写在前面 起因 逛b站时翻到一个视频基于ESP32和AI大模型API的个人专属智能桌面小助手(目前接入的是ChatGPT)_哔哩哔哩_bilibili
虽然平时仍有很多事要忙(绝对不是打游戏),但还是使得生活无聊的我眼前一亮,要不就DIY一个呗。
由于我的惰性,在向up主时运君Jonathan 问询下拖拉了整整三个月才完成。
这里再次表达我对时运君的感谢!
设备选取
选择M5Stack下的M5core2
M5Core2 是M5Stack开发套件系列中第二代主机,在原有一代主机基础上对功能进一步加强,硬件功能更加齐全。其核心主控Core2 配备了ESP32-D0WDQ6-V3
,具有两个可以单独控制的 Xtensa® 32-bit LX6
处理器,主频高达240Mhz,支持WiFi功能,板载16MB Flash
与8MB PSRAM
,可通过TYPE-C
接口下载程序,强劲的配置满足复杂应用的资源开销。正面搭载一块2.0寸一体化电容式触摸屏,为用户带来更流畅的人机交互体验。
机身内置震动马达
,可提供触觉回馈
和震动提醒功能。内建的RTC模块可提供精准计时功能。电源部分搭载AXP192
电源管理芯片可有效控制机身功耗,内置绿色电源指示灯。同时机身内配备了TF-card(microSD)
卡槽与扬声器,为了保证获得更高质量的声音效果,采用I2S
数字音频接口的功放芯片
,能有效防止信号失真。在机身的左侧和底部配有独立的电源按键与重启(RST)按键,屏幕正面的3个圆点属于触摸屏的一部分,可通过编写程序设置热区映射为3个虚拟按键。机身背部的扩展小板集成6轴IMU传感器与麦克风。
实测麦克风,扬声器以及http通信都十分有效,且在arduino ide上已经有了现成的类库。
接口选取 至于文字与语音的相关转换以及核心大模型的选取都使用了百度的接口,使用情况并不是很理想(文心一言毕竟没有GPT好用,而且百度的语音生成十分僵硬),等有空了(100年后)打算进行替换。
登录百度智能云-云智一体深入产业 (baidu.com)
注册账号,开通短文本在线合成以及语音识别服务,开通千帆大模型,拿取相关的apiKey与apiSecret即可使用
制作过程分享 所需类库 1 2 3 4 5 6 7 8 9 10 #include <M5Unified.h> #include <Arduino.h> #include <AudioFileStream.h> #include <HTTPClient.h> #include <AudioOutput.h> #include <AudioFileSourceHTTPStream.h> #include <AudioGeneratorWAV.h> #include <AudioOutputI2S.h> #include <Avatar.h> #include <base64.h>
语音输出(嘴巴) 语音输出所需要的相关定义
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 class AudioOutputM5Speaker : public AudioOutput{ public: AudioOutputM5Speaker(m5::Speaker_Class* m5sound, uint8_t virtual_sound_channel = 0 ) { _m5sound = m5sound; _virtual_ch = virtual_sound_channel; } virtual ~AudioOutputM5Speaker(void ) {}; virtual bool begin (void ) override { return true ; } virtual bool ConsumeSample (int16_t sample[2 ]) override { if (_tri_buffer_index < tri_buf_size) { _tri_buffer[_tri_index][_tri_buffer_index ] = sample[0 ]; _tri_buffer[_tri_index][_tri_buffer_index+1 ] = sample[1 ]; _tri_buffer_index += 2 ; return true ; } flush(); return false ; } virtual void flush (void ) override { if (_tri_buffer_index) { _m5sound->playRaw(_tri_buffer[_tri_index], _tri_buffer_index, hertz, true , 1 , _virtual_ch); _tri_index = _tri_index < 2 ? _tri_index + 1 : 0 ; _tri_buffer_index = 0 ; ++_update_count; } } virtual bool stop (void ) override { flush(); _m5sound->stop(_virtual_ch); for (size_t i = 0 ; i < 3 ; ++i) { memset (_tri_buffer[i], 0 , tri_buf_size * sizeof (int16_t )); } ++_update_count; return true ; } const int16_t * getBuffer (void ) const {return _tri_buffer[(_tri_index + 2 ) % 3 ]; } const uint32_t getUpdateCount (void ) const { return _update_count; } protected: m5::Speaker_Class* _m5sound; uint8_t _virtual_ch; static constexpr size_t tri_buf_size = 640 ; int16_t _tri_buffer[3 ][tri_buf_size]; size_t _tri_buffer_index = 0 ; size_t _tri_index = 0 ; size_t _update_count = 0 ; }; static AudioFileSourceHTTPStream *file = nullptr;static AudioOutputM5Speaker out (&M5.Speaker, 0 ) ;static AudioGenerator *decoder = nullptr;
扬声器输出,与文本转语音合成合并了,是机器人的嘴巴 ,输入文本,机器人能直接读出来
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 void s2vAnswer (String answer) { file = new AudioFileSourceHTTPStream((S2V_URL+access_to2+"&tex=" +UrlEncode(answer)).c_str()); decoder = new AudioGeneratorWAV(); M5.Speaker.begin(); M5.Speaker.setVolume(254 ); decoder->begin(file, &out); while (decoder->loop()){ float random_number = random(0 , 100 ) / 100.0 ; avatar.setMouthOpenRatio(random_number); } avatar.setMouthOpenRatio(0 ); decoder->stop(); M5.Speaker.end(); delete decoder; decoder = nullptr; file->close(); delete file; file = nullptr; }
进行录音(耳朵) 一些录音所需的定义
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 constexpr size_t record_number = 250 ; constexpr size_t record_length = 250 ; constexpr size_t record_size = record_number * record_length; constexpr size_t record_samplerate = 16000 ; constexpr int headerSize = 44 ; constexpr float threshold = 1000.0 ; const auto size = record_size * sizeof (int16_t ) + headerSize;byte * record_buffer; int16_t *MakeHeader (byte *header) { const auto wavDataSize = record_number * record_length * 2 ; header[0 ] = 'R' ; header[1 ] = 'I' ; header[2 ] = 'F' ; header[3 ] = 'F' ; unsigned int fileSizeMinus8 = wavDataSize + headerSize - 8 ; header[4 ] = (byte)(fileSizeMinus8 & 0xFF ); header[5 ] = (byte)((fileSizeMinus8 >> 8 ) & 0xFF ); header[6 ] = (byte)((fileSizeMinus8 >> 16 ) & 0xFF ); header[7 ] = (byte)((fileSizeMinus8 >> 24 ) & 0xFF ); header[8 ] = 'W' ; header[9 ] = 'A' ; header[10 ] = 'V' ; header[11 ] = 'E' ; header[12 ] = 'f' ; header[13 ] = 'm' ; header[14 ] = 't' ; header[15 ] = ' ' ; header[16 ] = 0x10 ; header[17 ] = 0x00 ; header[18 ] = 0x00 ; header[19 ] = 0x00 ; header[20 ] = 0x01 ; header[21 ] = 0x00 ; header[22 ] = 0x01 ; header[23 ] = 0x00 ; header[24 ] = 0x80 ; header[25 ] = 0x3E ; header[26 ] = 0x00 ; header[27 ] = 0x00 ; header[28 ] = 0x00 ; header[29 ] = 0x7D ; header[30 ] = 0x00 ; header[31 ] = 0x00 ; header[32 ] = 0x02 ; header[33 ] = 0x00 ; header[34 ] = 0x10 ; header[35 ] = 0x00 ; header[36 ] = 'd' ; header[37 ] = 'a' ; header[38 ] = 't' ; header[39 ] = 'a' ; header[40 ] = (byte)(wavDataSize & 0xFF ); header[41 ] = (byte)((wavDataSize >> 8 ) & 0xFF ); header[42 ] = (byte)((wavDataSize >> 16 ) & 0xFF ); header[43 ] = (byte)((wavDataSize >> 24 ) & 0xFF ); return (int16_t *)&header[headerSize]; }
麦克风录音
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 void record () { M5.Mic.begin(); auto *wavData = MakeHeader(record_buffer); for (int rec_record_idx = 0 ; rec_record_idx < record_number; ++rec_record_idx) { auto data = &wavData[rec_record_idx * record_length]; M5.Mic.record(data, record_length, record_samplerate); } M5.Mic.end(); }
机器人的耳朵 ,即获取到录音的文本,也就是要上传给大模型的提示词
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 String upQuestion () { record(); http2.begin(V2S_URL); http2.addHeader("Content-Type" ,"application/json" ); http2.addHeader("Accept" ,"application/json" ); base64 Base64; String baseVoice = Base64.encode(record_buffer,record_size*sizeof (int16_t ) + headerSize); String postload = "{" "\"format\": \"wav\"," "\"rate\": 16000," "\"channel\": 1," "\"cuid\": \"cFpnMuV27HjkzJm2LlfpeFcsoHrDCTtt\"," "\"token\": \"" +access_to2+"\"," "\"dev_pid\": 80001," "\"len\": " +String(record_size*sizeof (int16_t ) + headerSize)+"," "\"speech\": \"" ; postload.reserve(postload.length()+baseVoice.length()+10 ); postload+=baseVoice; postload+="\"" "}" ; http2.addHeader("Content-Length" ,String(postload.length())); http2.POST(postload); String result = http2.getString(); http2.end(); String question = getResult(result); return question; }
连接模型(大脑) WIFI连接
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 WiFiMulti wiFiMulti; void connectWifi (const char * key,const char *secret) { int sum = 0 ; wiFiMulti.addAP( key, secret); while (wiFiMulti.run() != WL_CONNECTED) { delay(1000 ); sum += 1 ; if (sum == 8 )N( M5.Lcd.print("Conncet failed!" );) } delay(500 ); }
初始化
1 2 3 4 5 6 7 8 9 10 void setup () { M5.begin(); while (!WiFiInit()){} access_to = getAccessToken(apiKey,apiSecret); access_to2 = getAccessToken(s2vKey,s2vSecret); record_buffer = static_cast<byte *>(::heap_caps_malloc(size, MALLOC_CAP_SPIRAM | MALLOC_CAP_8BIT)); memset (record_buffer, 0 , size); Serial.begin(9600 ); }
最终核心(大脑)
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 void loop () { M5.update(); if (M5.BtnA.wasClicked()){ String userInput = upQuestion(); if (userInput!="" ){ http.begin(CHAT_URL+"?access_token=" +access_to); http.addHeader("Content-Type" , "application/json" ); messages += message(0 ,userInput); String postPayload = "{" "\"messages\":[" + messages + "]," "\"stream\":true" "}" ; int httpCode = http.POST(postPayload); if (httpCode >0 ) { WiFiClient *stream = http.getStreamPtr(); String total_content="" ; String tmp="" ; String result="" ; while (true ){ tmp = stream->readStringUntil('\n' ); if (getValue("is_end" ,tmp,0 )=="true" )break ; else { result = getValue("result" ,tmp,1 ); if (result!="" ){ result.replace("\\n" , "" ); s2vAnswer(result); total_content+= result; } } } messages+=message(1 ,total_content); } else { messages = messages.substring(0 ,messages.length()-userInput.length()); } http.end(); } } }
后记 后续要做的不仅仅是换api
更应该对代码进行封装,变成一个个可直接调用方法的类库
再进一步抽象为抽象类或者接口,可以由不同的实现类继承,适应不同的api,定义一套规范
当然,这恐怕要等到很闲的时候了
M5Stack core2是个很强(很贵)的设备,它的功能与技术栈远不止这么点,将来将进行进一步探索与开发,不过那都是后话了
这里附上m5的官方地址m5-docs (m5stack.com)
需要完整代码的可以通过主页的微信二维码联系我