写在前面

起因

逛b站时翻到一个视频基于ESP32和AI大模型API的个人专属智能桌面小助手（目前接入的是ChatGPT）_哔哩哔哩_bilibili

虽然平时仍有很多事要忙(绝对不是打游戏），但还是使得生活无聊的我眼前一亮，要不就DIY一个呗。

由于我的惰性，在向up主时运君Jonathan问询下拖拉了整整三个月才完成。

这里再次表达我对时运君的感谢！

设备选取

选择M5Stack下的M5core2

M5Core2 是M5Stack开发套件系列中第二代主机,在原有一代主机基础上对功能进一步加强,硬件功能更加齐全。其核心主控Core2配备了ESP32-D0WDQ6-V3,具有两个可以单独控制的 Xtensa® 32-bit LX6 处理器,主频高达240Mhz,支持WiFi功能,板载16MB Flash与8MB PSRAM,可通过TYPE-C接口下载程序,强劲的配置满足复杂应用的资源开销。正面搭载一块2.0寸一体化电容式触摸屏,为用户带来更流畅的人机交互体验。

机身内置震动马达,可提供触觉回馈和震动提醒功能。内建的RTC模块可提供精准计时功能。电源部分搭载AXP192电源管理芯片可有效控制机身功耗,内置绿色电源指示灯。同时机身内配备了TF-card(microSD)卡槽与扬声器,为了保证获得更高质量的声音效果,采用I2S数字音频接口的功放芯片,能有效防止信号失真。在机身的左侧和底部配有独立的电源按键与重启(RST)按键,屏幕正面的3个圆点属于触摸屏的一部分,可通过编写程序设置热区映射为3个虚拟按键。机身背部的扩展小板集成6轴IMU传感器与麦克风。

实测麦克风,扬声器以及http通信都十分有效，且在arduino ide上已经有了现成的类库。

接口选取

至于文字与语音的相关转换以及核心大模型的选取都使用了百度的接口，使用情况并不是很理想(文心一言毕竟没有GPT好用，而且百度的语音生成十分僵硬)，等有空了(100年后)打算进行替换。

登录百度智能云-云智一体深入产业 (baidu.com)

注册账号，开通短文本在线合成以及语音识别服务，开通千帆大模型，拿取相关的apiKey与apiSecret即可使用

制作过程分享

所需类库

#include <M5Unified.h>
#include <Arduino.h>
#include <AudioFileStream.h>
#include <HTTPClient.h>
#include <AudioOutput.h>
#include <AudioFileSourceHTTPStream.h>
#include <AudioGeneratorWAV.h>
#include <AudioOutputI2S.h>
#include <Avatar.h>
#include <base64.h>

语音输出(嘴巴）

语音输出所需要的相关定义

//继承AudioOutput，定义了一个为M5StackCOre2服务的扬声器
class AudioOutputM5Speaker : public AudioOutput
{
  public:
    AudioOutputM5Speaker(m5::Speaker_Class* m5sound, uint8_t virtual_sound_channel = 0)
    {
      _m5sound = m5sound;
      _virtual_ch = virtual_sound_channel;
    }
    virtual ~AudioOutputM5Speaker(void) {};
    virtual bool begin(void) override { return true; }
    virtual bool ConsumeSample(int16_t sample[2]) override
    {
      if (_tri_buffer_index < tri_buf_size)
      {
        _tri_buffer[_tri_index][_tri_buffer_index  ] = sample[0];
        _tri_buffer[_tri_index][_tri_buffer_index+1] = sample[1];
        _tri_buffer_index += 2;

        return true;
      }

      flush();
      return false;
    }
    virtual void flush(void) override
    {
      if (_tri_buffer_index)
      {
        _m5sound->playRaw(_tri_buffer[_tri_index], _tri_buffer_index, hertz, true, 1, _virtual_ch);
        _tri_index = _tri_index < 2 ? _tri_index + 1 : 0;
        _tri_buffer_index = 0;
        ++_update_count;
      }
    }
    virtual bool stop(void) override
    {
      flush();
      _m5sound->stop(_virtual_ch);
      for (size_t i = 0; i < 3; ++i)
      {
        memset(_tri_buffer[i], 0, tri_buf_size * sizeof(int16_t));
      }
      ++_update_count;
      return true;
    }

    const int16_t* getBuffer(void) const {return _tri_buffer[(_tri_index + 2) % 3]; }
    const uint32_t getUpdateCount(void) const { return _update_count; }

  protected:
    m5::Speaker_Class* _m5sound;
    uint8_t _virtual_ch;
    static constexpr size_t tri_buf_size = 640;
    int16_t _tri_buffer[3][tri_buf_size];
    size_t _tri_buffer_index = 0;
    size_t _tri_index = 0;
    size_t _update_count = 0;
};
//以http流作为音频文件源
static AudioFileSourceHTTPStream *file = nullptr;
//音频麦克风定义为M5Speaker
static AudioOutputM5Speaker out(&M5.Speaker, 0);
//定义音频生成器
static AudioGenerator *decoder = nullptr;

扬声器输出，与文本转语音合成合并了，是机器人的嘴巴，输入文本，机器人能直接读出来

void s2vAnswer(String answer){
	//AudioFileSourceHTTPStream(url)用于从该url来获取音频数据
    //UrlEncode用于对tex进行编码，至于为什么要编码详解百度的接口使用
    file = new AudioFileSourceHTTPStream((S2V_URL+access_to2+"&tex="+UrlEncode(answer)).c_str());
    //新建一个wav音频生成器用于把获取到的数据转为wav
    decoder = new AudioGeneratorWAV();
    //记得先打开扬声器
    M5.Speaker.begin();
    M5.Speaker.setVolume(254);
    //生成器初始化，以file作为数据流，out用作扬声器
    decoder->begin(file, &out);
    //开始执行直到流中没有数据
    while(decoder->loop()){
        //机器人嘴巴动画
      float random_number = random(0, 100) / 100.0; 
    	avatar.setMouthOpenRatio(random_number);
    }
    avatar.setMouthOpenRatio(0);
    //关闭各类操作
    decoder->stop();
    M5.Speaker.end();
    delete decoder;
    decoder = nullptr;
    file->close();
    delete file;
    file = nullptr;
}

进行录音(耳朵)

一些录音所需的定义

constexpr size_t record_number = 250;
constexpr size_t record_length = 250;
constexpr size_t record_size = record_number * record_length;
constexpr size_t record_samplerate = 16000;
constexpr int headerSize = 44;
constexpr float threshold = 1000.0; 
//文件大小
const auto size = record_size * sizeof(int16_t) + headerSize;
//缓冲区
byte * record_buffer;
//存放音频的数组放入wav文件头
int16_t *MakeHeader(byte *header)
{
  const auto wavDataSize = record_number * record_length * 2;
  header[0] = 'R';
  header[1] = 'I';
  header[2] = 'F';
  header[3] = 'F';
  unsigned int fileSizeMinus8 = wavDataSize + headerSize - 8;
  header[4] = (byte)(fileSizeMinus8 & 0xFF);
  header[5] = (byte)((fileSizeMinus8 >> 8) & 0xFF);
  header[6] = (byte)((fileSizeMinus8 >> 16) & 0xFF);
  header[7] = (byte)((fileSizeMinus8 >> 24) & 0xFF);
  header[8] = 'W';
  header[9] = 'A';
  header[10] = 'V';
  header[11] = 'E';
  header[12] = 'f';
  header[13] = 'm';
  header[14] = 't';
  header[15] = ' ';
  header[16] = 0x10; // linear PCM
  header[17] = 0x00;
  header[18] = 0x00;
  header[19] = 0x00;
  header[20] = 0x01; // linear PCM
  header[21] = 0x00;
  header[22] = 0x01; // monoral
  header[23] = 0x00;
  header[24] = 0x80; // sampling rate 16000
  header[25] = 0x3E;
  header[26] = 0x00;
  header[27] = 0x00;
  header[28] = 0x00; // Byte/sec = 16000x2x1 = 32000
  header[29] = 0x7D;
  header[30] = 0x00;
  header[31] = 0x00;
  header[32] = 0x02; // 16bit monoral
  header[33] = 0x00;
  header[34] = 0x10; // 16bit
  header[35] = 0x00;
  header[36] = 'd';
  header[37] = 'a';
  header[38] = 't';
  header[39] = 'a';
  header[40] = (byte)(wavDataSize & 0xFF);
  header[41] = (byte)((wavDataSize >> 8) & 0xFF);
  header[42] = (byte)((wavDataSize >> 16) & 0xFF);
  header[43] = (byte)((wavDataSize >> 24) & 0xFF);
  return (int16_t *)&header[headerSize];
}

麦克风录音

void record()
{
  //打开麦克风
  M5.Mic.begin();
  //增添一个文件头
  auto *wavData = MakeHeader(record_buffer);
  //进行录音
  for (int rec_record_idx = 0; rec_record_idx < record_number; ++rec_record_idx)
  {
    auto data = &wavData[rec_record_idx * record_length];
    M5.Mic.record(data, record_length, record_samplerate);
  }
  //关闭录音，麦克风与扬声器不能同时打开
  M5.Mic.end();
}

机器人的耳朵，即获取到录音的文本，也就是要上传给大模型的提示词

//上传问题，即进行录音返回录音的文本
String upQuestion(){
    
      //进行录音
  	  record();
    
	  //http设定参数
      http2.begin(V2S_URL);
      http2.addHeader("Content-Type","application/json");
      http2.addHeader("Accept","application/json");
      
      //录音转base64编码
      base64 Base64;
      String baseVoice = Base64.encode(record_buffer,record_size*sizeof(int16_t) + headerSize);
    
      //短文本合成，也就是语音转文本
      String postload = "{"
                       "\"format\": \"wav\","
                       "\"rate\": 16000,"
                       "\"channel\": 1,"
                       "\"cuid\": \"cFpnMuV27HjkzJm2LlfpeFcsoHrDCTtt\","
                       "\"token\": \""+access_to2+"\","
                       "\"dev_pid\": 80001,"
                       "\"len\": "+String(record_size*sizeof(int16_t) + headerSize)+","
                       "\"speech\": \"";
      
      //这个方法非常蠢，相当于用了超级长的字符串存储音频，但是我暂时也没有找到别的方法了
      postload.reserve(postload.length()+baseVoice.length()+10);
      postload+=baseVoice;
      postload+="\"""}";
      
      //这个请求头也非常重要
      http2.addHeader("Content-Length",String(postload.length()));
      http2.POST(postload);  // start connection and send HTTP header.
      String result = http2.getString();
      http2.end();
  String question = getResult(result);
  //得到了录音内容的文本
  return question;
}

连接模型(大脑)

WIFI连接

WiFiMulti wiFiMulti;
//输入wifi账号与密码后连接
void connectWifi(const char * key,const char *secret){
    int sum = 0;
    wiFiMulti.addAP(
        key,
        secret);  // Add wifi configuration information.  添加wifi配置信息
    while (wiFiMulti.run() !=
           WL_CONNECTED) {  // If the connection to wifi is not established
                            // successfully.  如果没有与wifi成功建立连接
        delay(1000);
        sum += 1;
        if (sum == 8)N( M5.Lcd.print("Conncet failed!");)
    }
    delay(500);
}

初始化

void setup() {
    M5.begin();
    while(!WiFiInit()){}
    //拿到各类api的tokens
    access_to = getAccessToken(apiKey,apiSecret);
    access_to2 = getAccessToken(s2vKey,s2vSecret);
    record_buffer = static_cast<byte *>(::heap_caps_malloc(size, MALLOC_CAP_SPIRAM | MALLOC_CAP_8BIT));
    memset(record_buffer, 0, size);
    Serial.begin(9600);
}

最终核心(大脑)

void loop() {
    M5.update();//更新按键状态
    //按键A按下才开始录音，进行一次问答
    if(M5.BtnA.wasClicked()){
      //根据录音获取提示词
      String userInput = upQuestion();
      if (userInput!=""){
        http.begin(CHAT_URL+"?access_token="+access_to);  
        http.addHeader("Content-Type", "application/json");
        
        //messages要关联上下文，所以是个数组
        //当前的messages要新增一条userInput的message
        messages += message(0,userInput);
        //打开流式传输
        /*
        以下的代码都基于流式传输
        流式传输:连贯，不会思考太久，但是在不应该断句时可能会断句，有些奇怪
        非流式传输：会思考很久，但是语句是完整的，断句十分正常
        */
        String postPayload = "{"
                       "\"messages\":["+ messages + "],"
                       "\"stream\":true"
                       "}";

        //进行post请求
        int httpCode = http.POST(postPayload);
        
        if (httpCode >0) {  // httpCode will be negative on error.  出错时httpCode将为负值
				
            	//获取响应流
                WiFiClient *stream = http.getStreamPtr();
                String total_content="";
                String tmp="";
                String result="";
                while(true){
                    //读到换行代表读完了流式传输的一块内容，这点十分重要，否则会读到别的块的内容导致错位
                    tmp = stream->readStringUntil('\n');
                    //在百度的响应中is_end=true代表流式传输结束了
                    if(getValue("is_end",tmp,0)=="true")break;
                    else{
                      //从响应中剥离大模型的回答
                      result = getValue("result",tmp,1);
                      if(result!=""){
                        //这点也十分重要，不然m5core2回答\n读出来(物理上)
                        result.replace("\\n", "");
                        //回答转语音并且扬声器读出来
                        s2vAnswer(result);
                        //要记录进messages的是所有流式传输的总和而不是单个
                        total_content+= result;
                      }
                    }
                }
				//messages数组中加入大模型回答的内容
                messages+=message(1,total_content);
            }
         else {
            //请求失败要把这次userInput从messages去除
            messages = messages.substring(0,messages.length()-userInput.length());
        }
        //别忘关了http
        http.end();
      }
    }
}

后记

后续要做的不仅仅是换api

更应该对代码进行封装，变成一个个可直接调用方法的类库

再进一步抽象为抽象类或者接口，可以由不同的实现类继承，适应不同的api，定义一套规范

当然，这恐怕要等到很闲的时候了

M5Stack core2是个很强(很贵)的设备，它的功能与技术栈远不止这么点，将来将进行进一步探索与开发，不过那都是后话了

这里附上m5的官方地址m5-docs (m5stack.com)

需要完整代码的可以通过主页的微信二维码联系我

TangSong404's Blog

大模型结合ESP32设备？一次简单尝试