audio ok mais pas top

This commit is contained in:
ordinarthur 2026-04-09 03:14:43 +02:00
parent 28d5bd44e0
commit e941ad1b14
4 changed files with 204 additions and 282 deletions

View File

@ -104,8 +104,12 @@ export class RobotGateway implements OnGatewayConnection, OnGatewayDisconnect, I
@SubscribeMessage('wake_word_detected')
async handleWakeWord(@ConnectedSocket() client: AuthenticatedSocket) {
this.logger.log(`Wake word detected on device ${client.data.deviceId}`);
client.emit('status', { state: 'listening' as RobotState });
// IMPORTANT: open the STT stream FIRST, then tell the client we're
// listening. The client flushes its buffered audio as soon as it
// sees `listening`, so if Deepgram isn't open yet those chunks are
// silently dropped with a "No active STT stream" warning.
await this.conversationPort.startListening(client.data.deviceId);
client.emit('status', { state: 'listening' as RobotState });
}
@SubscribeMessage('audio_chunk')

View File

@ -13,6 +13,13 @@ interface ActiveSession {
finalTranscription: string;
interimTranscription: string;
sttStream: ISTTStream | null;
/**
* Audio chunks that arrived between session creation and the STT
* stream actually being open. Drained as soon as `sttStream` is
* assigned so we never lose the first few hundred milliseconds of
* speech.
*/
pendingChunks: Buffer[];
}
const SYSTEM_PROMPT = `Tu es Ti-Pote, un petit robot de bureau animatronique, chaleureux et serviable.
@ -84,6 +91,7 @@ export class ConversationService implements IConversationPort {
finalTranscription: '',
interimTranscription: '',
sttStream: null,
pendingChunks: [],
};
this.activeSessions.set(deviceId, session);
@ -102,15 +110,32 @@ export class ConversationService implements IConversationPort {
});
session.sttStream = sttStream;
// Flush anything that arrived while Deepgram was spinning up.
if (session.pendingChunks.length > 0) {
this.logger.debug(
`Flushing ${session.pendingChunks.length} buffered chunks for ${deviceId}`,
);
for (const chunk of session.pendingChunks) {
sttStream.sendAudio(chunk);
}
session.pendingChunks = [];
}
}
processAudioChunk(deviceId: string, chunk: Buffer, sampleRate: number): void {
const session = this.activeSessions.get(deviceId);
if (!session?.sttStream) {
this.logger.warn(`No active STT stream for device ${deviceId}, ignoring audio chunk`);
if (!session) {
// No session at all → user sent audio without wake_word_detected.
// Safe to ignore.
return;
}
if (!session.sttStream) {
// Session exists but Deepgram is still opening. Buffer the chunk
// so it gets replayed as soon as the stream is ready.
session.pendingChunks.push(chunk);
return;
}
session.sttStream.sendAudio(chunk);
}

View File

@ -141,14 +141,26 @@ export class OrchestratorService extends EventEmitter {
/**
* Basic Voice Activity Detection: check if audio chunk contains significant signal.
* Uses RMS (root mean square) amplitude threshold.
*
* Uses AC-RMS the mean is subtracted from each sample before squaring,
* so any DC offset from the microphone (the INMP441 has one in the hundreds)
* doesn't artificially inflate the energy and prevent silence detection.
*/
private isAudioSignificant(chunk: Buffer, threshold = 200): boolean {
let sumSquares = 0;
private isAudioSignificant(chunk: Buffer, threshold = 300): boolean {
const samples = chunk.length / 2; // 16-bit = 2 bytes per sample
if (samples === 0) return false;
// First pass: DC mean of the chunk.
let sum = 0;
for (let i = 0; i < chunk.length - 1; i += 2) {
const sample = chunk.readInt16LE(i);
sum += chunk.readInt16LE(i);
}
const mean = sum / samples;
// Second pass: variance (= AC power) around the mean.
let sumSquares = 0;
for (let i = 0; i < chunk.length - 1; i += 2) {
const sample = chunk.readInt16LE(i) - mean;
sumSquares += sample * sample;
}

View File

@ -1,310 +1,191 @@
// Ti-Pote — Minimal audio bring-up firmware (ESP32-WROOM-32)
// Ti-Pote — Robot Hardware firmware (ESP32-WROOM-32)
//
// GOAL: prove the I2S audio chain (INMP441 + MAX98357A) end to end.
// The command stream lives on Serial2 (hardware UART2, pins RX=27
// TX=13) which is wired to the Raspberry Pi's UART0 (/dev/serial0).
// The USB Serial port is kept only for boot-time diagnostics — all
// the real traffic goes over the UART to the Pi.
// Binary-frame protocol over Serial2 (UART2 remapped to RX=27 TX=13)
// wired to the Pi's /dev/serial0. This is the "production" firmware
// that the robot-client talks to. It continuously streams mic audio
// as AUDIO_UP frames and plays back AUDIO_DOWN frames on the speaker.
//
// On the host side, the same two scripts we used with the USB link
// work unchanged — just pass `ESP_PORT=/dev/serial0`:
// Pipeline overview:
//
// ESP_PORT=/dev/serial0 pnpm esp:record out.wav 3000
// ESP_PORT=/dev/serial0 pnpm esp:play out.wav
// ┌──────────────┐ AUDIO_UP ┌──────────────┐
// │ INMP441 │ ────► (S16 mono PCM) ─►│ Pi client │
// │ (mic) │ │ - wake word │
// └──────────────┘ │ - STT cloud │
// │ - TTS cloud │
// ┌──────────────┐ AUDIO_DOWN │ │
// │ MAX98357A │ ◄──── (S16 mono PCM) ◄──│ │
// │ (speaker) │ └──────────────┘
// └──────────────┘
//
// Protocol (same as before, 921600 baud, line-based for commands,
// raw bytes for audio payload):
// Commands also carried over the same link:
// PING / PONG — latency probe
// STATUS — Pi heartbeat (keeps us out of idle mode)
// DISPLAY_EMOTION — OLED eyes
// LOG — firmware → host human-readable logs
//
// host → esp32
// "PING\n" ping
// "REC <ms>\n" start recording for <ms> milliseconds
// "PLAY <bytes>\n" next <bytes> bytes on the wire are raw
// S16 LE mono 16 kHz PCM, play them
//
// esp32 → host
// "READY\n" once at boot
// "PONG\n" reply to PING
// "LOG <text>\n" human-readable log line
// "ERR <text>\n" error message
// "BEGIN <bytes>\n" start of a REC response
// "<raw bytes>" raw PCM (S16 LE mono 16 kHz)
// "END\n" end of a REC response
// "OK\n" command completed
//
// Wiring (shared I2S bus on I2S_NUM_0):
// BCLK = GPIO 32 (mic SCK + speaker BCLK)
// LRCLK = GPIO 33 (mic WS + speaker LRC)
// MIC = GPIO 34 (INMP441 SD → ESP32 data-in, input-only pin)
// SPK = GPIO 22 (ESP32 data-out → MAX98357A DIN)
// USB Serial is kept only as a boot logger (`pio device monitor -b 115200`).
#include <Arduino.h>
#include <driver/i2s.h>
#include <string.h>
#include "Protocol.h"
#include "Eyes.h"
#include "Audio.h"
// ──────────────────────────────────────────────────────────
// Comms config — UART2 to the Raspberry Pi
// ──────────────────────────────────────────────────────────
#ifndef HW_SERIAL_BAUD
#define HW_SERIAL_BAUD 921600
#endif
// Hardware UART2 remapped to pins that don't clash with anything
// else on the devkit. TX = GPIO 13, RX = GPIO 27.
#ifndef HW_HEARTBEAT_TIMEOUT_MS
#define HW_HEARTBEAT_TIMEOUT_MS 5000
#endif
// UART2 pin remap. Default UART2 pins (16/17) are already taken by
// the OLED, so we move Serial2 onto GPIO 27 (RX) and GPIO 13 (TX).
static constexpr int HW_UART_RX_PIN = 27;
static constexpr int HW_UART_TX_PIN = 13;
static constexpr long HW_UART_BAUD = 921600;
// HW_COMM is the Stream that carries the command/audio protocol.
// Changing this single #define lets us swap between USB (Serial)
// and the Pi-facing UART (Serial2).
// How many bytes of mic audio we stage per loop tick before emitting
// an AUDIO_UP frame. 640 B = 20 ms @ 16 kHz S16 mono.
static constexpr size_t AUDIO_UP_CHUNK_BYTES = 640;
#define HW_COMM Serial2
// ──────────────────────────────────────────────────────────
// Audio config
// ──────────────────────────────────────────────────────────
using namespace tipote;
static constexpr int SAMPLE_RATE = 16000;
static constexpr int PIN_BCLK = 32;
static constexpr int PIN_LRCLK = 33;
static constexpr int PIN_MIC_DIN = 34;
static constexpr int PIN_SPK_DOUT = 22;
static Eyes eyes;
static Audio audio;
static FrameDecoder decoder;
static constexpr int DMA_COUNT = 4;
static constexpr int DMA_LEN = 256;
static uint32_t lastHeartbeatMs = 0;
static bool idleMode = false;
static uint8_t micBuffer[AUDIO_UP_CHUNK_BYTES];
// Staging buffers — keep them outside of functions so we don't eat
// stack on every tick.
static constexpr size_t OUT_S16_SAMPLES = 320; // 20 ms of S16 mono
static int32_t g_rawStereo[OUT_S16_SAMPLES * 2];
static int16_t g_micMono [OUT_S16_SAMPLES];
static int32_t g_spkStereo[OUT_S16_SAMPLES * 2];
static uint8_t g_spkInBuf [OUT_S16_SAMPLES * 2]; // 640 bytes of S16 mono
// ──────────────────────────────────────────────────────────
// Line buffer for incoming text commands.
// ──────────────────────────────────────────────────────────
static char g_line[64];
static size_t g_lineLen = 0;
static void sendLog(const char* msg) {
HW_COMM.print("LOG ");
HW_COMM.println(msg);
}
static void sendErr(const char* msg) {
HW_COMM.print("ERR ");
HW_COMM.println(msg);
}
// ──────────────────────────────────────────────────────────
// I2S init — single port, full duplex, shared BCLK/WS.
// ──────────────────────────────────────────────────────────
static bool audioBegin() {
i2s_config_t cfg = {};
cfg.mode = static_cast<i2s_mode_t>(I2S_MODE_MASTER |
I2S_MODE_RX |
I2S_MODE_TX);
cfg.sample_rate = SAMPLE_RATE;
cfg.bits_per_sample = I2S_BITS_PER_SAMPLE_32BIT;
cfg.channel_format = I2S_CHANNEL_FMT_RIGHT_LEFT;
cfg.communication_format = I2S_COMM_FORMAT_STAND_I2S;
cfg.intr_alloc_flags = ESP_INTR_FLAG_LEVEL1;
cfg.dma_buf_count = DMA_COUNT;
cfg.dma_buf_len = DMA_LEN;
cfg.use_apll = false;
cfg.tx_desc_auto_clear = true;
cfg.fixed_mclk = 0;
if (i2s_driver_install(I2S_NUM_0, &cfg, 0, nullptr) != ESP_OK) return false;
i2s_pin_config_t pins = {};
pins.bck_io_num = PIN_BCLK;
pins.ws_io_num = PIN_LRCLK;
pins.data_out_num = PIN_SPK_DOUT;
pins.data_in_num = PIN_MIC_DIN;
if (i2s_set_pin(I2S_NUM_0, &pins) != ESP_OK) {
i2s_driver_uninstall(I2S_NUM_0);
return false;
}
i2s_zero_dma_buffer(I2S_NUM_0);
return true;
}
// Convert one batch of stereo 32-bit mic samples to S16 mono by
// taking the left slot and shifting the 24-bit-aligned data down.
// Returns the number of S16 samples written into `out`.
static size_t micReadMono(int16_t* out, size_t maxSamples) {
size_t wantPairs = maxSamples;
if (wantPairs > OUT_S16_SAMPLES) wantPairs = OUT_S16_SAMPLES;
size_t bytesRead = 0;
const esp_err_t err = i2s_read(
I2S_NUM_0,
g_rawStereo,
wantPairs * 2 * sizeof(int32_t),
&bytesRead,
portMAX_DELAY // block — we're in a dedicated REC loop
);
if (err != ESP_OK || bytesRead == 0) return 0;
const size_t pairs = bytesRead / (2 * sizeof(int32_t));
for (size_t i = 0; i < pairs; ++i) {
int32_t L = g_rawStereo[2 * i];
int32_t s = L >> 14;
if (s > INT16_MAX) s = INT16_MAX;
if (s < INT16_MIN) s = INT16_MIN;
out[i] = static_cast<int16_t>(s);
}
return pairs;
}
// Write one batch of S16 mono PCM to the speaker by duplicating each
// sample into both stereo slots and shifting into the high half of
// the 32-bit word (what the MAX98357A expects on a shared bus).
static void spkWriteMono(const int16_t* samples, size_t count) {
if (count == 0) return;
if (count > OUT_S16_SAMPLES) count = OUT_S16_SAMPLES;
for (size_t i = 0; i < count; ++i) {
const int32_t s32 = static_cast<int32_t>(samples[i]) << 16;
g_spkStereo[2 * i] = s32;
g_spkStereo[2 * i + 1] = s32;
}
size_t bytesWritten = 0;
i2s_write(I2S_NUM_0, g_spkStereo, count * 2 * sizeof(int32_t),
&bytesWritten, portMAX_DELAY);
}
// ──────────────────────────────────────────────────────────
// Command handlers
// ──────────────────────────────────────────────────────────
static void handleRec(uint32_t durationMs) {
const uint32_t totalSamples = (SAMPLE_RATE * durationMs) / 1000;
const uint32_t totalBytes = totalSamples * sizeof(int16_t);
HW_COMM.print("BEGIN ");
HW_COMM.println(totalBytes);
// Flush whatever old noise is in the mic DMA first.
i2s_zero_dma_buffer(I2S_NUM_0);
uint32_t sent = 0;
while (sent < totalSamples) {
size_t want = totalSamples - sent;
if (want > OUT_S16_SAMPLES) want = OUT_S16_SAMPLES;
const size_t got = micReadMono(g_micMono, want);
if (got == 0) continue;
HW_COMM.write(reinterpret_cast<const uint8_t*>(g_micMono),
got * sizeof(int16_t));
sent += got;
}
HW_COMM.println();
HW_COMM.println("END");
}
static void handlePlay(uint32_t totalBytes) {
// Drain any pending crap from the speaker DMA so we don't start
// with a pop.
i2s_zero_dma_buffer(I2S_NUM_0);
// Give readBytes a generous timeout so a jittery host doesn't
// abort us mid-playback.
HW_COMM.setTimeout(2000);
uint32_t remaining = totalBytes;
while (remaining > 0) {
size_t want = remaining;
if (want > sizeof(g_spkInBuf)) want = sizeof(g_spkInBuf);
// Force an even count so we always have complete S16 samples.
if (want & 1) want -= 1;
if (want == 0) want = 2;
const size_t got = HW_COMM.readBytes(g_spkInBuf, want);
if (got == 0) {
sendErr("PLAY read timeout");
return;
}
const size_t samples = got / sizeof(int16_t);
spkWriteMono(reinterpret_cast<const int16_t*>(g_spkInBuf), samples);
remaining -= got;
}
// Let the last frames actually reach the speaker, then clear.
delay(50);
i2s_zero_dma_buffer(I2S_NUM_0);
HW_COMM.println("OK");
}
static void handleLine(const char* line) {
if (strcmp(line, "PING") == 0) {
HW_COMM.println("PONG");
return;
}
if (strncmp(line, "REC ", 4) == 0) {
const long ms = atol(line + 4);
if (ms <= 0 || ms > 60000) { sendErr("REC bad duration"); return; }
handleRec(static_cast<uint32_t>(ms));
return;
}
if (strncmp(line, "PLAY ", 5) == 0) {
const long bytes = atol(line + 5);
if (bytes <= 0 || bytes > 16 * 1024 * 1024) {
sendErr("PLAY bad size");
return;
}
handlePlay(static_cast<uint32_t>(bytes));
return;
}
sendErr("unknown command");
}
// ──────────────────────────────────────────────────────────
// Arduino entry points
// ──────────────────────────────────────────────────────────
static void handleFrame(const Frame& frame, void* userData);
static void logLine(const char* line);
void setup() {
// USB Serial is kept as a *boot-time logger only*. It gives
// you something to look at via `pio device monitor` when the
// board is plugged into a laptop, without interfering with
// the Pi link on Serial2.
// USB Serial: boot-time diagnostics only.
Serial.begin(115200);
Serial.println("[boot] USB logger up, real comms on Serial2");
Serial.println("[boot] USB logger up (Serial2 is the real comms)");
// Bump the UART2 RX buffer WAY above the 256-byte default so we
// can absorb a full PLAY payload (up to a few tens of KB) without
// losing bytes if the host floods us.
// UART2 to the Pi, with a large RX buffer so AUDIO_DOWN bursts
// never overflow while we're busy pushing to I2S.
HW_COMM.setRxBufferSize(16 * 1024);
HW_COMM.begin(HW_UART_BAUD, SERIAL_8N1, HW_UART_RX_PIN, HW_UART_TX_PIN);
HW_COMM.begin(HW_SERIAL_BAUD, SERIAL_8N1, HW_UART_RX_PIN, HW_UART_TX_PIN);
HW_COMM.setTimeout(2000);
delay(50);
if (!audioBegin()) {
sendErr("I2S init failed");
eyes.begin();
if (!audio.begin()) {
logLine("audio: I2S init FAILED");
Serial.println("[boot] I2S init FAILED");
} else {
sendLog("I2S ready");
logLine("audio: I2S ready (mic + speaker)");
Serial.println("[boot] I2S ready");
}
HW_COMM.println("READY");
Serial.println("[boot] READY sent on Serial2");
decoder.onFrame(handleFrame);
lastHeartbeatMs = millis();
logLine("robot-hardware ready");
Serial.println("[boot] robot-hardware ready on Serial2");
}
void loop() {
// 1. Drain whatever the Pi has sent since the last tick and let
// the frame decoder emit complete frames to handleFrame().
while (HW_COMM.available() > 0) {
const int c = HW_COMM.read();
if (c < 0) break;
if (c == '\r') continue;
if (c == '\n') {
g_line[g_lineLen] = 0;
if (g_lineLen > 0) handleLine(g_line);
g_lineLen = 0;
continue;
}
if (g_lineLen < sizeof(g_line) - 1) {
g_line[g_lineLen++] = static_cast<char>(c);
} else {
g_lineLen = 0;
sendErr("line overflow");
const int b = HW_COMM.read();
if (b < 0) break;
decoder.feed(static_cast<uint8_t>(b));
}
// 2. Heartbeat watchdog — slip into sleepy idle if the Pi goes
// away. This also mutes the speaker DMA so an old tail
// doesn't loop forever on loss of heartbeat.
const uint32_t now = millis();
if (!idleMode && (now - lastHeartbeatMs) > HW_HEARTBEAT_TIMEOUT_MS) {
idleMode = true;
eyes.show(Emotion::SLEEPY);
audio.flushSpeaker();
}
// 3. Continuous mic streaming. We skip this while in idle mode
// to keep the UART and the Pi CPU quiet when nobody is home.
if (!idleMode) {
const size_t bytes = audio.readMicChunk(micBuffer, sizeof(micBuffer));
if (bytes > 0) {
FrameEncoder::writeTo(HW_COMM, MsgType::AUDIO_UP,
micBuffer, static_cast<uint16_t>(bytes));
}
}
}
// ---------------------------------------------------------------
// Frame dispatcher
// ---------------------------------------------------------------
static void handleFrame(const Frame& frame, void* /*userData*/) {
lastHeartbeatMs = millis();
if (idleMode) {
idleMode = false;
}
switch (frame.type) {
case MsgType::DISPLAY_EMOTION: {
if (frame.length < 1) {
logLine("DISPLAY_EMOTION: empty payload");
return;
}
const uint8_t code = frame.payload[0];
if (code >= static_cast<uint8_t>(Emotion::COUNT)) {
logLine("DISPLAY_EMOTION: out-of-range code");
return;
}
eyes.show(static_cast<Emotion>(code));
const uint8_t ackPayload[1] = {code};
FrameEncoder::writeTo(HW_COMM, MsgType::ACK, ackPayload, 1);
return;
}
case MsgType::DISPLAY_CLEAR: {
eyes.clear();
FrameEncoder::writeTo(HW_COMM, MsgType::ACK);
return;
}
case MsgType::AUDIO_DOWN: {
// Raw PCM S16 mono 16 kHz — push it straight to the speaker
// DMA. i2s_write blocks until it has room; at 20 ms chunks
// the DMA drains fast enough that this stays responsive.
if (frame.length > 0) {
audio.writeSpeakerChunk(frame.payload, frame.length);
}
return;
}
case MsgType::PING: {
FrameEncoder::writeTo(HW_COMM, MsgType::PONG,
frame.payload, frame.length);
return;
}
case MsgType::STATUS: {
// Heartbeat handled at the top of the dispatcher.
return;
}
// TODO: SERVO_CMD / LED_CMD / SENSOR_DATA
default:
logLine("unknown frame type");
return;
}
}
static void logLine(const char* line) {
const size_t len = strnlen(line, MAX_PAYLOAD_SIZE);
FrameEncoder::writeTo(HW_COMM, MsgType::LOG,
reinterpret_cast<const uint8_t*>(line),
static_cast<uint16_t>(len));
}