diff --git a/apps/backend/src/adapters/inbound/websocket/robot.gateway.ts b/apps/backend/src/adapters/inbound/websocket/robot.gateway.ts index 0e9a806..2899537 100644 --- a/apps/backend/src/adapters/inbound/websocket/robot.gateway.ts +++ b/apps/backend/src/adapters/inbound/websocket/robot.gateway.ts @@ -104,8 +104,12 @@ export class RobotGateway implements OnGatewayConnection, OnGatewayDisconnect, I @SubscribeMessage('wake_word_detected') async handleWakeWord(@ConnectedSocket() client: AuthenticatedSocket) { this.logger.log(`Wake word detected on device ${client.data.deviceId}`); - client.emit('status', { state: 'listening' as RobotState }); + // IMPORTANT: open the STT stream FIRST, then tell the client we're + // listening. The client flushes its buffered audio as soon as it + // sees `listening`, so if Deepgram isn't open yet those chunks are + // silently dropped with a "No active STT stream" warning. await this.conversationPort.startListening(client.data.deviceId); + client.emit('status', { state: 'listening' as RobotState }); } @SubscribeMessage('audio_chunk') diff --git a/apps/backend/src/core/services/conversation.service.ts b/apps/backend/src/core/services/conversation.service.ts index f265a78..a5a5b73 100644 --- a/apps/backend/src/core/services/conversation.service.ts +++ b/apps/backend/src/core/services/conversation.service.ts @@ -13,6 +13,13 @@ interface ActiveSession { finalTranscription: string; interimTranscription: string; sttStream: ISTTStream | null; + /** + * Audio chunks that arrived between session creation and the STT + * stream actually being open. Drained as soon as `sttStream` is + * assigned so we never lose the first few hundred milliseconds of + * speech. + */ + pendingChunks: Buffer[]; } const SYSTEM_PROMPT = `Tu es Ti-Pote, un petit robot de bureau animatronique, chaleureux et serviable. @@ -84,6 +91,7 @@ export class ConversationService implements IConversationPort { finalTranscription: '', interimTranscription: '', sttStream: null, + pendingChunks: [], }; this.activeSessions.set(deviceId, session); @@ -102,15 +110,32 @@ export class ConversationService implements IConversationPort { }); session.sttStream = sttStream; + + // Flush anything that arrived while Deepgram was spinning up. + if (session.pendingChunks.length > 0) { + this.logger.debug( + `Flushing ${session.pendingChunks.length} buffered chunks for ${deviceId}`, + ); + for (const chunk of session.pendingChunks) { + sttStream.sendAudio(chunk); + } + session.pendingChunks = []; + } } processAudioChunk(deviceId: string, chunk: Buffer, sampleRate: number): void { const session = this.activeSessions.get(deviceId); - if (!session?.sttStream) { - this.logger.warn(`No active STT stream for device ${deviceId}, ignoring audio chunk`); + if (!session) { + // No session at all → user sent audio without wake_word_detected. + // Safe to ignore. + return; + } + if (!session.sttStream) { + // Session exists but Deepgram is still opening. Buffer the chunk + // so it gets replayed as soon as the stream is ready. + session.pendingChunks.push(chunk); return; } - session.sttStream.sendAudio(chunk); } diff --git a/apps/robot-client/src/services/orchestrator.service.ts b/apps/robot-client/src/services/orchestrator.service.ts index c85c8d9..c06e00f 100644 --- a/apps/robot-client/src/services/orchestrator.service.ts +++ b/apps/robot-client/src/services/orchestrator.service.ts @@ -141,14 +141,26 @@ export class OrchestratorService extends EventEmitter { /** * Basic Voice Activity Detection: check if audio chunk contains significant signal. - * Uses RMS (root mean square) amplitude threshold. + * + * Uses AC-RMS — the mean is subtracted from each sample before squaring, + * so any DC offset from the microphone (the INMP441 has one in the hundreds) + * doesn't artificially inflate the energy and prevent silence detection. */ - private isAudioSignificant(chunk: Buffer, threshold = 200): boolean { - let sumSquares = 0; + private isAudioSignificant(chunk: Buffer, threshold = 300): boolean { const samples = chunk.length / 2; // 16-bit = 2 bytes per sample + if (samples === 0) return false; + // First pass: DC mean of the chunk. + let sum = 0; for (let i = 0; i < chunk.length - 1; i += 2) { - const sample = chunk.readInt16LE(i); + sum += chunk.readInt16LE(i); + } + const mean = sum / samples; + + // Second pass: variance (= AC power) around the mean. + let sumSquares = 0; + for (let i = 0; i < chunk.length - 1; i += 2) { + const sample = chunk.readInt16LE(i) - mean; sumSquares += sample * sample; } diff --git a/apps/robot-hardware/src/main.cpp b/apps/robot-hardware/src/main.cpp index 14426e5..8dc97c1 100644 --- a/apps/robot-hardware/src/main.cpp +++ b/apps/robot-hardware/src/main.cpp @@ -1,310 +1,191 @@ -// Ti-Pote — Minimal audio bring-up firmware (ESP32-WROOM-32) +// Ti-Pote — Robot Hardware firmware (ESP32-WROOM-32) // -// GOAL: prove the I2S audio chain (INMP441 + MAX98357A) end to end. -// The command stream lives on Serial2 (hardware UART2, pins RX=27 -// TX=13) which is wired to the Raspberry Pi's UART0 (/dev/serial0). -// The USB Serial port is kept only for boot-time diagnostics — all -// the real traffic goes over the UART to the Pi. +// Binary-frame protocol over Serial2 (UART2 remapped to RX=27 TX=13) +// wired to the Pi's /dev/serial0. This is the "production" firmware +// that the robot-client talks to. It continuously streams mic audio +// as AUDIO_UP frames and plays back AUDIO_DOWN frames on the speaker. // -// On the host side, the same two scripts we used with the USB link -// work unchanged — just pass `ESP_PORT=/dev/serial0`: +// Pipeline overview: // -// ESP_PORT=/dev/serial0 pnpm esp:record out.wav 3000 -// ESP_PORT=/dev/serial0 pnpm esp:play out.wav +// ┌──────────────┐ AUDIO_UP ┌──────────────┐ +// │ INMP441 │ ────► (S16 mono PCM) ─►│ Pi client │ +// │ (mic) │ │ - wake word │ +// └──────────────┘ │ - STT cloud │ +// │ - TTS cloud │ +// ┌──────────────┐ AUDIO_DOWN │ │ +// │ MAX98357A │ ◄──── (S16 mono PCM) ◄──│ │ +// │ (speaker) │ └──────────────┘ +// └──────────────┘ // -// Protocol (same as before, 921600 baud, line-based for commands, -// raw bytes for audio payload): +// Commands also carried over the same link: +// PING / PONG — latency probe +// STATUS — Pi heartbeat (keeps us out of idle mode) +// DISPLAY_EMOTION — OLED eyes +// LOG — firmware → host human-readable logs // -// host → esp32 -// "PING\n" ping -// "REC \n" start recording for milliseconds -// "PLAY \n" next bytes on the wire are raw -// S16 LE mono 16 kHz PCM, play them -// -// esp32 → host -// "READY\n" once at boot -// "PONG\n" reply to PING -// "LOG \n" human-readable log line -// "ERR \n" error message -// "BEGIN \n" start of a REC response -// "" raw PCM (S16 LE mono 16 kHz) -// "END\n" end of a REC response -// "OK\n" command completed -// -// Wiring (shared I2S bus on I2S_NUM_0): -// BCLK = GPIO 32 (mic SCK + speaker BCLK) -// LRCLK = GPIO 33 (mic WS + speaker LRC) -// MIC = GPIO 34 (INMP441 SD → ESP32 data-in, input-only pin) -// SPK = GPIO 22 (ESP32 data-out → MAX98357A DIN) +// USB Serial is kept only as a boot logger (`pio device monitor -b 115200`). #include -#include -#include +#include "Protocol.h" +#include "Eyes.h" +#include "Audio.h" -// ────────────────────────────────────────────────────────── -// Comms config — UART2 to the Raspberry Pi -// ────────────────────────────────────────────────────────── +#ifndef HW_SERIAL_BAUD +#define HW_SERIAL_BAUD 921600 +#endif -// Hardware UART2 remapped to pins that don't clash with anything -// else on the devkit. TX = GPIO 13, RX = GPIO 27. +#ifndef HW_HEARTBEAT_TIMEOUT_MS +#define HW_HEARTBEAT_TIMEOUT_MS 5000 +#endif + +// UART2 pin remap. Default UART2 pins (16/17) are already taken by +// the OLED, so we move Serial2 onto GPIO 27 (RX) and GPIO 13 (TX). static constexpr int HW_UART_RX_PIN = 27; static constexpr int HW_UART_TX_PIN = 13; -static constexpr long HW_UART_BAUD = 921600; -// HW_COMM is the Stream that carries the command/audio protocol. -// Changing this single #define lets us swap between USB (Serial) -// and the Pi-facing UART (Serial2). +// How many bytes of mic audio we stage per loop tick before emitting +// an AUDIO_UP frame. 640 B = 20 ms @ 16 kHz S16 mono. +static constexpr size_t AUDIO_UP_CHUNK_BYTES = 640; + #define HW_COMM Serial2 -// ────────────────────────────────────────────────────────── -// Audio config -// ────────────────────────────────────────────────────────── +using namespace tipote; -static constexpr int SAMPLE_RATE = 16000; -static constexpr int PIN_BCLK = 32; -static constexpr int PIN_LRCLK = 33; -static constexpr int PIN_MIC_DIN = 34; -static constexpr int PIN_SPK_DOUT = 22; +static Eyes eyes; +static Audio audio; +static FrameDecoder decoder; -static constexpr int DMA_COUNT = 4; -static constexpr int DMA_LEN = 256; +static uint32_t lastHeartbeatMs = 0; +static bool idleMode = false; +static uint8_t micBuffer[AUDIO_UP_CHUNK_BYTES]; -// Staging buffers — keep them outside of functions so we don't eat -// stack on every tick. -static constexpr size_t OUT_S16_SAMPLES = 320; // 20 ms of S16 mono -static int32_t g_rawStereo[OUT_S16_SAMPLES * 2]; -static int16_t g_micMono [OUT_S16_SAMPLES]; -static int32_t g_spkStereo[OUT_S16_SAMPLES * 2]; -static uint8_t g_spkInBuf [OUT_S16_SAMPLES * 2]; // 640 bytes of S16 mono - -// ────────────────────────────────────────────────────────── -// Line buffer for incoming text commands. -// ────────────────────────────────────────────────────────── - -static char g_line[64]; -static size_t g_lineLen = 0; - -static void sendLog(const char* msg) { - HW_COMM.print("LOG "); - HW_COMM.println(msg); -} - -static void sendErr(const char* msg) { - HW_COMM.print("ERR "); - HW_COMM.println(msg); -} - -// ────────────────────────────────────────────────────────── -// I2S init — single port, full duplex, shared BCLK/WS. -// ────────────────────────────────────────────────────────── - -static bool audioBegin() { - i2s_config_t cfg = {}; - cfg.mode = static_cast(I2S_MODE_MASTER | - I2S_MODE_RX | - I2S_MODE_TX); - cfg.sample_rate = SAMPLE_RATE; - cfg.bits_per_sample = I2S_BITS_PER_SAMPLE_32BIT; - cfg.channel_format = I2S_CHANNEL_FMT_RIGHT_LEFT; - cfg.communication_format = I2S_COMM_FORMAT_STAND_I2S; - cfg.intr_alloc_flags = ESP_INTR_FLAG_LEVEL1; - cfg.dma_buf_count = DMA_COUNT; - cfg.dma_buf_len = DMA_LEN; - cfg.use_apll = false; - cfg.tx_desc_auto_clear = true; - cfg.fixed_mclk = 0; - - if (i2s_driver_install(I2S_NUM_0, &cfg, 0, nullptr) != ESP_OK) return false; - - i2s_pin_config_t pins = {}; - pins.bck_io_num = PIN_BCLK; - pins.ws_io_num = PIN_LRCLK; - pins.data_out_num = PIN_SPK_DOUT; - pins.data_in_num = PIN_MIC_DIN; - if (i2s_set_pin(I2S_NUM_0, &pins) != ESP_OK) { - i2s_driver_uninstall(I2S_NUM_0); - return false; - } - i2s_zero_dma_buffer(I2S_NUM_0); - return true; -} - -// Convert one batch of stereo 32-bit mic samples to S16 mono by -// taking the left slot and shifting the 24-bit-aligned data down. -// Returns the number of S16 samples written into `out`. -static size_t micReadMono(int16_t* out, size_t maxSamples) { - size_t wantPairs = maxSamples; - if (wantPairs > OUT_S16_SAMPLES) wantPairs = OUT_S16_SAMPLES; - - size_t bytesRead = 0; - const esp_err_t err = i2s_read( - I2S_NUM_0, - g_rawStereo, - wantPairs * 2 * sizeof(int32_t), - &bytesRead, - portMAX_DELAY // block — we're in a dedicated REC loop - ); - if (err != ESP_OK || bytesRead == 0) return 0; - - const size_t pairs = bytesRead / (2 * sizeof(int32_t)); - for (size_t i = 0; i < pairs; ++i) { - int32_t L = g_rawStereo[2 * i]; - int32_t s = L >> 14; - if (s > INT16_MAX) s = INT16_MAX; - if (s < INT16_MIN) s = INT16_MIN; - out[i] = static_cast(s); - } - return pairs; -} - -// Write one batch of S16 mono PCM to the speaker by duplicating each -// sample into both stereo slots and shifting into the high half of -// the 32-bit word (what the MAX98357A expects on a shared bus). -static void spkWriteMono(const int16_t* samples, size_t count) { - if (count == 0) return; - if (count > OUT_S16_SAMPLES) count = OUT_S16_SAMPLES; - for (size_t i = 0; i < count; ++i) { - const int32_t s32 = static_cast(samples[i]) << 16; - g_spkStereo[2 * i] = s32; - g_spkStereo[2 * i + 1] = s32; - } - size_t bytesWritten = 0; - i2s_write(I2S_NUM_0, g_spkStereo, count * 2 * sizeof(int32_t), - &bytesWritten, portMAX_DELAY); -} - -// ────────────────────────────────────────────────────────── -// Command handlers -// ────────────────────────────────────────────────────────── - -static void handleRec(uint32_t durationMs) { - const uint32_t totalSamples = (SAMPLE_RATE * durationMs) / 1000; - const uint32_t totalBytes = totalSamples * sizeof(int16_t); - - HW_COMM.print("BEGIN "); - HW_COMM.println(totalBytes); - - // Flush whatever old noise is in the mic DMA first. - i2s_zero_dma_buffer(I2S_NUM_0); - - uint32_t sent = 0; - while (sent < totalSamples) { - size_t want = totalSamples - sent; - if (want > OUT_S16_SAMPLES) want = OUT_S16_SAMPLES; - const size_t got = micReadMono(g_micMono, want); - if (got == 0) continue; - HW_COMM.write(reinterpret_cast(g_micMono), - got * sizeof(int16_t)); - sent += got; - } - - HW_COMM.println(); - HW_COMM.println("END"); -} - -static void handlePlay(uint32_t totalBytes) { - // Drain any pending crap from the speaker DMA so we don't start - // with a pop. - i2s_zero_dma_buffer(I2S_NUM_0); - - // Give readBytes a generous timeout so a jittery host doesn't - // abort us mid-playback. - HW_COMM.setTimeout(2000); - - uint32_t remaining = totalBytes; - while (remaining > 0) { - size_t want = remaining; - if (want > sizeof(g_spkInBuf)) want = sizeof(g_spkInBuf); - // Force an even count so we always have complete S16 samples. - if (want & 1) want -= 1; - if (want == 0) want = 2; - - const size_t got = HW_COMM.readBytes(g_spkInBuf, want); - if (got == 0) { - sendErr("PLAY read timeout"); - return; - } - const size_t samples = got / sizeof(int16_t); - spkWriteMono(reinterpret_cast(g_spkInBuf), samples); - remaining -= got; - } - - // Let the last frames actually reach the speaker, then clear. - delay(50); - i2s_zero_dma_buffer(I2S_NUM_0); - HW_COMM.println("OK"); -} - -static void handleLine(const char* line) { - if (strcmp(line, "PING") == 0) { - HW_COMM.println("PONG"); - return; - } - if (strncmp(line, "REC ", 4) == 0) { - const long ms = atol(line + 4); - if (ms <= 0 || ms > 60000) { sendErr("REC bad duration"); return; } - handleRec(static_cast(ms)); - return; - } - if (strncmp(line, "PLAY ", 5) == 0) { - const long bytes = atol(line + 5); - if (bytes <= 0 || bytes > 16 * 1024 * 1024) { - sendErr("PLAY bad size"); - return; - } - handlePlay(static_cast(bytes)); - return; - } - sendErr("unknown command"); -} - -// ────────────────────────────────────────────────────────── -// Arduino entry points -// ────────────────────────────────────────────────────────── +static void handleFrame(const Frame& frame, void* userData); +static void logLine(const char* line); void setup() { - // USB Serial is kept as a *boot-time logger only*. It gives - // you something to look at via `pio device monitor` when the - // board is plugged into a laptop, without interfering with - // the Pi link on Serial2. + // USB Serial: boot-time diagnostics only. Serial.begin(115200); - Serial.println("[boot] USB logger up, real comms on Serial2"); + Serial.println("[boot] USB logger up (Serial2 is the real comms)"); - // Bump the UART2 RX buffer WAY above the 256-byte default so we - // can absorb a full PLAY payload (up to a few tens of KB) without - // losing bytes if the host floods us. + // UART2 to the Pi, with a large RX buffer so AUDIO_DOWN bursts + // never overflow while we're busy pushing to I2S. HW_COMM.setRxBufferSize(16 * 1024); - HW_COMM.begin(HW_UART_BAUD, SERIAL_8N1, HW_UART_RX_PIN, HW_UART_TX_PIN); + HW_COMM.begin(HW_SERIAL_BAUD, SERIAL_8N1, HW_UART_RX_PIN, HW_UART_TX_PIN); + HW_COMM.setTimeout(2000); delay(50); - if (!audioBegin()) { - sendErr("I2S init failed"); + eyes.begin(); + + if (!audio.begin()) { + logLine("audio: I2S init FAILED"); Serial.println("[boot] I2S init FAILED"); } else { - sendLog("I2S ready"); + logLine("audio: I2S ready (mic + speaker)"); Serial.println("[boot] I2S ready"); } - HW_COMM.println("READY"); - Serial.println("[boot] READY sent on Serial2"); + decoder.onFrame(handleFrame); + + lastHeartbeatMs = millis(); + logLine("robot-hardware ready"); + Serial.println("[boot] robot-hardware ready on Serial2"); } void loop() { + // 1. Drain whatever the Pi has sent since the last tick and let + // the frame decoder emit complete frames to handleFrame(). while (HW_COMM.available() > 0) { - const int c = HW_COMM.read(); - if (c < 0) break; - if (c == '\r') continue; - if (c == '\n') { - g_line[g_lineLen] = 0; - if (g_lineLen > 0) handleLine(g_line); - g_lineLen = 0; - continue; - } - if (g_lineLen < sizeof(g_line) - 1) { - g_line[g_lineLen++] = static_cast(c); - } else { - g_lineLen = 0; - sendErr("line overflow"); + const int b = HW_COMM.read(); + if (b < 0) break; + decoder.feed(static_cast(b)); + } + + // 2. Heartbeat watchdog — slip into sleepy idle if the Pi goes + // away. This also mutes the speaker DMA so an old tail + // doesn't loop forever on loss of heartbeat. + const uint32_t now = millis(); + if (!idleMode && (now - lastHeartbeatMs) > HW_HEARTBEAT_TIMEOUT_MS) { + idleMode = true; + eyes.show(Emotion::SLEEPY); + audio.flushSpeaker(); + } + + // 3. Continuous mic streaming. We skip this while in idle mode + // to keep the UART and the Pi CPU quiet when nobody is home. + if (!idleMode) { + const size_t bytes = audio.readMicChunk(micBuffer, sizeof(micBuffer)); + if (bytes > 0) { + FrameEncoder::writeTo(HW_COMM, MsgType::AUDIO_UP, + micBuffer, static_cast(bytes)); } } } + +// --------------------------------------------------------------- +// Frame dispatcher +// --------------------------------------------------------------- + +static void handleFrame(const Frame& frame, void* /*userData*/) { + lastHeartbeatMs = millis(); + if (idleMode) { + idleMode = false; + } + + switch (frame.type) { + case MsgType::DISPLAY_EMOTION: { + if (frame.length < 1) { + logLine("DISPLAY_EMOTION: empty payload"); + return; + } + const uint8_t code = frame.payload[0]; + if (code >= static_cast(Emotion::COUNT)) { + logLine("DISPLAY_EMOTION: out-of-range code"); + return; + } + eyes.show(static_cast(code)); + + const uint8_t ackPayload[1] = {code}; + FrameEncoder::writeTo(HW_COMM, MsgType::ACK, ackPayload, 1); + return; + } + + case MsgType::DISPLAY_CLEAR: { + eyes.clear(); + FrameEncoder::writeTo(HW_COMM, MsgType::ACK); + return; + } + + case MsgType::AUDIO_DOWN: { + // Raw PCM S16 mono 16 kHz — push it straight to the speaker + // DMA. i2s_write blocks until it has room; at 20 ms chunks + // the DMA drains fast enough that this stays responsive. + if (frame.length > 0) { + audio.writeSpeakerChunk(frame.payload, frame.length); + } + return; + } + + case MsgType::PING: { + FrameEncoder::writeTo(HW_COMM, MsgType::PONG, + frame.payload, frame.length); + return; + } + + case MsgType::STATUS: { + // Heartbeat handled at the top of the dispatcher. + return; + } + + // TODO: SERVO_CMD / LED_CMD / SENSOR_DATA + default: + logLine("unknown frame type"); + return; + } +} + +static void logLine(const char* line) { + const size_t len = strnlen(line, MAX_PAYLOAD_SIZE); + FrameEncoder::writeTo(HW_COMM, MsgType::LOG, + reinterpret_cast(line), + static_cast(len)); +}