audio ok mais pas top

2026-04-09 03:14:43 +02:00 · 2026-04-09 03:14:43 +02:00 · e941ad1b14
commit e941ad1b14
parent 28d5bd44e0
4 changed files with 204 additions and 282 deletions
--- a/apps/backend/src/adapters/inbound/websocket/robot.gateway.ts
+++ b/apps/backend/src/adapters/inbound/websocket/robot.gateway.ts
@ -104,8 +104,12 @@ export class RobotGateway implements OnGatewayConnection, OnGatewayDisconnect, I
  @SubscribeMessage('wake_word_detected')
  async handleWakeWord(@ConnectedSocket() client: AuthenticatedSocket) {
    this.logger.log(`Wake word detected on device ${client.data.deviceId}`);
-    client.emit('status', { state: 'listening' as RobotState });
+    // IMPORTANT: open the STT stream FIRST, then tell the client we're
+    // listening. The client flushes its buffered audio as soon as it
+    // sees `listening`, so if Deepgram isn't open yet those chunks are
+    // silently dropped with a "No active STT stream" warning.
    await this.conversationPort.startListening(client.data.deviceId);
+    client.emit('status', { state: 'listening' as RobotState });
  }

  @SubscribeMessage('audio_chunk')
--- a/apps/backend/src/core/services/conversation.service.ts
+++ b/apps/backend/src/core/services/conversation.service.ts
@ -13,6 +13,13 @@ interface ActiveSession {
  finalTranscription: string;
  interimTranscription: string;
  sttStream: ISTTStream | null;
+  /**
+   * Audio chunks that arrived between session creation and the STT
+   * stream actually being open. Drained as soon as `sttStream` is
+   * assigned so we never lose the first few hundred milliseconds of
+   * speech.
+   */
+  pendingChunks: Buffer[];
 }

 const SYSTEM_PROMPT = `Tu es Ti-Pote, un petit robot de bureau animatronique, chaleureux et serviable.
@ -84,6 +91,7 @@ export class ConversationService implements IConversationPort {
      finalTranscription: '',
      interimTranscription: '',
      sttStream: null,
+      pendingChunks: [],
    };

    this.activeSessions.set(deviceId, session);
@ -102,15 +110,32 @@ export class ConversationService implements IConversationPort {
    });

    session.sttStream = sttStream;
+
+    // Flush anything that arrived while Deepgram was spinning up.
+    if (session.pendingChunks.length > 0) {
+      this.logger.debug(
+        `Flushing ${session.pendingChunks.length} buffered chunks for ${deviceId}`,
+      );
+      for (const chunk of session.pendingChunks) {
+        sttStream.sendAudio(chunk);
+      }
+      session.pendingChunks = [];
+    }
  }

  processAudioChunk(deviceId: string, chunk: Buffer, sampleRate: number): void {
    const session = this.activeSessions.get(deviceId);
-    if (!session?.sttStream) {
-      this.logger.warn(`No active STT stream for device ${deviceId}, ignoring audio chunk`);
+    if (!session) {
+      // No session at all → user sent audio without wake_word_detected.
+      // Safe to ignore.
+      return;
+    }
+    if (!session.sttStream) {
+      // Session exists but Deepgram is still opening. Buffer the chunk
+      // so it gets replayed as soon as the stream is ready.
+      session.pendingChunks.push(chunk);
      return;
    }
-
    session.sttStream.sendAudio(chunk);
  }

--- a/apps/robot-client/src/services/orchestrator.service.ts
+++ b/apps/robot-client/src/services/orchestrator.service.ts
@ -141,14 +141,26 @@ export class OrchestratorService extends EventEmitter {

  /**
   * Basic Voice Activity Detection: check if audio chunk contains significant signal.
-   * Uses RMS (root mean square) amplitude threshold.
+   *
+   * Uses AC-RMS — the mean is subtracted from each sample before squaring,
+   * so any DC offset from the microphone (the INMP441 has one in the hundreds)
+   * doesn't artificially inflate the energy and prevent silence detection.
   */
-  private isAudioSignificant(chunk: Buffer, threshold = 200): boolean {
-    let sumSquares = 0;
+  private isAudioSignificant(chunk: Buffer, threshold = 300): boolean {
    const samples = chunk.length / 2; // 16-bit = 2 bytes per sample
+    if (samples === 0) return false;

+    // First pass: DC mean of the chunk.
+    let sum = 0;
    for (let i = 0; i < chunk.length - 1; i += 2) {
-      const sample = chunk.readInt16LE(i);
+      sum += chunk.readInt16LE(i);
+    }
+    const mean = sum / samples;
+
+    // Second pass: variance (= AC power) around the mean.
+    let sumSquares = 0;
+    for (let i = 0; i < chunk.length - 1; i += 2) {
+      const sample = chunk.readInt16LE(i) - mean;
      sumSquares += sample * sample;
    }

--- a/apps/robot-hardware/src/main.cpp
+++ b/apps/robot-hardware/src/main.cpp
@ -1,310 +1,191 @@
-// Ti-Pote — Minimal audio bring-up firmware (ESP32-WROOM-32)
+// Ti-Pote — Robot Hardware firmware (ESP32-WROOM-32)
 //
-// GOAL: prove the I2S audio chain (INMP441 + MAX98357A) end to end.
-// The command stream lives on Serial2 (hardware UART2, pins RX=27
-// TX=13) which is wired to the Raspberry Pi's UART0 (/dev/serial0).
-// The USB Serial port is kept only for boot-time diagnostics — all
-// the real traffic goes over the UART to the Pi.
+// Binary-frame protocol over Serial2 (UART2 remapped to RX=27 TX=13)
+// wired to the Pi's /dev/serial0. This is the "production" firmware
+// that the robot-client talks to. It continuously streams mic audio
+// as AUDIO_UP frames and plays back AUDIO_DOWN frames on the speaker.
 //
-// On the host side, the same two scripts we used with the USB link
-// work unchanged — just pass `ESP_PORT=/dev/serial0`:
+// Pipeline overview:
 //
-//   ESP_PORT=/dev/serial0 pnpm esp:record out.wav 3000
-//   ESP_PORT=/dev/serial0 pnpm esp:play  out.wav
+//   ┌──────────────┐       AUDIO_UP         ┌──────────────┐
+//   │   INMP441    │ ────► (S16 mono PCM) ─►│   Pi client  │
+//   │   (mic)      │                        │  - wake word │
+//   └──────────────┘                        │  - STT cloud │
+//                                            │  - TTS cloud │
+//   ┌──────────────┐       AUDIO_DOWN        │              │
+//   │  MAX98357A   │ ◄──── (S16 mono PCM) ◄──│              │
+//   │  (speaker)   │                        └──────────────┘
+//   └──────────────┘
 //
-// Protocol (same as before, 921600 baud, line-based for commands,
-// raw bytes for audio payload):
+// Commands also carried over the same link:
+//   PING / PONG              — latency probe
+//   STATUS                   — Pi heartbeat (keeps us out of idle mode)
+//   DISPLAY_EMOTION          — OLED eyes
+//   LOG                      — firmware → host human-readable logs
 //
-//   host → esp32
-//     "PING\n"              ping
-//     "REC <ms>\n"           start recording for <ms> milliseconds
-//     "PLAY <bytes>\n"       next <bytes> bytes on the wire are raw
-//                            S16 LE mono 16 kHz PCM, play them
-//
-//   esp32 → host
-//     "READY\n"              once at boot
-//     "PONG\n"               reply to PING
-//     "LOG <text>\n"         human-readable log line
-//     "ERR <text>\n"         error message
-//     "BEGIN <bytes>\n"      start of a REC response
-//     "<raw bytes>"          raw PCM (S16 LE mono 16 kHz)
-//     "END\n"                end of a REC response
-//     "OK\n"                 command completed
-//
-// Wiring (shared I2S bus on I2S_NUM_0):
-//   BCLK  = GPIO 32   (mic SCK + speaker BCLK)
-//   LRCLK = GPIO 33   (mic WS  + speaker LRC)
-//   MIC   = GPIO 34   (INMP441 SD → ESP32 data-in, input-only pin)
-//   SPK   = GPIO 22   (ESP32 data-out → MAX98357A DIN)
+// USB Serial is kept only as a boot logger (`pio device monitor -b 115200`).

 #include <Arduino.h>
-#include <driver/i2s.h>
-#include <string.h>
+#include "Protocol.h"
+#include "Eyes.h"
+#include "Audio.h"

-// ──────────────────────────────────────────────────────────
-// Comms config — UART2 to the Raspberry Pi
-// ──────────────────────────────────────────────────────────
+#ifndef HW_SERIAL_BAUD
+#define HW_SERIAL_BAUD 921600
+#endif

-// Hardware UART2 remapped to pins that don't clash with anything
-// else on the devkit. TX = GPIO 13, RX = GPIO 27.
+#ifndef HW_HEARTBEAT_TIMEOUT_MS
+#define HW_HEARTBEAT_TIMEOUT_MS 5000
+#endif
+
+// UART2 pin remap. Default UART2 pins (16/17) are already taken by
+// the OLED, so we move Serial2 onto GPIO 27 (RX) and GPIO 13 (TX).
 static constexpr int HW_UART_RX_PIN = 27;
 static constexpr int HW_UART_TX_PIN = 13;
-static constexpr long HW_UART_BAUD  = 921600;

-// HW_COMM is the Stream that carries the command/audio protocol.
-// Changing this single #define lets us swap between USB (Serial)
-// and the Pi-facing UART (Serial2).
+// How many bytes of mic audio we stage per loop tick before emitting
+// an AUDIO_UP frame. 640 B = 20 ms @ 16 kHz S16 mono.
+static constexpr size_t AUDIO_UP_CHUNK_BYTES = 640;
+
 #define HW_COMM Serial2

-// ──────────────────────────────────────────────────────────
-// Audio config
-// ──────────────────────────────────────────────────────────
+using namespace tipote;

-static constexpr int SAMPLE_RATE      = 16000;
-static constexpr int PIN_BCLK         = 32;
-static constexpr int PIN_LRCLK        = 33;
-static constexpr int PIN_MIC_DIN      = 34;
-static constexpr int PIN_SPK_DOUT     = 22;
+static Eyes         eyes;
+static Audio        audio;
+static FrameDecoder decoder;

-static constexpr int DMA_COUNT        = 4;
-static constexpr int DMA_LEN          = 256;
+static uint32_t lastHeartbeatMs = 0;
+static bool     idleMode        = false;
+static uint8_t  micBuffer[AUDIO_UP_CHUNK_BYTES];

-// Staging buffers — keep them outside of functions so we don't eat
-// stack on every tick.
-static constexpr size_t OUT_S16_SAMPLES = 320;  // 20 ms of S16 mono
-static int32_t g_rawStereo[OUT_S16_SAMPLES * 2];
-static int16_t g_micMono  [OUT_S16_SAMPLES];
-static int32_t g_spkStereo[OUT_S16_SAMPLES * 2];
-static uint8_t g_spkInBuf [OUT_S16_SAMPLES * 2];  // 640 bytes of S16 mono
-
-// ──────────────────────────────────────────────────────────
-// Line buffer for incoming text commands.
-// ──────────────────────────────────────────────────────────
-
-static char     g_line[64];
-static size_t   g_lineLen = 0;
-
-static void sendLog(const char* msg) {
-    HW_COMM.print("LOG ");
-    HW_COMM.println(msg);
-}
-
-static void sendErr(const char* msg) {
-    HW_COMM.print("ERR ");
-    HW_COMM.println(msg);
-}
-
-// ──────────────────────────────────────────────────────────
-// I2S init — single port, full duplex, shared BCLK/WS.
-// ──────────────────────────────────────────────────────────
-
-static bool audioBegin() {
-    i2s_config_t cfg = {};
-    cfg.mode                 = static_cast<i2s_mode_t>(I2S_MODE_MASTER |
-                                                       I2S_MODE_RX |
-                                                       I2S_MODE_TX);
-    cfg.sample_rate          = SAMPLE_RATE;
-    cfg.bits_per_sample      = I2S_BITS_PER_SAMPLE_32BIT;
-    cfg.channel_format       = I2S_CHANNEL_FMT_RIGHT_LEFT;
-    cfg.communication_format = I2S_COMM_FORMAT_STAND_I2S;
-    cfg.intr_alloc_flags     = ESP_INTR_FLAG_LEVEL1;
-    cfg.dma_buf_count        = DMA_COUNT;
-    cfg.dma_buf_len          = DMA_LEN;
-    cfg.use_apll             = false;
-    cfg.tx_desc_auto_clear   = true;
-    cfg.fixed_mclk           = 0;
-
-    if (i2s_driver_install(I2S_NUM_0, &cfg, 0, nullptr) != ESP_OK) return false;
-
-    i2s_pin_config_t pins = {};
-    pins.bck_io_num   = PIN_BCLK;
-    pins.ws_io_num    = PIN_LRCLK;
-    pins.data_out_num = PIN_SPK_DOUT;
-    pins.data_in_num  = PIN_MIC_DIN;
-    if (i2s_set_pin(I2S_NUM_0, &pins) != ESP_OK) {
-        i2s_driver_uninstall(I2S_NUM_0);
-        return false;
-    }
-    i2s_zero_dma_buffer(I2S_NUM_0);
-    return true;
-}
-
-// Convert one batch of stereo 32-bit mic samples to S16 mono by
-// taking the left slot and shifting the 24-bit-aligned data down.
-// Returns the number of S16 samples written into `out`.
-static size_t micReadMono(int16_t* out, size_t maxSamples) {
-    size_t wantPairs = maxSamples;
-    if (wantPairs > OUT_S16_SAMPLES) wantPairs = OUT_S16_SAMPLES;
-
-    size_t bytesRead = 0;
-    const esp_err_t err = i2s_read(
-        I2S_NUM_0,
-        g_rawStereo,
-        wantPairs * 2 * sizeof(int32_t),
-        &bytesRead,
-        portMAX_DELAY  // block — we're in a dedicated REC loop
-    );
-    if (err != ESP_OK || bytesRead == 0) return 0;
-
-    const size_t pairs = bytesRead / (2 * sizeof(int32_t));
-    for (size_t i = 0; i < pairs; ++i) {
-        int32_t L = g_rawStereo[2 * i];
-        int32_t s = L >> 14;
-        if (s >  INT16_MAX) s =  INT16_MAX;
-        if (s <  INT16_MIN) s =  INT16_MIN;
-        out[i] = static_cast<int16_t>(s);
-    }
-    return pairs;
-}
-
-// Write one batch of S16 mono PCM to the speaker by duplicating each
-// sample into both stereo slots and shifting into the high half of
-// the 32-bit word (what the MAX98357A expects on a shared bus).
-static void spkWriteMono(const int16_t* samples, size_t count) {
-    if (count == 0) return;
-    if (count > OUT_S16_SAMPLES) count = OUT_S16_SAMPLES;
-    for (size_t i = 0; i < count; ++i) {
-        const int32_t s32 = static_cast<int32_t>(samples[i]) << 16;
-        g_spkStereo[2 * i]     = s32;
-        g_spkStereo[2 * i + 1] = s32;
-    }
-    size_t bytesWritten = 0;
-    i2s_write(I2S_NUM_0, g_spkStereo, count * 2 * sizeof(int32_t),
-              &bytesWritten, portMAX_DELAY);
-}
-
-// ──────────────────────────────────────────────────────────
-// Command handlers
-// ──────────────────────────────────────────────────────────
-
-static void handleRec(uint32_t durationMs) {
-    const uint32_t totalSamples = (SAMPLE_RATE * durationMs) / 1000;
-    const uint32_t totalBytes   = totalSamples * sizeof(int16_t);
-
-    HW_COMM.print("BEGIN ");
-    HW_COMM.println(totalBytes);
-
-    // Flush whatever old noise is in the mic DMA first.
-    i2s_zero_dma_buffer(I2S_NUM_0);
-
-    uint32_t sent = 0;
-    while (sent < totalSamples) {
-        size_t want = totalSamples - sent;
-        if (want > OUT_S16_SAMPLES) want = OUT_S16_SAMPLES;
-        const size_t got = micReadMono(g_micMono, want);
-        if (got == 0) continue;
-        HW_COMM.write(reinterpret_cast<const uint8_t*>(g_micMono),
-                      got * sizeof(int16_t));
-        sent += got;
-    }
-
-    HW_COMM.println();
-    HW_COMM.println("END");
-}
-
-static void handlePlay(uint32_t totalBytes) {
-    // Drain any pending crap from the speaker DMA so we don't start
-    // with a pop.
-    i2s_zero_dma_buffer(I2S_NUM_0);
-
-    // Give readBytes a generous timeout so a jittery host doesn't
-    // abort us mid-playback.
-    HW_COMM.setTimeout(2000);
-
-    uint32_t remaining = totalBytes;
-    while (remaining > 0) {
-        size_t want = remaining;
-        if (want > sizeof(g_spkInBuf)) want = sizeof(g_spkInBuf);
-        // Force an even count so we always have complete S16 samples.
-        if (want & 1) want -= 1;
-        if (want == 0) want = 2;
-
-        const size_t got = HW_COMM.readBytes(g_spkInBuf, want);
-        if (got == 0) {
-            sendErr("PLAY read timeout");
-            return;
-        }
-        const size_t samples = got / sizeof(int16_t);
-        spkWriteMono(reinterpret_cast<const int16_t*>(g_spkInBuf), samples);
-        remaining -= got;
-    }
-
-    // Let the last frames actually reach the speaker, then clear.
-    delay(50);
-    i2s_zero_dma_buffer(I2S_NUM_0);
-    HW_COMM.println("OK");
-}
-
-static void handleLine(const char* line) {
-    if (strcmp(line, "PING") == 0) {
-        HW_COMM.println("PONG");
-        return;
-    }
-    if (strncmp(line, "REC ", 4) == 0) {
-        const long ms = atol(line + 4);
-        if (ms <= 0 || ms > 60000) { sendErr("REC bad duration"); return; }
-        handleRec(static_cast<uint32_t>(ms));
-        return;
-    }
-    if (strncmp(line, "PLAY ", 5) == 0) {
-        const long bytes = atol(line + 5);
-        if (bytes <= 0 || bytes > 16 * 1024 * 1024) {
-            sendErr("PLAY bad size");
-            return;
-        }
-        handlePlay(static_cast<uint32_t>(bytes));
-        return;
-    }
-    sendErr("unknown command");
-}
-
-// ──────────────────────────────────────────────────────────
-// Arduino entry points
-// ──────────────────────────────────────────────────────────
+static void handleFrame(const Frame& frame, void* userData);
+static void logLine(const char* line);

 void setup() {
-    // USB Serial is kept as a *boot-time logger only*. It gives
-    // you something to look at via `pio device monitor` when the
-    // board is plugged into a laptop, without interfering with
-    // the Pi link on Serial2.
+    // USB Serial: boot-time diagnostics only.
    Serial.begin(115200);
-    Serial.println("[boot] USB logger up, real comms on Serial2");
+    Serial.println("[boot] USB logger up (Serial2 is the real comms)");

-    // Bump the UART2 RX buffer WAY above the 256-byte default so we
-    // can absorb a full PLAY payload (up to a few tens of KB) without
-    // losing bytes if the host floods us.
+    // UART2 to the Pi, with a large RX buffer so AUDIO_DOWN bursts
+    // never overflow while we're busy pushing to I2S.
    HW_COMM.setRxBufferSize(16 * 1024);
-    HW_COMM.begin(HW_UART_BAUD, SERIAL_8N1, HW_UART_RX_PIN, HW_UART_TX_PIN);
+    HW_COMM.begin(HW_SERIAL_BAUD, SERIAL_8N1, HW_UART_RX_PIN, HW_UART_TX_PIN);
+    HW_COMM.setTimeout(2000);
    delay(50);

-    if (!audioBegin()) {
-        sendErr("I2S init failed");
+    eyes.begin();
+
+    if (!audio.begin()) {
+        logLine("audio: I2S init FAILED");
        Serial.println("[boot] I2S init FAILED");
    } else {
-        sendLog("I2S ready");
+        logLine("audio: I2S ready (mic + speaker)");
        Serial.println("[boot] I2S ready");
    }

-    HW_COMM.println("READY");
-    Serial.println("[boot] READY sent on Serial2");
+    decoder.onFrame(handleFrame);
+
+    lastHeartbeatMs = millis();
+    logLine("robot-hardware ready");
+    Serial.println("[boot] robot-hardware ready on Serial2");
 }

 void loop() {
+    // 1. Drain whatever the Pi has sent since the last tick and let
+    //    the frame decoder emit complete frames to handleFrame().
    while (HW_COMM.available() > 0) {
-        const int c = HW_COMM.read();
-        if (c < 0) break;
-        if (c == '\r') continue;
-        if (c == '\n') {
-            g_line[g_lineLen] = 0;
-            if (g_lineLen > 0) handleLine(g_line);
-            g_lineLen = 0;
-            continue;
-        }
-        if (g_lineLen < sizeof(g_line) - 1) {
-            g_line[g_lineLen++] = static_cast<char>(c);
-        } else {
-            g_lineLen = 0;
-            sendErr("line overflow");
+        const int b = HW_COMM.read();
+        if (b < 0) break;
+        decoder.feed(static_cast<uint8_t>(b));
+    }
+
+    // 2. Heartbeat watchdog — slip into sleepy idle if the Pi goes
+    //    away. This also mutes the speaker DMA so an old tail
+    //    doesn't loop forever on loss of heartbeat.
+    const uint32_t now = millis();
+    if (!idleMode && (now - lastHeartbeatMs) > HW_HEARTBEAT_TIMEOUT_MS) {
+        idleMode = true;
+        eyes.show(Emotion::SLEEPY);
+        audio.flushSpeaker();
+    }
+
+    // 3. Continuous mic streaming. We skip this while in idle mode
+    //    to keep the UART and the Pi CPU quiet when nobody is home.
+    if (!idleMode) {
+        const size_t bytes = audio.readMicChunk(micBuffer, sizeof(micBuffer));
+        if (bytes > 0) {
+            FrameEncoder::writeTo(HW_COMM, MsgType::AUDIO_UP,
+                                  micBuffer, static_cast<uint16_t>(bytes));
        }
    }
 }
+
+// ---------------------------------------------------------------
+// Frame dispatcher
+// ---------------------------------------------------------------
+
+static void handleFrame(const Frame& frame, void* /*userData*/) {
+    lastHeartbeatMs = millis();
+    if (idleMode) {
+        idleMode = false;
+    }
+
+    switch (frame.type) {
+        case MsgType::DISPLAY_EMOTION: {
+            if (frame.length < 1) {
+                logLine("DISPLAY_EMOTION: empty payload");
+                return;
+            }
+            const uint8_t code = frame.payload[0];
+            if (code >= static_cast<uint8_t>(Emotion::COUNT)) {
+                logLine("DISPLAY_EMOTION: out-of-range code");
+                return;
+            }
+            eyes.show(static_cast<Emotion>(code));
+
+            const uint8_t ackPayload[1] = {code};
+            FrameEncoder::writeTo(HW_COMM, MsgType::ACK, ackPayload, 1);
+            return;
+        }
+
+        case MsgType::DISPLAY_CLEAR: {
+            eyes.clear();
+            FrameEncoder::writeTo(HW_COMM, MsgType::ACK);
+            return;
+        }
+
+        case MsgType::AUDIO_DOWN: {
+            // Raw PCM S16 mono 16 kHz — push it straight to the speaker
+            // DMA. i2s_write blocks until it has room; at 20 ms chunks
+            // the DMA drains fast enough that this stays responsive.
+            if (frame.length > 0) {
+                audio.writeSpeakerChunk(frame.payload, frame.length);
+            }
+            return;
+        }
+
+        case MsgType::PING: {
+            FrameEncoder::writeTo(HW_COMM, MsgType::PONG,
+                                  frame.payload, frame.length);
+            return;
+        }
+
+        case MsgType::STATUS: {
+            // Heartbeat handled at the top of the dispatcher.
+            return;
+        }
+
+        // TODO: SERVO_CMD / LED_CMD / SENSOR_DATA
+        default:
+            logLine("unknown frame type");
+            return;
+    }
+}
+
+static void logLine(const char* line) {
+    const size_t len = strnlen(line, MAX_PAYLOAD_SIZE);
+    FrameEncoder::writeTo(HW_COMM, MsgType::LOG,
+                          reinterpret_cast<const uint8_t*>(line),
+                          static_cast<uint16_t>(len));
+}