From c19d9a7cf40f729b61ffb91c4412d785ffe910be Mon Sep 17 00:00:00 2001
From: ordinarthur <arthur.barre@forgeron3.fr>
Date: Thu, 9 Apr 2026 02:47:53 +0200
Subject: [PATCH] ok script esp

---
 apps/robot-client/package.json                |   6 +-
 apps/robot-client/scripts/audio-beep.ts       |  99 +++++
 apps/robot-client/scripts/audio-loopback.ts   | 171 ++++++++
 apps/robot-client/scripts/wake_word.py        | 408 +++++++++++-------
 .../src/config/hardware.config.ts             |  23 +-
 .../src/hardware/hardware.service.ts          |  48 +++
 apps/robot-client/src/main.ts                 |  36 +-
 .../src/services/audio.service.ts             | 262 +++++++++--
 apps/robot-client/src/services/index.ts       |   8 +-
 .../src/services/wake-word.service.ts         | 149 +++++--
 apps/robot-hardware/lib/Audio/library.json    |   7 +
 apps/robot-hardware/lib/Audio/src/Audio.cpp   | 151 +++++++
 apps/robot-hardware/lib/Audio/src/Audio.h     |  84 ++++
 apps/robot-hardware/platformio.ini            |   5 +
 apps/robot-hardware/scripts/esp-play.ts       | 219 ++++++++++
 apps/robot-hardware/scripts/esp-record.ts     | 190 ++++++++
 apps/robot-hardware/src/main.cpp              | 384 +++++++++++------
 17 files changed, 1860 insertions(+), 390 deletions(-)
 create mode 100644 apps/robot-client/scripts/audio-beep.ts
 create mode 100644 apps/robot-client/scripts/audio-loopback.ts
 create mode 100644 apps/robot-hardware/lib/Audio/library.json
 create mode 100644 apps/robot-hardware/lib/Audio/src/Audio.cpp
 create mode 100644 apps/robot-hardware/lib/Audio/src/Audio.h
 create mode 100644 apps/robot-hardware/scripts/esp-play.ts
 create mode 100644 apps/robot-hardware/scripts/esp-record.ts

diff --git a/apps/robot-client/package.json b/apps/robot-client/package.json
index 19d467c..2f064c1 100644
--- a/apps/robot-client/package.json
+++ b/apps/robot-client/package.json
@@ -12,7 +12,11 @@
     "format": "prettier --write \"src/**/*.ts\"",
     "test": "vitest run",
     "test:watch": "vitest",
-    "hw:demo": "tsx scripts/hardware-demo.ts"
+    "hw:demo": "pnpm exec tsx scripts/hardware-demo.ts",
+    "audio:loopback": "pnpm exec tsx scripts/audio-loopback.ts",
+    "audio:beep": "pnpm exec tsx scripts/audio-beep.ts",
+    "esp:record": "pnpm exec tsx ../robot-hardware/scripts/esp-record.ts",
+    "esp:play": "pnpm exec tsx ../robot-hardware/scripts/esp-play.ts"
   },
   "dependencies": {
     "socket.io-client": "^4.8.3",
diff --git a/apps/robot-client/scripts/audio-beep.ts b/apps/robot-client/scripts/audio-beep.ts
new file mode 100644
index 0000000..c608dd2
--- /dev/null
+++ b/apps/robot-client/scripts/audio-beep.ts
@@ -0,0 +1,99 @@
+/**
+ * Ti-Pote — Pure tone speaker test.
+ *
+ * Generates a 440 Hz sine wave at ~70% of full scale and streams it
+ * to the ESP32 speaker via AUDIO_DOWN frames, then a second beep at
+ * 880 Hz. Completely independent of the microphone — if this does
+ * not produce audible sound, the problem is downstream of the ESP32
+ * on the speaker path (MAX98357A wiring, SD pin, VIN, speaker leads).
+ *
+ * Run with:
+ *   HARDWARE_SERIAL_PORT=/dev/serial0 pnpm --filter @ti-pote/robot-client audio:beep
+ *
+ * Optional env:
+ *   BEEP_MS     — length of each beep in ms (default 1500)
+ *   BEEP_FREQ   — primary frequency in Hz (default 440)
+ *   BEEP_AMP    — amplitude 0.0..1.0 (default 0.7)
+ */
+
+import { HardwareService, Emotion } from '../src/hardware/index.js';
+import { Esp32AudioService } from '../src/services/audio.service.js';
+
+const path = process.env.HARDWARE_SERIAL_PORT ?? '/dev/serial0';
+const baudRate = parseInt(process.env.HARDWARE_SERIAL_BAUD ?? '921600', 10);
+const beepMs = parseInt(process.env.BEEP_MS ?? '1500', 10);
+const beepFreq = parseInt(process.env.BEEP_FREQ ?? '440', 10);
+const beepAmp = parseFloat(process.env.BEEP_AMP ?? '0.7');
+
+const SAMPLE_RATE = 16000;
+
+function generateSine(freqHz: number, durationMs: number, amplitude: number): Buffer {
+  const sampleCount = Math.floor((SAMPLE_RATE * durationMs) / 1000);
+  const buf = Buffer.alloc(sampleCount * 2);
+  const amp = Math.max(0, Math.min(1, amplitude)) * 32767;
+  const twoPiF = (2 * Math.PI * freqHz) / SAMPLE_RATE;
+  // 5 ms linear attack/release so the speaker doesn't click.
+  const rampSamples = Math.floor((SAMPLE_RATE * 5) / 1000);
+  for (let i = 0; i < sampleCount; i++) {
+    let env = 1;
+    if (i < rampSamples) env = i / rampSamples;
+    else if (i > sampleCount - rampSamples) env = (sampleCount - i) / rampSamples;
+    const s = Math.round(Math.sin(i * twoPiF) * amp * env);
+    buf.writeInt16LE(Math.max(-32768, Math.min(32767, s)), i * 2);
+  }
+  return buf;
+}
+
+async function sleep(ms: number): Promise<void> {
+  return new Promise((r) => setTimeout(r, ms));
+}
+
+async function main(): Promise<void> {
+  const hw = new HardwareService({ path, baudRate, heartbeatIntervalMs: 1000 });
+  hw.on('log', (line) => console.log(`[firmware] ${line}`));
+  hw.on('error', (err) => console.error(`[firmware error] ${err.message}`));
+
+  console.log(`→ opening ${path} @ ${baudRate} baud`);
+  await hw.connect();
+
+  try {
+    const rtt = await hw.ping(Buffer.from('beep'));
+    console.log(`→ ping round-trip: ${rtt.toFixed(1)} ms`);
+
+    const audio = new Esp32AudioService(
+      {
+        backend: 'esp32',
+        captureDevice: 'default',
+        playbackDevice: 'default',
+        sampleRate: SAMPLE_RATE,
+        bitDepth: 16,
+        channels: 1,
+        chunkDurationMs: 20,
+      },
+      hw,
+    );
+
+    hw.sendEmotion(Emotion.HAPPY);
+
+    console.log(`🔊 Beep 1: ${beepFreq} Hz · ${beepMs} ms · amp=${beepAmp}`);
+    const tone1 = generateSine(beepFreq, beepMs, beepAmp);
+    await audio.play(tone1);
+
+    await sleep(400);
+
+    console.log(`🔊 Beep 2: ${beepFreq * 2} Hz · ${beepMs} ms · amp=${beepAmp}`);
+    const tone2 = generateSine(beepFreq * 2, beepMs, beepAmp);
+    await audio.play(tone2);
+
+    console.log('✅ done — did you hear two beeps?');
+  } finally {
+    hw.sendEmotion(Emotion.NEUTRAL);
+    await sleep(200);
+    await hw.disconnect();
+  }
+}
+
+main().catch((err) => {
+  console.error('beep failed:', err);
+  process.exit(1);
+});
diff --git a/apps/robot-client/scripts/audio-loopback.ts b/apps/robot-client/scripts/audio-loopback.ts
new file mode 100644
index 0000000..816bb66
--- /dev/null
+++ b/apps/robot-client/scripts/audio-loopback.ts
@@ -0,0 +1,171 @@
+/**
+ * Ti-Pote — End-to-end audio loopback test.
+ *
+ * What it proves: the whole Pi ↔ ESP32 ↔ mic/speaker chain works,
+ * without bringing the cloud/wake-word/orchestrator into the picture.
+ *
+ * What it does:
+ *   1. Opens the serial link to the ESP32.
+ *   2. Captures `CAPTURE_MS` (default 5000) of mic audio via
+ *      AUDIO_UP frames into a single in-memory buffer.
+ *   3. Pauses briefly.
+ *   4. Streams that buffer back to the ESP32 as AUDIO_DOWN frames
+ *      and waits for the speaker to finish playing.
+ *
+ * Expected result: you say "allô allô" during step 2 and hear your
+ * own voice played back on the robot's speaker a moment later.
+ *
+ * Run with:
+ *   HARDWARE_SERIAL_PORT=/dev/serial0 pnpm --filter @ti-pote/robot-client audio:loopback
+ *
+ * Optional env:
+ *   CAPTURE_MS        — capture duration in ms (default 5000)
+ *   HARDWARE_SERIAL_PORT / HARDWARE_SERIAL_BAUD
+ */
+
+import { writeFileSync } from 'node:fs';
+import { HardwareService, Emotion } from '../src/hardware/index.js';
+import { Esp32AudioService } from '../src/services/audio.service.js';
+
+const path = process.env.HARDWARE_SERIAL_PORT ?? '/dev/serial0';
+const baudRate = parseInt(process.env.HARDWARE_SERIAL_BAUD ?? '921600', 10);
+const captureMs = parseInt(process.env.CAPTURE_MS ?? '5000', 10);
+const debug = !!process.env.DEBUG;
+const dumpPath = process.env.DUMP_PATH ?? '/tmp/tipote-capture.raw';
+const skipPlayback = !!process.env.SKIP_PLAYBACK;
+
+const SAMPLE_RATE = 16000;
+const BYTES_PER_SAMPLE = 2;
+
+let debugFramesSeen = 0;
+
+async function sleep(ms: number): Promise<void> {
+  return new Promise((r) => setTimeout(r, ms));
+}
+
+async function main(): Promise<void> {
+  const hw = new HardwareService({ path, baudRate, heartbeatIntervalMs: 1000 });
+  hw.on('log', (line) => console.log(`[firmware] ${line}`));
+  hw.on('error', (err) => console.error(`[firmware error] ${err.message}`));
+  if (debug) {
+    hw.on('audio_up', (chunk) => {
+      // Print first 8 int16 samples of the first few frames
+      // so we can see whether the wire carries zeros or real data.
+      if (debugFramesSeen < 3) {
+        const head: number[] = [];
+        for (let i = 0; i < Math.min(chunk.length, 16); i += 2) {
+          head.push(chunk.readInt16LE(i));
+        }
+        console.log(`[debug] frame ${debugFramesSeen} len=${chunk.length} head=${head.join(',')}`);
+        debugFramesSeen++;
+      }
+    });
+  }
+
+  console.log(`→ opening ${path} @ ${baudRate} baud`);
+  await hw.connect();
+
+  try {
+    const rtt = await hw.ping(Buffer.from('loopback'));
+    console.log(`→ ping round-trip: ${rtt.toFixed(1)} ms`);
+
+    hw.sendEmotion(Emotion.SURPRISED);
+
+    // ── 1. Capture ────────────────────────────────────────────────
+    const chunks: Buffer[] = [];
+    let bytesCaptured = 0;
+
+    const collect = (chunk: Buffer): void => {
+      chunks.push(chunk);
+      bytesCaptured += chunk.length;
+    };
+    hw.on('audio_up', collect);
+
+    console.log(`🎙️  Recording ${captureMs} ms — say something!`);
+    await sleep(captureMs);
+
+    hw.off('audio_up', collect);
+    const capture = Buffer.concat(chunks);
+    const samples = capture.length / BYTES_PER_SAMPLE;
+    const durationMs = (samples / SAMPLE_RATE) * 1000;
+    console.log(
+      `✅ captured ${capture.length} bytes (${samples} samples, ${durationMs.toFixed(0)} ms)` +
+        ` across ${chunks.length} frames`,
+    );
+
+    if (capture.length === 0) {
+      console.error(
+        '❌ no audio received from the ESP32. Check the I2S wiring ' +
+          '(BCLK=32, LRCLK=33, DIN=34) and that the firmware got past `audio: I2S ready`.',
+      );
+      return;
+    }
+
+    // Quick RMS sanity check so we catch "mic muted" / "disconnected" early.
+    const rms = computeRms(capture);
+    console.log(`   RMS level: ${rms.toFixed(0)} (silence ≈ 10, speech ≳ 500)`);
+
+    if (debug) {
+      // Dump the raw capture so we can replay it offline:
+      //   aplay -r 16000 -f S16_LE -c 1 /tmp/tipote-capture.raw
+      writeFileSync(dumpPath, capture);
+      console.log(`[debug] raw capture written to ${dumpPath} (${capture.length} bytes)`);
+
+      const allZero = capture.every((b) => b === 0);
+      console.log(`[debug] capture.allZero=${allZero}`);
+
+      // Also print some distinct int16 values we saw, to spot patterns.
+      const seen = new Set<number>();
+      for (let i = 0; i < capture.length - 1 && seen.size < 10; i += 2) {
+        seen.add(capture.readInt16LE(i));
+      }
+      console.log(`[debug] first distinct samples: ${[...seen].join(',')}`);
+    }
+
+    if (skipPlayback) {
+      console.log('SKIP_PLAYBACK set — not sending AUDIO_DOWN');
+      return;
+    }
+
+    // ── 2. Playback ───────────────────────────────────────────────
+    await sleep(500);
+
+    const audio = new Esp32AudioService(
+      {
+        backend: 'esp32',
+        captureDevice: 'default',
+        playbackDevice: 'default',
+        sampleRate: SAMPLE_RATE,
+        bitDepth: 16,
+        channels: 1,
+        chunkDurationMs: 20,
+      },
+      hw,
+    );
+
+    hw.sendEmotion(Emotion.HAPPY);
+    console.log('🔊 Playing back on the ESP32 speaker...');
+    await audio.play(capture);
+    console.log('✅ playback done');
+  } finally {
+    hw.sendEmotion(Emotion.NEUTRAL);
+    await sleep(200);
+    await hw.disconnect();
+  }
+}
+
+function computeRms(buf: Buffer): number {
+  if (buf.length < 2) return 0;
+  let sumSquares = 0;
+  const samples = buf.length / 2;
+  for (let i = 0; i < buf.length - 1; i += 2) {
+    const s = buf.readInt16LE(i);
+    sumSquares += s * s;
+  }
+  return Math.sqrt(sumSquares / samples);
+}
+
+main().catch((err) => {
+  console.error('loopback failed:', err);
+  process.exit(1);
+});
diff --git a/apps/robot-client/scripts/wake_word.py b/apps/robot-client/scripts/wake_word.py
index c91b589..bf0a74b 100755
--- a/apps/robot-client/scripts/wake_word.py
+++ b/apps/robot-client/scripts/wake_word.py
@@ -2,94 +2,175 @@
 """
 Ti-Pote Wake Word Detection Script.
 
-Runs OpenWakeWord model continuously, listening on the specified ALSA device.
-Prints "DETECTED" to stdout when the wake word is heard.
+Runs OpenWakeWord continuously and prints "DETECTED" to stdout when
+the wake word is heard.
 
-Supports PAUSE/RESUME commands on stdin to temporarily stop/start listening
-without reloading the model. When paused, the audio stream is closed so other
-processes (arecord) can use the device.
+Two input modes:
 
-Usage:
-    python3 wake_word.py --model hey_jarvis --threshold 0.5 --device default --sample-rate 16000
+1. --input alsa  (default, legacy)
+   Opens an ALSA capture device via PyAudio. PAUSE/RESUME/QUIT
+   commands are read from stdin.
 
-Requirements:
-    pip install openwakeword pyaudio numpy
+2. --input stdin
+   Reads raw S16 mono PCM audio from stdin (fd 0). This is used when
+   the Raspberry Pi is just an orchestrator and the microphone lives
+   on the ESP32 — the Node client forwards AUDIO_UP frames into this
+   script's stdin. Control commands are read from a separate file
+   descriptor specified by --control-fd (default: 3).
+
+Control commands (one per line, uppercase):
+   PAUSE   — stop emitting DETECTED events (audio keeps flowing so
+             we don't overflow the pipe, but predictions are ignored).
+   RESUME  — resume emitting and reset the model buffer.
+   RESET   — reset the model buffer without touching the pause flag.
+   QUIT    — exit cleanly.
+
+Usage (ALSA):
+    python3 wake_word.py --model hey_jarvis --device default
+
+Usage (stdin / ESP32 backend):
+    python3 wake_word.py --model hey_jarvis --input stdin --control-fd 3
 """
 
 import argparse
-import sys
 import os
 import signal
-import select
+import sys
 import threading
+import time
+
 import numpy as np
 
 
-def main():
-    parser = argparse.ArgumentParser(description='Ti-Pote Wake Word Detection')
-    parser.add_argument('--model', type=str, default='hey_jarvis',
-                        help='Wake word model name (default: hey_jarvis as placeholder)')
-    parser.add_argument('--threshold', type=float, default=0.5,
-                        help='Detection threshold (0.0-1.0)')
-    parser.add_argument('--device', type=str, default='default',
-                        help='ALSA audio capture device')
-    parser.add_argument('--sample-rate', type=int, default=16000,
-                        help='Audio sample rate in Hz')
-    args = parser.parse_args()
+CHUNK_SAMPLES = 1280  # ≈ 80 ms @ 16 kHz (OpenWakeWord's preferred size)
 
+
+def load_model(model_name: str):
     try:
         from openwakeword.model import Model
     except ImportError:
-        print("ERROR: openwakeword not installed. Run: pip install openwakeword", file=sys.stderr)
+        print("ERROR: openwakeword not installed. Run: pip install openwakeword",
+              file=sys.stderr)
         sys.exit(1)
 
-    try:
-        import pyaudio
-    except ImportError:
-        print("ERROR: pyaudio not installed. Run: pip install pyaudio", file=sys.stderr)
-        sys.exit(1)
-
-    # ── Load the wake word model (one time only) ──
-
-    print(f"Loading wake word model: {args.model}...", file=sys.stderr)
-
     import openwakeword
-    pretrained_paths = openwakeword.get_pretrained_model_paths()
-    model_path = None
-    for p in pretrained_paths:
-        basename = os.path.basename(p)
-        if basename.startswith(args.model):
-            model_path = p
-            break
-
+    pretrained = openwakeword.get_pretrained_model_paths()
+    model_path = next(
+        (p for p in pretrained if os.path.basename(p).startswith(model_name)),
+        None,
+    )
     if model_path is None:
-        if os.path.isfile(args.model):
-            model_path = args.model
+        if os.path.isfile(model_name):
+            model_path = model_name
         else:
-            print(f"ERROR: model '{args.model}' not found in pretrained models", file=sys.stderr)
-            print(f"Available models:", file=sys.stderr)
-            for p in pretrained_paths:
+            print(f"ERROR: model '{model_name}' not found", file=sys.stderr)
+            for p in pretrained:
                 print(f"  - {os.path.basename(p)}", file=sys.stderr)
             sys.exit(1)
 
-    print(f"Resolved model path: {model_path}", file=sys.stderr)
-
+    print(f"Loading wake word model: {model_name}...", file=sys.stderr)
     try:
-        oww_model = Model(wakeword_model_paths=[model_path])
+        return Model(wakeword_model_paths=[model_path])
     except Exception as e:
-        print(f"ERROR loading model '{args.model}': {e}", file=sys.stderr)
+        print(f"ERROR loading model '{model_name}': {e}", file=sys.stderr)
         sys.exit(1)
 
-    print(f"Wake word model loaded: {args.model}", file=sys.stderr)
-    print(f"Threshold: {args.threshold}", file=sys.stderr)
-    print(f"Listening on device: {args.device}", file=sys.stderr)
 
-    # ── Initialize PyAudio ──
+class State:
+    """Shared mutable state between the audio and control threads."""
+    def __init__(self):
+        self.paused = False
+        self.running = True
+        self.reset_requested = False
+        self.lock = threading.Lock()
+
+
+def start_control_reader(state: State, fd: int):
+    """Background thread that reads PAUSE/RESUME/RESET/QUIT commands."""
+    try:
+        f = os.fdopen(fd, 'r', buffering=1)
+    except OSError as e:
+        print(f"ERROR opening control fd {fd}: {e}", file=sys.stderr)
+        return
+
+    def reader():
+        while state.running:
+            try:
+                line = f.readline()
+            except Exception:
+                break
+            if not line:
+                break
+            cmd = line.strip().upper()
+            with state.lock:
+                if cmd == 'PAUSE' and not state.paused:
+                    state.paused = True
+                    print("PAUSED", file=sys.stderr, flush=True)
+                elif cmd == 'RESUME' and state.paused:
+                    state.paused = False
+                    state.reset_requested = True
+                    print("RESUMED", file=sys.stderr, flush=True)
+                elif cmd == 'RESET':
+                    state.reset_requested = True
+                elif cmd == 'QUIT':
+                    state.running = False
+                    break
+
+    t = threading.Thread(target=reader, daemon=True)
+    t.start()
+
+
+def run_predict_loop(oww_model, read_chunk, state: State, threshold: float):
+    """
+    Shared loop: pull a chunk from `read_chunk()`, feed the model,
+    optionally emit DETECTED. Exits when `read_chunk()` returns None
+    or state.running is False.
+    """
+    print("READY", file=sys.stderr, flush=True)
+    try:
+        while state.running:
+            with state.lock:
+                if state.reset_requested:
+                    oww_model.reset()
+                    state.reset_requested = False
+
+            audio_data = read_chunk()
+            if audio_data is None:
+                # EOF / error; exit cleanly
+                break
+
+            audio_array = np.frombuffer(audio_data, dtype=np.int16)
+            oww_model.predict(audio_array)
+
+            with state.lock:
+                if state.paused:
+                    # Keep draining but don't emit detections.
+                    continue
+
+            for _, score in oww_model.prediction_buffer.items():
+                if len(score) > 0 and score[-1] > threshold:
+                    print("DETECTED", flush=True)
+                    oww_model.reset()
+                    break
+    except KeyboardInterrupt:
+        pass
+
+
+# ─────────────────────────────────────────────────────────────────
+# ALSA input (legacy backend)
+# ─────────────────────────────────────────────────────────────────
+
+def run_alsa_mode(args, oww_model, state: State):
+    import re
+    try:
+        import pyaudio
+    except ImportError:
+        print("ERROR: pyaudio not installed. Run: pip install pyaudio",
+              file=sys.stderr)
+        sys.exit(1)
 
     pa = pyaudio.PyAudio()
 
-    # Find the device index
-    import re
     device_index = None
     if args.device != 'default':
         try:
@@ -97,14 +178,14 @@ def main():
             info = pa.get_device_info_by_index(idx)
             if info.get('maxInputChannels', 0) > 0:
                 device_index = idx
-                print(f"Using device by index: [{idx}] {info['name']}", file=sys.stderr)
+                print(f"Using device by index: [{idx}] {info['name']}",
+                      file=sys.stderr)
         except (ValueError, IOError):
             pass
 
         if device_index is None:
             hw_match = re.search(r'(\d+),(\d+)', args.device)
             hw_pattern = f"hw:{hw_match.group(1)},{hw_match.group(2)}" if hw_match else None
-
             for i in range(pa.get_device_count()):
                 info = pa.get_device_info_by_index(i)
                 if info.get('maxInputChannels', 0) <= 0:
@@ -115,133 +196,134 @@ def main():
                     print(f"Matched device: [{i}] {name}", file=sys.stderr)
                     break
 
-        if device_index is None:
-            print(f"WARNING: Device '{args.device}' not found, listing available inputs:", file=sys.stderr)
-            for i in range(pa.get_device_count()):
-                info = pa.get_device_info_by_index(i)
-                if info.get('maxInputChannels', 0) > 0:
-                    print(f"  [{i}] {info['name']}", file=sys.stderr)
-            print("Falling back to default device", file=sys.stderr)
-
-    # ── Audio stream helpers ──
-
-    chunk_size = 1280  # ~80ms at 16kHz (OpenWakeWord expects this)
-    stream = None
+    stream = {'handle': None}
 
     def open_stream():
-        nonlocal stream
-        stream = pa.open(
+        stream['handle'] = pa.open(
             format=pyaudio.paInt16,
             channels=1,
             rate=args.sample_rate,
             input=True,
-            frames_per_buffer=chunk_size,
+            frames_per_buffer=CHUNK_SAMPLES,
             input_device_index=device_index,
         )
 
     def close_stream():
-        nonlocal stream
-        if stream is not None:
+        h = stream['handle']
+        if h is not None:
             try:
-                stream.stop_stream()
-                stream.close()
+                h.stop_stream()
+                h.close()
             except Exception:
                 pass
-            stream = None
+            stream['handle'] = None
 
-    # ── Stdin command reader (PAUSE / RESUME) ──
-
-    paused = False
-    running = True
-    lock = threading.Lock()
-
-    def stdin_reader():
-        nonlocal paused, running
-        while running:
-            try:
-                line = sys.stdin.readline()
-                if not line:  # EOF
-                    running = False
-                    break
-                cmd = line.strip().upper()
-                with lock:
-                    if cmd == 'PAUSE':
-                        if not paused:
-                            paused = True
-                            print("PAUSED", file=sys.stderr, flush=True)
-                    elif cmd == 'RESUME':
-                        if paused:
-                            paused = False
-                            print("RESUMED", file=sys.stderr, flush=True)
-                    elif cmd == 'QUIT':
-                        running = False
-                        break
-            except Exception:
-                break
-
-    stdin_thread = threading.Thread(target=stdin_reader, daemon=True)
-    stdin_thread.start()
-
-    # ── Signal handling ──
-
-    def handle_signal(sig, frame):
-        nonlocal running
-        running = False
-    signal.signal(signal.SIGTERM, handle_signal)
-    signal.signal(signal.SIGINT, handle_signal)
-
-    # ── Main loop ──
+    def read_chunk():
+        with state.lock:
+            is_paused = state.paused
+        # In ALSA mode, pausing means physically releasing the device.
+        if is_paused:
+            if stream['handle'] is not None:
+                close_stream()
+                print("STREAM_CLOSED", file=sys.stderr, flush=True)
+            time.sleep(0.1)
+            return b'\x00' * (CHUNK_SAMPLES * 2)  # dummy silence; won't be predicted
+        if stream['handle'] is None:
+            open_stream()
+            oww_model.reset()
+            print("STREAM_REOPENED", file=sys.stderr, flush=True)
+        try:
+            return stream['handle'].read(CHUNK_SAMPLES, exception_on_overflow=False)
+        except Exception as e:
+            print(f"Audio read error: {e}", file=sys.stderr)
+            close_stream()
+            time.sleep(0.5)
+            return b'\x00' * (CHUNK_SAMPLES * 2)
 
     open_stream()
-    print("READY", file=sys.stderr, flush=True)
-
     try:
-        while running:
-            with lock:
-                is_paused = paused
-
-            if is_paused:
-                # Close the audio stream so arecord can use the device
-                if stream is not None:
-                    close_stream()
-                    print("STREAM_CLOSED", file=sys.stderr, flush=True)
-                # Wait a bit before checking again
-                import time
-                time.sleep(0.1)
-                continue
-
-            # Reopen stream if it was closed (after resume)
-            if stream is None:
-                open_stream()
-                oww_model.reset()
-                print("STREAM_REOPENED", file=sys.stderr, flush=True)
-
-            try:
-                audio_data = stream.read(chunk_size, exception_on_overflow=False)
-            except Exception as e:
-                print(f"Audio read error: {e}", file=sys.stderr)
-                close_stream()
-                import time
-                time.sleep(0.5)
-                continue
-
-            audio_array = np.frombuffer(audio_data, dtype=np.int16)
-
-            oww_model.predict(audio_array)
-
-            for model_name, score in oww_model.prediction_buffer.items():
-                if len(score) > 0 and score[-1] > args.threshold:
-                    print("DETECTED", flush=True)
-                    oww_model.reset()
-                    break
-
-    except KeyboardInterrupt:
-        pass
+        run_predict_loop(oww_model, read_chunk, state, args.threshold)
     finally:
         close_stream()
         pa.terminate()
         print("Wake word detection stopped", file=sys.stderr)
 
 
+# ─────────────────────────────────────────────────────────────────
+# Stdin input (ESP32 backend)
+# ─────────────────────────────────────────────────────────────────
+
+def run_stdin_mode(args, oww_model, state: State):
+    """
+    Audio bytes arrive on stdin (fd 0), 16-bit signed LE mono at
+    `args.sample_rate`. We block until a full CHUNK_SAMPLES chunk is
+    available and hand it to the model.
+    """
+    print("Listening on stdin for raw S16LE mono PCM", file=sys.stderr)
+    chunk_bytes = CHUNK_SAMPLES * 2
+    stdin = sys.stdin.buffer
+    buf = bytearray()
+
+    def read_chunk():
+        # Keep reading until we have a full chunk or hit EOF.
+        while len(buf) < chunk_bytes and state.running:
+            try:
+                data = stdin.read(chunk_bytes - len(buf))
+            except Exception as e:
+                print(f"stdin read error: {e}", file=sys.stderr)
+                return None
+            if not data:
+                return None
+            buf.extend(data)
+        if len(buf) < chunk_bytes:
+            return None
+        chunk = bytes(buf[:chunk_bytes])
+        del buf[:chunk_bytes]
+        return chunk
+
+    try:
+        run_predict_loop(oww_model, read_chunk, state, args.threshold)
+    finally:
+        print("Wake word detection stopped", file=sys.stderr)
+
+
+# ─────────────────────────────────────────────────────────────────
+# Entrypoint
+# ─────────────────────────────────────────────────────────────────
+
+def main():
+    parser = argparse.ArgumentParser(description='Ti-Pote Wake Word Detection')
+    parser.add_argument('--model', type=str, default='hey_jarvis')
+    parser.add_argument('--threshold', type=float, default=0.5)
+    parser.add_argument('--input', type=str, choices=['alsa', 'stdin'], default='alsa',
+                        help="Audio source. 'alsa' opens PyAudio, 'stdin' reads from fd 0.")
+    parser.add_argument('--device', type=str, default='default',
+                        help='ALSA audio capture device (only used with --input alsa).')
+    parser.add_argument('--control-fd', type=int, default=0,
+                        help='File descriptor to read control commands from. '
+                             'Default 0 (stdin) for ALSA, pass 3 for stdin mode.')
+    parser.add_argument('--sample-rate', type=int, default=16000)
+    args = parser.parse_args()
+
+    state = State()
+
+    def handle_signal(_sig, _frame):
+        state.running = False
+    signal.signal(signal.SIGTERM, handle_signal)
+    signal.signal(signal.SIGINT, handle_signal)
+
+    oww_model = load_model(args.model)
+    print(f"Wake word model loaded: {args.model}", file=sys.stderr)
+    print(f"Threshold: {args.threshold}", file=sys.stderr)
+
+    start_control_reader(state, args.control_fd)
+
+    if args.input == 'stdin':
+        run_stdin_mode(args, oww_model, state)
+    else:
+        print(f"Listening on device: {args.device}", file=sys.stderr)
+        run_alsa_mode(args, oww_model, state)
+
+
 if __name__ == '__main__':
     main()
diff --git a/apps/robot-client/src/config/hardware.config.ts b/apps/robot-client/src/config/hardware.config.ts
index c071238..e0c330a 100644
--- a/apps/robot-client/src/config/hardware.config.ts
+++ b/apps/robot-client/src/config/hardware.config.ts
@@ -1,8 +1,11 @@
 export interface AudioConfig {
-  /** ALSA device for capture (e.g., 'plughw:1,0' or 'default') */
+  /** Which audio backend to use: 'esp32' (default) or 'alsa' (legacy). */
+  backend: 'esp32' | 'alsa';
+
+  /** ALSA device for capture (only used when backend='alsa'). */
   captureDevice: string;
 
-  /** ALSA device for playback (e.g., 'plughw:0,0' or 'default') */
+  /** ALSA device for playback (only used when backend='alsa'). */
   playbackDevice: string;
 
   /** Sample rate in Hz */
@@ -53,8 +56,13 @@ export interface HardwareConfig {
 }
 
 export function loadHardwareConfig(): HardwareConfig {
+  const backend = (process.env.AUDIO_BACKEND || 'esp32').toLowerCase() as
+    | 'esp32'
+    | 'alsa';
+
   return {
     audio: {
+      backend,
       captureDevice: process.env.AUDIO_CAPTURE_DEVICE || 'default',
       playbackDevice: process.env.AUDIO_PLAYBACK_DEVICE || 'default',
       sampleRate: parseInt(process.env.AUDIO_SAMPLE_RATE || '16000', 10),
@@ -69,8 +77,15 @@ export function loadHardwareConfig(): HardwareConfig {
       threshold: parseFloat(process.env.WAKEWORD_THRESHOLD || '0.5'),
     },
     serial: {
-      enabled: (process.env.HARDWARE_SERIAL_ENABLED || 'false').toLowerCase() === 'true',
-      path: process.env.HARDWARE_SERIAL_PORT || '/dev/ttyUSB0',
+      // The ESP32 is now the mic/speaker front-end — serial link is
+      // enabled by default. Set HARDWARE_SERIAL_ENABLED=false only
+      // when intentionally falling back to the ALSA backend.
+      enabled:
+        (process.env.HARDWARE_SERIAL_ENABLED || (backend === 'esp32' ? 'true' : 'false'))
+          .toLowerCase() === 'true',
+      // Default to /dev/serial0 (the Pi's hardware UART once the
+      // console has been freed via raspi-config).
+      path: process.env.HARDWARE_SERIAL_PORT || '/dev/serial0',
       baudRate: parseInt(process.env.HARDWARE_SERIAL_BAUD || '921600', 10),
       heartbeatIntervalMs: parseInt(process.env.HARDWARE_HEARTBEAT_MS || '1000', 10),
     },
diff --git a/apps/robot-client/src/hardware/hardware.service.ts b/apps/robot-client/src/hardware/hardware.service.ts
index f243dfc..fd8852a 100644
--- a/apps/robot-client/src/hardware/hardware.service.ts
+++ b/apps/robot-client/src/hardware/hardware.service.ts
@@ -27,8 +27,17 @@ export interface HardwareServiceEvents {
   log: (message: string) => void;
   frame: (frame: DecodedFrame) => void;
   ack: (payload: Buffer) => void;
+  /** Emitted for each AUDIO_UP frame received from the ESP32 (raw S16 mono PCM). */
+  audio_up: (chunk: Buffer) => void;
 }
 
+/**
+ * Max bytes we put in a single AUDIO_DOWN frame. Must stay below
+ * MAX_PAYLOAD_SIZE (1024) and should map to a whole number of
+ * 20 ms @ 16 kHz chunks: 640 bytes = 20 ms, 320 samples.
+ */
+const AUDIO_DOWN_CHUNK_BYTES = 640;
+
 /**
  * HardwareService — the robot-client's only direct link to the ESP32.
  *
@@ -136,6 +145,42 @@ export class HardwareService extends EventEmitter {
     this.writeFrame(MsgType.DISPLAY_CLEAR);
   }
 
+  /**
+   * Send a PCM S16 mono 16 kHz buffer to the ESP32 speaker as one or
+   * more AUDIO_DOWN frames. The buffer is automatically split into
+   * chunks of `AUDIO_DOWN_CHUNK_BYTES` so each frame fits within the
+   * UART protocol's MAX_PAYLOAD_SIZE.
+   *
+   * Back-pressure note: `SerialPort.write` buffers in user-space, so
+   * this method is best-effort. For long TTS playbacks, call
+   * `drainAudioDown()` between chunks or space them with a `setTimeout`
+   * to avoid unbounded growth.
+   */
+  sendAudioDown(chunk: Buffer): void {
+    if (!this.port?.isOpen) {
+      this.log.warn('Dropping AUDIO_DOWN — serial port not open');
+      return;
+    }
+    for (let offset = 0; offset < chunk.length; offset += AUDIO_DOWN_CHUNK_BYTES) {
+      const slice = chunk.subarray(offset, offset + AUDIO_DOWN_CHUNK_BYTES);
+      this.writeFrame(MsgType.AUDIO_DOWN, slice);
+    }
+  }
+
+  /**
+   * Wait for the kernel-side serial buffer to drain. Useful between
+   * large AUDIO_DOWN bursts to keep latency bounded.
+   */
+  drainAudioDown(): Promise<void> {
+    return new Promise((resolve, reject) => {
+      if (!this.port?.isOpen) {
+        resolve();
+        return;
+      }
+      this.port.drain((err) => (err ? reject(err) : resolve()));
+    });
+  }
+
   /**
    * Round-trip PING → PONG used for bring-up and latency checks.
    * Resolves with the measured RTT in ms.
@@ -187,6 +232,9 @@ export class HardwareService extends EventEmitter {
       case MsgType.ERROR:
         this.log.error({ payload: frame.payload.toString('utf8') }, 'firmware error');
         return;
+      case MsgType.AUDIO_UP:
+        this.emit('audio_up', frame.payload);
+        return;
       default:
         return;
     }
diff --git a/apps/robot-client/src/main.ts b/apps/robot-client/src/main.ts
index e271a3b..034cb9a 100644
--- a/apps/robot-client/src/main.ts
+++ b/apps/robot-client/src/main.ts
@@ -1,7 +1,7 @@
 import { loadRobotConfig, loadHardwareConfig } from './config/index.js';
 import { CloudSocket } from './transport/index.js';
 import {
-  AudioService,
+  createAudioService,
   WakeWordService,
   KeyboardTriggerService,
   HealthService,
@@ -72,15 +72,16 @@ async function main(): Promise<void> {
   const resolvedConfig = { ...robotConfig, deviceId, deviceToken };
 
   const cloudSocket = new CloudSocket(resolvedConfig as Required<typeof resolvedConfig>);
-  const audioService = new AudioService(hardwareConfig.audio);
   const healthService = new HealthService(cloudSocket);
 
-  // ── Optional: hardware bridge (ESP32 firmware) ──
-  // The serial link is opt-in via HARDWARE_SERIAL_ENABLED=true. We
-  // treat failures here as non-fatal: even without a face, the
-  // robot can still converse with the cloud.
+  // ── Hardware bridge (ESP32 firmware) ──
+  // With AUDIO_BACKEND=esp32 the ESP32 owns the mic AND the speaker,
+  // so the serial link is mandatory. With AUDIO_BACKEND=alsa we can
+  // still run without it (face will be missing, but audio works).
 
+  const audioBackend = hardwareConfig.audio.backend;
   let hardwareService: HardwareService | null = null;
+
   if (hardwareConfig.serial.enabled) {
     hardwareService = new HardwareService({
       path: hardwareConfig.serial.path,
@@ -93,19 +94,40 @@ async function main(): Promise<void> {
       hardwareService.sendEmotion(Emotion.HAPPY);
       logger.info('Hardware bridge connected');
     } catch (err) {
+      if (audioBackend === 'esp32') {
+        logger.fatal(
+          { err, path: hardwareConfig.serial.path },
+          'Hardware bridge required for AUDIO_BACKEND=esp32 — check the UART wiring or set AUDIO_BACKEND=alsa',
+        );
+        process.exit(1);
+      }
       logger.warn({ err }, 'Hardware bridge unavailable — continuing without face');
       hardwareService = null;
     }
+  } else if (audioBackend === 'esp32') {
+    logger.fatal(
+      'AUDIO_BACKEND=esp32 requires HARDWARE_SERIAL_ENABLED=true. Either enable the serial link or switch to AUDIO_BACKEND=alsa.',
+    );
+    process.exit(1);
   } else {
     logger.info('Hardware bridge disabled (set HARDWARE_SERIAL_ENABLED=true to enable)');
   }
 
+  // Audio service — pick a backend now that we know whether the
+  // hardware bridge is alive.
+  const audioService = createAudioService(hardwareConfig.audio, hardwareService);
+  logger.info({ backend: audioBackend }, 'Audio service initialised');
+
   // Choose trigger based on TRIGGER_MODE
   let trigger: ITriggerService;
 
   if (resolvedConfig.triggerMode === 'wakeword') {
     logger.info('Trigger: wake word (OpenWakeWord)');
-    trigger = new WakeWordService(hardwareConfig.wakeWord, hardwareConfig.audio);
+    trigger = new WakeWordService(
+      hardwareConfig.wakeWord,
+      hardwareConfig.audio,
+      audioBackend === 'esp32' ? hardwareService : null,
+    );
   } else {
     logger.info('Trigger: keyboard (press Enter to talk)');
     trigger = new KeyboardTriggerService();
diff --git a/apps/robot-client/src/services/audio.service.ts b/apps/robot-client/src/services/audio.service.ts
index c44bc73..e3c3dbc 100644
--- a/apps/robot-client/src/services/audio.service.ts
+++ b/apps/robot-client/src/services/audio.service.ts
@@ -1,30 +1,48 @@
 import { ChildProcess, spawn } from 'node:child_process';
 import { EventEmitter } from 'node:events';
 import { type AudioConfig } from '../config/index.js';
+import { type HardwareService } from '../hardware/index.js';
 import { createLogger, type Logger } from '../utils/index.js';
 
 export interface AudioServiceEvents {
-  /** Emitted when a raw PCM audio chunk is captured from the microphone */
+  /** Emitted when a raw PCM audio chunk is captured from the microphone. */
   audio_chunk: (chunk: Buffer) => void;
-  /** Emitted when playback of a response finishes */
+  /** Emitted when playback of a response finishes. */
   playback_done: () => void;
-  /** Emitted on audio errors */
+  /** Emitted on audio errors. */
   error: (error: Error) => void;
 }
 
 /**
- * Audio service for Raspberry Pi.
+ * Common audio interface used by the orchestrator, wake word service,
+ * and test scripts. Two backends implement it:
  *
- * Uses ALSA tools (arecord/aplay) via child processes.
- * Works with any ALSA-compatible audio device:
- * - I2S (INMP441 mic, MAX98357 amp) connected directly to Pi GPIO
- * - USB audio devices
- * - Default system audio
+ *   - `AlsaAudioService` — arecord/aplay child processes, for dev on a
+ *     machine with a USB mic or when the Pi owns the I2S mic/speaker
+ *     directly. Selected with `AUDIO_BACKEND=alsa`.
  *
- * Audio format: PCM signed 16-bit little-endian, mono, 16kHz
+ *   - `Esp32AudioService` — mic and speaker live on the ESP32; audio
+ *     flows over UART via `HardwareService`. Selected with
+ *     `AUDIO_BACKEND=esp32` (the default in production).
  */
-export class AudioService extends EventEmitter {
+export abstract class AudioService extends EventEmitter {
+  abstract get isCapturing(): boolean;
+  abstract get isPlaying(): boolean;
+  abstract startCapture(): void;
+  abstract stopCapture(): void;
+  abstract play(audioBuffer: Buffer): Promise<void>;
+  abstract stopPlayback(): void;
+  abstract destroy(): Promise<void>;
+}
+
+// ─────────────────────────────────────────────────────────────────
+// ALSA backend — kept for dev on laptops and for Pi setups where
+// the mic/speaker hang off ALSA directly (USB sound card, HAT…).
+// ─────────────────────────────────────────────────────────────────
+
+export class AlsaAudioService extends AudioService {
   private captureProcess: ChildProcess | null = null;
+  private playProcess: ChildProcess | null = null;
   private readonly logger: Logger;
   private _isCapturing = false;
   private _isPlaying = false;
@@ -32,7 +50,7 @@ export class AudioService extends EventEmitter {
 
   constructor(private readonly config: AudioConfig) {
     super();
-    this.logger = createLogger('audio', 'info');
+    this.logger = createLogger('audio:alsa', 'info');
   }
 
   get isCapturing(): boolean {
@@ -43,10 +61,6 @@ export class AudioService extends EventEmitter {
     return this._isPlaying;
   }
 
-  /**
-   * Start capturing audio from the microphone.
-   * Emits 'audio_chunk' events with raw PCM buffers.
-   */
   startCapture(): void {
     if (this._isCapturing) {
       this.logger.warn('Already capturing audio');
@@ -58,13 +72,6 @@ export class AudioService extends EventEmitter {
       'Starting audio capture',
     );
 
-    // arecord outputs raw PCM to stdout
-    // -D: ALSA device
-    // -f: format (S16_LE = signed 16-bit little-endian)
-    // -r: sample rate
-    // -c: channels
-    // -t: type (raw = no header)
-    // --buffer-size: in frames, controls latency
     const bufferFrames = Math.floor(this.config.sampleRate * (this.config.chunkDurationMs / 1000));
 
     this.captureProcess = spawn('arecord', [
@@ -112,9 +119,6 @@ export class AudioService extends EventEmitter {
     });
   }
 
-  /**
-   * Stop capturing audio from the microphone.
-   */
   stopCapture(): void {
     if (!this.captureProcess) return;
 
@@ -125,12 +129,6 @@ export class AudioService extends EventEmitter {
     this._isCapturing = false;
   }
 
-  /**
-   * Play audio through the speaker.
-   * Accepts either raw PCM or WAV (with RIFF header) data.
-   *
-   * @returns Promise that resolves when playback is complete
-   */
   async play(audioBuffer: Buffer): Promise<void> {
     if (this._isPlaying) {
       this.logger.warn('Already playing audio, queueing...');
@@ -152,24 +150,26 @@ export class AudioService extends EventEmitter {
             '-',
           ];
 
-      const playProcess = spawn('aplay', args, {
+      this.playProcess = spawn('aplay', args, {
         stdio: ['pipe', 'ignore', 'pipe'],
       });
 
-      playProcess.stderr?.on('data', (data: Buffer) => {
+      this.playProcess.stderr?.on('data', (data: Buffer) => {
         const msg = data.toString().trim();
         if (msg && !msg.startsWith('Playing') && !msg.startsWith('Warning')) {
           this.logger.error({ msg }, 'aplay stderr');
         }
       });
 
-      playProcess.on('error', (err) => {
+      this.playProcess.on('error', (err) => {
         this._isPlaying = false;
+        this.playProcess = null;
         reject(new Error(`Audio playback failed: ${err.message}`));
       });
 
-      playProcess.on('exit', (code) => {
+      this.playProcess.on('exit', (code) => {
         this._isPlaying = false;
+        this.playProcess = null;
         if (code === 0 || code === null) {
           this.emit('playback_done');
           resolve();
@@ -178,26 +178,194 @@ export class AudioService extends EventEmitter {
         }
       });
 
-      // Write audio data to aplay's stdin and close it
-      playProcess.stdin?.write(audioBuffer);
-      playProcess.stdin?.end();
+      this.playProcess.stdin?.write(audioBuffer);
+      this.playProcess.stdin?.end();
     });
   }
 
-  /**
-   * Stop any currently playing audio.
-   */
   stopPlayback(): void {
-    // aplay is spawned per-play, so we can't easily stop it here
-    // For interrupt support, we'd track the play process
+    if (this.playProcess) {
+      this.playProcess.kill('SIGTERM');
+      this.playProcess = null;
+    }
     this._isPlaying = false;
   }
 
-  /**
-   * Clean up resources.
-   */
   async destroy(): Promise<void> {
     this.stopCapture();
+    this.stopPlayback();
     this.removeAllListeners();
   }
 }
+
+// ─────────────────────────────────────────────────────────────────
+// ESP32 backend — the mic and speaker live on the firmware side and
+// audio flows over the UART link owned by HardwareService.
+// ─────────────────────────────────────────────────────────────────
+
+/**
+ * Bytes-per-chunk written to the ESP32 per AUDIO_DOWN frame. Must
+ * match `AUDIO_DOWN_CHUNK_BYTES` in HardwareService. 640 bytes =
+ * 20 ms of 16 kHz S16 mono audio.
+ */
+const ESP32_CHUNK_BYTES = 640;
+
+/** Milliseconds we wait between two AUDIO_DOWN frames during playback. */
+const ESP32_PACING_MS = 18;
+
+export class Esp32AudioService extends AudioService {
+  private readonly logger: Logger;
+  private _isCapturing = false;
+  private _isPlaying = false;
+  private _playbackAbort = false;
+
+  /** Latched listener so we can detach on `stopCapture()`. */
+  private readonly forwardAudioUp = (chunk: Buffer): void => {
+    if (!this._isCapturing) return;
+    this.emit('audio_chunk', chunk);
+  };
+
+  constructor(
+    _config: AudioConfig,
+    private readonly hardware: HardwareService,
+  ) {
+    super();
+    void _config;
+    this.logger = createLogger('audio:esp32', 'info');
+  }
+
+  get isCapturing(): boolean {
+    return this._isCapturing;
+  }
+
+  get isPlaying(): boolean {
+    return this._isPlaying;
+  }
+
+  startCapture(): void {
+    if (this._isCapturing) {
+      this.logger.warn('Already capturing audio');
+      return;
+    }
+    this.logger.info('Subscribing to ESP32 AUDIO_UP stream');
+    this._isCapturing = true;
+    // Attach exactly once per capture session — removed in stopCapture.
+    this.hardware.on('audio_up', this.forwardAudioUp);
+  }
+
+  stopCapture(): void {
+    if (!this._isCapturing) return;
+    this.logger.info('Unsubscribing from ESP32 AUDIO_UP stream');
+    this._isCapturing = false;
+    this.hardware.off('audio_up', this.forwardAudioUp);
+  }
+
+  /**
+   * Play a PCM S16 mono 16 kHz buffer on the ESP32 speaker. If `buf`
+   * carries a WAV header, strip it first (the firmware expects raw PCM).
+   *
+   * We pace the writes manually so the Node serial buffer and the
+   * ESP32 speaker DMA stay roughly in sync. Without pacing, the whole
+   * buffer would be pushed into the kernel at once and the robot would
+   * still be "speaking" long after the orchestrator thinks it's done.
+   */
+  async play(audioBuffer: Buffer): Promise<void> {
+    if (this._isPlaying) {
+      this.logger.warn('Already playing audio — ignoring new buffer');
+      return;
+    }
+
+    const pcm = stripWavHeader(audioBuffer);
+    if (pcm.length === 0) {
+      this.emit('playback_done');
+      return;
+    }
+
+    this._isPlaying = true;
+    this._playbackAbort = false;
+
+    try {
+      for (let offset = 0; offset < pcm.length; offset += ESP32_CHUNK_BYTES) {
+        if (this._playbackAbort) break;
+        const slice = pcm.subarray(offset, offset + ESP32_CHUNK_BYTES);
+        this.hardware.sendAudioDown(slice);
+        if (ESP32_PACING_MS > 0) {
+          await delay(ESP32_PACING_MS);
+        }
+      }
+      // Let the kernel TX buffer drain so we don't race on destroy.
+      try {
+        await this.hardware.drainAudioDown();
+      } catch (err) {
+        this.logger.warn({ err }, 'drain after playback failed');
+      }
+      this.emit('playback_done');
+    } finally {
+      this._isPlaying = false;
+      this._playbackAbort = false;
+    }
+  }
+
+  stopPlayback(): void {
+    if (!this._isPlaying) return;
+    this.logger.info('Aborting playback');
+    this._playbackAbort = true;
+  }
+
+  async destroy(): Promise<void> {
+    this.stopCapture();
+    this.stopPlayback();
+    this.removeAllListeners();
+  }
+}
+
+// ─────────────────────────────────────────────────────────────────
+// Helpers
+// ─────────────────────────────────────────────────────────────────
+
+function delay(ms: number): Promise<void> {
+  return new Promise((resolve) => setTimeout(resolve, ms));
+}
+
+/**
+ * Strip the 44-byte RIFF/WAVE header if present. The ESP32 I2S driver
+ * wants raw S16 mono PCM, nothing else.
+ */
+function stripWavHeader(buf: Buffer): Buffer {
+  if (buf.length > 44 && buf.toString('ascii', 0, 4) === 'RIFF' && buf.toString('ascii', 8, 12) === 'WAVE') {
+    return buf.subarray(44);
+  }
+  return buf;
+}
+
+// ─────────────────────────────────────────────────────────────────
+// Factory
+// ─────────────────────────────────────────────────────────────────
+
+export type AudioBackend = 'alsa' | 'esp32';
+
+/**
+ * Create the right AudioService for the current backend. The default
+ * is `esp32`; set `AUDIO_BACKEND=alsa` to fall back to the legacy
+ * arecord/aplay path (useful for laptop dev without an ESP32 wired in).
+ */
+export function createAudioService(
+  config: AudioConfig,
+  hardware: HardwareService | null,
+): AudioService {
+  const backend = (config.backend ?? 'esp32') as AudioBackend;
+  if (backend === 'alsa') {
+    return new AlsaAudioService(config);
+  }
+  if (backend === 'esp32') {
+    if (!hardware) {
+      throw new Error(
+        'AUDIO_BACKEND=esp32 requires a connected HardwareService — ' +
+          'set HARDWARE_SERIAL_ENABLED=true and make sure the ESP32 is reachable, ' +
+          'or switch to AUDIO_BACKEND=alsa for local development.',
+      );
+    }
+    return new Esp32AudioService(config, hardware);
+  }
+  throw new Error(`Unknown AUDIO_BACKEND: ${backend}`);
+}
diff --git a/apps/robot-client/src/services/index.ts b/apps/robot-client/src/services/index.ts
index 0a25fd7..b38ff48 100644
--- a/apps/robot-client/src/services/index.ts
+++ b/apps/robot-client/src/services/index.ts
@@ -1,4 +1,10 @@
-export { AudioService } from './audio.service.js';
+export {
+  AudioService,
+  AlsaAudioService,
+  Esp32AudioService,
+  createAudioService,
+  type AudioBackend,
+} from './audio.service.js';
 export { WakeWordService } from './wake-word.service.js';
 export { KeyboardTriggerService } from './keyboard-trigger.service.js';
 export { HealthService } from './health.service.js';
diff --git a/apps/robot-client/src/services/wake-word.service.ts b/apps/robot-client/src/services/wake-word.service.ts
index 228fb42..7d2b30a 100644
--- a/apps/robot-client/src/services/wake-word.service.ts
+++ b/apps/robot-client/src/services/wake-word.service.ts
@@ -1,24 +1,35 @@
 import { ChildProcess, spawn } from 'node:child_process';
 import { EventEmitter } from 'node:events';
 import { type WakeWordConfig, type AudioConfig } from '../config/index.js';
+import { type HardwareService } from '../hardware/index.js';
 import { createLogger, type Logger } from '../utils/index.js';
 
 export interface WakeWordServiceEvents {
-  /** Emitted when the wake word is detected */
   detected: () => void;
-  /** Emitted when the engine is ready */
   ready: () => void;
-  /** Emitted on errors */
   error: (error: Error) => void;
 }
 
 /**
  * Wake word detection service.
  *
- * Runs OpenWakeWord as a **long-lived** Python subprocess.
- * The model is loaded once at startup; pause/resume is handled via
- * PAUSE/RESUME commands on stdin, so the audio device is released
- * while arecord is capturing, then reclaimed when listening resumes.
+ * Two operating modes, selected by whether a HardwareService is passed
+ * to the constructor:
+ *
+ * 1. **ALSA mode** (no HardwareService)
+ *    The Python subprocess opens PyAudio on `audioConfig.captureDevice`
+ *    and reads the mic directly. Pause releases the ALSA device so
+ *    arecord (the AlsaAudioService) can use it during conversation.
+ *
+ * 2. **ESP32 mode** (HardwareService provided)
+ *    The Python subprocess reads raw S16 mono PCM from stdin. We
+ *    subscribe to `hardware.on('audio_up')` and pipe every mic chunk
+ *    coming off the UART straight into the Python process. Control
+ *    commands (PAUSE/RESUME/RESET/QUIT) go over a separate pipe at
+ *    fd 3 because stdin is busy carrying audio.
+ *
+ * The model is loaded once at startup; pause/resume is cheap and
+ * does not reload it.
  */
 export class WakeWordService extends EventEmitter {
   private process: ChildProcess | null = null;
@@ -26,51 +37,73 @@ export class WakeWordService extends EventEmitter {
   private _isListening = false;
   private _isPaused = false;
   private _streamClosed = false;
+  private readonly usesHardware: boolean;
+
+  /** Latched forwarder so we can detach it on stop / error. */
+  private readonly forwardMicChunk = (chunk: Buffer): void => {
+    if (!this.process || !this.process.stdin || this.process.stdin.destroyed) return;
+    // Node gracefully buffers writes if the pipe is full; we don't
+    // apply back-pressure here because dropping wake-word audio would
+    // just hurt detection accuracy for a few tens of ms.
+    this.process.stdin.write(chunk);
+  };
 
   constructor(
     private readonly wakeWordConfig: WakeWordConfig,
     private readonly audioConfig: AudioConfig,
+    private readonly hardware: HardwareService | null = null,
   ) {
     super();
     this.logger = createLogger('wake-word', 'info');
+    this.usesHardware = hardware !== null;
   }
 
   get isListening(): boolean {
     return this._isListening && !this._isPaused;
   }
 
-  /**
-   * Start the wake word Python subprocess.
-   * The model is loaded once; subsequent pause/resume cycles are fast.
-   */
   start(): void {
     if (this.process) {
-      // Process already running — just resume if paused
-      if (this._isPaused) {
-        this.resume();
-      }
+      if (this._isPaused) this.resume();
       return;
     }
 
     this.logger.info(
-      { model: this.wakeWordConfig.modelName, threshold: this.wakeWordConfig.threshold },
+      {
+        mode: this.usesHardware ? 'esp32' : 'alsa',
+        model: this.wakeWordConfig.modelName,
+        threshold: this.wakeWordConfig.threshold,
+      },
       'Starting wake word detection',
     );
 
-    this.process = spawn(this.wakeWordConfig.pythonPath, [
+    const args = [
       this.wakeWordConfig.scriptPath,
       '--model', this.wakeWordConfig.modelName,
       '--threshold', String(this.wakeWordConfig.threshold),
-      '--device', this.audioConfig.captureDevice,
       '--sample-rate', String(this.audioConfig.sampleRate),
-    ], {
-      stdio: ['pipe', 'pipe', 'pipe'],
-    });
+    ];
+
+    if (this.usesHardware) {
+      args.push('--input', 'stdin', '--control-fd', '3');
+    } else {
+      args.push('--input', 'alsa', '--device', this.audioConfig.captureDevice);
+    }
+
+    // stdio layout:
+    //   0: stdin  — audio in (ESP32 mode) or control (ALSA mode)
+    //   1: stdout — DETECTED events
+    //   2: stderr — status & log lines
+    //   3: extra  — control pipe (ESP32 mode only)
+    const stdio: ('pipe' | 'ignore')[] = this.usesHardware
+      ? ['pipe', 'pipe', 'pipe', 'pipe']
+      : ['pipe', 'pipe', 'pipe'];
+
+    this.process = spawn(this.wakeWordConfig.pythonPath, args, { stdio });
 
     this._isListening = true;
     this._isPaused = false;
 
-    // ── stdout: DETECTED events ──
     this.process.stdout?.on('data', (data: Buffer) => {
       const lines = data.toString().trim().split('\n');
       for (const line of lines) {
@@ -83,7 +116,6 @@ export class WakeWordService extends EventEmitter {
       }
     });
 
-    // ── stderr: status messages ──
     this.process.stderr?.on('data', (data: Buffer) => {
       const lines = data.toString().trim().split('\n');
       for (const line of lines) {
@@ -107,10 +139,9 @@ export class WakeWordService extends EventEmitter {
           this.logger.info('⏳ Loading wake word model...');
         } else if (msg.startsWith('Wake word model loaded')) {
           this.logger.info('✅ Wake word model loaded');
-        } else if (msg.startsWith('Matched device') || msg.startsWith('Using device')) {
+        } else if (msg.startsWith('Matched device') || msg.startsWith('Using device') || msg.startsWith('Listening')) {
           this.logger.info(`🔊 ${msg}`);
         } else {
-          // Log unknown stderr messages at warn level to catch errors
           this.logger.warn({ msg }, 'Wake word stderr');
         }
       }
@@ -119,29 +150,36 @@ export class WakeWordService extends EventEmitter {
     this.process.on('error', (err) => {
       this._isListening = false;
       this.logger.error({ err }, 'Wake word process error');
+      this.detachHardware();
       this.emit('error', new Error(`Wake word process failed: ${err.message}`));
     });
 
     this.process.on('exit', (code) => {
       this._isListening = false;
       this._isPaused = false;
+      this.detachHardware();
       this.process = null;
       if (code !== 0 && code !== null) {
         this.logger.warn({ code }, 'Wake word process exited unexpectedly');
-        // Auto-restart after a short delay
         setTimeout(() => {
           this.logger.info('Restarting wake word detection...');
           this.start();
         }, 2000);
       }
     });
+
+    // In ESP32 mode, start piping mic audio from the UART.
+    if (this.usesHardware && this.hardware) {
+      this.hardware.on('audio_up', this.forwardMicChunk);
+    }
   }
 
   /**
    * Pause wake word detection.
-   * Sends PAUSE command to Python subprocess which closes the audio stream,
-   * freeing the device for arecord. Returns a promise that resolves when
-   * the audio stream is confirmed closed.
+   *
+   * In ALSA mode we must wait for STREAM_CLOSED so arecord can reclaim
+   * the device. In ESP32 mode the audio flow never stops — we just
+   * tell the Python process to ignore detections.
    */
   pause(): Promise<void> {
     if (!this.process || this._isPaused) return Promise.resolve();
@@ -149,9 +187,13 @@ export class WakeWordService extends EventEmitter {
     this._isPaused = true;
     this._streamClosed = false;
 
-    this.process.stdin?.write('PAUSE\n');
+    this.writeControl('PAUSE');
+
+    if (this.usesHardware) {
+      // No physical device to release — resolve immediately.
+      return Promise.resolve();
+    }
 
-    // Wait for the stream to be closed (so arecord can use the device)
     return new Promise((resolve) => {
       const checkInterval = setInterval(() => {
         if (this._streamClosed || !this.process) {
@@ -160,7 +202,6 @@ export class WakeWordService extends EventEmitter {
         }
       }, 50);
 
-      // Safety timeout
       setTimeout(() => {
         clearInterval(checkInterval);
         resolve();
@@ -168,25 +209,18 @@ export class WakeWordService extends EventEmitter {
     });
   }
 
-  /**
-   * Resume wake word detection after pause.
-   * The Python subprocess reopens the audio stream (fast, no model reload).
-   */
   resume(): void {
     if (!this.process || !this._isPaused) return;
 
     this._isPaused = false;
-    this.process.stdin?.write('RESUME\n');
+    this.writeControl('RESUME');
     this.logger.info('🎤 Resuming wake word listening...');
   }
 
-  /**
-   * Stop wake word detection permanently.
-   */
   stop(): void {
     if (this.process) {
-      this.process.stdin?.write('QUIT\n');
-      // Give it a moment to exit cleanly, then force kill
+      this.writeControl('QUIT');
+      this.detachHardware();
       setTimeout(() => {
         if (this.process) {
           this.process.kill('SIGTERM');
@@ -198,4 +232,35 @@ export class WakeWordService extends EventEmitter {
     this._isPaused = false;
     this.removeAllListeners();
   }
+
+  // ──────────────────────────────────────────────────────────
+  // Internals
+  // ──────────────────────────────────────────────────────────
+
+  /**
+   * Write a text control command. In ALSA mode that goes to stdin;
+   * in ESP32 mode stdin carries audio so commands travel over the
+   * extra pipe at fd 3 (process.stdio[3]).
+   */
+  private writeControl(cmd: string): void {
+    if (!this.process) return;
+    const line = `${cmd}\n`;
+    if (this.usesHardware) {
+      // stdio[3] is our control pipe — a Node Writable (net.Socket) stream.
+      const control = this.process.stdio[3] as unknown as
+        | (NodeJS.WritableStream & { destroyed?: boolean })
+        | null;
+      if (control && !control.destroyed) {
+        control.write(line);
+      }
+    } else {
+      this.process.stdin?.write(line);
+    }
+  }
+
+  private detachHardware(): void {
+    if (this.usesHardware && this.hardware) {
+      this.hardware.off('audio_up', this.forwardMicChunk);
+    }
+  }
 }
diff --git a/apps/robot-hardware/lib/Audio/library.json b/apps/robot-hardware/lib/Audio/library.json
new file mode 100644
index 0000000..189e247
--- /dev/null
+++ b/apps/robot-hardware/lib/Audio/library.json
@@ -0,0 +1,7 @@
+{
+  "name": "Audio",
+  "version": "0.1.0",
+  "description": "Ti-Pote audio I/O — INMP441 mic + MAX98357A speaker via two I2S peripherals.",
+  "frameworks": "arduino",
+  "platforms": "espressif32"
+}
diff --git a/apps/robot-hardware/lib/Audio/src/Audio.cpp b/apps/robot-hardware/lib/Audio/src/Audio.cpp
new file mode 100644
index 0000000..2e4cf3b
--- /dev/null
+++ b/apps/robot-hardware/lib/Audio/src/Audio.cpp
@@ -0,0 +1,151 @@
+#include "Audio.h"
+#include <driver/i2s.h>
+
+namespace tipote {
+
+// ─────────────────────────────────────────────────────────────────
+// Shared I2S bus pin assignment — see the header for rationale.
+// ─────────────────────────────────────────────────────────────────
+static constexpr int PIN_BCLK     = 32;   // shared: mic SCK + speaker BCLK
+static constexpr int PIN_LRCLK    = 33;   // shared: mic WS  + speaker LRC
+static constexpr int PIN_MIC_DIN  = 34;   // INMP441 SD   → ESP32 data-in
+static constexpr int PIN_SPK_DOUT = 22;   // MAX98357A DIN ← ESP32 data-out
+
+// DMA buffers — 4 × 256 × 8 bytes (stereo 32-bit) ≈ 8 KB each for
+// RX and TX. That's ~64 ms of audio each way at 16 kHz, plenty of
+// room to absorb UART jitter.
+static constexpr int DMA_COUNT = 4;
+static constexpr int DMA_LEN   = 256;
+
+bool Audio::begin() {
+    // ───── Single I2S port, full duplex, 32-bit stereo slots ─────
+    //
+    // The INMP441 requires 32-bit slots; the MAX98357A happily reads
+    // the 32-bit frames we emit. With a shared bus we get one set of
+    // BCLK/WS for both sides — exactly like the Pi setup that worked.
+    i2s_config_t cfg = {};
+    cfg.mode                 = static_cast<i2s_mode_t>(I2S_MODE_MASTER |
+                                                       I2S_MODE_RX |
+                                                       I2S_MODE_TX);
+    cfg.sample_rate          = SAMPLE_RATE;
+    cfg.bits_per_sample      = I2S_BITS_PER_SAMPLE_32BIT;
+    cfg.channel_format       = I2S_CHANNEL_FMT_RIGHT_LEFT;  // stereo frames
+    cfg.communication_format = I2S_COMM_FORMAT_STAND_I2S;
+    cfg.intr_alloc_flags     = ESP_INTR_FLAG_LEVEL1;
+    cfg.dma_buf_count        = DMA_COUNT;
+    cfg.dma_buf_len          = DMA_LEN;
+    cfg.use_apll             = false;
+    cfg.tx_desc_auto_clear   = true;
+    cfg.fixed_mclk           = 0;
+
+    if (i2s_driver_install(I2S_NUM_0, &cfg, 0, nullptr) != ESP_OK) {
+        return false;
+    }
+
+    i2s_pin_config_t pins = {};
+    pins.bck_io_num   = PIN_BCLK;
+    pins.ws_io_num    = PIN_LRCLK;
+    pins.data_out_num = PIN_SPK_DOUT;
+    pins.data_in_num  = PIN_MIC_DIN;
+    if (i2s_set_pin(I2S_NUM_0, &pins) != ESP_OK) {
+        i2s_driver_uninstall(I2S_NUM_0);
+        return false;
+    }
+
+    i2s_zero_dma_buffer(I2S_NUM_0);
+    micStarted_ = true;
+    spkStarted_ = true;
+    return true;
+}
+
+size_t Audio::readMicChunk(uint8_t* out, size_t outCapacity) {
+    if (!micStarted_ || outCapacity < 2) return 0;
+
+    // Stereo read: each "sample pair" is L + R, each 32-bit = 8 bytes.
+    // Cap at 320 pairs = 20 ms @ 16 kHz mono per call.
+    constexpr size_t MAX_PAIRS = 320;
+    int32_t raw[MAX_PAIRS * 2];
+
+    size_t wantPairs = outCapacity / 2;  // 2 bytes out per mono sample
+    if (wantPairs > MAX_PAIRS) wantPairs = MAX_PAIRS;
+
+    size_t bytesRead = 0;
+    const esp_err_t err = i2s_read(
+        I2S_NUM_0,
+        reinterpret_cast<void*>(raw),
+        wantPairs * 2 * sizeof(int32_t),
+        &bytesRead,
+        0  // non-blocking
+    );
+    if (err != ESP_OK || bytesRead == 0) return 0;
+
+    const size_t pairs = bytesRead / (2 * sizeof(int32_t));
+    int16_t* dst = reinterpret_cast<int16_t*>(out);
+
+    int32_t lMin = INT32_MAX, lMax = INT32_MIN;
+    int32_t rMin = INT32_MAX, rMax = INT32_MIN;
+    int16_t s16Min = INT16_MAX, s16Max = INT16_MIN;
+
+    const bool pickRight = (micChannel_ == MicChannel::Right);
+
+    for (size_t i = 0; i < pairs; ++i) {
+        const int32_t L = raw[2 * i];
+        const int32_t R = raw[2 * i + 1];
+        if (L < lMin) lMin = L;
+        if (L > lMax) lMax = L;
+        if (R < rMin) rMin = R;
+        if (R > rMax) rMax = R;
+
+        // INMP441 is 24-bit left-justified in a 32-bit slot, so the
+        // useful range lives in bits 31..8. A >> 14 gives a comfortable
+        // speech level; bump to >> 11 if the result is too quiet.
+        const int32_t src = pickRight ? R : L;
+        int32_t s = src >> 14;
+        if (s >  INT16_MAX) s =  INT16_MAX;
+        if (s <  INT16_MIN) s =  INT16_MIN;
+        const int16_t s16 = static_cast<int16_t>(s);
+        if (s16 < s16Min) s16Min = s16;
+        if (s16 > s16Max) s16Max = s16;
+        dst[i] = s16;
+    }
+
+    lastStats_ = {lMin, lMax, rMin, rMax, s16Min, s16Max, pairs};
+    return pairs * 2;
+}
+
+size_t Audio::writeSpeakerChunk(const uint8_t* data, size_t len) {
+    if (!spkStarted_ || len == 0) return 0;
+
+    // The UART brings us S16 mono PCM. The I2S bus is running as
+    // 32-bit stereo, so we expand each 16-bit sample to a stereo
+    // pair of 32-bit words. 320 input samples → 2560 output bytes.
+    constexpr size_t MAX_IN_SAMPLES = 320;
+    const size_t inSamples = (len / 2 > MAX_IN_SAMPLES) ? MAX_IN_SAMPLES : len / 2;
+
+    int32_t stereo[MAX_IN_SAMPLES * 2];
+    const int16_t* src = reinterpret_cast<const int16_t*>(data);
+    for (size_t i = 0; i < inSamples; ++i) {
+        // Shift up to place the sample in the upper 16 bits of the
+        // 32-bit slot (matches what the MAX98357A expects).
+        const int32_t s32 = static_cast<int32_t>(src[i]) << 16;
+        stereo[2 * i]     = s32;  // left
+        stereo[2 * i + 1] = s32;  // right duplicated
+    }
+
+    size_t bytesWritten = 0;
+    i2s_write(I2S_NUM_0, stereo, inSamples * 2 * sizeof(int32_t),
+              &bytesWritten, pdMS_TO_TICKS(50));
+
+    // Report bytes accepted in *caller units* (S16 mono) so the
+    // outside world doesn't need to know about our internal format.
+    const size_t pairsWritten = bytesWritten / (2 * sizeof(int32_t));
+    return pairsWritten * 2;
+}
+
+void Audio::flushSpeaker() {
+    if (spkStarted_) {
+        i2s_zero_dma_buffer(I2S_NUM_0);
+    }
+}
+
+}  // namespace tipote
diff --git a/apps/robot-hardware/lib/Audio/src/Audio.h b/apps/robot-hardware/lib/Audio/src/Audio.h
new file mode 100644
index 0000000..50b1f9c
--- /dev/null
+++ b/apps/robot-hardware/lib/Audio/src/Audio.h
@@ -0,0 +1,84 @@
+// Ti-Pote — Audio I/O via a single full-duplex I2S bus.
+//
+// I2S_NUM_0 is configured as MASTER in RX+TX mode. BCLK and WS are
+// shared between the INMP441 microphone (RX) and the MAX98357A
+// amplifier (TX), which is the standard I2S bus layout — exactly
+// what was working on the Raspberry Pi side.
+//
+// Pin map (single shared I2S bus):
+//   BCLK         = GPIO 32   shared mic SCK + speaker BCLK
+//   LRCLK / WS   = GPIO 33   shared mic WS  + speaker LRC
+//   Mic data in  = GPIO 34   INMP441 SD (input-only pin, perfect)
+//   Speaker DOUT = GPIO 22   MAX98357A DIN
+//
+// Mic L/R stays tied to GND → talks on the LEFT slot of the I2S frame.
+//
+// Format exchanged with the Pi on the UART:
+//   PCM signed 16-bit little-endian, mono, 16 kHz.
+//
+// Internally the bus runs at 32-bit stereo slots (INMP441 requires it).
+// readMicChunk() converts the 32-bit left slot down to S16 mono.
+// writeSpeakerChunk() expands S16 mono to 32-bit stereo frames before
+// handing them to i2s_write().
+
+#pragma once
+
+#include <Arduino.h>
+#include <stdint.h>
+#include <stddef.h>
+
+namespace tipote {
+
+class Audio {
+public:
+    static constexpr int       SAMPLE_RATE      = 16000;
+    static constexpr int       CHANNELS         = 1;
+    static constexpr int       BYTES_PER_SAMPLE = 2;  // S16
+
+    // Initialise both I2S ports. Safe to call exactly once from setup().
+    bool begin();
+
+    // Pull whatever the mic DMA has ready. Writes S16 mono little-endian
+    // bytes into `out`, up to `outCapacity` bytes, and returns the number
+    // of bytes actually written (always even, possibly zero).
+    //
+    // Non-blocking (timeout = 0).
+    size_t readMicChunk(uint8_t* out, size_t outCapacity);
+
+    // Push S16 mono little-endian PCM to the speaker DMA. Blocks up to
+    // ~50 ms waiting for room. Returns bytes actually accepted.
+    size_t writeSpeakerChunk(const uint8_t* data, size_t len);
+
+    // Drop anything pending in the speaker DMA. Used on shutdown / reset.
+    void flushSpeaker();
+
+    // ─── Debug / bring-up ────────────────────────────────────────
+    //
+    // Stats updated on every readMicChunk() call, covering *this last
+    // batch only*. Handy to confirm the mic is actually clocking data
+    // into the ESP32 without blowing up the main audio path.
+    struct MicStats {
+        int32_t leftRawMin;     // raw int32 sample on left I2S slot
+        int32_t leftRawMax;
+        int32_t rightRawMin;    // raw int32 sample on right I2S slot
+        int32_t rightRawMax;
+        int16_t s16Min;         // post-shift S16 sample (output channel)
+        int16_t s16Max;
+        size_t  samples;        // sample pairs in the batch
+    };
+    const MicStats& lastMicStats() const { return lastStats_; }
+
+    // Which I2S slot to route into the S16 output. Flip at runtime if
+    // the mic's L/R pin doesn't land where we expect.
+    enum class MicChannel { Left, Right };
+    void setMicChannel(MicChannel ch) { micChannel_ = ch; }
+    MicChannel micChannel() const { return micChannel_; }
+
+private:
+    bool        micStarted_ = false;
+    bool        spkStarted_ = false;
+    MicChannel  micChannel_ = MicChannel::Left;
+    MicStats    lastStats_  = {0, 0, 0, 0, 0, 0, 0};
+};
+
+}  // namespace tipote
diff --git a/apps/robot-hardware/platformio.ini b/apps/robot-hardware/platformio.ini
index 3af807a..9e62092 100644
--- a/apps/robot-hardware/platformio.ini
+++ b/apps/robot-hardware/platformio.ini
@@ -30,6 +30,11 @@ build_flags =
     -DHW_SERIAL_BAUD=921600
     ; Idle timeout before the eyes fall back to the default animation (ms)
     -DHW_HEARTBEAT_TIMEOUT_MS=5000
+    ; Hardware UART2 pins used to talk to the Raspberry Pi.
+    ; The OLED eyes already claim GPIO 16/17 (UART2 default pins),
+    ; so Serial2 is remapped to these two free pins instead.
+    -DHW_UART_RX_PIN=27
+    -DHW_UART_TX_PIN=13
 build_unflags =
     -std=gnu++11
 
diff --git a/apps/robot-hardware/scripts/esp-play.ts b/apps/robot-hardware/scripts/esp-play.ts
new file mode 100644
index 0000000..f328d45
--- /dev/null
+++ b/apps/robot-hardware/scripts/esp-play.ts
@@ -0,0 +1,219 @@
+/**
+ * Ti-Pote — Play a PCM/WAV file on the ESP32 speaker over USB.
+ *
+ * Usage:
+ *   pnpm esp:play <file.wav|file.raw>
+ *
+ * Accepts either:
+ *   - raw S16 LE mono 16 kHz PCM
+ *   - WAV file with a 44-byte RIFF header (header is stripped)
+ *
+ * Default port: auto-detected, override with ESP_PORT=/dev/cu.usbserial-XXX
+ */
+
+import { execFileSync } from 'node:child_process';
+import { existsSync, mkdtempSync, readFileSync, readdirSync, rmSync } from 'node:fs';
+import { tmpdir } from 'node:os';
+import { join, extname } from 'node:path';
+import { SerialPort } from 'serialport';
+
+const SAMPLE_RATE = 16000;
+
+function findDefaultPort(): string {
+  const envPort = process.env.ESP_PORT;
+  if (envPort) return envPort;
+  const candidates = readdirSync('/dev').filter(
+    (f) =>
+      f.startsWith('cu.usbserial') ||
+      f.startsWith('cu.SLAB_') ||
+      f.startsWith('cu.wchusbserial'),
+  );
+  if (candidates.length === 0) {
+    throw new Error(
+      'No ESP32 serial port detected. Plug the board in, or set ESP_PORT=/dev/cu.usbserial-XXX',
+    );
+  }
+  return `/dev/${candidates[0]}`;
+}
+
+function stripWav(buf: Buffer): Buffer {
+  if (
+    buf.length > 44 &&
+    buf.toString('ascii', 0, 4) === 'RIFF' &&
+    buf.toString('ascii', 8, 12) === 'WAVE'
+  ) {
+    return buf.subarray(44);
+  }
+  return buf;
+}
+
+/**
+ * Convert any audio file macOS can decode (m4a, mp3, ogg, aiff, …) to
+ * S16 LE mono 16 kHz WAV using the built-in `afconvert` tool. Returns
+ * the path to a new .wav file in a temp dir which the caller is
+ * responsible for cleaning up.
+ */
+function convertToEsp32Wav(inputPath: string): { wavPath: string; cleanup: () => void } {
+  const dir = mkdtempSync(join(tmpdir(), 'tipote-'));
+  const wavPath = join(dir, 'converted.wav');
+  console.log(`→ converting ${inputPath} → 16 kHz mono S16LE WAV`);
+  try {
+    execFileSync(
+      'afconvert',
+      [
+        '-f', 'WAVE',
+        '-d', 'LEI16@16000',
+        '-c', '1',
+        inputPath,
+        wavPath,
+      ],
+      { stdio: 'inherit' },
+    );
+  } catch (err) {
+    rmSync(dir, { recursive: true, force: true });
+    throw new Error(`afconvert failed: ${(err as Error).message}`);
+  }
+  return {
+    wavPath,
+    cleanup: () => rmSync(dir, { recursive: true, force: true }),
+  };
+}
+
+async function main(): Promise<void> {
+  const inPath = process.argv[2];
+  if (!inPath) {
+    console.error('Usage: esp-play.ts <file>  (wav, raw, m4a, mp3, …)');
+    process.exit(1);
+  }
+  if (!existsSync(inPath)) {
+    throw new Error(`file not found: ${inPath}`);
+  }
+
+  // Convert anything that isn't already a .wav or raw PCM blob. This
+  // covers m4a / mp3 / ogg / aiff / opus / flac via the built-in
+  // macOS `afconvert` tool.
+  const ext = extname(inPath).toLowerCase();
+  const needsConversion = ext !== '.wav' && ext !== '.raw' && ext !== '.pcm';
+
+  let cleanup: () => void = () => {};
+  let loadPath = inPath;
+  if (needsConversion) {
+    const converted = convertToEsp32Wav(inPath);
+    loadPath = converted.wavPath;
+    cleanup = converted.cleanup;
+  }
+
+  const raw = readFileSync(loadPath);
+  const pcm = stripWav(raw);
+  const samples = pcm.length / 2;
+  const durationMs = (samples / SAMPLE_RATE) * 1000;
+  console.log(
+    `→ loaded ${loadPath}: ${pcm.length} bytes (${samples} samples, ${durationMs.toFixed(0)} ms)`,
+  );
+
+  if (pcm.length === 0) {
+    cleanup();
+    throw new Error('empty PCM buffer');
+  }
+  if (pcm.length % 2 !== 0) {
+    cleanup();
+    throw new Error(
+      'PCM size must be a multiple of 2 (S16 mono). The source file is probably not 16-bit or not mono. If you passed a raw file, convert it first.',
+    );
+  }
+
+  const path = findDefaultPort();
+  console.log(`→ opening ${path} @ 921600 baud`);
+
+  const port = new SerialPort({ path, baudRate: 921600, autoOpen: false });
+  await new Promise<void>((resolve, reject) => {
+    port.open((err) => (err ? reject(err) : resolve()));
+  });
+
+  let ready = false;
+  const readyWaiters: Array<() => void> = [];
+
+  const finished = new Promise<void>((resolve, reject) => {
+    const timeout = setTimeout(
+      () => reject(new Error(`timeout waiting for OK after ${durationMs + 8000} ms`)),
+      durationMs + 8000,
+    );
+    let lineBuf = '';
+    port.on('data', (data: Buffer) => {
+      lineBuf += data.toString('utf8');
+      let idx: number;
+      while ((idx = lineBuf.indexOf('\n')) >= 0) {
+        const line = lineBuf.slice(0, idx).replace(/\r$/, '').trim();
+        lineBuf = lineBuf.slice(idx + 1);
+        if (!line) continue;
+        if (line === 'OK') {
+          clearTimeout(timeout);
+          resolve();
+          return;
+        }
+        if (line === 'READY') {
+          ready = true;
+          while (readyWaiters.length) readyWaiters.shift()!();
+          continue;
+        }
+        if (line.startsWith('ERR ')) {
+          clearTimeout(timeout);
+          reject(new Error(`firmware error: ${line.slice(4)}`));
+          return;
+        }
+        if (line.startsWith('LOG ')) console.log(`[esp] ${line.slice(4)}`);
+        else console.log(`[esp] ${line}`);
+      }
+    });
+    port.on('error', reject);
+  });
+
+  // Wait for READY so we don't send PLAY into the bootloader.
+  await new Promise<void>((resolve, reject) => {
+    if (ready) return resolve();
+    const timer = setTimeout(
+      () => reject(new Error('timeout waiting for READY from firmware')),
+      5000,
+    );
+    readyWaiters.push(() => {
+      clearTimeout(timer);
+      resolve();
+    });
+  });
+  await new Promise((r) => setTimeout(r, 50));
+
+  console.log(`→ PLAY ${pcm.length} bytes`);
+  port.write(`PLAY ${pcm.length}\n`);
+
+  // Stream the payload paced EXACTLY at the I2S consumption rate so
+  // the ESP32 RX buffer stays roughly constant in size regardless of
+  // file length. I2S consumes 16 kHz × 2 bytes/sample = 32 KB/s of
+  // S16 mono. A 1024-byte burst is 32 ms of audio → sleeping 32 ms
+  // between bursts matches playback exactly.
+  //
+  // We still pad lightly above 32 KB/s (30 ms instead of 32) so the
+  // DMA never runs dry. The excess fills the ~16 KB RX buffer on the
+  // firmware slowly; even for a 10 s file we stay well under it.
+  const CHUNK = 1024;
+  const PAUSE_MS = 30;
+  for (let off = 0; off < pcm.length; off += CHUNK) {
+    const slice = pcm.subarray(off, off + CHUNK);
+    await new Promise<void>((resolve, reject) => {
+      port.write(slice, (err) => (err ? reject(err) : resolve()));
+    });
+    await new Promise<void>((resolve) => port.drain(() => resolve()));
+    if (off + CHUNK < pcm.length) {
+      await new Promise((r) => setTimeout(r, PAUSE_MS));
+    }
+  }
+
+  await finished;
+  await new Promise<void>((resolve) => port.close(() => resolve()));
+  cleanup();
+  console.log('✅ playback done');
+}
+
+main().catch((err) => {
+  console.error(err);
+  process.exit(1);
+});
diff --git a/apps/robot-hardware/scripts/esp-record.ts b/apps/robot-hardware/scripts/esp-record.ts
new file mode 100644
index 0000000..4a54218
--- /dev/null
+++ b/apps/robot-hardware/scripts/esp-record.ts
@@ -0,0 +1,190 @@
+/**
+ * Ti-Pote — Record audio from the ESP32 over USB.
+ *
+ * Usage:
+ *   pnpm --filter @ti-pote/robot-client exec tsx \
+ *     ../robot-hardware/scripts/esp-record.ts <file.wav> [duration_ms]
+ *
+ * Or with the shortcut from robot-hardware:
+ *   pnpm esp:record out.wav 3000
+ *
+ * Defaults:
+ *   duration_ms = 3000
+ *   port        = auto-detected (first /dev/cu.usbserial-* or /dev/cu.SLAB_*)
+ *                 can be overridden with ESP_PORT=/dev/cu.usbserial-XXX
+ */
+
+import { readdirSync, writeFileSync } from 'node:fs';
+import { SerialPort } from 'serialport';
+
+const SAMPLE_RATE = 16000;
+const BYTES_PER_SAMPLE = 2;
+
+function findDefaultPort(): string {
+  const envPort = process.env.ESP_PORT;
+  if (envPort) return envPort;
+  const candidates = readdirSync('/dev').filter(
+    (f) =>
+      f.startsWith('cu.usbserial') ||
+      f.startsWith('cu.SLAB_') ||
+      f.startsWith('cu.wchusbserial'),
+  );
+  if (candidates.length === 0) {
+    throw new Error(
+      'No ESP32 serial port detected. Plug the board in, or set ESP_PORT=/dev/cu.usbserial-XXX',
+    );
+  }
+  return `/dev/${candidates[0]}`;
+}
+
+function wavHeader(pcmBytes: number, sampleRate: number): Buffer {
+  const header = Buffer.alloc(44);
+  header.write('RIFF', 0);
+  header.writeUInt32LE(36 + pcmBytes, 4);
+  header.write('WAVE', 8);
+  header.write('fmt ', 12);
+  header.writeUInt32LE(16, 16); // fmt chunk size
+  header.writeUInt16LE(1, 20); // PCM
+  header.writeUInt16LE(1, 22); // mono
+  header.writeUInt32LE(sampleRate, 24);
+  header.writeUInt32LE(sampleRate * 2, 28); // byte rate
+  header.writeUInt16LE(2, 32); // block align
+  header.writeUInt16LE(16, 34); // bits per sample
+  header.write('data', 36);
+  header.writeUInt32LE(pcmBytes, 40);
+  return header;
+}
+
+async function main(): Promise<void> {
+  const outPath = process.argv[2];
+  const durationMs = parseInt(process.argv[3] ?? '3000', 10);
+
+  if (!outPath) {
+    console.error('Usage: esp-record.ts <file.wav> [duration_ms]');
+    process.exit(1);
+  }
+
+  const path = findDefaultPort();
+  console.log(`→ opening ${path} @ 921600 baud`);
+
+  const port = new SerialPort({ path, baudRate: 921600, autoOpen: false });
+
+  await new Promise<void>((resolve, reject) => {
+    port.open((err) => (err ? reject(err) : resolve()));
+  });
+
+  // ── simple line-based state machine for stdout text ───────────
+  let phase: 'idle' | 'streaming' = 'idle';
+  let remaining = 0;
+  const chunks: Buffer[] = [];
+  let lineBuf = '';
+  let ready = false;
+  const readyWaiters: Array<() => void> = [];
+
+  const finished = new Promise<Buffer>((resolve, reject) => {
+    const timeout = setTimeout(
+      () => reject(new Error(`timeout waiting for audio after ${durationMs + 5000} ms`)),
+      durationMs + 5000,
+    );
+
+    port.on('data', (data: Buffer) => {
+      let offset = 0;
+      while (offset < data.length) {
+        if (phase === 'streaming') {
+          const take = Math.min(remaining, data.length - offset);
+          chunks.push(data.subarray(offset, offset + take));
+          offset += take;
+          remaining -= take;
+          if (remaining === 0) {
+            phase = 'idle';
+            lineBuf = '';
+          }
+          continue;
+        }
+
+        // text mode: accumulate until newline
+        const nl = data.indexOf(0x0a, offset);
+        if (nl === -1) {
+          lineBuf += data.subarray(offset).toString('utf8');
+          break;
+        }
+        lineBuf += data.subarray(offset, nl).toString('utf8');
+        offset = nl + 1;
+        const line = lineBuf.replace(/\r$/, '').trim();
+        lineBuf = '';
+        if (!line) continue;
+
+        if (line.startsWith('BEGIN ')) {
+          remaining = parseInt(line.slice(6), 10);
+          phase = 'streaming';
+          console.log(`→ BEGIN ${remaining} bytes`);
+        } else if (line === 'END') {
+          clearTimeout(timeout);
+          const pcm = Buffer.concat(chunks);
+          resolve(pcm);
+        } else if (line === 'READY') {
+          ready = true;
+          while (readyWaiters.length) readyWaiters.shift()!();
+        } else if (line.startsWith('LOG ')) {
+          console.log(`[esp] ${line.slice(4)}`);
+        } else if (line.startsWith('ERR ')) {
+          clearTimeout(timeout);
+          reject(new Error(`firmware error: ${line.slice(4)}`));
+        } else {
+          console.log(`[esp] ${line}`);
+        }
+      }
+    });
+
+    port.on('error', reject);
+  });
+
+  // The ESP32 resets on port open (DTR/RTS). Wait until it prints
+  // READY so we don't send commands into the bootloader.
+  await new Promise<void>((resolve, reject) => {
+    if (ready) return resolve();
+    const timer = setTimeout(
+      () => reject(new Error('timeout waiting for READY from firmware')),
+      5000,
+    );
+    readyWaiters.push(() => {
+      clearTimeout(timer);
+      resolve();
+    });
+  });
+  await new Promise((r) => setTimeout(r, 50));
+
+  console.log(`→ REC ${durationMs} ms — speak now!`);
+  port.write(`REC ${durationMs}\n`);
+
+  const pcm = await finished;
+
+  await new Promise<void>((resolve) => port.close(() => resolve()));
+
+  // Basic RMS sanity check.
+  let sumSq = 0;
+  const samples = pcm.length / BYTES_PER_SAMPLE;
+  for (let i = 0; i < pcm.length - 1; i += 2) {
+    const s = pcm.readInt16LE(i);
+    sumSq += s * s;
+  }
+  const rms = Math.sqrt(sumSq / samples);
+  console.log(
+    `✅ captured ${pcm.length} bytes (${samples} samples, ${(
+      (samples / SAMPLE_RATE) *
+      1000
+    ).toFixed(0)} ms)   RMS=${rms.toFixed(0)}`,
+  );
+
+  if (outPath.toLowerCase().endsWith('.wav')) {
+    writeFileSync(outPath, Buffer.concat([wavHeader(pcm.length, SAMPLE_RATE), pcm]));
+  } else {
+    writeFileSync(outPath, pcm);
+  }
+  console.log(`→ wrote ${outPath}`);
+}
+
+main().catch((err) => {
+  console.error(err);
+  process.exit(1);
+});
diff --git a/apps/robot-hardware/src/main.cpp b/apps/robot-hardware/src/main.cpp
index c347343..c7cdb9e 100644
--- a/apps/robot-hardware/src/main.cpp
+++ b/apps/robot-hardware/src/main.cpp
@@ -1,147 +1,281 @@
-// Ti-Pote — Robot Hardware firmware (ESP32)
+// Ti-Pote — Minimal audio bring-up firmware (ESP32-WROOM-32)
 //
-// Responsibilities for v0:
-//   - Listen on UART0 (the USB-connected serial port while the ESP32
-//     is plugged into Arthur's laptop; on the real robot this will
-//     eventually be Serial2 wired to the Raspberry Pi).
-//   - Decode incoming binary frames (see include/protocol_types.h).
-//   - Dispatch commands to the Eyes renderer.
-//   - Reply to PING with PONG.
-//   - Fall back to a sleepy animation if no heartbeat is received
-//     for HW_HEARTBEAT_TIMEOUT_MS (set in platformio.ini).
+// GOAL: prove the I2S audio chain (INMP441 + MAX98357A) end to end
+// with nothing else in the loop — no Pi, no OLED, no protocol frames.
+// The ESP32 is plugged into a computer via USB and the host runs
+// two tiny scripts:
 //
-// Intentionally NOT yet implemented (Phase 2):
-//   - I2S audio up/down streaming
-//   - Servo / LED commands
+//   scripts/esp-record.mjs <file.raw> <duration_ms>
+//   scripts/esp-play.mjs   <file.raw>
 //
-// The hook points for those are marked with TODO(phase2).
+// Protocol over USB Serial (921600 baud, line-based for commands,
+// raw bytes for audio):
+//
+//   host → esp32
+//     "PING\n"              ping
+//     "REC <ms>\n"           start recording for <ms> milliseconds
+//     "PLAY <bytes>\n"       next <bytes> bytes on the wire are raw
+//                            S16 LE mono 16 kHz PCM, play them
+//
+//   esp32 → host
+//     "READY\n"              once at boot
+//     "PONG\n"               reply to PING
+//     "LOG <text>\n"         human-readable log line
+//     "ERR <text>\n"         error message
+//     "BEGIN <bytes>\n"      start of a REC response
+//     "<raw bytes>"          raw PCM (S16 LE mono 16 kHz)
+//     "END\n"                end of a REC response
+//     "OK\n"                 command completed
+//
+// Wiring (shared I2S bus on I2S_NUM_0):
+//   BCLK  = GPIO 32   (mic SCK + speaker BCLK)
+//   LRCLK = GPIO 33   (mic WS  + speaker LRC)
+//   MIC   = GPIO 34   (INMP441 SD → ESP32 data-in, input-only pin)
+//   SPK   = GPIO 22   (ESP32 data-out → MAX98357A DIN)
 
 #include <Arduino.h>
-#include "Protocol.h"
-#include "Eyes.h"
+#include <driver/i2s.h>
+#include <string.h>
 
-#ifndef HW_SERIAL_BAUD
-#define HW_SERIAL_BAUD 921600
-#endif
+// ──────────────────────────────────────────────────────────
+// Audio config
+// ──────────────────────────────────────────────────────────
 
-#ifndef HW_HEARTBEAT_TIMEOUT_MS
-#define HW_HEARTBEAT_TIMEOUT_MS 5000
-#endif
+static constexpr int SAMPLE_RATE      = 16000;
+static constexpr int PIN_BCLK         = 32;
+static constexpr int PIN_LRCLK        = 33;
+static constexpr int PIN_MIC_DIN      = 34;
+static constexpr int PIN_SPK_DOUT     = 22;
 
-// The communication stream. When the ESP32 is plugged into a
-// computer, UART0 (Serial) is the USB-CDC port, which is exactly
-// what the robot-client will talk to during development. Later,
-// for the Pi wiring, change this to Serial2 and call
-// `Serial2.begin(HW_SERIAL_BAUD, SERIAL_8N1, RX_PIN, TX_PIN)`.
-#define HW_COMM Serial
+static constexpr int DMA_COUNT        = 4;
+static constexpr int DMA_LEN          = 256;
 
-using namespace tipote;
+// Staging buffers — keep them outside of functions so we don't eat
+// stack on every tick.
+static constexpr size_t OUT_S16_SAMPLES = 320;  // 20 ms of S16 mono
+static int32_t g_rawStereo[OUT_S16_SAMPLES * 2];
+static int16_t g_micMono  [OUT_S16_SAMPLES];
+static int32_t g_spkStereo[OUT_S16_SAMPLES * 2];
+static uint8_t g_spkInBuf [OUT_S16_SAMPLES * 2];  // 640 bytes of S16 mono
 
-static Eyes         eyes;
-static FrameDecoder decoder;
+// ──────────────────────────────────────────────────────────
+// Line buffer for incoming text commands.
+// ──────────────────────────────────────────────────────────
 
-static uint32_t     lastHeartbeatMs = 0;
-static bool         idleMode        = false;
+static char     g_line[64];
+static size_t   g_lineLen = 0;
 
-// Forward decl
-static void handleFrame(const Frame& frame, void* userData);
-static void logLine(const char* line);
+static void sendLog(const char* msg) {
+    Serial.print("LOG ");
+    Serial.println(msg);
+}
+
+static void sendErr(const char* msg) {
+    Serial.print("ERR ");
+    Serial.println(msg);
+}
+
+// ──────────────────────────────────────────────────────────
+// I2S init — single port, full duplex, shared BCLK/WS.
+// ──────────────────────────────────────────────────────────
+
+static bool audioBegin() {
+    i2s_config_t cfg = {};
+    cfg.mode                 = static_cast<i2s_mode_t>(I2S_MODE_MASTER |
+                                                       I2S_MODE_RX |
+                                                       I2S_MODE_TX);
+    cfg.sample_rate          = SAMPLE_RATE;
+    cfg.bits_per_sample      = I2S_BITS_PER_SAMPLE_32BIT;
+    cfg.channel_format       = I2S_CHANNEL_FMT_RIGHT_LEFT;
+    cfg.communication_format = I2S_COMM_FORMAT_STAND_I2S;
+    cfg.intr_alloc_flags     = ESP_INTR_FLAG_LEVEL1;
+    cfg.dma_buf_count        = DMA_COUNT;
+    cfg.dma_buf_len          = DMA_LEN;
+    cfg.use_apll             = false;
+    cfg.tx_desc_auto_clear   = true;
+    cfg.fixed_mclk           = 0;
+
+    if (i2s_driver_install(I2S_NUM_0, &cfg, 0, nullptr) != ESP_OK) return false;
+
+    i2s_pin_config_t pins = {};
+    pins.bck_io_num   = PIN_BCLK;
+    pins.ws_io_num    = PIN_LRCLK;
+    pins.data_out_num = PIN_SPK_DOUT;
+    pins.data_in_num  = PIN_MIC_DIN;
+    if (i2s_set_pin(I2S_NUM_0, &pins) != ESP_OK) {
+        i2s_driver_uninstall(I2S_NUM_0);
+        return false;
+    }
+    i2s_zero_dma_buffer(I2S_NUM_0);
+    return true;
+}
+
+// Convert one batch of stereo 32-bit mic samples to S16 mono by
+// taking the left slot and shifting the 24-bit-aligned data down.
+// Returns the number of S16 samples written into `out`.
+static size_t micReadMono(int16_t* out, size_t maxSamples) {
+    size_t wantPairs = maxSamples;
+    if (wantPairs > OUT_S16_SAMPLES) wantPairs = OUT_S16_SAMPLES;
+
+    size_t bytesRead = 0;
+    const esp_err_t err = i2s_read(
+        I2S_NUM_0,
+        g_rawStereo,
+        wantPairs * 2 * sizeof(int32_t),
+        &bytesRead,
+        portMAX_DELAY  // block — we're in a dedicated REC loop
+    );
+    if (err != ESP_OK || bytesRead == 0) return 0;
+
+    const size_t pairs = bytesRead / (2 * sizeof(int32_t));
+    for (size_t i = 0; i < pairs; ++i) {
+        int32_t L = g_rawStereo[2 * i];
+        int32_t s = L >> 14;
+        if (s >  INT16_MAX) s =  INT16_MAX;
+        if (s <  INT16_MIN) s =  INT16_MIN;
+        out[i] = static_cast<int16_t>(s);
+    }
+    return pairs;
+}
+
+// Write one batch of S16 mono PCM to the speaker by duplicating each
+// sample into both stereo slots and shifting into the high half of
+// the 32-bit word (what the MAX98357A expects on a shared bus).
+static void spkWriteMono(const int16_t* samples, size_t count) {
+    if (count == 0) return;
+    if (count > OUT_S16_SAMPLES) count = OUT_S16_SAMPLES;
+    for (size_t i = 0; i < count; ++i) {
+        const int32_t s32 = static_cast<int32_t>(samples[i]) << 16;
+        g_spkStereo[2 * i]     = s32;
+        g_spkStereo[2 * i + 1] = s32;
+    }
+    size_t bytesWritten = 0;
+    i2s_write(I2S_NUM_0, g_spkStereo, count * 2 * sizeof(int32_t),
+              &bytesWritten, portMAX_DELAY);
+}
+
+// ──────────────────────────────────────────────────────────
+// Command handlers
+// ──────────────────────────────────────────────────────────
+
+static void handleRec(uint32_t durationMs) {
+    const uint32_t totalSamples = (SAMPLE_RATE * durationMs) / 1000;
+    const uint32_t totalBytes   = totalSamples * sizeof(int16_t);
+
+    Serial.print("BEGIN ");
+    Serial.println(totalBytes);
+
+    // Flush whatever old noise is in the mic DMA first.
+    i2s_zero_dma_buffer(I2S_NUM_0);
+
+    uint32_t sent = 0;
+    while (sent < totalSamples) {
+        size_t want = totalSamples - sent;
+        if (want > OUT_S16_SAMPLES) want = OUT_S16_SAMPLES;
+        const size_t got = micReadMono(g_micMono, want);
+        if (got == 0) continue;
+        Serial.write(reinterpret_cast<const uint8_t*>(g_micMono),
+                     got * sizeof(int16_t));
+        sent += got;
+    }
+
+    Serial.println();
+    Serial.println("END");
+}
+
+static void handlePlay(uint32_t totalBytes) {
+    // Drain any pending crap from the speaker DMA so we don't start
+    // with a pop.
+    i2s_zero_dma_buffer(I2S_NUM_0);
+
+    // Give Serial.readBytes a generous timeout so a jittery host
+    // doesn't abort us mid-playback.
+    Serial.setTimeout(2000);
+
+    uint32_t remaining = totalBytes;
+    while (remaining > 0) {
+        size_t want = remaining;
+        if (want > sizeof(g_spkInBuf)) want = sizeof(g_spkInBuf);
+        // Force an even count so we always have complete S16 samples.
+        if (want & 1) want -= 1;
+        if (want == 0) want = 2;
+
+        const size_t got = Serial.readBytes(g_spkInBuf, want);
+        if (got == 0) {
+            sendErr("PLAY read timeout");
+            return;
+        }
+        const size_t samples = got / sizeof(int16_t);
+        spkWriteMono(reinterpret_cast<const int16_t*>(g_spkInBuf), samples);
+        remaining -= got;
+    }
+
+    // Let the last frames actually reach the speaker, then clear.
+    delay(50);
+    i2s_zero_dma_buffer(I2S_NUM_0);
+    Serial.println("OK");
+}
+
+static void handleLine(const char* line) {
+    if (strcmp(line, "PING") == 0) {
+        Serial.println("PONG");
+        return;
+    }
+    if (strncmp(line, "REC ", 4) == 0) {
+        const long ms = atol(line + 4);
+        if (ms <= 0 || ms > 60000) { sendErr("REC bad duration"); return; }
+        handleRec(static_cast<uint32_t>(ms));
+        return;
+    }
+    if (strncmp(line, "PLAY ", 5) == 0) {
+        const long bytes = atol(line + 5);
+        if (bytes <= 0 || bytes > 16 * 1024 * 1024) {
+            sendErr("PLAY bad size");
+            return;
+        }
+        handlePlay(static_cast<uint32_t>(bytes));
+        return;
+    }
+    sendErr("unknown command");
+}
+
+// ──────────────────────────────────────────────────────────
+// Arduino entry points
+// ──────────────────────────────────────────────────────────
 
 void setup() {
-    HW_COMM.begin(HW_SERIAL_BAUD);
-    // Give the host a beat to open the port after auto-reset.
+    // Bump the UART RX buffer WAY above the 256-byte default so we
+    // can absorb a full PLAY payload (up to a few tens of KB) without
+    // losing bytes if the host floods us.
+    Serial.setRxBufferSize(16 * 1024);
+    Serial.begin(921600);
     delay(50);
 
-    eyes.begin();
+    if (!audioBegin()) {
+        sendErr("I2S init failed");
+    } else {
+        sendLog("I2S ready");
+    }
 
-    decoder.onFrame(handleFrame);
-
-    lastHeartbeatMs = millis();
-    logLine("robot-hardware ready");
+    Serial.println("READY");
 }
 
 void loop() {
-    // Drain whatever the host has sent since the last tick.
-    while (HW_COMM.available() > 0) {
-        int b = HW_COMM.read();
-        if (b < 0) break;
-        decoder.feed(static_cast<uint8_t>(b));
-    }
-
-    // Heartbeat watchdog: if we haven't heard from the host in a
-    // while, slip into a sleepy animation so the robot doesn't
-    // look frozen. Any incoming frame resets this.
-    const uint32_t now = millis();
-    if (!idleMode && (now - lastHeartbeatMs) > HW_HEARTBEAT_TIMEOUT_MS) {
-        idleMode = true;
-        eyes.show(Emotion::SLEEPY);
+    while (Serial.available() > 0) {
+        const int c = Serial.read();
+        if (c < 0) break;
+        if (c == '\r') continue;
+        if (c == '\n') {
+            g_line[g_lineLen] = 0;
+            if (g_lineLen > 0) handleLine(g_line);
+            g_lineLen = 0;
+            continue;
+        }
+        if (g_lineLen < sizeof(g_line) - 1) {
+            g_line[g_lineLen++] = static_cast<char>(c);
+        } else {
+            g_lineLen = 0;
+            sendErr("line overflow");
+        }
     }
 }
-
-// ---------------------------------------------------------------
-// Frame dispatcher
-// ---------------------------------------------------------------
-
-static void handleFrame(const Frame& frame, void* /*userData*/) {
-    lastHeartbeatMs = millis();
-    if (idleMode) {
-        idleMode = false;
-    }
-
-    switch (frame.type) {
-        case MsgType::DISPLAY_EMOTION: {
-            if (frame.length < 1) {
-                logLine("DISPLAY_EMOTION: empty payload");
-                return;
-            }
-            const uint8_t code = frame.payload[0];
-            if (code >= static_cast<uint8_t>(Emotion::COUNT)) {
-                logLine("DISPLAY_EMOTION: out-of-range code");
-                return;
-            }
-            eyes.show(static_cast<Emotion>(code));
-
-            // ACK back so the host knows it was applied.
-            uint8_t ackPayload[1] = {code};
-            FrameEncoder::writeTo(HW_COMM, MsgType::ACK, ackPayload, 1);
-            return;
-        }
-
-        case MsgType::DISPLAY_CLEAR: {
-            eyes.clear();
-            FrameEncoder::writeTo(HW_COMM, MsgType::ACK);
-            return;
-        }
-
-        case MsgType::PING: {
-            // Echo the payload back as PONG. Useful for latency
-            // measurements and proving the link is symmetric.
-            FrameEncoder::writeTo(HW_COMM, MsgType::PONG,
-                                  frame.payload, frame.length);
-            return;
-        }
-
-        case MsgType::STATUS: {
-            // Heartbeat from host — lastHeartbeatMs was already
-            // bumped above. Nothing else to do for v0.
-            return;
-        }
-
-        // TODO(phase2): AUDIO_UP / AUDIO_DOWN / SERVO_CMD / LED_CMD
-        default:
-            logLine("unknown frame type");
-            return;
-    }
-}
-
-// ---------------------------------------------------------------
-// Diagnostic logging — wraps text in a LOG frame so the host
-// can parse it without getting confused by free text on the wire.
-// ---------------------------------------------------------------
-
-static void logLine(const char* line) {
-    const size_t len = strnlen(line, MAX_PAYLOAD_SIZE);
-    FrameEncoder::writeTo(HW_COMM, MsgType::LOG,
-                          reinterpret_cast<const uint8_t*>(line),
-                          static_cast<uint16_t>(len));
-}