From c19d9a7cf40f729b61ffb91c4412d785ffe910be Mon Sep 17 00:00:00 2001 From: ordinarthur Date: Thu, 9 Apr 2026 02:47:53 +0200 Subject: [PATCH] ok script esp --- apps/robot-client/package.json | 6 +- apps/robot-client/scripts/audio-beep.ts | 99 +++++ apps/robot-client/scripts/audio-loopback.ts | 171 ++++++++ apps/robot-client/scripts/wake_word.py | 408 +++++++++++------- .../src/config/hardware.config.ts | 23 +- .../src/hardware/hardware.service.ts | 48 +++ apps/robot-client/src/main.ts | 36 +- .../src/services/audio.service.ts | 262 +++++++++-- apps/robot-client/src/services/index.ts | 8 +- .../src/services/wake-word.service.ts | 149 +++++-- apps/robot-hardware/lib/Audio/library.json | 7 + apps/robot-hardware/lib/Audio/src/Audio.cpp | 151 +++++++ apps/robot-hardware/lib/Audio/src/Audio.h | 84 ++++ apps/robot-hardware/platformio.ini | 5 + apps/robot-hardware/scripts/esp-play.ts | 219 ++++++++++ apps/robot-hardware/scripts/esp-record.ts | 190 ++++++++ apps/robot-hardware/src/main.cpp | 384 +++++++++++------ 17 files changed, 1860 insertions(+), 390 deletions(-) create mode 100644 apps/robot-client/scripts/audio-beep.ts create mode 100644 apps/robot-client/scripts/audio-loopback.ts create mode 100644 apps/robot-hardware/lib/Audio/library.json create mode 100644 apps/robot-hardware/lib/Audio/src/Audio.cpp create mode 100644 apps/robot-hardware/lib/Audio/src/Audio.h create mode 100644 apps/robot-hardware/scripts/esp-play.ts create mode 100644 apps/robot-hardware/scripts/esp-record.ts diff --git a/apps/robot-client/package.json b/apps/robot-client/package.json index 19d467c..2f064c1 100644 --- a/apps/robot-client/package.json +++ b/apps/robot-client/package.json @@ -12,7 +12,11 @@ "format": "prettier --write \"src/**/*.ts\"", "test": "vitest run", "test:watch": "vitest", - "hw:demo": "tsx scripts/hardware-demo.ts" + "hw:demo": "pnpm exec tsx scripts/hardware-demo.ts", + "audio:loopback": "pnpm exec tsx scripts/audio-loopback.ts", + "audio:beep": "pnpm exec tsx scripts/audio-beep.ts", + "esp:record": "pnpm exec tsx ../robot-hardware/scripts/esp-record.ts", + "esp:play": "pnpm exec tsx ../robot-hardware/scripts/esp-play.ts" }, "dependencies": { "socket.io-client": "^4.8.3", diff --git a/apps/robot-client/scripts/audio-beep.ts b/apps/robot-client/scripts/audio-beep.ts new file mode 100644 index 0000000..c608dd2 --- /dev/null +++ b/apps/robot-client/scripts/audio-beep.ts @@ -0,0 +1,99 @@ +/** + * Ti-Pote — Pure tone speaker test. + * + * Generates a 440 Hz sine wave at ~70% of full scale and streams it + * to the ESP32 speaker via AUDIO_DOWN frames, then a second beep at + * 880 Hz. Completely independent of the microphone — if this does + * not produce audible sound, the problem is downstream of the ESP32 + * on the speaker path (MAX98357A wiring, SD pin, VIN, speaker leads). + * + * Run with: + * HARDWARE_SERIAL_PORT=/dev/serial0 pnpm --filter @ti-pote/robot-client audio:beep + * + * Optional env: + * BEEP_MS — length of each beep in ms (default 1500) + * BEEP_FREQ — primary frequency in Hz (default 440) + * BEEP_AMP — amplitude 0.0..1.0 (default 0.7) + */ + +import { HardwareService, Emotion } from '../src/hardware/index.js'; +import { Esp32AudioService } from '../src/services/audio.service.js'; + +const path = process.env.HARDWARE_SERIAL_PORT ?? '/dev/serial0'; +const baudRate = parseInt(process.env.HARDWARE_SERIAL_BAUD ?? '921600', 10); +const beepMs = parseInt(process.env.BEEP_MS ?? '1500', 10); +const beepFreq = parseInt(process.env.BEEP_FREQ ?? '440', 10); +const beepAmp = parseFloat(process.env.BEEP_AMP ?? '0.7'); + +const SAMPLE_RATE = 16000; + +function generateSine(freqHz: number, durationMs: number, amplitude: number): Buffer { + const sampleCount = Math.floor((SAMPLE_RATE * durationMs) / 1000); + const buf = Buffer.alloc(sampleCount * 2); + const amp = Math.max(0, Math.min(1, amplitude)) * 32767; + const twoPiF = (2 * Math.PI * freqHz) / SAMPLE_RATE; + // 5 ms linear attack/release so the speaker doesn't click. + const rampSamples = Math.floor((SAMPLE_RATE * 5) / 1000); + for (let i = 0; i < sampleCount; i++) { + let env = 1; + if (i < rampSamples) env = i / rampSamples; + else if (i > sampleCount - rampSamples) env = (sampleCount - i) / rampSamples; + const s = Math.round(Math.sin(i * twoPiF) * amp * env); + buf.writeInt16LE(Math.max(-32768, Math.min(32767, s)), i * 2); + } + return buf; +} + +async function sleep(ms: number): Promise { + return new Promise((r) => setTimeout(r, ms)); +} + +async function main(): Promise { + const hw = new HardwareService({ path, baudRate, heartbeatIntervalMs: 1000 }); + hw.on('log', (line) => console.log(`[firmware] ${line}`)); + hw.on('error', (err) => console.error(`[firmware error] ${err.message}`)); + + console.log(`→ opening ${path} @ ${baudRate} baud`); + await hw.connect(); + + try { + const rtt = await hw.ping(Buffer.from('beep')); + console.log(`→ ping round-trip: ${rtt.toFixed(1)} ms`); + + const audio = new Esp32AudioService( + { + backend: 'esp32', + captureDevice: 'default', + playbackDevice: 'default', + sampleRate: SAMPLE_RATE, + bitDepth: 16, + channels: 1, + chunkDurationMs: 20, + }, + hw, + ); + + hw.sendEmotion(Emotion.HAPPY); + + console.log(`🔊 Beep 1: ${beepFreq} Hz · ${beepMs} ms · amp=${beepAmp}`); + const tone1 = generateSine(beepFreq, beepMs, beepAmp); + await audio.play(tone1); + + await sleep(400); + + console.log(`🔊 Beep 2: ${beepFreq * 2} Hz · ${beepMs} ms · amp=${beepAmp}`); + const tone2 = generateSine(beepFreq * 2, beepMs, beepAmp); + await audio.play(tone2); + + console.log('✅ done — did you hear two beeps?'); + } finally { + hw.sendEmotion(Emotion.NEUTRAL); + await sleep(200); + await hw.disconnect(); + } +} + +main().catch((err) => { + console.error('beep failed:', err); + process.exit(1); +}); diff --git a/apps/robot-client/scripts/audio-loopback.ts b/apps/robot-client/scripts/audio-loopback.ts new file mode 100644 index 0000000..816bb66 --- /dev/null +++ b/apps/robot-client/scripts/audio-loopback.ts @@ -0,0 +1,171 @@ +/** + * Ti-Pote — End-to-end audio loopback test. + * + * What it proves: the whole Pi ↔ ESP32 ↔ mic/speaker chain works, + * without bringing the cloud/wake-word/orchestrator into the picture. + * + * What it does: + * 1. Opens the serial link to the ESP32. + * 2. Captures `CAPTURE_MS` (default 5000) of mic audio via + * AUDIO_UP frames into a single in-memory buffer. + * 3. Pauses briefly. + * 4. Streams that buffer back to the ESP32 as AUDIO_DOWN frames + * and waits for the speaker to finish playing. + * + * Expected result: you say "allô allô" during step 2 and hear your + * own voice played back on the robot's speaker a moment later. + * + * Run with: + * HARDWARE_SERIAL_PORT=/dev/serial0 pnpm --filter @ti-pote/robot-client audio:loopback + * + * Optional env: + * CAPTURE_MS — capture duration in ms (default 5000) + * HARDWARE_SERIAL_PORT / HARDWARE_SERIAL_BAUD + */ + +import { writeFileSync } from 'node:fs'; +import { HardwareService, Emotion } from '../src/hardware/index.js'; +import { Esp32AudioService } from '../src/services/audio.service.js'; + +const path = process.env.HARDWARE_SERIAL_PORT ?? '/dev/serial0'; +const baudRate = parseInt(process.env.HARDWARE_SERIAL_BAUD ?? '921600', 10); +const captureMs = parseInt(process.env.CAPTURE_MS ?? '5000', 10); +const debug = !!process.env.DEBUG; +const dumpPath = process.env.DUMP_PATH ?? '/tmp/tipote-capture.raw'; +const skipPlayback = !!process.env.SKIP_PLAYBACK; + +const SAMPLE_RATE = 16000; +const BYTES_PER_SAMPLE = 2; + +let debugFramesSeen = 0; + +async function sleep(ms: number): Promise { + return new Promise((r) => setTimeout(r, ms)); +} + +async function main(): Promise { + const hw = new HardwareService({ path, baudRate, heartbeatIntervalMs: 1000 }); + hw.on('log', (line) => console.log(`[firmware] ${line}`)); + hw.on('error', (err) => console.error(`[firmware error] ${err.message}`)); + if (debug) { + hw.on('audio_up', (chunk) => { + // Print first 8 int16 samples of the first few frames + // so we can see whether the wire carries zeros or real data. + if (debugFramesSeen < 3) { + const head: number[] = []; + for (let i = 0; i < Math.min(chunk.length, 16); i += 2) { + head.push(chunk.readInt16LE(i)); + } + console.log(`[debug] frame ${debugFramesSeen} len=${chunk.length} head=${head.join(',')}`); + debugFramesSeen++; + } + }); + } + + console.log(`→ opening ${path} @ ${baudRate} baud`); + await hw.connect(); + + try { + const rtt = await hw.ping(Buffer.from('loopback')); + console.log(`→ ping round-trip: ${rtt.toFixed(1)} ms`); + + hw.sendEmotion(Emotion.SURPRISED); + + // ── 1. Capture ──────────────────────────────────────────────── + const chunks: Buffer[] = []; + let bytesCaptured = 0; + + const collect = (chunk: Buffer): void => { + chunks.push(chunk); + bytesCaptured += chunk.length; + }; + hw.on('audio_up', collect); + + console.log(`🎙️ Recording ${captureMs} ms — say something!`); + await sleep(captureMs); + + hw.off('audio_up', collect); + const capture = Buffer.concat(chunks); + const samples = capture.length / BYTES_PER_SAMPLE; + const durationMs = (samples / SAMPLE_RATE) * 1000; + console.log( + `✅ captured ${capture.length} bytes (${samples} samples, ${durationMs.toFixed(0)} ms)` + + ` across ${chunks.length} frames`, + ); + + if (capture.length === 0) { + console.error( + '❌ no audio received from the ESP32. Check the I2S wiring ' + + '(BCLK=32, LRCLK=33, DIN=34) and that the firmware got past `audio: I2S ready`.', + ); + return; + } + + // Quick RMS sanity check so we catch "mic muted" / "disconnected" early. + const rms = computeRms(capture); + console.log(` RMS level: ${rms.toFixed(0)} (silence ≈ 10, speech ≳ 500)`); + + if (debug) { + // Dump the raw capture so we can replay it offline: + // aplay -r 16000 -f S16_LE -c 1 /tmp/tipote-capture.raw + writeFileSync(dumpPath, capture); + console.log(`[debug] raw capture written to ${dumpPath} (${capture.length} bytes)`); + + const allZero = capture.every((b) => b === 0); + console.log(`[debug] capture.allZero=${allZero}`); + + // Also print some distinct int16 values we saw, to spot patterns. + const seen = new Set(); + for (let i = 0; i < capture.length - 1 && seen.size < 10; i += 2) { + seen.add(capture.readInt16LE(i)); + } + console.log(`[debug] first distinct samples: ${[...seen].join(',')}`); + } + + if (skipPlayback) { + console.log('SKIP_PLAYBACK set — not sending AUDIO_DOWN'); + return; + } + + // ── 2. Playback ─────────────────────────────────────────────── + await sleep(500); + + const audio = new Esp32AudioService( + { + backend: 'esp32', + captureDevice: 'default', + playbackDevice: 'default', + sampleRate: SAMPLE_RATE, + bitDepth: 16, + channels: 1, + chunkDurationMs: 20, + }, + hw, + ); + + hw.sendEmotion(Emotion.HAPPY); + console.log('🔊 Playing back on the ESP32 speaker...'); + await audio.play(capture); + console.log('✅ playback done'); + } finally { + hw.sendEmotion(Emotion.NEUTRAL); + await sleep(200); + await hw.disconnect(); + } +} + +function computeRms(buf: Buffer): number { + if (buf.length < 2) return 0; + let sumSquares = 0; + const samples = buf.length / 2; + for (let i = 0; i < buf.length - 1; i += 2) { + const s = buf.readInt16LE(i); + sumSquares += s * s; + } + return Math.sqrt(sumSquares / samples); +} + +main().catch((err) => { + console.error('loopback failed:', err); + process.exit(1); +}); diff --git a/apps/robot-client/scripts/wake_word.py b/apps/robot-client/scripts/wake_word.py index c91b589..bf0a74b 100755 --- a/apps/robot-client/scripts/wake_word.py +++ b/apps/robot-client/scripts/wake_word.py @@ -2,94 +2,175 @@ """ Ti-Pote Wake Word Detection Script. -Runs OpenWakeWord model continuously, listening on the specified ALSA device. -Prints "DETECTED" to stdout when the wake word is heard. +Runs OpenWakeWord continuously and prints "DETECTED" to stdout when +the wake word is heard. -Supports PAUSE/RESUME commands on stdin to temporarily stop/start listening -without reloading the model. When paused, the audio stream is closed so other -processes (arecord) can use the device. +Two input modes: -Usage: - python3 wake_word.py --model hey_jarvis --threshold 0.5 --device default --sample-rate 16000 +1. --input alsa (default, legacy) + Opens an ALSA capture device via PyAudio. PAUSE/RESUME/QUIT + commands are read from stdin. -Requirements: - pip install openwakeword pyaudio numpy +2. --input stdin + Reads raw S16 mono PCM audio from stdin (fd 0). This is used when + the Raspberry Pi is just an orchestrator and the microphone lives + on the ESP32 — the Node client forwards AUDIO_UP frames into this + script's stdin. Control commands are read from a separate file + descriptor specified by --control-fd (default: 3). + +Control commands (one per line, uppercase): + PAUSE — stop emitting DETECTED events (audio keeps flowing so + we don't overflow the pipe, but predictions are ignored). + RESUME — resume emitting and reset the model buffer. + RESET — reset the model buffer without touching the pause flag. + QUIT — exit cleanly. + +Usage (ALSA): + python3 wake_word.py --model hey_jarvis --device default + +Usage (stdin / ESP32 backend): + python3 wake_word.py --model hey_jarvis --input stdin --control-fd 3 """ import argparse -import sys import os import signal -import select +import sys import threading +import time + import numpy as np -def main(): - parser = argparse.ArgumentParser(description='Ti-Pote Wake Word Detection') - parser.add_argument('--model', type=str, default='hey_jarvis', - help='Wake word model name (default: hey_jarvis as placeholder)') - parser.add_argument('--threshold', type=float, default=0.5, - help='Detection threshold (0.0-1.0)') - parser.add_argument('--device', type=str, default='default', - help='ALSA audio capture device') - parser.add_argument('--sample-rate', type=int, default=16000, - help='Audio sample rate in Hz') - args = parser.parse_args() +CHUNK_SAMPLES = 1280 # ≈ 80 ms @ 16 kHz (OpenWakeWord's preferred size) + +def load_model(model_name: str): try: from openwakeword.model import Model except ImportError: - print("ERROR: openwakeword not installed. Run: pip install openwakeword", file=sys.stderr) + print("ERROR: openwakeword not installed. Run: pip install openwakeword", + file=sys.stderr) sys.exit(1) - try: - import pyaudio - except ImportError: - print("ERROR: pyaudio not installed. Run: pip install pyaudio", file=sys.stderr) - sys.exit(1) - - # ── Load the wake word model (one time only) ── - - print(f"Loading wake word model: {args.model}...", file=sys.stderr) - import openwakeword - pretrained_paths = openwakeword.get_pretrained_model_paths() - model_path = None - for p in pretrained_paths: - basename = os.path.basename(p) - if basename.startswith(args.model): - model_path = p - break - + pretrained = openwakeword.get_pretrained_model_paths() + model_path = next( + (p for p in pretrained if os.path.basename(p).startswith(model_name)), + None, + ) if model_path is None: - if os.path.isfile(args.model): - model_path = args.model + if os.path.isfile(model_name): + model_path = model_name else: - print(f"ERROR: model '{args.model}' not found in pretrained models", file=sys.stderr) - print(f"Available models:", file=sys.stderr) - for p in pretrained_paths: + print(f"ERROR: model '{model_name}' not found", file=sys.stderr) + for p in pretrained: print(f" - {os.path.basename(p)}", file=sys.stderr) sys.exit(1) - print(f"Resolved model path: {model_path}", file=sys.stderr) - + print(f"Loading wake word model: {model_name}...", file=sys.stderr) try: - oww_model = Model(wakeword_model_paths=[model_path]) + return Model(wakeword_model_paths=[model_path]) except Exception as e: - print(f"ERROR loading model '{args.model}': {e}", file=sys.stderr) + print(f"ERROR loading model '{model_name}': {e}", file=sys.stderr) sys.exit(1) - print(f"Wake word model loaded: {args.model}", file=sys.stderr) - print(f"Threshold: {args.threshold}", file=sys.stderr) - print(f"Listening on device: {args.device}", file=sys.stderr) - # ── Initialize PyAudio ── +class State: + """Shared mutable state between the audio and control threads.""" + def __init__(self): + self.paused = False + self.running = True + self.reset_requested = False + self.lock = threading.Lock() + + +def start_control_reader(state: State, fd: int): + """Background thread that reads PAUSE/RESUME/RESET/QUIT commands.""" + try: + f = os.fdopen(fd, 'r', buffering=1) + except OSError as e: + print(f"ERROR opening control fd {fd}: {e}", file=sys.stderr) + return + + def reader(): + while state.running: + try: + line = f.readline() + except Exception: + break + if not line: + break + cmd = line.strip().upper() + with state.lock: + if cmd == 'PAUSE' and not state.paused: + state.paused = True + print("PAUSED", file=sys.stderr, flush=True) + elif cmd == 'RESUME' and state.paused: + state.paused = False + state.reset_requested = True + print("RESUMED", file=sys.stderr, flush=True) + elif cmd == 'RESET': + state.reset_requested = True + elif cmd == 'QUIT': + state.running = False + break + + t = threading.Thread(target=reader, daemon=True) + t.start() + + +def run_predict_loop(oww_model, read_chunk, state: State, threshold: float): + """ + Shared loop: pull a chunk from `read_chunk()`, feed the model, + optionally emit DETECTED. Exits when `read_chunk()` returns None + or state.running is False. + """ + print("READY", file=sys.stderr, flush=True) + try: + while state.running: + with state.lock: + if state.reset_requested: + oww_model.reset() + state.reset_requested = False + + audio_data = read_chunk() + if audio_data is None: + # EOF / error; exit cleanly + break + + audio_array = np.frombuffer(audio_data, dtype=np.int16) + oww_model.predict(audio_array) + + with state.lock: + if state.paused: + # Keep draining but don't emit detections. + continue + + for _, score in oww_model.prediction_buffer.items(): + if len(score) > 0 and score[-1] > threshold: + print("DETECTED", flush=True) + oww_model.reset() + break + except KeyboardInterrupt: + pass + + +# ───────────────────────────────────────────────────────────────── +# ALSA input (legacy backend) +# ───────────────────────────────────────────────────────────────── + +def run_alsa_mode(args, oww_model, state: State): + import re + try: + import pyaudio + except ImportError: + print("ERROR: pyaudio not installed. Run: pip install pyaudio", + file=sys.stderr) + sys.exit(1) pa = pyaudio.PyAudio() - # Find the device index - import re device_index = None if args.device != 'default': try: @@ -97,14 +178,14 @@ def main(): info = pa.get_device_info_by_index(idx) if info.get('maxInputChannels', 0) > 0: device_index = idx - print(f"Using device by index: [{idx}] {info['name']}", file=sys.stderr) + print(f"Using device by index: [{idx}] {info['name']}", + file=sys.stderr) except (ValueError, IOError): pass if device_index is None: hw_match = re.search(r'(\d+),(\d+)', args.device) hw_pattern = f"hw:{hw_match.group(1)},{hw_match.group(2)}" if hw_match else None - for i in range(pa.get_device_count()): info = pa.get_device_info_by_index(i) if info.get('maxInputChannels', 0) <= 0: @@ -115,133 +196,134 @@ def main(): print(f"Matched device: [{i}] {name}", file=sys.stderr) break - if device_index is None: - print(f"WARNING: Device '{args.device}' not found, listing available inputs:", file=sys.stderr) - for i in range(pa.get_device_count()): - info = pa.get_device_info_by_index(i) - if info.get('maxInputChannels', 0) > 0: - print(f" [{i}] {info['name']}", file=sys.stderr) - print("Falling back to default device", file=sys.stderr) - - # ── Audio stream helpers ── - - chunk_size = 1280 # ~80ms at 16kHz (OpenWakeWord expects this) - stream = None + stream = {'handle': None} def open_stream(): - nonlocal stream - stream = pa.open( + stream['handle'] = pa.open( format=pyaudio.paInt16, channels=1, rate=args.sample_rate, input=True, - frames_per_buffer=chunk_size, + frames_per_buffer=CHUNK_SAMPLES, input_device_index=device_index, ) def close_stream(): - nonlocal stream - if stream is not None: + h = stream['handle'] + if h is not None: try: - stream.stop_stream() - stream.close() + h.stop_stream() + h.close() except Exception: pass - stream = None + stream['handle'] = None - # ── Stdin command reader (PAUSE / RESUME) ── - - paused = False - running = True - lock = threading.Lock() - - def stdin_reader(): - nonlocal paused, running - while running: - try: - line = sys.stdin.readline() - if not line: # EOF - running = False - break - cmd = line.strip().upper() - with lock: - if cmd == 'PAUSE': - if not paused: - paused = True - print("PAUSED", file=sys.stderr, flush=True) - elif cmd == 'RESUME': - if paused: - paused = False - print("RESUMED", file=sys.stderr, flush=True) - elif cmd == 'QUIT': - running = False - break - except Exception: - break - - stdin_thread = threading.Thread(target=stdin_reader, daemon=True) - stdin_thread.start() - - # ── Signal handling ── - - def handle_signal(sig, frame): - nonlocal running - running = False - signal.signal(signal.SIGTERM, handle_signal) - signal.signal(signal.SIGINT, handle_signal) - - # ── Main loop ── + def read_chunk(): + with state.lock: + is_paused = state.paused + # In ALSA mode, pausing means physically releasing the device. + if is_paused: + if stream['handle'] is not None: + close_stream() + print("STREAM_CLOSED", file=sys.stderr, flush=True) + time.sleep(0.1) + return b'\x00' * (CHUNK_SAMPLES * 2) # dummy silence; won't be predicted + if stream['handle'] is None: + open_stream() + oww_model.reset() + print("STREAM_REOPENED", file=sys.stderr, flush=True) + try: + return stream['handle'].read(CHUNK_SAMPLES, exception_on_overflow=False) + except Exception as e: + print(f"Audio read error: {e}", file=sys.stderr) + close_stream() + time.sleep(0.5) + return b'\x00' * (CHUNK_SAMPLES * 2) open_stream() - print("READY", file=sys.stderr, flush=True) - try: - while running: - with lock: - is_paused = paused - - if is_paused: - # Close the audio stream so arecord can use the device - if stream is not None: - close_stream() - print("STREAM_CLOSED", file=sys.stderr, flush=True) - # Wait a bit before checking again - import time - time.sleep(0.1) - continue - - # Reopen stream if it was closed (after resume) - if stream is None: - open_stream() - oww_model.reset() - print("STREAM_REOPENED", file=sys.stderr, flush=True) - - try: - audio_data = stream.read(chunk_size, exception_on_overflow=False) - except Exception as e: - print(f"Audio read error: {e}", file=sys.stderr) - close_stream() - import time - time.sleep(0.5) - continue - - audio_array = np.frombuffer(audio_data, dtype=np.int16) - - oww_model.predict(audio_array) - - for model_name, score in oww_model.prediction_buffer.items(): - if len(score) > 0 and score[-1] > args.threshold: - print("DETECTED", flush=True) - oww_model.reset() - break - - except KeyboardInterrupt: - pass + run_predict_loop(oww_model, read_chunk, state, args.threshold) finally: close_stream() pa.terminate() print("Wake word detection stopped", file=sys.stderr) +# ───────────────────────────────────────────────────────────────── +# Stdin input (ESP32 backend) +# ───────────────────────────────────────────────────────────────── + +def run_stdin_mode(args, oww_model, state: State): + """ + Audio bytes arrive on stdin (fd 0), 16-bit signed LE mono at + `args.sample_rate`. We block until a full CHUNK_SAMPLES chunk is + available and hand it to the model. + """ + print("Listening on stdin for raw S16LE mono PCM", file=sys.stderr) + chunk_bytes = CHUNK_SAMPLES * 2 + stdin = sys.stdin.buffer + buf = bytearray() + + def read_chunk(): + # Keep reading until we have a full chunk or hit EOF. + while len(buf) < chunk_bytes and state.running: + try: + data = stdin.read(chunk_bytes - len(buf)) + except Exception as e: + print(f"stdin read error: {e}", file=sys.stderr) + return None + if not data: + return None + buf.extend(data) + if len(buf) < chunk_bytes: + return None + chunk = bytes(buf[:chunk_bytes]) + del buf[:chunk_bytes] + return chunk + + try: + run_predict_loop(oww_model, read_chunk, state, args.threshold) + finally: + print("Wake word detection stopped", file=sys.stderr) + + +# ───────────────────────────────────────────────────────────────── +# Entrypoint +# ───────────────────────────────────────────────────────────────── + +def main(): + parser = argparse.ArgumentParser(description='Ti-Pote Wake Word Detection') + parser.add_argument('--model', type=str, default='hey_jarvis') + parser.add_argument('--threshold', type=float, default=0.5) + parser.add_argument('--input', type=str, choices=['alsa', 'stdin'], default='alsa', + help="Audio source. 'alsa' opens PyAudio, 'stdin' reads from fd 0.") + parser.add_argument('--device', type=str, default='default', + help='ALSA audio capture device (only used with --input alsa).') + parser.add_argument('--control-fd', type=int, default=0, + help='File descriptor to read control commands from. ' + 'Default 0 (stdin) for ALSA, pass 3 for stdin mode.') + parser.add_argument('--sample-rate', type=int, default=16000) + args = parser.parse_args() + + state = State() + + def handle_signal(_sig, _frame): + state.running = False + signal.signal(signal.SIGTERM, handle_signal) + signal.signal(signal.SIGINT, handle_signal) + + oww_model = load_model(args.model) + print(f"Wake word model loaded: {args.model}", file=sys.stderr) + print(f"Threshold: {args.threshold}", file=sys.stderr) + + start_control_reader(state, args.control_fd) + + if args.input == 'stdin': + run_stdin_mode(args, oww_model, state) + else: + print(f"Listening on device: {args.device}", file=sys.stderr) + run_alsa_mode(args, oww_model, state) + + if __name__ == '__main__': main() diff --git a/apps/robot-client/src/config/hardware.config.ts b/apps/robot-client/src/config/hardware.config.ts index c071238..e0c330a 100644 --- a/apps/robot-client/src/config/hardware.config.ts +++ b/apps/robot-client/src/config/hardware.config.ts @@ -1,8 +1,11 @@ export interface AudioConfig { - /** ALSA device for capture (e.g., 'plughw:1,0' or 'default') */ + /** Which audio backend to use: 'esp32' (default) or 'alsa' (legacy). */ + backend: 'esp32' | 'alsa'; + + /** ALSA device for capture (only used when backend='alsa'). */ captureDevice: string; - /** ALSA device for playback (e.g., 'plughw:0,0' or 'default') */ + /** ALSA device for playback (only used when backend='alsa'). */ playbackDevice: string; /** Sample rate in Hz */ @@ -53,8 +56,13 @@ export interface HardwareConfig { } export function loadHardwareConfig(): HardwareConfig { + const backend = (process.env.AUDIO_BACKEND || 'esp32').toLowerCase() as + | 'esp32' + | 'alsa'; + return { audio: { + backend, captureDevice: process.env.AUDIO_CAPTURE_DEVICE || 'default', playbackDevice: process.env.AUDIO_PLAYBACK_DEVICE || 'default', sampleRate: parseInt(process.env.AUDIO_SAMPLE_RATE || '16000', 10), @@ -69,8 +77,15 @@ export function loadHardwareConfig(): HardwareConfig { threshold: parseFloat(process.env.WAKEWORD_THRESHOLD || '0.5'), }, serial: { - enabled: (process.env.HARDWARE_SERIAL_ENABLED || 'false').toLowerCase() === 'true', - path: process.env.HARDWARE_SERIAL_PORT || '/dev/ttyUSB0', + // The ESP32 is now the mic/speaker front-end — serial link is + // enabled by default. Set HARDWARE_SERIAL_ENABLED=false only + // when intentionally falling back to the ALSA backend. + enabled: + (process.env.HARDWARE_SERIAL_ENABLED || (backend === 'esp32' ? 'true' : 'false')) + .toLowerCase() === 'true', + // Default to /dev/serial0 (the Pi's hardware UART once the + // console has been freed via raspi-config). + path: process.env.HARDWARE_SERIAL_PORT || '/dev/serial0', baudRate: parseInt(process.env.HARDWARE_SERIAL_BAUD || '921600', 10), heartbeatIntervalMs: parseInt(process.env.HARDWARE_HEARTBEAT_MS || '1000', 10), }, diff --git a/apps/robot-client/src/hardware/hardware.service.ts b/apps/robot-client/src/hardware/hardware.service.ts index f243dfc..fd8852a 100644 --- a/apps/robot-client/src/hardware/hardware.service.ts +++ b/apps/robot-client/src/hardware/hardware.service.ts @@ -27,8 +27,17 @@ export interface HardwareServiceEvents { log: (message: string) => void; frame: (frame: DecodedFrame) => void; ack: (payload: Buffer) => void; + /** Emitted for each AUDIO_UP frame received from the ESP32 (raw S16 mono PCM). */ + audio_up: (chunk: Buffer) => void; } +/** + * Max bytes we put in a single AUDIO_DOWN frame. Must stay below + * MAX_PAYLOAD_SIZE (1024) and should map to a whole number of + * 20 ms @ 16 kHz chunks: 640 bytes = 20 ms, 320 samples. + */ +const AUDIO_DOWN_CHUNK_BYTES = 640; + /** * HardwareService — the robot-client's only direct link to the ESP32. * @@ -136,6 +145,42 @@ export class HardwareService extends EventEmitter { this.writeFrame(MsgType.DISPLAY_CLEAR); } + /** + * Send a PCM S16 mono 16 kHz buffer to the ESP32 speaker as one or + * more AUDIO_DOWN frames. The buffer is automatically split into + * chunks of `AUDIO_DOWN_CHUNK_BYTES` so each frame fits within the + * UART protocol's MAX_PAYLOAD_SIZE. + * + * Back-pressure note: `SerialPort.write` buffers in user-space, so + * this method is best-effort. For long TTS playbacks, call + * `drainAudioDown()` between chunks or space them with a `setTimeout` + * to avoid unbounded growth. + */ + sendAudioDown(chunk: Buffer): void { + if (!this.port?.isOpen) { + this.log.warn('Dropping AUDIO_DOWN — serial port not open'); + return; + } + for (let offset = 0; offset < chunk.length; offset += AUDIO_DOWN_CHUNK_BYTES) { + const slice = chunk.subarray(offset, offset + AUDIO_DOWN_CHUNK_BYTES); + this.writeFrame(MsgType.AUDIO_DOWN, slice); + } + } + + /** + * Wait for the kernel-side serial buffer to drain. Useful between + * large AUDIO_DOWN bursts to keep latency bounded. + */ + drainAudioDown(): Promise { + return new Promise((resolve, reject) => { + if (!this.port?.isOpen) { + resolve(); + return; + } + this.port.drain((err) => (err ? reject(err) : resolve())); + }); + } + /** * Round-trip PING → PONG used for bring-up and latency checks. * Resolves with the measured RTT in ms. @@ -187,6 +232,9 @@ export class HardwareService extends EventEmitter { case MsgType.ERROR: this.log.error({ payload: frame.payload.toString('utf8') }, 'firmware error'); return; + case MsgType.AUDIO_UP: + this.emit('audio_up', frame.payload); + return; default: return; } diff --git a/apps/robot-client/src/main.ts b/apps/robot-client/src/main.ts index e271a3b..034cb9a 100644 --- a/apps/robot-client/src/main.ts +++ b/apps/robot-client/src/main.ts @@ -1,7 +1,7 @@ import { loadRobotConfig, loadHardwareConfig } from './config/index.js'; import { CloudSocket } from './transport/index.js'; import { - AudioService, + createAudioService, WakeWordService, KeyboardTriggerService, HealthService, @@ -72,15 +72,16 @@ async function main(): Promise { const resolvedConfig = { ...robotConfig, deviceId, deviceToken }; const cloudSocket = new CloudSocket(resolvedConfig as Required); - const audioService = new AudioService(hardwareConfig.audio); const healthService = new HealthService(cloudSocket); - // ── Optional: hardware bridge (ESP32 firmware) ── - // The serial link is opt-in via HARDWARE_SERIAL_ENABLED=true. We - // treat failures here as non-fatal: even without a face, the - // robot can still converse with the cloud. + // ── Hardware bridge (ESP32 firmware) ── + // With AUDIO_BACKEND=esp32 the ESP32 owns the mic AND the speaker, + // so the serial link is mandatory. With AUDIO_BACKEND=alsa we can + // still run without it (face will be missing, but audio works). + const audioBackend = hardwareConfig.audio.backend; let hardwareService: HardwareService | null = null; + if (hardwareConfig.serial.enabled) { hardwareService = new HardwareService({ path: hardwareConfig.serial.path, @@ -93,19 +94,40 @@ async function main(): Promise { hardwareService.sendEmotion(Emotion.HAPPY); logger.info('Hardware bridge connected'); } catch (err) { + if (audioBackend === 'esp32') { + logger.fatal( + { err, path: hardwareConfig.serial.path }, + 'Hardware bridge required for AUDIO_BACKEND=esp32 — check the UART wiring or set AUDIO_BACKEND=alsa', + ); + process.exit(1); + } logger.warn({ err }, 'Hardware bridge unavailable — continuing without face'); hardwareService = null; } + } else if (audioBackend === 'esp32') { + logger.fatal( + 'AUDIO_BACKEND=esp32 requires HARDWARE_SERIAL_ENABLED=true. Either enable the serial link or switch to AUDIO_BACKEND=alsa.', + ); + process.exit(1); } else { logger.info('Hardware bridge disabled (set HARDWARE_SERIAL_ENABLED=true to enable)'); } + // Audio service — pick a backend now that we know whether the + // hardware bridge is alive. + const audioService = createAudioService(hardwareConfig.audio, hardwareService); + logger.info({ backend: audioBackend }, 'Audio service initialised'); + // Choose trigger based on TRIGGER_MODE let trigger: ITriggerService; if (resolvedConfig.triggerMode === 'wakeword') { logger.info('Trigger: wake word (OpenWakeWord)'); - trigger = new WakeWordService(hardwareConfig.wakeWord, hardwareConfig.audio); + trigger = new WakeWordService( + hardwareConfig.wakeWord, + hardwareConfig.audio, + audioBackend === 'esp32' ? hardwareService : null, + ); } else { logger.info('Trigger: keyboard (press Enter to talk)'); trigger = new KeyboardTriggerService(); diff --git a/apps/robot-client/src/services/audio.service.ts b/apps/robot-client/src/services/audio.service.ts index c44bc73..e3c3dbc 100644 --- a/apps/robot-client/src/services/audio.service.ts +++ b/apps/robot-client/src/services/audio.service.ts @@ -1,30 +1,48 @@ import { ChildProcess, spawn } from 'node:child_process'; import { EventEmitter } from 'node:events'; import { type AudioConfig } from '../config/index.js'; +import { type HardwareService } from '../hardware/index.js'; import { createLogger, type Logger } from '../utils/index.js'; export interface AudioServiceEvents { - /** Emitted when a raw PCM audio chunk is captured from the microphone */ + /** Emitted when a raw PCM audio chunk is captured from the microphone. */ audio_chunk: (chunk: Buffer) => void; - /** Emitted when playback of a response finishes */ + /** Emitted when playback of a response finishes. */ playback_done: () => void; - /** Emitted on audio errors */ + /** Emitted on audio errors. */ error: (error: Error) => void; } /** - * Audio service for Raspberry Pi. + * Common audio interface used by the orchestrator, wake word service, + * and test scripts. Two backends implement it: * - * Uses ALSA tools (arecord/aplay) via child processes. - * Works with any ALSA-compatible audio device: - * - I2S (INMP441 mic, MAX98357 amp) connected directly to Pi GPIO - * - USB audio devices - * - Default system audio + * - `AlsaAudioService` — arecord/aplay child processes, for dev on a + * machine with a USB mic or when the Pi owns the I2S mic/speaker + * directly. Selected with `AUDIO_BACKEND=alsa`. * - * Audio format: PCM signed 16-bit little-endian, mono, 16kHz + * - `Esp32AudioService` — mic and speaker live on the ESP32; audio + * flows over UART via `HardwareService`. Selected with + * `AUDIO_BACKEND=esp32` (the default in production). */ -export class AudioService extends EventEmitter { +export abstract class AudioService extends EventEmitter { + abstract get isCapturing(): boolean; + abstract get isPlaying(): boolean; + abstract startCapture(): void; + abstract stopCapture(): void; + abstract play(audioBuffer: Buffer): Promise; + abstract stopPlayback(): void; + abstract destroy(): Promise; +} + +// ───────────────────────────────────────────────────────────────── +// ALSA backend — kept for dev on laptops and for Pi setups where +// the mic/speaker hang off ALSA directly (USB sound card, HAT…). +// ───────────────────────────────────────────────────────────────── + +export class AlsaAudioService extends AudioService { private captureProcess: ChildProcess | null = null; + private playProcess: ChildProcess | null = null; private readonly logger: Logger; private _isCapturing = false; private _isPlaying = false; @@ -32,7 +50,7 @@ export class AudioService extends EventEmitter { constructor(private readonly config: AudioConfig) { super(); - this.logger = createLogger('audio', 'info'); + this.logger = createLogger('audio:alsa', 'info'); } get isCapturing(): boolean { @@ -43,10 +61,6 @@ export class AudioService extends EventEmitter { return this._isPlaying; } - /** - * Start capturing audio from the microphone. - * Emits 'audio_chunk' events with raw PCM buffers. - */ startCapture(): void { if (this._isCapturing) { this.logger.warn('Already capturing audio'); @@ -58,13 +72,6 @@ export class AudioService extends EventEmitter { 'Starting audio capture', ); - // arecord outputs raw PCM to stdout - // -D: ALSA device - // -f: format (S16_LE = signed 16-bit little-endian) - // -r: sample rate - // -c: channels - // -t: type (raw = no header) - // --buffer-size: in frames, controls latency const bufferFrames = Math.floor(this.config.sampleRate * (this.config.chunkDurationMs / 1000)); this.captureProcess = spawn('arecord', [ @@ -112,9 +119,6 @@ export class AudioService extends EventEmitter { }); } - /** - * Stop capturing audio from the microphone. - */ stopCapture(): void { if (!this.captureProcess) return; @@ -125,12 +129,6 @@ export class AudioService extends EventEmitter { this._isCapturing = false; } - /** - * Play audio through the speaker. - * Accepts either raw PCM or WAV (with RIFF header) data. - * - * @returns Promise that resolves when playback is complete - */ async play(audioBuffer: Buffer): Promise { if (this._isPlaying) { this.logger.warn('Already playing audio, queueing...'); @@ -152,24 +150,26 @@ export class AudioService extends EventEmitter { '-', ]; - const playProcess = spawn('aplay', args, { + this.playProcess = spawn('aplay', args, { stdio: ['pipe', 'ignore', 'pipe'], }); - playProcess.stderr?.on('data', (data: Buffer) => { + this.playProcess.stderr?.on('data', (data: Buffer) => { const msg = data.toString().trim(); if (msg && !msg.startsWith('Playing') && !msg.startsWith('Warning')) { this.logger.error({ msg }, 'aplay stderr'); } }); - playProcess.on('error', (err) => { + this.playProcess.on('error', (err) => { this._isPlaying = false; + this.playProcess = null; reject(new Error(`Audio playback failed: ${err.message}`)); }); - playProcess.on('exit', (code) => { + this.playProcess.on('exit', (code) => { this._isPlaying = false; + this.playProcess = null; if (code === 0 || code === null) { this.emit('playback_done'); resolve(); @@ -178,26 +178,194 @@ export class AudioService extends EventEmitter { } }); - // Write audio data to aplay's stdin and close it - playProcess.stdin?.write(audioBuffer); - playProcess.stdin?.end(); + this.playProcess.stdin?.write(audioBuffer); + this.playProcess.stdin?.end(); }); } - /** - * Stop any currently playing audio. - */ stopPlayback(): void { - // aplay is spawned per-play, so we can't easily stop it here - // For interrupt support, we'd track the play process + if (this.playProcess) { + this.playProcess.kill('SIGTERM'); + this.playProcess = null; + } this._isPlaying = false; } - /** - * Clean up resources. - */ async destroy(): Promise { this.stopCapture(); + this.stopPlayback(); this.removeAllListeners(); } } + +// ───────────────────────────────────────────────────────────────── +// ESP32 backend — the mic and speaker live on the firmware side and +// audio flows over the UART link owned by HardwareService. +// ───────────────────────────────────────────────────────────────── + +/** + * Bytes-per-chunk written to the ESP32 per AUDIO_DOWN frame. Must + * match `AUDIO_DOWN_CHUNK_BYTES` in HardwareService. 640 bytes = + * 20 ms of 16 kHz S16 mono audio. + */ +const ESP32_CHUNK_BYTES = 640; + +/** Milliseconds we wait between two AUDIO_DOWN frames during playback. */ +const ESP32_PACING_MS = 18; + +export class Esp32AudioService extends AudioService { + private readonly logger: Logger; + private _isCapturing = false; + private _isPlaying = false; + private _playbackAbort = false; + + /** Latched listener so we can detach on `stopCapture()`. */ + private readonly forwardAudioUp = (chunk: Buffer): void => { + if (!this._isCapturing) return; + this.emit('audio_chunk', chunk); + }; + + constructor( + _config: AudioConfig, + private readonly hardware: HardwareService, + ) { + super(); + void _config; + this.logger = createLogger('audio:esp32', 'info'); + } + + get isCapturing(): boolean { + return this._isCapturing; + } + + get isPlaying(): boolean { + return this._isPlaying; + } + + startCapture(): void { + if (this._isCapturing) { + this.logger.warn('Already capturing audio'); + return; + } + this.logger.info('Subscribing to ESP32 AUDIO_UP stream'); + this._isCapturing = true; + // Attach exactly once per capture session — removed in stopCapture. + this.hardware.on('audio_up', this.forwardAudioUp); + } + + stopCapture(): void { + if (!this._isCapturing) return; + this.logger.info('Unsubscribing from ESP32 AUDIO_UP stream'); + this._isCapturing = false; + this.hardware.off('audio_up', this.forwardAudioUp); + } + + /** + * Play a PCM S16 mono 16 kHz buffer on the ESP32 speaker. If `buf` + * carries a WAV header, strip it first (the firmware expects raw PCM). + * + * We pace the writes manually so the Node serial buffer and the + * ESP32 speaker DMA stay roughly in sync. Without pacing, the whole + * buffer would be pushed into the kernel at once and the robot would + * still be "speaking" long after the orchestrator thinks it's done. + */ + async play(audioBuffer: Buffer): Promise { + if (this._isPlaying) { + this.logger.warn('Already playing audio — ignoring new buffer'); + return; + } + + const pcm = stripWavHeader(audioBuffer); + if (pcm.length === 0) { + this.emit('playback_done'); + return; + } + + this._isPlaying = true; + this._playbackAbort = false; + + try { + for (let offset = 0; offset < pcm.length; offset += ESP32_CHUNK_BYTES) { + if (this._playbackAbort) break; + const slice = pcm.subarray(offset, offset + ESP32_CHUNK_BYTES); + this.hardware.sendAudioDown(slice); + if (ESP32_PACING_MS > 0) { + await delay(ESP32_PACING_MS); + } + } + // Let the kernel TX buffer drain so we don't race on destroy. + try { + await this.hardware.drainAudioDown(); + } catch (err) { + this.logger.warn({ err }, 'drain after playback failed'); + } + this.emit('playback_done'); + } finally { + this._isPlaying = false; + this._playbackAbort = false; + } + } + + stopPlayback(): void { + if (!this._isPlaying) return; + this.logger.info('Aborting playback'); + this._playbackAbort = true; + } + + async destroy(): Promise { + this.stopCapture(); + this.stopPlayback(); + this.removeAllListeners(); + } +} + +// ───────────────────────────────────────────────────────────────── +// Helpers +// ───────────────────────────────────────────────────────────────── + +function delay(ms: number): Promise { + return new Promise((resolve) => setTimeout(resolve, ms)); +} + +/** + * Strip the 44-byte RIFF/WAVE header if present. The ESP32 I2S driver + * wants raw S16 mono PCM, nothing else. + */ +function stripWavHeader(buf: Buffer): Buffer { + if (buf.length > 44 && buf.toString('ascii', 0, 4) === 'RIFF' && buf.toString('ascii', 8, 12) === 'WAVE') { + return buf.subarray(44); + } + return buf; +} + +// ───────────────────────────────────────────────────────────────── +// Factory +// ───────────────────────────────────────────────────────────────── + +export type AudioBackend = 'alsa' | 'esp32'; + +/** + * Create the right AudioService for the current backend. The default + * is `esp32`; set `AUDIO_BACKEND=alsa` to fall back to the legacy + * arecord/aplay path (useful for laptop dev without an ESP32 wired in). + */ +export function createAudioService( + config: AudioConfig, + hardware: HardwareService | null, +): AudioService { + const backend = (config.backend ?? 'esp32') as AudioBackend; + if (backend === 'alsa') { + return new AlsaAudioService(config); + } + if (backend === 'esp32') { + if (!hardware) { + throw new Error( + 'AUDIO_BACKEND=esp32 requires a connected HardwareService — ' + + 'set HARDWARE_SERIAL_ENABLED=true and make sure the ESP32 is reachable, ' + + 'or switch to AUDIO_BACKEND=alsa for local development.', + ); + } + return new Esp32AudioService(config, hardware); + } + throw new Error(`Unknown AUDIO_BACKEND: ${backend}`); +} diff --git a/apps/robot-client/src/services/index.ts b/apps/robot-client/src/services/index.ts index 0a25fd7..b38ff48 100644 --- a/apps/robot-client/src/services/index.ts +++ b/apps/robot-client/src/services/index.ts @@ -1,4 +1,10 @@ -export { AudioService } from './audio.service.js'; +export { + AudioService, + AlsaAudioService, + Esp32AudioService, + createAudioService, + type AudioBackend, +} from './audio.service.js'; export { WakeWordService } from './wake-word.service.js'; export { KeyboardTriggerService } from './keyboard-trigger.service.js'; export { HealthService } from './health.service.js'; diff --git a/apps/robot-client/src/services/wake-word.service.ts b/apps/robot-client/src/services/wake-word.service.ts index 228fb42..7d2b30a 100644 --- a/apps/robot-client/src/services/wake-word.service.ts +++ b/apps/robot-client/src/services/wake-word.service.ts @@ -1,24 +1,35 @@ import { ChildProcess, spawn } from 'node:child_process'; import { EventEmitter } from 'node:events'; import { type WakeWordConfig, type AudioConfig } from '../config/index.js'; +import { type HardwareService } from '../hardware/index.js'; import { createLogger, type Logger } from '../utils/index.js'; export interface WakeWordServiceEvents { - /** Emitted when the wake word is detected */ detected: () => void; - /** Emitted when the engine is ready */ ready: () => void; - /** Emitted on errors */ error: (error: Error) => void; } /** * Wake word detection service. * - * Runs OpenWakeWord as a **long-lived** Python subprocess. - * The model is loaded once at startup; pause/resume is handled via - * PAUSE/RESUME commands on stdin, so the audio device is released - * while arecord is capturing, then reclaimed when listening resumes. + * Two operating modes, selected by whether a HardwareService is passed + * to the constructor: + * + * 1. **ALSA mode** (no HardwareService) + * The Python subprocess opens PyAudio on `audioConfig.captureDevice` + * and reads the mic directly. Pause releases the ALSA device so + * arecord (the AlsaAudioService) can use it during conversation. + * + * 2. **ESP32 mode** (HardwareService provided) + * The Python subprocess reads raw S16 mono PCM from stdin. We + * subscribe to `hardware.on('audio_up')` and pipe every mic chunk + * coming off the UART straight into the Python process. Control + * commands (PAUSE/RESUME/RESET/QUIT) go over a separate pipe at + * fd 3 because stdin is busy carrying audio. + * + * The model is loaded once at startup; pause/resume is cheap and + * does not reload it. */ export class WakeWordService extends EventEmitter { private process: ChildProcess | null = null; @@ -26,51 +37,73 @@ export class WakeWordService extends EventEmitter { private _isListening = false; private _isPaused = false; private _streamClosed = false; + private readonly usesHardware: boolean; + + /** Latched forwarder so we can detach it on stop / error. */ + private readonly forwardMicChunk = (chunk: Buffer): void => { + if (!this.process || !this.process.stdin || this.process.stdin.destroyed) return; + // Node gracefully buffers writes if the pipe is full; we don't + // apply back-pressure here because dropping wake-word audio would + // just hurt detection accuracy for a few tens of ms. + this.process.stdin.write(chunk); + }; constructor( private readonly wakeWordConfig: WakeWordConfig, private readonly audioConfig: AudioConfig, + private readonly hardware: HardwareService | null = null, ) { super(); this.logger = createLogger('wake-word', 'info'); + this.usesHardware = hardware !== null; } get isListening(): boolean { return this._isListening && !this._isPaused; } - /** - * Start the wake word Python subprocess. - * The model is loaded once; subsequent pause/resume cycles are fast. - */ start(): void { if (this.process) { - // Process already running — just resume if paused - if (this._isPaused) { - this.resume(); - } + if (this._isPaused) this.resume(); return; } this.logger.info( - { model: this.wakeWordConfig.modelName, threshold: this.wakeWordConfig.threshold }, + { + mode: this.usesHardware ? 'esp32' : 'alsa', + model: this.wakeWordConfig.modelName, + threshold: this.wakeWordConfig.threshold, + }, 'Starting wake word detection', ); - this.process = spawn(this.wakeWordConfig.pythonPath, [ + const args = [ this.wakeWordConfig.scriptPath, '--model', this.wakeWordConfig.modelName, '--threshold', String(this.wakeWordConfig.threshold), - '--device', this.audioConfig.captureDevice, '--sample-rate', String(this.audioConfig.sampleRate), - ], { - stdio: ['pipe', 'pipe', 'pipe'], - }); + ]; + + if (this.usesHardware) { + args.push('--input', 'stdin', '--control-fd', '3'); + } else { + args.push('--input', 'alsa', '--device', this.audioConfig.captureDevice); + } + + // stdio layout: + // 0: stdin — audio in (ESP32 mode) or control (ALSA mode) + // 1: stdout — DETECTED events + // 2: stderr — status & log lines + // 3: extra — control pipe (ESP32 mode only) + const stdio: ('pipe' | 'ignore')[] = this.usesHardware + ? ['pipe', 'pipe', 'pipe', 'pipe'] + : ['pipe', 'pipe', 'pipe']; + + this.process = spawn(this.wakeWordConfig.pythonPath, args, { stdio }); this._isListening = true; this._isPaused = false; - // ── stdout: DETECTED events ── this.process.stdout?.on('data', (data: Buffer) => { const lines = data.toString().trim().split('\n'); for (const line of lines) { @@ -83,7 +116,6 @@ export class WakeWordService extends EventEmitter { } }); - // ── stderr: status messages ── this.process.stderr?.on('data', (data: Buffer) => { const lines = data.toString().trim().split('\n'); for (const line of lines) { @@ -107,10 +139,9 @@ export class WakeWordService extends EventEmitter { this.logger.info('⏳ Loading wake word model...'); } else if (msg.startsWith('Wake word model loaded')) { this.logger.info('✅ Wake word model loaded'); - } else if (msg.startsWith('Matched device') || msg.startsWith('Using device')) { + } else if (msg.startsWith('Matched device') || msg.startsWith('Using device') || msg.startsWith('Listening')) { this.logger.info(`🔊 ${msg}`); } else { - // Log unknown stderr messages at warn level to catch errors this.logger.warn({ msg }, 'Wake word stderr'); } } @@ -119,29 +150,36 @@ export class WakeWordService extends EventEmitter { this.process.on('error', (err) => { this._isListening = false; this.logger.error({ err }, 'Wake word process error'); + this.detachHardware(); this.emit('error', new Error(`Wake word process failed: ${err.message}`)); }); this.process.on('exit', (code) => { this._isListening = false; this._isPaused = false; + this.detachHardware(); this.process = null; if (code !== 0 && code !== null) { this.logger.warn({ code }, 'Wake word process exited unexpectedly'); - // Auto-restart after a short delay setTimeout(() => { this.logger.info('Restarting wake word detection...'); this.start(); }, 2000); } }); + + // In ESP32 mode, start piping mic audio from the UART. + if (this.usesHardware && this.hardware) { + this.hardware.on('audio_up', this.forwardMicChunk); + } } /** * Pause wake word detection. - * Sends PAUSE command to Python subprocess which closes the audio stream, - * freeing the device for arecord. Returns a promise that resolves when - * the audio stream is confirmed closed. + * + * In ALSA mode we must wait for STREAM_CLOSED so arecord can reclaim + * the device. In ESP32 mode the audio flow never stops — we just + * tell the Python process to ignore detections. */ pause(): Promise { if (!this.process || this._isPaused) return Promise.resolve(); @@ -149,9 +187,13 @@ export class WakeWordService extends EventEmitter { this._isPaused = true; this._streamClosed = false; - this.process.stdin?.write('PAUSE\n'); + this.writeControl('PAUSE'); + + if (this.usesHardware) { + // No physical device to release — resolve immediately. + return Promise.resolve(); + } - // Wait for the stream to be closed (so arecord can use the device) return new Promise((resolve) => { const checkInterval = setInterval(() => { if (this._streamClosed || !this.process) { @@ -160,7 +202,6 @@ export class WakeWordService extends EventEmitter { } }, 50); - // Safety timeout setTimeout(() => { clearInterval(checkInterval); resolve(); @@ -168,25 +209,18 @@ export class WakeWordService extends EventEmitter { }); } - /** - * Resume wake word detection after pause. - * The Python subprocess reopens the audio stream (fast, no model reload). - */ resume(): void { if (!this.process || !this._isPaused) return; this._isPaused = false; - this.process.stdin?.write('RESUME\n'); + this.writeControl('RESUME'); this.logger.info('🎤 Resuming wake word listening...'); } - /** - * Stop wake word detection permanently. - */ stop(): void { if (this.process) { - this.process.stdin?.write('QUIT\n'); - // Give it a moment to exit cleanly, then force kill + this.writeControl('QUIT'); + this.detachHardware(); setTimeout(() => { if (this.process) { this.process.kill('SIGTERM'); @@ -198,4 +232,35 @@ export class WakeWordService extends EventEmitter { this._isPaused = false; this.removeAllListeners(); } + + // ────────────────────────────────────────────────────────── + // Internals + // ────────────────────────────────────────────────────────── + + /** + * Write a text control command. In ALSA mode that goes to stdin; + * in ESP32 mode stdin carries audio so commands travel over the + * extra pipe at fd 3 (process.stdio[3]). + */ + private writeControl(cmd: string): void { + if (!this.process) return; + const line = `${cmd}\n`; + if (this.usesHardware) { + // stdio[3] is our control pipe — a Node Writable (net.Socket) stream. + const control = this.process.stdio[3] as unknown as + | (NodeJS.WritableStream & { destroyed?: boolean }) + | null; + if (control && !control.destroyed) { + control.write(line); + } + } else { + this.process.stdin?.write(line); + } + } + + private detachHardware(): void { + if (this.usesHardware && this.hardware) { + this.hardware.off('audio_up', this.forwardMicChunk); + } + } } diff --git a/apps/robot-hardware/lib/Audio/library.json b/apps/robot-hardware/lib/Audio/library.json new file mode 100644 index 0000000..189e247 --- /dev/null +++ b/apps/robot-hardware/lib/Audio/library.json @@ -0,0 +1,7 @@ +{ + "name": "Audio", + "version": "0.1.0", + "description": "Ti-Pote audio I/O — INMP441 mic + MAX98357A speaker via two I2S peripherals.", + "frameworks": "arduino", + "platforms": "espressif32" +} diff --git a/apps/robot-hardware/lib/Audio/src/Audio.cpp b/apps/robot-hardware/lib/Audio/src/Audio.cpp new file mode 100644 index 0000000..2e4cf3b --- /dev/null +++ b/apps/robot-hardware/lib/Audio/src/Audio.cpp @@ -0,0 +1,151 @@ +#include "Audio.h" +#include + +namespace tipote { + +// ───────────────────────────────────────────────────────────────── +// Shared I2S bus pin assignment — see the header for rationale. +// ───────────────────────────────────────────────────────────────── +static constexpr int PIN_BCLK = 32; // shared: mic SCK + speaker BCLK +static constexpr int PIN_LRCLK = 33; // shared: mic WS + speaker LRC +static constexpr int PIN_MIC_DIN = 34; // INMP441 SD → ESP32 data-in +static constexpr int PIN_SPK_DOUT = 22; // MAX98357A DIN ← ESP32 data-out + +// DMA buffers — 4 × 256 × 8 bytes (stereo 32-bit) ≈ 8 KB each for +// RX and TX. That's ~64 ms of audio each way at 16 kHz, plenty of +// room to absorb UART jitter. +static constexpr int DMA_COUNT = 4; +static constexpr int DMA_LEN = 256; + +bool Audio::begin() { + // ───── Single I2S port, full duplex, 32-bit stereo slots ───── + // + // The INMP441 requires 32-bit slots; the MAX98357A happily reads + // the 32-bit frames we emit. With a shared bus we get one set of + // BCLK/WS for both sides — exactly like the Pi setup that worked. + i2s_config_t cfg = {}; + cfg.mode = static_cast(I2S_MODE_MASTER | + I2S_MODE_RX | + I2S_MODE_TX); + cfg.sample_rate = SAMPLE_RATE; + cfg.bits_per_sample = I2S_BITS_PER_SAMPLE_32BIT; + cfg.channel_format = I2S_CHANNEL_FMT_RIGHT_LEFT; // stereo frames + cfg.communication_format = I2S_COMM_FORMAT_STAND_I2S; + cfg.intr_alloc_flags = ESP_INTR_FLAG_LEVEL1; + cfg.dma_buf_count = DMA_COUNT; + cfg.dma_buf_len = DMA_LEN; + cfg.use_apll = false; + cfg.tx_desc_auto_clear = true; + cfg.fixed_mclk = 0; + + if (i2s_driver_install(I2S_NUM_0, &cfg, 0, nullptr) != ESP_OK) { + return false; + } + + i2s_pin_config_t pins = {}; + pins.bck_io_num = PIN_BCLK; + pins.ws_io_num = PIN_LRCLK; + pins.data_out_num = PIN_SPK_DOUT; + pins.data_in_num = PIN_MIC_DIN; + if (i2s_set_pin(I2S_NUM_0, &pins) != ESP_OK) { + i2s_driver_uninstall(I2S_NUM_0); + return false; + } + + i2s_zero_dma_buffer(I2S_NUM_0); + micStarted_ = true; + spkStarted_ = true; + return true; +} + +size_t Audio::readMicChunk(uint8_t* out, size_t outCapacity) { + if (!micStarted_ || outCapacity < 2) return 0; + + // Stereo read: each "sample pair" is L + R, each 32-bit = 8 bytes. + // Cap at 320 pairs = 20 ms @ 16 kHz mono per call. + constexpr size_t MAX_PAIRS = 320; + int32_t raw[MAX_PAIRS * 2]; + + size_t wantPairs = outCapacity / 2; // 2 bytes out per mono sample + if (wantPairs > MAX_PAIRS) wantPairs = MAX_PAIRS; + + size_t bytesRead = 0; + const esp_err_t err = i2s_read( + I2S_NUM_0, + reinterpret_cast(raw), + wantPairs * 2 * sizeof(int32_t), + &bytesRead, + 0 // non-blocking + ); + if (err != ESP_OK || bytesRead == 0) return 0; + + const size_t pairs = bytesRead / (2 * sizeof(int32_t)); + int16_t* dst = reinterpret_cast(out); + + int32_t lMin = INT32_MAX, lMax = INT32_MIN; + int32_t rMin = INT32_MAX, rMax = INT32_MIN; + int16_t s16Min = INT16_MAX, s16Max = INT16_MIN; + + const bool pickRight = (micChannel_ == MicChannel::Right); + + for (size_t i = 0; i < pairs; ++i) { + const int32_t L = raw[2 * i]; + const int32_t R = raw[2 * i + 1]; + if (L < lMin) lMin = L; + if (L > lMax) lMax = L; + if (R < rMin) rMin = R; + if (R > rMax) rMax = R; + + // INMP441 is 24-bit left-justified in a 32-bit slot, so the + // useful range lives in bits 31..8. A >> 14 gives a comfortable + // speech level; bump to >> 11 if the result is too quiet. + const int32_t src = pickRight ? R : L; + int32_t s = src >> 14; + if (s > INT16_MAX) s = INT16_MAX; + if (s < INT16_MIN) s = INT16_MIN; + const int16_t s16 = static_cast(s); + if (s16 < s16Min) s16Min = s16; + if (s16 > s16Max) s16Max = s16; + dst[i] = s16; + } + + lastStats_ = {lMin, lMax, rMin, rMax, s16Min, s16Max, pairs}; + return pairs * 2; +} + +size_t Audio::writeSpeakerChunk(const uint8_t* data, size_t len) { + if (!spkStarted_ || len == 0) return 0; + + // The UART brings us S16 mono PCM. The I2S bus is running as + // 32-bit stereo, so we expand each 16-bit sample to a stereo + // pair of 32-bit words. 320 input samples → 2560 output bytes. + constexpr size_t MAX_IN_SAMPLES = 320; + const size_t inSamples = (len / 2 > MAX_IN_SAMPLES) ? MAX_IN_SAMPLES : len / 2; + + int32_t stereo[MAX_IN_SAMPLES * 2]; + const int16_t* src = reinterpret_cast(data); + for (size_t i = 0; i < inSamples; ++i) { + // Shift up to place the sample in the upper 16 bits of the + // 32-bit slot (matches what the MAX98357A expects). + const int32_t s32 = static_cast(src[i]) << 16; + stereo[2 * i] = s32; // left + stereo[2 * i + 1] = s32; // right duplicated + } + + size_t bytesWritten = 0; + i2s_write(I2S_NUM_0, stereo, inSamples * 2 * sizeof(int32_t), + &bytesWritten, pdMS_TO_TICKS(50)); + + // Report bytes accepted in *caller units* (S16 mono) so the + // outside world doesn't need to know about our internal format. + const size_t pairsWritten = bytesWritten / (2 * sizeof(int32_t)); + return pairsWritten * 2; +} + +void Audio::flushSpeaker() { + if (spkStarted_) { + i2s_zero_dma_buffer(I2S_NUM_0); + } +} + +} // namespace tipote diff --git a/apps/robot-hardware/lib/Audio/src/Audio.h b/apps/robot-hardware/lib/Audio/src/Audio.h new file mode 100644 index 0000000..50b1f9c --- /dev/null +++ b/apps/robot-hardware/lib/Audio/src/Audio.h @@ -0,0 +1,84 @@ +// Ti-Pote — Audio I/O via a single full-duplex I2S bus. +// +// I2S_NUM_0 is configured as MASTER in RX+TX mode. BCLK and WS are +// shared between the INMP441 microphone (RX) and the MAX98357A +// amplifier (TX), which is the standard I2S bus layout — exactly +// what was working on the Raspberry Pi side. +// +// Pin map (single shared I2S bus): +// BCLK = GPIO 32 shared mic SCK + speaker BCLK +// LRCLK / WS = GPIO 33 shared mic WS + speaker LRC +// Mic data in = GPIO 34 INMP441 SD (input-only pin, perfect) +// Speaker DOUT = GPIO 22 MAX98357A DIN +// +// Mic L/R stays tied to GND → talks on the LEFT slot of the I2S frame. +// +// Format exchanged with the Pi on the UART: +// PCM signed 16-bit little-endian, mono, 16 kHz. +// +// Internally the bus runs at 32-bit stereo slots (INMP441 requires it). +// readMicChunk() converts the 32-bit left slot down to S16 mono. +// writeSpeakerChunk() expands S16 mono to 32-bit stereo frames before +// handing them to i2s_write(). + +#pragma once + +#include +#include +#include + +namespace tipote { + +class Audio { +public: + static constexpr int SAMPLE_RATE = 16000; + static constexpr int CHANNELS = 1; + static constexpr int BYTES_PER_SAMPLE = 2; // S16 + + // Initialise both I2S ports. Safe to call exactly once from setup(). + bool begin(); + + // Pull whatever the mic DMA has ready. Writes S16 mono little-endian + // bytes into `out`, up to `outCapacity` bytes, and returns the number + // of bytes actually written (always even, possibly zero). + // + // Non-blocking (timeout = 0). + size_t readMicChunk(uint8_t* out, size_t outCapacity); + + // Push S16 mono little-endian PCM to the speaker DMA. Blocks up to + // ~50 ms waiting for room. Returns bytes actually accepted. + size_t writeSpeakerChunk(const uint8_t* data, size_t len); + + // Drop anything pending in the speaker DMA. Used on shutdown / reset. + void flushSpeaker(); + + // ─── Debug / bring-up ──────────────────────────────────────── + // + // Stats updated on every readMicChunk() call, covering *this last + // batch only*. Handy to confirm the mic is actually clocking data + // into the ESP32 without blowing up the main audio path. + struct MicStats { + int32_t leftRawMin; // raw int32 sample on left I2S slot + int32_t leftRawMax; + int32_t rightRawMin; // raw int32 sample on right I2S slot + int32_t rightRawMax; + int16_t s16Min; // post-shift S16 sample (output channel) + int16_t s16Max; + size_t samples; // sample pairs in the batch + }; + const MicStats& lastMicStats() const { return lastStats_; } + + // Which I2S slot to route into the S16 output. Flip at runtime if + // the mic's L/R pin doesn't land where we expect. + enum class MicChannel { Left, Right }; + void setMicChannel(MicChannel ch) { micChannel_ = ch; } + MicChannel micChannel() const { return micChannel_; } + +private: + bool micStarted_ = false; + bool spkStarted_ = false; + MicChannel micChannel_ = MicChannel::Left; + MicStats lastStats_ = {0, 0, 0, 0, 0, 0, 0}; +}; + +} // namespace tipote diff --git a/apps/robot-hardware/platformio.ini b/apps/robot-hardware/platformio.ini index 3af807a..9e62092 100644 --- a/apps/robot-hardware/platformio.ini +++ b/apps/robot-hardware/platformio.ini @@ -30,6 +30,11 @@ build_flags = -DHW_SERIAL_BAUD=921600 ; Idle timeout before the eyes fall back to the default animation (ms) -DHW_HEARTBEAT_TIMEOUT_MS=5000 + ; Hardware UART2 pins used to talk to the Raspberry Pi. + ; The OLED eyes already claim GPIO 16/17 (UART2 default pins), + ; so Serial2 is remapped to these two free pins instead. + -DHW_UART_RX_PIN=27 + -DHW_UART_TX_PIN=13 build_unflags = -std=gnu++11 diff --git a/apps/robot-hardware/scripts/esp-play.ts b/apps/robot-hardware/scripts/esp-play.ts new file mode 100644 index 0000000..f328d45 --- /dev/null +++ b/apps/robot-hardware/scripts/esp-play.ts @@ -0,0 +1,219 @@ +/** + * Ti-Pote — Play a PCM/WAV file on the ESP32 speaker over USB. + * + * Usage: + * pnpm esp:play + * + * Accepts either: + * - raw S16 LE mono 16 kHz PCM + * - WAV file with a 44-byte RIFF header (header is stripped) + * + * Default port: auto-detected, override with ESP_PORT=/dev/cu.usbserial-XXX + */ + +import { execFileSync } from 'node:child_process'; +import { existsSync, mkdtempSync, readFileSync, readdirSync, rmSync } from 'node:fs'; +import { tmpdir } from 'node:os'; +import { join, extname } from 'node:path'; +import { SerialPort } from 'serialport'; + +const SAMPLE_RATE = 16000; + +function findDefaultPort(): string { + const envPort = process.env.ESP_PORT; + if (envPort) return envPort; + const candidates = readdirSync('/dev').filter( + (f) => + f.startsWith('cu.usbserial') || + f.startsWith('cu.SLAB_') || + f.startsWith('cu.wchusbserial'), + ); + if (candidates.length === 0) { + throw new Error( + 'No ESP32 serial port detected. Plug the board in, or set ESP_PORT=/dev/cu.usbserial-XXX', + ); + } + return `/dev/${candidates[0]}`; +} + +function stripWav(buf: Buffer): Buffer { + if ( + buf.length > 44 && + buf.toString('ascii', 0, 4) === 'RIFF' && + buf.toString('ascii', 8, 12) === 'WAVE' + ) { + return buf.subarray(44); + } + return buf; +} + +/** + * Convert any audio file macOS can decode (m4a, mp3, ogg, aiff, …) to + * S16 LE mono 16 kHz WAV using the built-in `afconvert` tool. Returns + * the path to a new .wav file in a temp dir which the caller is + * responsible for cleaning up. + */ +function convertToEsp32Wav(inputPath: string): { wavPath: string; cleanup: () => void } { + const dir = mkdtempSync(join(tmpdir(), 'tipote-')); + const wavPath = join(dir, 'converted.wav'); + console.log(`→ converting ${inputPath} → 16 kHz mono S16LE WAV`); + try { + execFileSync( + 'afconvert', + [ + '-f', 'WAVE', + '-d', 'LEI16@16000', + '-c', '1', + inputPath, + wavPath, + ], + { stdio: 'inherit' }, + ); + } catch (err) { + rmSync(dir, { recursive: true, force: true }); + throw new Error(`afconvert failed: ${(err as Error).message}`); + } + return { + wavPath, + cleanup: () => rmSync(dir, { recursive: true, force: true }), + }; +} + +async function main(): Promise { + const inPath = process.argv[2]; + if (!inPath) { + console.error('Usage: esp-play.ts (wav, raw, m4a, mp3, …)'); + process.exit(1); + } + if (!existsSync(inPath)) { + throw new Error(`file not found: ${inPath}`); + } + + // Convert anything that isn't already a .wav or raw PCM blob. This + // covers m4a / mp3 / ogg / aiff / opus / flac via the built-in + // macOS `afconvert` tool. + const ext = extname(inPath).toLowerCase(); + const needsConversion = ext !== '.wav' && ext !== '.raw' && ext !== '.pcm'; + + let cleanup: () => void = () => {}; + let loadPath = inPath; + if (needsConversion) { + const converted = convertToEsp32Wav(inPath); + loadPath = converted.wavPath; + cleanup = converted.cleanup; + } + + const raw = readFileSync(loadPath); + const pcm = stripWav(raw); + const samples = pcm.length / 2; + const durationMs = (samples / SAMPLE_RATE) * 1000; + console.log( + `→ loaded ${loadPath}: ${pcm.length} bytes (${samples} samples, ${durationMs.toFixed(0)} ms)`, + ); + + if (pcm.length === 0) { + cleanup(); + throw new Error('empty PCM buffer'); + } + if (pcm.length % 2 !== 0) { + cleanup(); + throw new Error( + 'PCM size must be a multiple of 2 (S16 mono). The source file is probably not 16-bit or not mono. If you passed a raw file, convert it first.', + ); + } + + const path = findDefaultPort(); + console.log(`→ opening ${path} @ 921600 baud`); + + const port = new SerialPort({ path, baudRate: 921600, autoOpen: false }); + await new Promise((resolve, reject) => { + port.open((err) => (err ? reject(err) : resolve())); + }); + + let ready = false; + const readyWaiters: Array<() => void> = []; + + const finished = new Promise((resolve, reject) => { + const timeout = setTimeout( + () => reject(new Error(`timeout waiting for OK after ${durationMs + 8000} ms`)), + durationMs + 8000, + ); + let lineBuf = ''; + port.on('data', (data: Buffer) => { + lineBuf += data.toString('utf8'); + let idx: number; + while ((idx = lineBuf.indexOf('\n')) >= 0) { + const line = lineBuf.slice(0, idx).replace(/\r$/, '').trim(); + lineBuf = lineBuf.slice(idx + 1); + if (!line) continue; + if (line === 'OK') { + clearTimeout(timeout); + resolve(); + return; + } + if (line === 'READY') { + ready = true; + while (readyWaiters.length) readyWaiters.shift()!(); + continue; + } + if (line.startsWith('ERR ')) { + clearTimeout(timeout); + reject(new Error(`firmware error: ${line.slice(4)}`)); + return; + } + if (line.startsWith('LOG ')) console.log(`[esp] ${line.slice(4)}`); + else console.log(`[esp] ${line}`); + } + }); + port.on('error', reject); + }); + + // Wait for READY so we don't send PLAY into the bootloader. + await new Promise((resolve, reject) => { + if (ready) return resolve(); + const timer = setTimeout( + () => reject(new Error('timeout waiting for READY from firmware')), + 5000, + ); + readyWaiters.push(() => { + clearTimeout(timer); + resolve(); + }); + }); + await new Promise((r) => setTimeout(r, 50)); + + console.log(`→ PLAY ${pcm.length} bytes`); + port.write(`PLAY ${pcm.length}\n`); + + // Stream the payload paced EXACTLY at the I2S consumption rate so + // the ESP32 RX buffer stays roughly constant in size regardless of + // file length. I2S consumes 16 kHz × 2 bytes/sample = 32 KB/s of + // S16 mono. A 1024-byte burst is 32 ms of audio → sleeping 32 ms + // between bursts matches playback exactly. + // + // We still pad lightly above 32 KB/s (30 ms instead of 32) so the + // DMA never runs dry. The excess fills the ~16 KB RX buffer on the + // firmware slowly; even for a 10 s file we stay well under it. + const CHUNK = 1024; + const PAUSE_MS = 30; + for (let off = 0; off < pcm.length; off += CHUNK) { + const slice = pcm.subarray(off, off + CHUNK); + await new Promise((resolve, reject) => { + port.write(slice, (err) => (err ? reject(err) : resolve())); + }); + await new Promise((resolve) => port.drain(() => resolve())); + if (off + CHUNK < pcm.length) { + await new Promise((r) => setTimeout(r, PAUSE_MS)); + } + } + + await finished; + await new Promise((resolve) => port.close(() => resolve())); + cleanup(); + console.log('✅ playback done'); +} + +main().catch((err) => { + console.error(err); + process.exit(1); +}); diff --git a/apps/robot-hardware/scripts/esp-record.ts b/apps/robot-hardware/scripts/esp-record.ts new file mode 100644 index 0000000..4a54218 --- /dev/null +++ b/apps/robot-hardware/scripts/esp-record.ts @@ -0,0 +1,190 @@ +/** + * Ti-Pote — Record audio from the ESP32 over USB. + * + * Usage: + * pnpm --filter @ti-pote/robot-client exec tsx \ + * ../robot-hardware/scripts/esp-record.ts [duration_ms] + * + * Or with the shortcut from robot-hardware: + * pnpm esp:record out.wav 3000 + * + * Defaults: + * duration_ms = 3000 + * port = auto-detected (first /dev/cu.usbserial-* or /dev/cu.SLAB_*) + * can be overridden with ESP_PORT=/dev/cu.usbserial-XXX + */ + +import { readdirSync, writeFileSync } from 'node:fs'; +import { SerialPort } from 'serialport'; + +const SAMPLE_RATE = 16000; +const BYTES_PER_SAMPLE = 2; + +function findDefaultPort(): string { + const envPort = process.env.ESP_PORT; + if (envPort) return envPort; + const candidates = readdirSync('/dev').filter( + (f) => + f.startsWith('cu.usbserial') || + f.startsWith('cu.SLAB_') || + f.startsWith('cu.wchusbserial'), + ); + if (candidates.length === 0) { + throw new Error( + 'No ESP32 serial port detected. Plug the board in, or set ESP_PORT=/dev/cu.usbserial-XXX', + ); + } + return `/dev/${candidates[0]}`; +} + +function wavHeader(pcmBytes: number, sampleRate: number): Buffer { + const header = Buffer.alloc(44); + header.write('RIFF', 0); + header.writeUInt32LE(36 + pcmBytes, 4); + header.write('WAVE', 8); + header.write('fmt ', 12); + header.writeUInt32LE(16, 16); // fmt chunk size + header.writeUInt16LE(1, 20); // PCM + header.writeUInt16LE(1, 22); // mono + header.writeUInt32LE(sampleRate, 24); + header.writeUInt32LE(sampleRate * 2, 28); // byte rate + header.writeUInt16LE(2, 32); // block align + header.writeUInt16LE(16, 34); // bits per sample + header.write('data', 36); + header.writeUInt32LE(pcmBytes, 40); + return header; +} + +async function main(): Promise { + const outPath = process.argv[2]; + const durationMs = parseInt(process.argv[3] ?? '3000', 10); + + if (!outPath) { + console.error('Usage: esp-record.ts [duration_ms]'); + process.exit(1); + } + + const path = findDefaultPort(); + console.log(`→ opening ${path} @ 921600 baud`); + + const port = new SerialPort({ path, baudRate: 921600, autoOpen: false }); + + await new Promise((resolve, reject) => { + port.open((err) => (err ? reject(err) : resolve())); + }); + + // ── simple line-based state machine for stdout text ─────────── + let phase: 'idle' | 'streaming' = 'idle'; + let remaining = 0; + const chunks: Buffer[] = []; + let lineBuf = ''; + let ready = false; + const readyWaiters: Array<() => void> = []; + + const finished = new Promise((resolve, reject) => { + const timeout = setTimeout( + () => reject(new Error(`timeout waiting for audio after ${durationMs + 5000} ms`)), + durationMs + 5000, + ); + + port.on('data', (data: Buffer) => { + let offset = 0; + while (offset < data.length) { + if (phase === 'streaming') { + const take = Math.min(remaining, data.length - offset); + chunks.push(data.subarray(offset, offset + take)); + offset += take; + remaining -= take; + if (remaining === 0) { + phase = 'idle'; + lineBuf = ''; + } + continue; + } + + // text mode: accumulate until newline + const nl = data.indexOf(0x0a, offset); + if (nl === -1) { + lineBuf += data.subarray(offset).toString('utf8'); + break; + } + lineBuf += data.subarray(offset, nl).toString('utf8'); + offset = nl + 1; + const line = lineBuf.replace(/\r$/, '').trim(); + lineBuf = ''; + if (!line) continue; + + if (line.startsWith('BEGIN ')) { + remaining = parseInt(line.slice(6), 10); + phase = 'streaming'; + console.log(`→ BEGIN ${remaining} bytes`); + } else if (line === 'END') { + clearTimeout(timeout); + const pcm = Buffer.concat(chunks); + resolve(pcm); + } else if (line === 'READY') { + ready = true; + while (readyWaiters.length) readyWaiters.shift()!(); + } else if (line.startsWith('LOG ')) { + console.log(`[esp] ${line.slice(4)}`); + } else if (line.startsWith('ERR ')) { + clearTimeout(timeout); + reject(new Error(`firmware error: ${line.slice(4)}`)); + } else { + console.log(`[esp] ${line}`); + } + } + }); + + port.on('error', reject); + }); + + // The ESP32 resets on port open (DTR/RTS). Wait until it prints + // READY so we don't send commands into the bootloader. + await new Promise((resolve, reject) => { + if (ready) return resolve(); + const timer = setTimeout( + () => reject(new Error('timeout waiting for READY from firmware')), + 5000, + ); + readyWaiters.push(() => { + clearTimeout(timer); + resolve(); + }); + }); + await new Promise((r) => setTimeout(r, 50)); + + console.log(`→ REC ${durationMs} ms — speak now!`); + port.write(`REC ${durationMs}\n`); + + const pcm = await finished; + + await new Promise((resolve) => port.close(() => resolve())); + + // Basic RMS sanity check. + let sumSq = 0; + const samples = pcm.length / BYTES_PER_SAMPLE; + for (let i = 0; i < pcm.length - 1; i += 2) { + const s = pcm.readInt16LE(i); + sumSq += s * s; + } + const rms = Math.sqrt(sumSq / samples); + console.log( + `✅ captured ${pcm.length} bytes (${samples} samples, ${( + (samples / SAMPLE_RATE) * + 1000 + ).toFixed(0)} ms) RMS=${rms.toFixed(0)}`, + ); + + if (outPath.toLowerCase().endsWith('.wav')) { + writeFileSync(outPath, Buffer.concat([wavHeader(pcm.length, SAMPLE_RATE), pcm])); + } else { + writeFileSync(outPath, pcm); + } + console.log(`→ wrote ${outPath}`); +} + +main().catch((err) => { + console.error(err); + process.exit(1); +}); diff --git a/apps/robot-hardware/src/main.cpp b/apps/robot-hardware/src/main.cpp index c347343..c7cdb9e 100644 --- a/apps/robot-hardware/src/main.cpp +++ b/apps/robot-hardware/src/main.cpp @@ -1,147 +1,281 @@ -// Ti-Pote — Robot Hardware firmware (ESP32) +// Ti-Pote — Minimal audio bring-up firmware (ESP32-WROOM-32) // -// Responsibilities for v0: -// - Listen on UART0 (the USB-connected serial port while the ESP32 -// is plugged into Arthur's laptop; on the real robot this will -// eventually be Serial2 wired to the Raspberry Pi). -// - Decode incoming binary frames (see include/protocol_types.h). -// - Dispatch commands to the Eyes renderer. -// - Reply to PING with PONG. -// - Fall back to a sleepy animation if no heartbeat is received -// for HW_HEARTBEAT_TIMEOUT_MS (set in platformio.ini). +// GOAL: prove the I2S audio chain (INMP441 + MAX98357A) end to end +// with nothing else in the loop — no Pi, no OLED, no protocol frames. +// The ESP32 is plugged into a computer via USB and the host runs +// two tiny scripts: // -// Intentionally NOT yet implemented (Phase 2): -// - I2S audio up/down streaming -// - Servo / LED commands +// scripts/esp-record.mjs +// scripts/esp-play.mjs // -// The hook points for those are marked with TODO(phase2). +// Protocol over USB Serial (921600 baud, line-based for commands, +// raw bytes for audio): +// +// host → esp32 +// "PING\n" ping +// "REC \n" start recording for milliseconds +// "PLAY \n" next bytes on the wire are raw +// S16 LE mono 16 kHz PCM, play them +// +// esp32 → host +// "READY\n" once at boot +// "PONG\n" reply to PING +// "LOG \n" human-readable log line +// "ERR \n" error message +// "BEGIN \n" start of a REC response +// "" raw PCM (S16 LE mono 16 kHz) +// "END\n" end of a REC response +// "OK\n" command completed +// +// Wiring (shared I2S bus on I2S_NUM_0): +// BCLK = GPIO 32 (mic SCK + speaker BCLK) +// LRCLK = GPIO 33 (mic WS + speaker LRC) +// MIC = GPIO 34 (INMP441 SD → ESP32 data-in, input-only pin) +// SPK = GPIO 22 (ESP32 data-out → MAX98357A DIN) #include -#include "Protocol.h" -#include "Eyes.h" +#include +#include -#ifndef HW_SERIAL_BAUD -#define HW_SERIAL_BAUD 921600 -#endif +// ────────────────────────────────────────────────────────── +// Audio config +// ────────────────────────────────────────────────────────── -#ifndef HW_HEARTBEAT_TIMEOUT_MS -#define HW_HEARTBEAT_TIMEOUT_MS 5000 -#endif +static constexpr int SAMPLE_RATE = 16000; +static constexpr int PIN_BCLK = 32; +static constexpr int PIN_LRCLK = 33; +static constexpr int PIN_MIC_DIN = 34; +static constexpr int PIN_SPK_DOUT = 22; -// The communication stream. When the ESP32 is plugged into a -// computer, UART0 (Serial) is the USB-CDC port, which is exactly -// what the robot-client will talk to during development. Later, -// for the Pi wiring, change this to Serial2 and call -// `Serial2.begin(HW_SERIAL_BAUD, SERIAL_8N1, RX_PIN, TX_PIN)`. -#define HW_COMM Serial +static constexpr int DMA_COUNT = 4; +static constexpr int DMA_LEN = 256; -using namespace tipote; +// Staging buffers — keep them outside of functions so we don't eat +// stack on every tick. +static constexpr size_t OUT_S16_SAMPLES = 320; // 20 ms of S16 mono +static int32_t g_rawStereo[OUT_S16_SAMPLES * 2]; +static int16_t g_micMono [OUT_S16_SAMPLES]; +static int32_t g_spkStereo[OUT_S16_SAMPLES * 2]; +static uint8_t g_spkInBuf [OUT_S16_SAMPLES * 2]; // 640 bytes of S16 mono -static Eyes eyes; -static FrameDecoder decoder; +// ────────────────────────────────────────────────────────── +// Line buffer for incoming text commands. +// ────────────────────────────────────────────────────────── -static uint32_t lastHeartbeatMs = 0; -static bool idleMode = false; +static char g_line[64]; +static size_t g_lineLen = 0; -// Forward decl -static void handleFrame(const Frame& frame, void* userData); -static void logLine(const char* line); +static void sendLog(const char* msg) { + Serial.print("LOG "); + Serial.println(msg); +} + +static void sendErr(const char* msg) { + Serial.print("ERR "); + Serial.println(msg); +} + +// ────────────────────────────────────────────────────────── +// I2S init — single port, full duplex, shared BCLK/WS. +// ────────────────────────────────────────────────────────── + +static bool audioBegin() { + i2s_config_t cfg = {}; + cfg.mode = static_cast(I2S_MODE_MASTER | + I2S_MODE_RX | + I2S_MODE_TX); + cfg.sample_rate = SAMPLE_RATE; + cfg.bits_per_sample = I2S_BITS_PER_SAMPLE_32BIT; + cfg.channel_format = I2S_CHANNEL_FMT_RIGHT_LEFT; + cfg.communication_format = I2S_COMM_FORMAT_STAND_I2S; + cfg.intr_alloc_flags = ESP_INTR_FLAG_LEVEL1; + cfg.dma_buf_count = DMA_COUNT; + cfg.dma_buf_len = DMA_LEN; + cfg.use_apll = false; + cfg.tx_desc_auto_clear = true; + cfg.fixed_mclk = 0; + + if (i2s_driver_install(I2S_NUM_0, &cfg, 0, nullptr) != ESP_OK) return false; + + i2s_pin_config_t pins = {}; + pins.bck_io_num = PIN_BCLK; + pins.ws_io_num = PIN_LRCLK; + pins.data_out_num = PIN_SPK_DOUT; + pins.data_in_num = PIN_MIC_DIN; + if (i2s_set_pin(I2S_NUM_0, &pins) != ESP_OK) { + i2s_driver_uninstall(I2S_NUM_0); + return false; + } + i2s_zero_dma_buffer(I2S_NUM_0); + return true; +} + +// Convert one batch of stereo 32-bit mic samples to S16 mono by +// taking the left slot and shifting the 24-bit-aligned data down. +// Returns the number of S16 samples written into `out`. +static size_t micReadMono(int16_t* out, size_t maxSamples) { + size_t wantPairs = maxSamples; + if (wantPairs > OUT_S16_SAMPLES) wantPairs = OUT_S16_SAMPLES; + + size_t bytesRead = 0; + const esp_err_t err = i2s_read( + I2S_NUM_0, + g_rawStereo, + wantPairs * 2 * sizeof(int32_t), + &bytesRead, + portMAX_DELAY // block — we're in a dedicated REC loop + ); + if (err != ESP_OK || bytesRead == 0) return 0; + + const size_t pairs = bytesRead / (2 * sizeof(int32_t)); + for (size_t i = 0; i < pairs; ++i) { + int32_t L = g_rawStereo[2 * i]; + int32_t s = L >> 14; + if (s > INT16_MAX) s = INT16_MAX; + if (s < INT16_MIN) s = INT16_MIN; + out[i] = static_cast(s); + } + return pairs; +} + +// Write one batch of S16 mono PCM to the speaker by duplicating each +// sample into both stereo slots and shifting into the high half of +// the 32-bit word (what the MAX98357A expects on a shared bus). +static void spkWriteMono(const int16_t* samples, size_t count) { + if (count == 0) return; + if (count > OUT_S16_SAMPLES) count = OUT_S16_SAMPLES; + for (size_t i = 0; i < count; ++i) { + const int32_t s32 = static_cast(samples[i]) << 16; + g_spkStereo[2 * i] = s32; + g_spkStereo[2 * i + 1] = s32; + } + size_t bytesWritten = 0; + i2s_write(I2S_NUM_0, g_spkStereo, count * 2 * sizeof(int32_t), + &bytesWritten, portMAX_DELAY); +} + +// ────────────────────────────────────────────────────────── +// Command handlers +// ────────────────────────────────────────────────────────── + +static void handleRec(uint32_t durationMs) { + const uint32_t totalSamples = (SAMPLE_RATE * durationMs) / 1000; + const uint32_t totalBytes = totalSamples * sizeof(int16_t); + + Serial.print("BEGIN "); + Serial.println(totalBytes); + + // Flush whatever old noise is in the mic DMA first. + i2s_zero_dma_buffer(I2S_NUM_0); + + uint32_t sent = 0; + while (sent < totalSamples) { + size_t want = totalSamples - sent; + if (want > OUT_S16_SAMPLES) want = OUT_S16_SAMPLES; + const size_t got = micReadMono(g_micMono, want); + if (got == 0) continue; + Serial.write(reinterpret_cast(g_micMono), + got * sizeof(int16_t)); + sent += got; + } + + Serial.println(); + Serial.println("END"); +} + +static void handlePlay(uint32_t totalBytes) { + // Drain any pending crap from the speaker DMA so we don't start + // with a pop. + i2s_zero_dma_buffer(I2S_NUM_0); + + // Give Serial.readBytes a generous timeout so a jittery host + // doesn't abort us mid-playback. + Serial.setTimeout(2000); + + uint32_t remaining = totalBytes; + while (remaining > 0) { + size_t want = remaining; + if (want > sizeof(g_spkInBuf)) want = sizeof(g_spkInBuf); + // Force an even count so we always have complete S16 samples. + if (want & 1) want -= 1; + if (want == 0) want = 2; + + const size_t got = Serial.readBytes(g_spkInBuf, want); + if (got == 0) { + sendErr("PLAY read timeout"); + return; + } + const size_t samples = got / sizeof(int16_t); + spkWriteMono(reinterpret_cast(g_spkInBuf), samples); + remaining -= got; + } + + // Let the last frames actually reach the speaker, then clear. + delay(50); + i2s_zero_dma_buffer(I2S_NUM_0); + Serial.println("OK"); +} + +static void handleLine(const char* line) { + if (strcmp(line, "PING") == 0) { + Serial.println("PONG"); + return; + } + if (strncmp(line, "REC ", 4) == 0) { + const long ms = atol(line + 4); + if (ms <= 0 || ms > 60000) { sendErr("REC bad duration"); return; } + handleRec(static_cast(ms)); + return; + } + if (strncmp(line, "PLAY ", 5) == 0) { + const long bytes = atol(line + 5); + if (bytes <= 0 || bytes > 16 * 1024 * 1024) { + sendErr("PLAY bad size"); + return; + } + handlePlay(static_cast(bytes)); + return; + } + sendErr("unknown command"); +} + +// ────────────────────────────────────────────────────────── +// Arduino entry points +// ────────────────────────────────────────────────────────── void setup() { - HW_COMM.begin(HW_SERIAL_BAUD); - // Give the host a beat to open the port after auto-reset. + // Bump the UART RX buffer WAY above the 256-byte default so we + // can absorb a full PLAY payload (up to a few tens of KB) without + // losing bytes if the host floods us. + Serial.setRxBufferSize(16 * 1024); + Serial.begin(921600); delay(50); - eyes.begin(); + if (!audioBegin()) { + sendErr("I2S init failed"); + } else { + sendLog("I2S ready"); + } - decoder.onFrame(handleFrame); - - lastHeartbeatMs = millis(); - logLine("robot-hardware ready"); + Serial.println("READY"); } void loop() { - // Drain whatever the host has sent since the last tick. - while (HW_COMM.available() > 0) { - int b = HW_COMM.read(); - if (b < 0) break; - decoder.feed(static_cast(b)); - } - - // Heartbeat watchdog: if we haven't heard from the host in a - // while, slip into a sleepy animation so the robot doesn't - // look frozen. Any incoming frame resets this. - const uint32_t now = millis(); - if (!idleMode && (now - lastHeartbeatMs) > HW_HEARTBEAT_TIMEOUT_MS) { - idleMode = true; - eyes.show(Emotion::SLEEPY); + while (Serial.available() > 0) { + const int c = Serial.read(); + if (c < 0) break; + if (c == '\r') continue; + if (c == '\n') { + g_line[g_lineLen] = 0; + if (g_lineLen > 0) handleLine(g_line); + g_lineLen = 0; + continue; + } + if (g_lineLen < sizeof(g_line) - 1) { + g_line[g_lineLen++] = static_cast(c); + } else { + g_lineLen = 0; + sendErr("line overflow"); + } } } - -// --------------------------------------------------------------- -// Frame dispatcher -// --------------------------------------------------------------- - -static void handleFrame(const Frame& frame, void* /*userData*/) { - lastHeartbeatMs = millis(); - if (idleMode) { - idleMode = false; - } - - switch (frame.type) { - case MsgType::DISPLAY_EMOTION: { - if (frame.length < 1) { - logLine("DISPLAY_EMOTION: empty payload"); - return; - } - const uint8_t code = frame.payload[0]; - if (code >= static_cast(Emotion::COUNT)) { - logLine("DISPLAY_EMOTION: out-of-range code"); - return; - } - eyes.show(static_cast(code)); - - // ACK back so the host knows it was applied. - uint8_t ackPayload[1] = {code}; - FrameEncoder::writeTo(HW_COMM, MsgType::ACK, ackPayload, 1); - return; - } - - case MsgType::DISPLAY_CLEAR: { - eyes.clear(); - FrameEncoder::writeTo(HW_COMM, MsgType::ACK); - return; - } - - case MsgType::PING: { - // Echo the payload back as PONG. Useful for latency - // measurements and proving the link is symmetric. - FrameEncoder::writeTo(HW_COMM, MsgType::PONG, - frame.payload, frame.length); - return; - } - - case MsgType::STATUS: { - // Heartbeat from host — lastHeartbeatMs was already - // bumped above. Nothing else to do for v0. - return; - } - - // TODO(phase2): AUDIO_UP / AUDIO_DOWN / SERVO_CMD / LED_CMD - default: - logLine("unknown frame type"); - return; - } -} - -// --------------------------------------------------------------- -// Diagnostic logging — wraps text in a LOG frame so the host -// can parse it without getting confused by free text on the wire. -// --------------------------------------------------------------- - -static void logLine(const char* line) { - const size_t len = strnlen(line, MAX_PAYLOAD_SIZE); - FrameEncoder::writeTo(HW_COMM, MsgType::LOG, - reinterpret_cast(line), - static_cast(len)); -}