ok script esp

This commit is contained in:
ordinarthur 2026-04-09 02:47:53 +02:00
parent b29653e3aa
commit c19d9a7cf4
17 changed files with 1860 additions and 390 deletions

View File

@ -12,7 +12,11 @@
"format": "prettier --write \"src/**/*.ts\"",
"test": "vitest run",
"test:watch": "vitest",
"hw:demo": "tsx scripts/hardware-demo.ts"
"hw:demo": "pnpm exec tsx scripts/hardware-demo.ts",
"audio:loopback": "pnpm exec tsx scripts/audio-loopback.ts",
"audio:beep": "pnpm exec tsx scripts/audio-beep.ts",
"esp:record": "pnpm exec tsx ../robot-hardware/scripts/esp-record.ts",
"esp:play": "pnpm exec tsx ../robot-hardware/scripts/esp-play.ts"
},
"dependencies": {
"socket.io-client": "^4.8.3",

View File

@ -0,0 +1,99 @@
/**
* Ti-Pote Pure tone speaker test.
*
* Generates a 440 Hz sine wave at ~70% of full scale and streams it
* to the ESP32 speaker via AUDIO_DOWN frames, then a second beep at
* 880 Hz. Completely independent of the microphone if this does
* not produce audible sound, the problem is downstream of the ESP32
* on the speaker path (MAX98357A wiring, SD pin, VIN, speaker leads).
*
* Run with:
* HARDWARE_SERIAL_PORT=/dev/serial0 pnpm --filter @ti-pote/robot-client audio:beep
*
* Optional env:
* BEEP_MS length of each beep in ms (default 1500)
* BEEP_FREQ primary frequency in Hz (default 440)
* BEEP_AMP amplitude 0.0..1.0 (default 0.7)
*/
import { HardwareService, Emotion } from '../src/hardware/index.js';
import { Esp32AudioService } from '../src/services/audio.service.js';
const path = process.env.HARDWARE_SERIAL_PORT ?? '/dev/serial0';
const baudRate = parseInt(process.env.HARDWARE_SERIAL_BAUD ?? '921600', 10);
const beepMs = parseInt(process.env.BEEP_MS ?? '1500', 10);
const beepFreq = parseInt(process.env.BEEP_FREQ ?? '440', 10);
const beepAmp = parseFloat(process.env.BEEP_AMP ?? '0.7');
const SAMPLE_RATE = 16000;
function generateSine(freqHz: number, durationMs: number, amplitude: number): Buffer {
const sampleCount = Math.floor((SAMPLE_RATE * durationMs) / 1000);
const buf = Buffer.alloc(sampleCount * 2);
const amp = Math.max(0, Math.min(1, amplitude)) * 32767;
const twoPiF = (2 * Math.PI * freqHz) / SAMPLE_RATE;
// 5 ms linear attack/release so the speaker doesn't click.
const rampSamples = Math.floor((SAMPLE_RATE * 5) / 1000);
for (let i = 0; i < sampleCount; i++) {
let env = 1;
if (i < rampSamples) env = i / rampSamples;
else if (i > sampleCount - rampSamples) env = (sampleCount - i) / rampSamples;
const s = Math.round(Math.sin(i * twoPiF) * amp * env);
buf.writeInt16LE(Math.max(-32768, Math.min(32767, s)), i * 2);
}
return buf;
}
async function sleep(ms: number): Promise<void> {
return new Promise((r) => setTimeout(r, ms));
}
async function main(): Promise<void> {
const hw = new HardwareService({ path, baudRate, heartbeatIntervalMs: 1000 });
hw.on('log', (line) => console.log(`[firmware] ${line}`));
hw.on('error', (err) => console.error(`[firmware error] ${err.message}`));
console.log(`→ opening ${path} @ ${baudRate} baud`);
await hw.connect();
try {
const rtt = await hw.ping(Buffer.from('beep'));
console.log(`→ ping round-trip: ${rtt.toFixed(1)} ms`);
const audio = new Esp32AudioService(
{
backend: 'esp32',
captureDevice: 'default',
playbackDevice: 'default',
sampleRate: SAMPLE_RATE,
bitDepth: 16,
channels: 1,
chunkDurationMs: 20,
},
hw,
);
hw.sendEmotion(Emotion.HAPPY);
console.log(`🔊 Beep 1: ${beepFreq} Hz · ${beepMs} ms · amp=${beepAmp}`);
const tone1 = generateSine(beepFreq, beepMs, beepAmp);
await audio.play(tone1);
await sleep(400);
console.log(`🔊 Beep 2: ${beepFreq * 2} Hz · ${beepMs} ms · amp=${beepAmp}`);
const tone2 = generateSine(beepFreq * 2, beepMs, beepAmp);
await audio.play(tone2);
console.log('✅ done — did you hear two beeps?');
} finally {
hw.sendEmotion(Emotion.NEUTRAL);
await sleep(200);
await hw.disconnect();
}
}
main().catch((err) => {
console.error('beep failed:', err);
process.exit(1);
});

View File

@ -0,0 +1,171 @@
/**
* Ti-Pote End-to-end audio loopback test.
*
* What it proves: the whole Pi ESP32 mic/speaker chain works,
* without bringing the cloud/wake-word/orchestrator into the picture.
*
* What it does:
* 1. Opens the serial link to the ESP32.
* 2. Captures `CAPTURE_MS` (default 5000) of mic audio via
* AUDIO_UP frames into a single in-memory buffer.
* 3. Pauses briefly.
* 4. Streams that buffer back to the ESP32 as AUDIO_DOWN frames
* and waits for the speaker to finish playing.
*
* Expected result: you say "allô allô" during step 2 and hear your
* own voice played back on the robot's speaker a moment later.
*
* Run with:
* HARDWARE_SERIAL_PORT=/dev/serial0 pnpm --filter @ti-pote/robot-client audio:loopback
*
* Optional env:
* CAPTURE_MS capture duration in ms (default 5000)
* HARDWARE_SERIAL_PORT / HARDWARE_SERIAL_BAUD
*/
import { writeFileSync } from 'node:fs';
import { HardwareService, Emotion } from '../src/hardware/index.js';
import { Esp32AudioService } from '../src/services/audio.service.js';
const path = process.env.HARDWARE_SERIAL_PORT ?? '/dev/serial0';
const baudRate = parseInt(process.env.HARDWARE_SERIAL_BAUD ?? '921600', 10);
const captureMs = parseInt(process.env.CAPTURE_MS ?? '5000', 10);
const debug = !!process.env.DEBUG;
const dumpPath = process.env.DUMP_PATH ?? '/tmp/tipote-capture.raw';
const skipPlayback = !!process.env.SKIP_PLAYBACK;
const SAMPLE_RATE = 16000;
const BYTES_PER_SAMPLE = 2;
let debugFramesSeen = 0;
async function sleep(ms: number): Promise<void> {
return new Promise((r) => setTimeout(r, ms));
}
async function main(): Promise<void> {
const hw = new HardwareService({ path, baudRate, heartbeatIntervalMs: 1000 });
hw.on('log', (line) => console.log(`[firmware] ${line}`));
hw.on('error', (err) => console.error(`[firmware error] ${err.message}`));
if (debug) {
hw.on('audio_up', (chunk) => {
// Print first 8 int16 samples of the first few frames
// so we can see whether the wire carries zeros or real data.
if (debugFramesSeen < 3) {
const head: number[] = [];
for (let i = 0; i < Math.min(chunk.length, 16); i += 2) {
head.push(chunk.readInt16LE(i));
}
console.log(`[debug] frame ${debugFramesSeen} len=${chunk.length} head=${head.join(',')}`);
debugFramesSeen++;
}
});
}
console.log(`→ opening ${path} @ ${baudRate} baud`);
await hw.connect();
try {
const rtt = await hw.ping(Buffer.from('loopback'));
console.log(`→ ping round-trip: ${rtt.toFixed(1)} ms`);
hw.sendEmotion(Emotion.SURPRISED);
// ── 1. Capture ────────────────────────────────────────────────
const chunks: Buffer[] = [];
let bytesCaptured = 0;
const collect = (chunk: Buffer): void => {
chunks.push(chunk);
bytesCaptured += chunk.length;
};
hw.on('audio_up', collect);
console.log(`🎙️ Recording ${captureMs} ms — say something!`);
await sleep(captureMs);
hw.off('audio_up', collect);
const capture = Buffer.concat(chunks);
const samples = capture.length / BYTES_PER_SAMPLE;
const durationMs = (samples / SAMPLE_RATE) * 1000;
console.log(
`✅ captured ${capture.length} bytes (${samples} samples, ${durationMs.toFixed(0)} ms)` +
` across ${chunks.length} frames`,
);
if (capture.length === 0) {
console.error(
'❌ no audio received from the ESP32. Check the I2S wiring ' +
'(BCLK=32, LRCLK=33, DIN=34) and that the firmware got past `audio: I2S ready`.',
);
return;
}
// Quick RMS sanity check so we catch "mic muted" / "disconnected" early.
const rms = computeRms(capture);
console.log(` RMS level: ${rms.toFixed(0)} (silence ≈ 10, speech ≳ 500)`);
if (debug) {
// Dump the raw capture so we can replay it offline:
// aplay -r 16000 -f S16_LE -c 1 /tmp/tipote-capture.raw
writeFileSync(dumpPath, capture);
console.log(`[debug] raw capture written to ${dumpPath} (${capture.length} bytes)`);
const allZero = capture.every((b) => b === 0);
console.log(`[debug] capture.allZero=${allZero}`);
// Also print some distinct int16 values we saw, to spot patterns.
const seen = new Set<number>();
for (let i = 0; i < capture.length - 1 && seen.size < 10; i += 2) {
seen.add(capture.readInt16LE(i));
}
console.log(`[debug] first distinct samples: ${[...seen].join(',')}`);
}
if (skipPlayback) {
console.log('SKIP_PLAYBACK set — not sending AUDIO_DOWN');
return;
}
// ── 2. Playback ───────────────────────────────────────────────
await sleep(500);
const audio = new Esp32AudioService(
{
backend: 'esp32',
captureDevice: 'default',
playbackDevice: 'default',
sampleRate: SAMPLE_RATE,
bitDepth: 16,
channels: 1,
chunkDurationMs: 20,
},
hw,
);
hw.sendEmotion(Emotion.HAPPY);
console.log('🔊 Playing back on the ESP32 speaker...');
await audio.play(capture);
console.log('✅ playback done');
} finally {
hw.sendEmotion(Emotion.NEUTRAL);
await sleep(200);
await hw.disconnect();
}
}
function computeRms(buf: Buffer): number {
if (buf.length < 2) return 0;
let sumSquares = 0;
const samples = buf.length / 2;
for (let i = 0; i < buf.length - 1; i += 2) {
const s = buf.readInt16LE(i);
sumSquares += s * s;
}
return Math.sqrt(sumSquares / samples);
}
main().catch((err) => {
console.error('loopback failed:', err);
process.exit(1);
});

View File

@ -2,94 +2,175 @@
"""
Ti-Pote Wake Word Detection Script.
Runs OpenWakeWord model continuously, listening on the specified ALSA device.
Prints "DETECTED" to stdout when the wake word is heard.
Runs OpenWakeWord continuously and prints "DETECTED" to stdout when
the wake word is heard.
Supports PAUSE/RESUME commands on stdin to temporarily stop/start listening
without reloading the model. When paused, the audio stream is closed so other
processes (arecord) can use the device.
Two input modes:
Usage:
python3 wake_word.py --model hey_jarvis --threshold 0.5 --device default --sample-rate 16000
1. --input alsa (default, legacy)
Opens an ALSA capture device via PyAudio. PAUSE/RESUME/QUIT
commands are read from stdin.
Requirements:
pip install openwakeword pyaudio numpy
2. --input stdin
Reads raw S16 mono PCM audio from stdin (fd 0). This is used when
the Raspberry Pi is just an orchestrator and the microphone lives
on the ESP32 the Node client forwards AUDIO_UP frames into this
script's stdin. Control commands are read from a separate file
descriptor specified by --control-fd (default: 3).
Control commands (one per line, uppercase):
PAUSE stop emitting DETECTED events (audio keeps flowing so
we don't overflow the pipe, but predictions are ignored).
RESUME resume emitting and reset the model buffer.
RESET reset the model buffer without touching the pause flag.
QUIT exit cleanly.
Usage (ALSA):
python3 wake_word.py --model hey_jarvis --device default
Usage (stdin / ESP32 backend):
python3 wake_word.py --model hey_jarvis --input stdin --control-fd 3
"""
import argparse
import sys
import os
import signal
import select
import sys
import threading
import time
import numpy as np
def main():
parser = argparse.ArgumentParser(description='Ti-Pote Wake Word Detection')
parser.add_argument('--model', type=str, default='hey_jarvis',
help='Wake word model name (default: hey_jarvis as placeholder)')
parser.add_argument('--threshold', type=float, default=0.5,
help='Detection threshold (0.0-1.0)')
parser.add_argument('--device', type=str, default='default',
help='ALSA audio capture device')
parser.add_argument('--sample-rate', type=int, default=16000,
help='Audio sample rate in Hz')
args = parser.parse_args()
CHUNK_SAMPLES = 1280 # ≈ 80 ms @ 16 kHz (OpenWakeWord's preferred size)
def load_model(model_name: str):
try:
from openwakeword.model import Model
except ImportError:
print("ERROR: openwakeword not installed. Run: pip install openwakeword", file=sys.stderr)
print("ERROR: openwakeword not installed. Run: pip install openwakeword",
file=sys.stderr)
sys.exit(1)
try:
import pyaudio
except ImportError:
print("ERROR: pyaudio not installed. Run: pip install pyaudio", file=sys.stderr)
sys.exit(1)
# ── Load the wake word model (one time only) ──
print(f"Loading wake word model: {args.model}...", file=sys.stderr)
import openwakeword
pretrained_paths = openwakeword.get_pretrained_model_paths()
model_path = None
for p in pretrained_paths:
basename = os.path.basename(p)
if basename.startswith(args.model):
model_path = p
break
pretrained = openwakeword.get_pretrained_model_paths()
model_path = next(
(p for p in pretrained if os.path.basename(p).startswith(model_name)),
None,
)
if model_path is None:
if os.path.isfile(args.model):
model_path = args.model
if os.path.isfile(model_name):
model_path = model_name
else:
print(f"ERROR: model '{args.model}' not found in pretrained models", file=sys.stderr)
print(f"Available models:", file=sys.stderr)
for p in pretrained_paths:
print(f"ERROR: model '{model_name}' not found", file=sys.stderr)
for p in pretrained:
print(f" - {os.path.basename(p)}", file=sys.stderr)
sys.exit(1)
print(f"Resolved model path: {model_path}", file=sys.stderr)
print(f"Loading wake word model: {model_name}...", file=sys.stderr)
try:
oww_model = Model(wakeword_model_paths=[model_path])
return Model(wakeword_model_paths=[model_path])
except Exception as e:
print(f"ERROR loading model '{args.model}': {e}", file=sys.stderr)
print(f"ERROR loading model '{model_name}': {e}", file=sys.stderr)
sys.exit(1)
print(f"Wake word model loaded: {args.model}", file=sys.stderr)
print(f"Threshold: {args.threshold}", file=sys.stderr)
print(f"Listening on device: {args.device}", file=sys.stderr)
# ── Initialize PyAudio ──
class State:
"""Shared mutable state between the audio and control threads."""
def __init__(self):
self.paused = False
self.running = True
self.reset_requested = False
self.lock = threading.Lock()
def start_control_reader(state: State, fd: int):
"""Background thread that reads PAUSE/RESUME/RESET/QUIT commands."""
try:
f = os.fdopen(fd, 'r', buffering=1)
except OSError as e:
print(f"ERROR opening control fd {fd}: {e}", file=sys.stderr)
return
def reader():
while state.running:
try:
line = f.readline()
except Exception:
break
if not line:
break
cmd = line.strip().upper()
with state.lock:
if cmd == 'PAUSE' and not state.paused:
state.paused = True
print("PAUSED", file=sys.stderr, flush=True)
elif cmd == 'RESUME' and state.paused:
state.paused = False
state.reset_requested = True
print("RESUMED", file=sys.stderr, flush=True)
elif cmd == 'RESET':
state.reset_requested = True
elif cmd == 'QUIT':
state.running = False
break
t = threading.Thread(target=reader, daemon=True)
t.start()
def run_predict_loop(oww_model, read_chunk, state: State, threshold: float):
"""
Shared loop: pull a chunk from `read_chunk()`, feed the model,
optionally emit DETECTED. Exits when `read_chunk()` returns None
or state.running is False.
"""
print("READY", file=sys.stderr, flush=True)
try:
while state.running:
with state.lock:
if state.reset_requested:
oww_model.reset()
state.reset_requested = False
audio_data = read_chunk()
if audio_data is None:
# EOF / error; exit cleanly
break
audio_array = np.frombuffer(audio_data, dtype=np.int16)
oww_model.predict(audio_array)
with state.lock:
if state.paused:
# Keep draining but don't emit detections.
continue
for _, score in oww_model.prediction_buffer.items():
if len(score) > 0 and score[-1] > threshold:
print("DETECTED", flush=True)
oww_model.reset()
break
except KeyboardInterrupt:
pass
# ─────────────────────────────────────────────────────────────────
# ALSA input (legacy backend)
# ─────────────────────────────────────────────────────────────────
def run_alsa_mode(args, oww_model, state: State):
import re
try:
import pyaudio
except ImportError:
print("ERROR: pyaudio not installed. Run: pip install pyaudio",
file=sys.stderr)
sys.exit(1)
pa = pyaudio.PyAudio()
# Find the device index
import re
device_index = None
if args.device != 'default':
try:
@ -97,14 +178,14 @@ def main():
info = pa.get_device_info_by_index(idx)
if info.get('maxInputChannels', 0) > 0:
device_index = idx
print(f"Using device by index: [{idx}] {info['name']}", file=sys.stderr)
print(f"Using device by index: [{idx}] {info['name']}",
file=sys.stderr)
except (ValueError, IOError):
pass
if device_index is None:
hw_match = re.search(r'(\d+),(\d+)', args.device)
hw_pattern = f"hw:{hw_match.group(1)},{hw_match.group(2)}" if hw_match else None
for i in range(pa.get_device_count()):
info = pa.get_device_info_by_index(i)
if info.get('maxInputChannels', 0) <= 0:
@ -115,133 +196,134 @@ def main():
print(f"Matched device: [{i}] {name}", file=sys.stderr)
break
if device_index is None:
print(f"WARNING: Device '{args.device}' not found, listing available inputs:", file=sys.stderr)
for i in range(pa.get_device_count()):
info = pa.get_device_info_by_index(i)
if info.get('maxInputChannels', 0) > 0:
print(f" [{i}] {info['name']}", file=sys.stderr)
print("Falling back to default device", file=sys.stderr)
# ── Audio stream helpers ──
chunk_size = 1280 # ~80ms at 16kHz (OpenWakeWord expects this)
stream = None
stream = {'handle': None}
def open_stream():
nonlocal stream
stream = pa.open(
stream['handle'] = pa.open(
format=pyaudio.paInt16,
channels=1,
rate=args.sample_rate,
input=True,
frames_per_buffer=chunk_size,
frames_per_buffer=CHUNK_SAMPLES,
input_device_index=device_index,
)
def close_stream():
nonlocal stream
if stream is not None:
h = stream['handle']
if h is not None:
try:
stream.stop_stream()
stream.close()
h.stop_stream()
h.close()
except Exception:
pass
stream = None
# ── Stdin command reader (PAUSE / RESUME) ──
paused = False
running = True
lock = threading.Lock()
def stdin_reader():
nonlocal paused, running
while running:
try:
line = sys.stdin.readline()
if not line: # EOF
running = False
break
cmd = line.strip().upper()
with lock:
if cmd == 'PAUSE':
if not paused:
paused = True
print("PAUSED", file=sys.stderr, flush=True)
elif cmd == 'RESUME':
if paused:
paused = False
print("RESUMED", file=sys.stderr, flush=True)
elif cmd == 'QUIT':
running = False
break
except Exception:
break
stdin_thread = threading.Thread(target=stdin_reader, daemon=True)
stdin_thread.start()
# ── Signal handling ──
def handle_signal(sig, frame):
nonlocal running
running = False
signal.signal(signal.SIGTERM, handle_signal)
signal.signal(signal.SIGINT, handle_signal)
# ── Main loop ──
open_stream()
print("READY", file=sys.stderr, flush=True)
try:
while running:
with lock:
is_paused = paused
stream['handle'] = None
def read_chunk():
with state.lock:
is_paused = state.paused
# In ALSA mode, pausing means physically releasing the device.
if is_paused:
# Close the audio stream so arecord can use the device
if stream is not None:
if stream['handle'] is not None:
close_stream()
print("STREAM_CLOSED", file=sys.stderr, flush=True)
# Wait a bit before checking again
import time
time.sleep(0.1)
continue
# Reopen stream if it was closed (after resume)
if stream is None:
return b'\x00' * (CHUNK_SAMPLES * 2) # dummy silence; won't be predicted
if stream['handle'] is None:
open_stream()
oww_model.reset()
print("STREAM_REOPENED", file=sys.stderr, flush=True)
try:
audio_data = stream.read(chunk_size, exception_on_overflow=False)
return stream['handle'].read(CHUNK_SAMPLES, exception_on_overflow=False)
except Exception as e:
print(f"Audio read error: {e}", file=sys.stderr)
close_stream()
import time
time.sleep(0.5)
continue
return b'\x00' * (CHUNK_SAMPLES * 2)
audio_array = np.frombuffer(audio_data, dtype=np.int16)
oww_model.predict(audio_array)
for model_name, score in oww_model.prediction_buffer.items():
if len(score) > 0 and score[-1] > args.threshold:
print("DETECTED", flush=True)
oww_model.reset()
break
except KeyboardInterrupt:
pass
open_stream()
try:
run_predict_loop(oww_model, read_chunk, state, args.threshold)
finally:
close_stream()
pa.terminate()
print("Wake word detection stopped", file=sys.stderr)
# ─────────────────────────────────────────────────────────────────
# Stdin input (ESP32 backend)
# ─────────────────────────────────────────────────────────────────
def run_stdin_mode(args, oww_model, state: State):
"""
Audio bytes arrive on stdin (fd 0), 16-bit signed LE mono at
`args.sample_rate`. We block until a full CHUNK_SAMPLES chunk is
available and hand it to the model.
"""
print("Listening on stdin for raw S16LE mono PCM", file=sys.stderr)
chunk_bytes = CHUNK_SAMPLES * 2
stdin = sys.stdin.buffer
buf = bytearray()
def read_chunk():
# Keep reading until we have a full chunk or hit EOF.
while len(buf) < chunk_bytes and state.running:
try:
data = stdin.read(chunk_bytes - len(buf))
except Exception as e:
print(f"stdin read error: {e}", file=sys.stderr)
return None
if not data:
return None
buf.extend(data)
if len(buf) < chunk_bytes:
return None
chunk = bytes(buf[:chunk_bytes])
del buf[:chunk_bytes]
return chunk
try:
run_predict_loop(oww_model, read_chunk, state, args.threshold)
finally:
print("Wake word detection stopped", file=sys.stderr)
# ─────────────────────────────────────────────────────────────────
# Entrypoint
# ─────────────────────────────────────────────────────────────────
def main():
parser = argparse.ArgumentParser(description='Ti-Pote Wake Word Detection')
parser.add_argument('--model', type=str, default='hey_jarvis')
parser.add_argument('--threshold', type=float, default=0.5)
parser.add_argument('--input', type=str, choices=['alsa', 'stdin'], default='alsa',
help="Audio source. 'alsa' opens PyAudio, 'stdin' reads from fd 0.")
parser.add_argument('--device', type=str, default='default',
help='ALSA audio capture device (only used with --input alsa).')
parser.add_argument('--control-fd', type=int, default=0,
help='File descriptor to read control commands from. '
'Default 0 (stdin) for ALSA, pass 3 for stdin mode.')
parser.add_argument('--sample-rate', type=int, default=16000)
args = parser.parse_args()
state = State()
def handle_signal(_sig, _frame):
state.running = False
signal.signal(signal.SIGTERM, handle_signal)
signal.signal(signal.SIGINT, handle_signal)
oww_model = load_model(args.model)
print(f"Wake word model loaded: {args.model}", file=sys.stderr)
print(f"Threshold: {args.threshold}", file=sys.stderr)
start_control_reader(state, args.control_fd)
if args.input == 'stdin':
run_stdin_mode(args, oww_model, state)
else:
print(f"Listening on device: {args.device}", file=sys.stderr)
run_alsa_mode(args, oww_model, state)
if __name__ == '__main__':
main()

View File

@ -1,8 +1,11 @@
export interface AudioConfig {
/** ALSA device for capture (e.g., 'plughw:1,0' or 'default') */
/** Which audio backend to use: 'esp32' (default) or 'alsa' (legacy). */
backend: 'esp32' | 'alsa';
/** ALSA device for capture (only used when backend='alsa'). */
captureDevice: string;
/** ALSA device for playback (e.g., 'plughw:0,0' or 'default') */
/** ALSA device for playback (only used when backend='alsa'). */
playbackDevice: string;
/** Sample rate in Hz */
@ -53,8 +56,13 @@ export interface HardwareConfig {
}
export function loadHardwareConfig(): HardwareConfig {
const backend = (process.env.AUDIO_BACKEND || 'esp32').toLowerCase() as
| 'esp32'
| 'alsa';
return {
audio: {
backend,
captureDevice: process.env.AUDIO_CAPTURE_DEVICE || 'default',
playbackDevice: process.env.AUDIO_PLAYBACK_DEVICE || 'default',
sampleRate: parseInt(process.env.AUDIO_SAMPLE_RATE || '16000', 10),
@ -69,8 +77,15 @@ export function loadHardwareConfig(): HardwareConfig {
threshold: parseFloat(process.env.WAKEWORD_THRESHOLD || '0.5'),
},
serial: {
enabled: (process.env.HARDWARE_SERIAL_ENABLED || 'false').toLowerCase() === 'true',
path: process.env.HARDWARE_SERIAL_PORT || '/dev/ttyUSB0',
// The ESP32 is now the mic/speaker front-end — serial link is
// enabled by default. Set HARDWARE_SERIAL_ENABLED=false only
// when intentionally falling back to the ALSA backend.
enabled:
(process.env.HARDWARE_SERIAL_ENABLED || (backend === 'esp32' ? 'true' : 'false'))
.toLowerCase() === 'true',
// Default to /dev/serial0 (the Pi's hardware UART once the
// console has been freed via raspi-config).
path: process.env.HARDWARE_SERIAL_PORT || '/dev/serial0',
baudRate: parseInt(process.env.HARDWARE_SERIAL_BAUD || '921600', 10),
heartbeatIntervalMs: parseInt(process.env.HARDWARE_HEARTBEAT_MS || '1000', 10),
},

View File

@ -27,8 +27,17 @@ export interface HardwareServiceEvents {
log: (message: string) => void;
frame: (frame: DecodedFrame) => void;
ack: (payload: Buffer) => void;
/** Emitted for each AUDIO_UP frame received from the ESP32 (raw S16 mono PCM). */
audio_up: (chunk: Buffer) => void;
}
/**
* Max bytes we put in a single AUDIO_DOWN frame. Must stay below
* MAX_PAYLOAD_SIZE (1024) and should map to a whole number of
* 20 ms @ 16 kHz chunks: 640 bytes = 20 ms, 320 samples.
*/
const AUDIO_DOWN_CHUNK_BYTES = 640;
/**
* HardwareService the robot-client's only direct link to the ESP32.
*
@ -136,6 +145,42 @@ export class HardwareService extends EventEmitter {
this.writeFrame(MsgType.DISPLAY_CLEAR);
}
/**
* Send a PCM S16 mono 16 kHz buffer to the ESP32 speaker as one or
* more AUDIO_DOWN frames. The buffer is automatically split into
* chunks of `AUDIO_DOWN_CHUNK_BYTES` so each frame fits within the
* UART protocol's MAX_PAYLOAD_SIZE.
*
* Back-pressure note: `SerialPort.write` buffers in user-space, so
* this method is best-effort. For long TTS playbacks, call
* `drainAudioDown()` between chunks or space them with a `setTimeout`
* to avoid unbounded growth.
*/
sendAudioDown(chunk: Buffer): void {
if (!this.port?.isOpen) {
this.log.warn('Dropping AUDIO_DOWN — serial port not open');
return;
}
for (let offset = 0; offset < chunk.length; offset += AUDIO_DOWN_CHUNK_BYTES) {
const slice = chunk.subarray(offset, offset + AUDIO_DOWN_CHUNK_BYTES);
this.writeFrame(MsgType.AUDIO_DOWN, slice);
}
}
/**
* Wait for the kernel-side serial buffer to drain. Useful between
* large AUDIO_DOWN bursts to keep latency bounded.
*/
drainAudioDown(): Promise<void> {
return new Promise((resolve, reject) => {
if (!this.port?.isOpen) {
resolve();
return;
}
this.port.drain((err) => (err ? reject(err) : resolve()));
});
}
/**
* Round-trip PING PONG used for bring-up and latency checks.
* Resolves with the measured RTT in ms.
@ -187,6 +232,9 @@ export class HardwareService extends EventEmitter {
case MsgType.ERROR:
this.log.error({ payload: frame.payload.toString('utf8') }, 'firmware error');
return;
case MsgType.AUDIO_UP:
this.emit('audio_up', frame.payload);
return;
default:
return;
}

View File

@ -1,7 +1,7 @@
import { loadRobotConfig, loadHardwareConfig } from './config/index.js';
import { CloudSocket } from './transport/index.js';
import {
AudioService,
createAudioService,
WakeWordService,
KeyboardTriggerService,
HealthService,
@ -72,15 +72,16 @@ async function main(): Promise<void> {
const resolvedConfig = { ...robotConfig, deviceId, deviceToken };
const cloudSocket = new CloudSocket(resolvedConfig as Required<typeof resolvedConfig>);
const audioService = new AudioService(hardwareConfig.audio);
const healthService = new HealthService(cloudSocket);
// ── Optional: hardware bridge (ESP32 firmware) ──
// The serial link is opt-in via HARDWARE_SERIAL_ENABLED=true. We
// treat failures here as non-fatal: even without a face, the
// robot can still converse with the cloud.
// ── Hardware bridge (ESP32 firmware) ──
// With AUDIO_BACKEND=esp32 the ESP32 owns the mic AND the speaker,
// so the serial link is mandatory. With AUDIO_BACKEND=alsa we can
// still run without it (face will be missing, but audio works).
const audioBackend = hardwareConfig.audio.backend;
let hardwareService: HardwareService | null = null;
if (hardwareConfig.serial.enabled) {
hardwareService = new HardwareService({
path: hardwareConfig.serial.path,
@ -93,19 +94,40 @@ async function main(): Promise<void> {
hardwareService.sendEmotion(Emotion.HAPPY);
logger.info('Hardware bridge connected');
} catch (err) {
if (audioBackend === 'esp32') {
logger.fatal(
{ err, path: hardwareConfig.serial.path },
'Hardware bridge required for AUDIO_BACKEND=esp32 — check the UART wiring or set AUDIO_BACKEND=alsa',
);
process.exit(1);
}
logger.warn({ err }, 'Hardware bridge unavailable — continuing without face');
hardwareService = null;
}
} else if (audioBackend === 'esp32') {
logger.fatal(
'AUDIO_BACKEND=esp32 requires HARDWARE_SERIAL_ENABLED=true. Either enable the serial link or switch to AUDIO_BACKEND=alsa.',
);
process.exit(1);
} else {
logger.info('Hardware bridge disabled (set HARDWARE_SERIAL_ENABLED=true to enable)');
}
// Audio service — pick a backend now that we know whether the
// hardware bridge is alive.
const audioService = createAudioService(hardwareConfig.audio, hardwareService);
logger.info({ backend: audioBackend }, 'Audio service initialised');
// Choose trigger based on TRIGGER_MODE
let trigger: ITriggerService;
if (resolvedConfig.triggerMode === 'wakeword') {
logger.info('Trigger: wake word (OpenWakeWord)');
trigger = new WakeWordService(hardwareConfig.wakeWord, hardwareConfig.audio);
trigger = new WakeWordService(
hardwareConfig.wakeWord,
hardwareConfig.audio,
audioBackend === 'esp32' ? hardwareService : null,
);
} else {
logger.info('Trigger: keyboard (press Enter to talk)');
trigger = new KeyboardTriggerService();

View File

@ -1,30 +1,48 @@
import { ChildProcess, spawn } from 'node:child_process';
import { EventEmitter } from 'node:events';
import { type AudioConfig } from '../config/index.js';
import { type HardwareService } from '../hardware/index.js';
import { createLogger, type Logger } from '../utils/index.js';
export interface AudioServiceEvents {
/** Emitted when a raw PCM audio chunk is captured from the microphone */
/** Emitted when a raw PCM audio chunk is captured from the microphone. */
audio_chunk: (chunk: Buffer) => void;
/** Emitted when playback of a response finishes */
/** Emitted when playback of a response finishes. */
playback_done: () => void;
/** Emitted on audio errors */
/** Emitted on audio errors. */
error: (error: Error) => void;
}
/**
* Audio service for Raspberry Pi.
* Common audio interface used by the orchestrator, wake word service,
* and test scripts. Two backends implement it:
*
* Uses ALSA tools (arecord/aplay) via child processes.
* Works with any ALSA-compatible audio device:
* - I2S (INMP441 mic, MAX98357 amp) connected directly to Pi GPIO
* - USB audio devices
* - Default system audio
* - `AlsaAudioService` arecord/aplay child processes, for dev on a
* machine with a USB mic or when the Pi owns the I2S mic/speaker
* directly. Selected with `AUDIO_BACKEND=alsa`.
*
* Audio format: PCM signed 16-bit little-endian, mono, 16kHz
* - `Esp32AudioService` mic and speaker live on the ESP32; audio
* flows over UART via `HardwareService`. Selected with
* `AUDIO_BACKEND=esp32` (the default in production).
*/
export class AudioService extends EventEmitter {
export abstract class AudioService extends EventEmitter {
abstract get isCapturing(): boolean;
abstract get isPlaying(): boolean;
abstract startCapture(): void;
abstract stopCapture(): void;
abstract play(audioBuffer: Buffer): Promise<void>;
abstract stopPlayback(): void;
abstract destroy(): Promise<void>;
}
// ─────────────────────────────────────────────────────────────────
// ALSA backend — kept for dev on laptops and for Pi setups where
// the mic/speaker hang off ALSA directly (USB sound card, HAT…).
// ─────────────────────────────────────────────────────────────────
export class AlsaAudioService extends AudioService {
private captureProcess: ChildProcess | null = null;
private playProcess: ChildProcess | null = null;
private readonly logger: Logger;
private _isCapturing = false;
private _isPlaying = false;
@ -32,7 +50,7 @@ export class AudioService extends EventEmitter {
constructor(private readonly config: AudioConfig) {
super();
this.logger = createLogger('audio', 'info');
this.logger = createLogger('audio:alsa', 'info');
}
get isCapturing(): boolean {
@ -43,10 +61,6 @@ export class AudioService extends EventEmitter {
return this._isPlaying;
}
/**
* Start capturing audio from the microphone.
* Emits 'audio_chunk' events with raw PCM buffers.
*/
startCapture(): void {
if (this._isCapturing) {
this.logger.warn('Already capturing audio');
@ -58,13 +72,6 @@ export class AudioService extends EventEmitter {
'Starting audio capture',
);
// arecord outputs raw PCM to stdout
// -D: ALSA device
// -f: format (S16_LE = signed 16-bit little-endian)
// -r: sample rate
// -c: channels
// -t: type (raw = no header)
// --buffer-size: in frames, controls latency
const bufferFrames = Math.floor(this.config.sampleRate * (this.config.chunkDurationMs / 1000));
this.captureProcess = spawn('arecord', [
@ -112,9 +119,6 @@ export class AudioService extends EventEmitter {
});
}
/**
* Stop capturing audio from the microphone.
*/
stopCapture(): void {
if (!this.captureProcess) return;
@ -125,12 +129,6 @@ export class AudioService extends EventEmitter {
this._isCapturing = false;
}
/**
* Play audio through the speaker.
* Accepts either raw PCM or WAV (with RIFF header) data.
*
* @returns Promise that resolves when playback is complete
*/
async play(audioBuffer: Buffer): Promise<void> {
if (this._isPlaying) {
this.logger.warn('Already playing audio, queueing...');
@ -152,24 +150,26 @@ export class AudioService extends EventEmitter {
'-',
];
const playProcess = spawn('aplay', args, {
this.playProcess = spawn('aplay', args, {
stdio: ['pipe', 'ignore', 'pipe'],
});
playProcess.stderr?.on('data', (data: Buffer) => {
this.playProcess.stderr?.on('data', (data: Buffer) => {
const msg = data.toString().trim();
if (msg && !msg.startsWith('Playing') && !msg.startsWith('Warning')) {
this.logger.error({ msg }, 'aplay stderr');
}
});
playProcess.on('error', (err) => {
this.playProcess.on('error', (err) => {
this._isPlaying = false;
this.playProcess = null;
reject(new Error(`Audio playback failed: ${err.message}`));
});
playProcess.on('exit', (code) => {
this.playProcess.on('exit', (code) => {
this._isPlaying = false;
this.playProcess = null;
if (code === 0 || code === null) {
this.emit('playback_done');
resolve();
@ -178,26 +178,194 @@ export class AudioService extends EventEmitter {
}
});
// Write audio data to aplay's stdin and close it
playProcess.stdin?.write(audioBuffer);
playProcess.stdin?.end();
this.playProcess.stdin?.write(audioBuffer);
this.playProcess.stdin?.end();
});
}
/**
* Stop any currently playing audio.
*/
stopPlayback(): void {
// aplay is spawned per-play, so we can't easily stop it here
// For interrupt support, we'd track the play process
if (this.playProcess) {
this.playProcess.kill('SIGTERM');
this.playProcess = null;
}
this._isPlaying = false;
}
/**
* Clean up resources.
*/
async destroy(): Promise<void> {
this.stopCapture();
this.stopPlayback();
this.removeAllListeners();
}
}
// ─────────────────────────────────────────────────────────────────
// ESP32 backend — the mic and speaker live on the firmware side and
// audio flows over the UART link owned by HardwareService.
// ─────────────────────────────────────────────────────────────────
/**
* Bytes-per-chunk written to the ESP32 per AUDIO_DOWN frame. Must
* match `AUDIO_DOWN_CHUNK_BYTES` in HardwareService. 640 bytes =
* 20 ms of 16 kHz S16 mono audio.
*/
const ESP32_CHUNK_BYTES = 640;
/** Milliseconds we wait between two AUDIO_DOWN frames during playback. */
const ESP32_PACING_MS = 18;
export class Esp32AudioService extends AudioService {
private readonly logger: Logger;
private _isCapturing = false;
private _isPlaying = false;
private _playbackAbort = false;
/** Latched listener so we can detach on `stopCapture()`. */
private readonly forwardAudioUp = (chunk: Buffer): void => {
if (!this._isCapturing) return;
this.emit('audio_chunk', chunk);
};
constructor(
_config: AudioConfig,
private readonly hardware: HardwareService,
) {
super();
void _config;
this.logger = createLogger('audio:esp32', 'info');
}
get isCapturing(): boolean {
return this._isCapturing;
}
get isPlaying(): boolean {
return this._isPlaying;
}
startCapture(): void {
if (this._isCapturing) {
this.logger.warn('Already capturing audio');
return;
}
this.logger.info('Subscribing to ESP32 AUDIO_UP stream');
this._isCapturing = true;
// Attach exactly once per capture session — removed in stopCapture.
this.hardware.on('audio_up', this.forwardAudioUp);
}
stopCapture(): void {
if (!this._isCapturing) return;
this.logger.info('Unsubscribing from ESP32 AUDIO_UP stream');
this._isCapturing = false;
this.hardware.off('audio_up', this.forwardAudioUp);
}
/**
* Play a PCM S16 mono 16 kHz buffer on the ESP32 speaker. If `buf`
* carries a WAV header, strip it first (the firmware expects raw PCM).
*
* We pace the writes manually so the Node serial buffer and the
* ESP32 speaker DMA stay roughly in sync. Without pacing, the whole
* buffer would be pushed into the kernel at once and the robot would
* still be "speaking" long after the orchestrator thinks it's done.
*/
async play(audioBuffer: Buffer): Promise<void> {
if (this._isPlaying) {
this.logger.warn('Already playing audio — ignoring new buffer');
return;
}
const pcm = stripWavHeader(audioBuffer);
if (pcm.length === 0) {
this.emit('playback_done');
return;
}
this._isPlaying = true;
this._playbackAbort = false;
try {
for (let offset = 0; offset < pcm.length; offset += ESP32_CHUNK_BYTES) {
if (this._playbackAbort) break;
const slice = pcm.subarray(offset, offset + ESP32_CHUNK_BYTES);
this.hardware.sendAudioDown(slice);
if (ESP32_PACING_MS > 0) {
await delay(ESP32_PACING_MS);
}
}
// Let the kernel TX buffer drain so we don't race on destroy.
try {
await this.hardware.drainAudioDown();
} catch (err) {
this.logger.warn({ err }, 'drain after playback failed');
}
this.emit('playback_done');
} finally {
this._isPlaying = false;
this._playbackAbort = false;
}
}
stopPlayback(): void {
if (!this._isPlaying) return;
this.logger.info('Aborting playback');
this._playbackAbort = true;
}
async destroy(): Promise<void> {
this.stopCapture();
this.stopPlayback();
this.removeAllListeners();
}
}
// ─────────────────────────────────────────────────────────────────
// Helpers
// ─────────────────────────────────────────────────────────────────
function delay(ms: number): Promise<void> {
return new Promise((resolve) => setTimeout(resolve, ms));
}
/**
* Strip the 44-byte RIFF/WAVE header if present. The ESP32 I2S driver
* wants raw S16 mono PCM, nothing else.
*/
function stripWavHeader(buf: Buffer): Buffer {
if (buf.length > 44 && buf.toString('ascii', 0, 4) === 'RIFF' && buf.toString('ascii', 8, 12) === 'WAVE') {
return buf.subarray(44);
}
return buf;
}
// ─────────────────────────────────────────────────────────────────
// Factory
// ─────────────────────────────────────────────────────────────────
export type AudioBackend = 'alsa' | 'esp32';
/**
* Create the right AudioService for the current backend. The default
* is `esp32`; set `AUDIO_BACKEND=alsa` to fall back to the legacy
* arecord/aplay path (useful for laptop dev without an ESP32 wired in).
*/
export function createAudioService(
config: AudioConfig,
hardware: HardwareService | null,
): AudioService {
const backend = (config.backend ?? 'esp32') as AudioBackend;
if (backend === 'alsa') {
return new AlsaAudioService(config);
}
if (backend === 'esp32') {
if (!hardware) {
throw new Error(
'AUDIO_BACKEND=esp32 requires a connected HardwareService — ' +
'set HARDWARE_SERIAL_ENABLED=true and make sure the ESP32 is reachable, ' +
'or switch to AUDIO_BACKEND=alsa for local development.',
);
}
return new Esp32AudioService(config, hardware);
}
throw new Error(`Unknown AUDIO_BACKEND: ${backend}`);
}

View File

@ -1,4 +1,10 @@
export { AudioService } from './audio.service.js';
export {
AudioService,
AlsaAudioService,
Esp32AudioService,
createAudioService,
type AudioBackend,
} from './audio.service.js';
export { WakeWordService } from './wake-word.service.js';
export { KeyboardTriggerService } from './keyboard-trigger.service.js';
export { HealthService } from './health.service.js';

View File

@ -1,24 +1,35 @@
import { ChildProcess, spawn } from 'node:child_process';
import { EventEmitter } from 'node:events';
import { type WakeWordConfig, type AudioConfig } from '../config/index.js';
import { type HardwareService } from '../hardware/index.js';
import { createLogger, type Logger } from '../utils/index.js';
export interface WakeWordServiceEvents {
/** Emitted when the wake word is detected */
detected: () => void;
/** Emitted when the engine is ready */
ready: () => void;
/** Emitted on errors */
error: (error: Error) => void;
}
/**
* Wake word detection service.
*
* Runs OpenWakeWord as a **long-lived** Python subprocess.
* The model is loaded once at startup; pause/resume is handled via
* PAUSE/RESUME commands on stdin, so the audio device is released
* while arecord is capturing, then reclaimed when listening resumes.
* Two operating modes, selected by whether a HardwareService is passed
* to the constructor:
*
* 1. **ALSA mode** (no HardwareService)
* The Python subprocess opens PyAudio on `audioConfig.captureDevice`
* and reads the mic directly. Pause releases the ALSA device so
* arecord (the AlsaAudioService) can use it during conversation.
*
* 2. **ESP32 mode** (HardwareService provided)
* The Python subprocess reads raw S16 mono PCM from stdin. We
* subscribe to `hardware.on('audio_up')` and pipe every mic chunk
* coming off the UART straight into the Python process. Control
* commands (PAUSE/RESUME/RESET/QUIT) go over a separate pipe at
* fd 3 because stdin is busy carrying audio.
*
* The model is loaded once at startup; pause/resume is cheap and
* does not reload it.
*/
export class WakeWordService extends EventEmitter {
private process: ChildProcess | null = null;
@ -26,51 +37,73 @@ export class WakeWordService extends EventEmitter {
private _isListening = false;
private _isPaused = false;
private _streamClosed = false;
private readonly usesHardware: boolean;
/** Latched forwarder so we can detach it on stop / error. */
private readonly forwardMicChunk = (chunk: Buffer): void => {
if (!this.process || !this.process.stdin || this.process.stdin.destroyed) return;
// Node gracefully buffers writes if the pipe is full; we don't
// apply back-pressure here because dropping wake-word audio would
// just hurt detection accuracy for a few tens of ms.
this.process.stdin.write(chunk);
};
constructor(
private readonly wakeWordConfig: WakeWordConfig,
private readonly audioConfig: AudioConfig,
private readonly hardware: HardwareService | null = null,
) {
super();
this.logger = createLogger('wake-word', 'info');
this.usesHardware = hardware !== null;
}
get isListening(): boolean {
return this._isListening && !this._isPaused;
}
/**
* Start the wake word Python subprocess.
* The model is loaded once; subsequent pause/resume cycles are fast.
*/
start(): void {
if (this.process) {
// Process already running — just resume if paused
if (this._isPaused) {
this.resume();
}
if (this._isPaused) this.resume();
return;
}
this.logger.info(
{ model: this.wakeWordConfig.modelName, threshold: this.wakeWordConfig.threshold },
{
mode: this.usesHardware ? 'esp32' : 'alsa',
model: this.wakeWordConfig.modelName,
threshold: this.wakeWordConfig.threshold,
},
'Starting wake word detection',
);
this.process = spawn(this.wakeWordConfig.pythonPath, [
const args = [
this.wakeWordConfig.scriptPath,
'--model', this.wakeWordConfig.modelName,
'--threshold', String(this.wakeWordConfig.threshold),
'--device', this.audioConfig.captureDevice,
'--sample-rate', String(this.audioConfig.sampleRate),
], {
stdio: ['pipe', 'pipe', 'pipe'],
});
];
if (this.usesHardware) {
args.push('--input', 'stdin', '--control-fd', '3');
} else {
args.push('--input', 'alsa', '--device', this.audioConfig.captureDevice);
}
// stdio layout:
// 0: stdin — audio in (ESP32 mode) or control (ALSA mode)
// 1: stdout — DETECTED events
// 2: stderr — status & log lines
// 3: extra — control pipe (ESP32 mode only)
const stdio: ('pipe' | 'ignore')[] = this.usesHardware
? ['pipe', 'pipe', 'pipe', 'pipe']
: ['pipe', 'pipe', 'pipe'];
this.process = spawn(this.wakeWordConfig.pythonPath, args, { stdio });
this._isListening = true;
this._isPaused = false;
// ── stdout: DETECTED events ──
this.process.stdout?.on('data', (data: Buffer) => {
const lines = data.toString().trim().split('\n');
for (const line of lines) {
@ -83,7 +116,6 @@ export class WakeWordService extends EventEmitter {
}
});
// ── stderr: status messages ──
this.process.stderr?.on('data', (data: Buffer) => {
const lines = data.toString().trim().split('\n');
for (const line of lines) {
@ -107,10 +139,9 @@ export class WakeWordService extends EventEmitter {
this.logger.info('⏳ Loading wake word model...');
} else if (msg.startsWith('Wake word model loaded')) {
this.logger.info('✅ Wake word model loaded');
} else if (msg.startsWith('Matched device') || msg.startsWith('Using device')) {
} else if (msg.startsWith('Matched device') || msg.startsWith('Using device') || msg.startsWith('Listening')) {
this.logger.info(`🔊 ${msg}`);
} else {
// Log unknown stderr messages at warn level to catch errors
this.logger.warn({ msg }, 'Wake word stderr');
}
}
@ -119,29 +150,36 @@ export class WakeWordService extends EventEmitter {
this.process.on('error', (err) => {
this._isListening = false;
this.logger.error({ err }, 'Wake word process error');
this.detachHardware();
this.emit('error', new Error(`Wake word process failed: ${err.message}`));
});
this.process.on('exit', (code) => {
this._isListening = false;
this._isPaused = false;
this.detachHardware();
this.process = null;
if (code !== 0 && code !== null) {
this.logger.warn({ code }, 'Wake word process exited unexpectedly');
// Auto-restart after a short delay
setTimeout(() => {
this.logger.info('Restarting wake word detection...');
this.start();
}, 2000);
}
});
// In ESP32 mode, start piping mic audio from the UART.
if (this.usesHardware && this.hardware) {
this.hardware.on('audio_up', this.forwardMicChunk);
}
}
/**
* Pause wake word detection.
* Sends PAUSE command to Python subprocess which closes the audio stream,
* freeing the device for arecord. Returns a promise that resolves when
* the audio stream is confirmed closed.
*
* In ALSA mode we must wait for STREAM_CLOSED so arecord can reclaim
* the device. In ESP32 mode the audio flow never stops we just
* tell the Python process to ignore detections.
*/
pause(): Promise<void> {
if (!this.process || this._isPaused) return Promise.resolve();
@ -149,9 +187,13 @@ export class WakeWordService extends EventEmitter {
this._isPaused = true;
this._streamClosed = false;
this.process.stdin?.write('PAUSE\n');
this.writeControl('PAUSE');
if (this.usesHardware) {
// No physical device to release — resolve immediately.
return Promise.resolve();
}
// Wait for the stream to be closed (so arecord can use the device)
return new Promise((resolve) => {
const checkInterval = setInterval(() => {
if (this._streamClosed || !this.process) {
@ -160,7 +202,6 @@ export class WakeWordService extends EventEmitter {
}
}, 50);
// Safety timeout
setTimeout(() => {
clearInterval(checkInterval);
resolve();
@ -168,25 +209,18 @@ export class WakeWordService extends EventEmitter {
});
}
/**
* Resume wake word detection after pause.
* The Python subprocess reopens the audio stream (fast, no model reload).
*/
resume(): void {
if (!this.process || !this._isPaused) return;
this._isPaused = false;
this.process.stdin?.write('RESUME\n');
this.writeControl('RESUME');
this.logger.info('🎤 Resuming wake word listening...');
}
/**
* Stop wake word detection permanently.
*/
stop(): void {
if (this.process) {
this.process.stdin?.write('QUIT\n');
// Give it a moment to exit cleanly, then force kill
this.writeControl('QUIT');
this.detachHardware();
setTimeout(() => {
if (this.process) {
this.process.kill('SIGTERM');
@ -198,4 +232,35 @@ export class WakeWordService extends EventEmitter {
this._isPaused = false;
this.removeAllListeners();
}
// ──────────────────────────────────────────────────────────
// Internals
// ──────────────────────────────────────────────────────────
/**
* Write a text control command. In ALSA mode that goes to stdin;
* in ESP32 mode stdin carries audio so commands travel over the
* extra pipe at fd 3 (process.stdio[3]).
*/
private writeControl(cmd: string): void {
if (!this.process) return;
const line = `${cmd}\n`;
if (this.usesHardware) {
// stdio[3] is our control pipe — a Node Writable (net.Socket) stream.
const control = this.process.stdio[3] as unknown as
| (NodeJS.WritableStream & { destroyed?: boolean })
| null;
if (control && !control.destroyed) {
control.write(line);
}
} else {
this.process.stdin?.write(line);
}
}
private detachHardware(): void {
if (this.usesHardware && this.hardware) {
this.hardware.off('audio_up', this.forwardMicChunk);
}
}
}

View File

@ -0,0 +1,7 @@
{
"name": "Audio",
"version": "0.1.0",
"description": "Ti-Pote audio I/O — INMP441 mic + MAX98357A speaker via two I2S peripherals.",
"frameworks": "arduino",
"platforms": "espressif32"
}

View File

@ -0,0 +1,151 @@
#include "Audio.h"
#include <driver/i2s.h>
namespace tipote {
// ─────────────────────────────────────────────────────────────────
// Shared I2S bus pin assignment — see the header for rationale.
// ─────────────────────────────────────────────────────────────────
static constexpr int PIN_BCLK = 32; // shared: mic SCK + speaker BCLK
static constexpr int PIN_LRCLK = 33; // shared: mic WS + speaker LRC
static constexpr int PIN_MIC_DIN = 34; // INMP441 SD → ESP32 data-in
static constexpr int PIN_SPK_DOUT = 22; // MAX98357A DIN ← ESP32 data-out
// DMA buffers — 4 × 256 × 8 bytes (stereo 32-bit) ≈ 8 KB each for
// RX and TX. That's ~64 ms of audio each way at 16 kHz, plenty of
// room to absorb UART jitter.
static constexpr int DMA_COUNT = 4;
static constexpr int DMA_LEN = 256;
bool Audio::begin() {
// ───── Single I2S port, full duplex, 32-bit stereo slots ─────
//
// The INMP441 requires 32-bit slots; the MAX98357A happily reads
// the 32-bit frames we emit. With a shared bus we get one set of
// BCLK/WS for both sides — exactly like the Pi setup that worked.
i2s_config_t cfg = {};
cfg.mode = static_cast<i2s_mode_t>(I2S_MODE_MASTER |
I2S_MODE_RX |
I2S_MODE_TX);
cfg.sample_rate = SAMPLE_RATE;
cfg.bits_per_sample = I2S_BITS_PER_SAMPLE_32BIT;
cfg.channel_format = I2S_CHANNEL_FMT_RIGHT_LEFT; // stereo frames
cfg.communication_format = I2S_COMM_FORMAT_STAND_I2S;
cfg.intr_alloc_flags = ESP_INTR_FLAG_LEVEL1;
cfg.dma_buf_count = DMA_COUNT;
cfg.dma_buf_len = DMA_LEN;
cfg.use_apll = false;
cfg.tx_desc_auto_clear = true;
cfg.fixed_mclk = 0;
if (i2s_driver_install(I2S_NUM_0, &cfg, 0, nullptr) != ESP_OK) {
return false;
}
i2s_pin_config_t pins = {};
pins.bck_io_num = PIN_BCLK;
pins.ws_io_num = PIN_LRCLK;
pins.data_out_num = PIN_SPK_DOUT;
pins.data_in_num = PIN_MIC_DIN;
if (i2s_set_pin(I2S_NUM_0, &pins) != ESP_OK) {
i2s_driver_uninstall(I2S_NUM_0);
return false;
}
i2s_zero_dma_buffer(I2S_NUM_0);
micStarted_ = true;
spkStarted_ = true;
return true;
}
size_t Audio::readMicChunk(uint8_t* out, size_t outCapacity) {
if (!micStarted_ || outCapacity < 2) return 0;
// Stereo read: each "sample pair" is L + R, each 32-bit = 8 bytes.
// Cap at 320 pairs = 20 ms @ 16 kHz mono per call.
constexpr size_t MAX_PAIRS = 320;
int32_t raw[MAX_PAIRS * 2];
size_t wantPairs = outCapacity / 2; // 2 bytes out per mono sample
if (wantPairs > MAX_PAIRS) wantPairs = MAX_PAIRS;
size_t bytesRead = 0;
const esp_err_t err = i2s_read(
I2S_NUM_0,
reinterpret_cast<void*>(raw),
wantPairs * 2 * sizeof(int32_t),
&bytesRead,
0 // non-blocking
);
if (err != ESP_OK || bytesRead == 0) return 0;
const size_t pairs = bytesRead / (2 * sizeof(int32_t));
int16_t* dst = reinterpret_cast<int16_t*>(out);
int32_t lMin = INT32_MAX, lMax = INT32_MIN;
int32_t rMin = INT32_MAX, rMax = INT32_MIN;
int16_t s16Min = INT16_MAX, s16Max = INT16_MIN;
const bool pickRight = (micChannel_ == MicChannel::Right);
for (size_t i = 0; i < pairs; ++i) {
const int32_t L = raw[2 * i];
const int32_t R = raw[2 * i + 1];
if (L < lMin) lMin = L;
if (L > lMax) lMax = L;
if (R < rMin) rMin = R;
if (R > rMax) rMax = R;
// INMP441 is 24-bit left-justified in a 32-bit slot, so the
// useful range lives in bits 31..8. A >> 14 gives a comfortable
// speech level; bump to >> 11 if the result is too quiet.
const int32_t src = pickRight ? R : L;
int32_t s = src >> 14;
if (s > INT16_MAX) s = INT16_MAX;
if (s < INT16_MIN) s = INT16_MIN;
const int16_t s16 = static_cast<int16_t>(s);
if (s16 < s16Min) s16Min = s16;
if (s16 > s16Max) s16Max = s16;
dst[i] = s16;
}
lastStats_ = {lMin, lMax, rMin, rMax, s16Min, s16Max, pairs};
return pairs * 2;
}
size_t Audio::writeSpeakerChunk(const uint8_t* data, size_t len) {
if (!spkStarted_ || len == 0) return 0;
// The UART brings us S16 mono PCM. The I2S bus is running as
// 32-bit stereo, so we expand each 16-bit sample to a stereo
// pair of 32-bit words. 320 input samples → 2560 output bytes.
constexpr size_t MAX_IN_SAMPLES = 320;
const size_t inSamples = (len / 2 > MAX_IN_SAMPLES) ? MAX_IN_SAMPLES : len / 2;
int32_t stereo[MAX_IN_SAMPLES * 2];
const int16_t* src = reinterpret_cast<const int16_t*>(data);
for (size_t i = 0; i < inSamples; ++i) {
// Shift up to place the sample in the upper 16 bits of the
// 32-bit slot (matches what the MAX98357A expects).
const int32_t s32 = static_cast<int32_t>(src[i]) << 16;
stereo[2 * i] = s32; // left
stereo[2 * i + 1] = s32; // right duplicated
}
size_t bytesWritten = 0;
i2s_write(I2S_NUM_0, stereo, inSamples * 2 * sizeof(int32_t),
&bytesWritten, pdMS_TO_TICKS(50));
// Report bytes accepted in *caller units* (S16 mono) so the
// outside world doesn't need to know about our internal format.
const size_t pairsWritten = bytesWritten / (2 * sizeof(int32_t));
return pairsWritten * 2;
}
void Audio::flushSpeaker() {
if (spkStarted_) {
i2s_zero_dma_buffer(I2S_NUM_0);
}
}
} // namespace tipote

View File

@ -0,0 +1,84 @@
// Ti-Pote — Audio I/O via a single full-duplex I2S bus.
//
// I2S_NUM_0 is configured as MASTER in RX+TX mode. BCLK and WS are
// shared between the INMP441 microphone (RX) and the MAX98357A
// amplifier (TX), which is the standard I2S bus layout — exactly
// what was working on the Raspberry Pi side.
//
// Pin map (single shared I2S bus):
// BCLK = GPIO 32 shared mic SCK + speaker BCLK
// LRCLK / WS = GPIO 33 shared mic WS + speaker LRC
// Mic data in = GPIO 34 INMP441 SD (input-only pin, perfect)
// Speaker DOUT = GPIO 22 MAX98357A DIN
//
// Mic L/R stays tied to GND → talks on the LEFT slot of the I2S frame.
//
// Format exchanged with the Pi on the UART:
// PCM signed 16-bit little-endian, mono, 16 kHz.
//
// Internally the bus runs at 32-bit stereo slots (INMP441 requires it).
// readMicChunk() converts the 32-bit left slot down to S16 mono.
// writeSpeakerChunk() expands S16 mono to 32-bit stereo frames before
// handing them to i2s_write().
#pragma once
#include <Arduino.h>
#include <stdint.h>
#include <stddef.h>
namespace tipote {
class Audio {
public:
static constexpr int SAMPLE_RATE = 16000;
static constexpr int CHANNELS = 1;
static constexpr int BYTES_PER_SAMPLE = 2; // S16
// Initialise both I2S ports. Safe to call exactly once from setup().
bool begin();
// Pull whatever the mic DMA has ready. Writes S16 mono little-endian
// bytes into `out`, up to `outCapacity` bytes, and returns the number
// of bytes actually written (always even, possibly zero).
//
// Non-blocking (timeout = 0).
size_t readMicChunk(uint8_t* out, size_t outCapacity);
// Push S16 mono little-endian PCM to the speaker DMA. Blocks up to
// ~50 ms waiting for room. Returns bytes actually accepted.
size_t writeSpeakerChunk(const uint8_t* data, size_t len);
// Drop anything pending in the speaker DMA. Used on shutdown / reset.
void flushSpeaker();
// ─── Debug / bring-up ────────────────────────────────────────
//
// Stats updated on every readMicChunk() call, covering *this last
// batch only*. Handy to confirm the mic is actually clocking data
// into the ESP32 without blowing up the main audio path.
struct MicStats {
int32_t leftRawMin; // raw int32 sample on left I2S slot
int32_t leftRawMax;
int32_t rightRawMin; // raw int32 sample on right I2S slot
int32_t rightRawMax;
int16_t s16Min; // post-shift S16 sample (output channel)
int16_t s16Max;
size_t samples; // sample pairs in the batch
};
const MicStats& lastMicStats() const { return lastStats_; }
// Which I2S slot to route into the S16 output. Flip at runtime if
// the mic's L/R pin doesn't land where we expect.
enum class MicChannel { Left, Right };
void setMicChannel(MicChannel ch) { micChannel_ = ch; }
MicChannel micChannel() const { return micChannel_; }
private:
bool micStarted_ = false;
bool spkStarted_ = false;
MicChannel micChannel_ = MicChannel::Left;
MicStats lastStats_ = {0, 0, 0, 0, 0, 0, 0};
};
} // namespace tipote

View File

@ -30,6 +30,11 @@ build_flags =
-DHW_SERIAL_BAUD=921600
; Idle timeout before the eyes fall back to the default animation (ms)
-DHW_HEARTBEAT_TIMEOUT_MS=5000
; Hardware UART2 pins used to talk to the Raspberry Pi.
; The OLED eyes already claim GPIO 16/17 (UART2 default pins),
; so Serial2 is remapped to these two free pins instead.
-DHW_UART_RX_PIN=27
-DHW_UART_TX_PIN=13
build_unflags =
-std=gnu++11

View File

@ -0,0 +1,219 @@
/**
* Ti-Pote Play a PCM/WAV file on the ESP32 speaker over USB.
*
* Usage:
* pnpm esp:play <file.wav|file.raw>
*
* Accepts either:
* - raw S16 LE mono 16 kHz PCM
* - WAV file with a 44-byte RIFF header (header is stripped)
*
* Default port: auto-detected, override with ESP_PORT=/dev/cu.usbserial-XXX
*/
import { execFileSync } from 'node:child_process';
import { existsSync, mkdtempSync, readFileSync, readdirSync, rmSync } from 'node:fs';
import { tmpdir } from 'node:os';
import { join, extname } from 'node:path';
import { SerialPort } from 'serialport';
const SAMPLE_RATE = 16000;
function findDefaultPort(): string {
const envPort = process.env.ESP_PORT;
if (envPort) return envPort;
const candidates = readdirSync('/dev').filter(
(f) =>
f.startsWith('cu.usbserial') ||
f.startsWith('cu.SLAB_') ||
f.startsWith('cu.wchusbserial'),
);
if (candidates.length === 0) {
throw new Error(
'No ESP32 serial port detected. Plug the board in, or set ESP_PORT=/dev/cu.usbserial-XXX',
);
}
return `/dev/${candidates[0]}`;
}
function stripWav(buf: Buffer): Buffer {
if (
buf.length > 44 &&
buf.toString('ascii', 0, 4) === 'RIFF' &&
buf.toString('ascii', 8, 12) === 'WAVE'
) {
return buf.subarray(44);
}
return buf;
}
/**
* Convert any audio file macOS can decode (m4a, mp3, ogg, aiff, ) to
* S16 LE mono 16 kHz WAV using the built-in `afconvert` tool. Returns
* the path to a new .wav file in a temp dir which the caller is
* responsible for cleaning up.
*/
function convertToEsp32Wav(inputPath: string): { wavPath: string; cleanup: () => void } {
const dir = mkdtempSync(join(tmpdir(), 'tipote-'));
const wavPath = join(dir, 'converted.wav');
console.log(`→ converting ${inputPath} → 16 kHz mono S16LE WAV`);
try {
execFileSync(
'afconvert',
[
'-f', 'WAVE',
'-d', 'LEI16@16000',
'-c', '1',
inputPath,
wavPath,
],
{ stdio: 'inherit' },
);
} catch (err) {
rmSync(dir, { recursive: true, force: true });
throw new Error(`afconvert failed: ${(err as Error).message}`);
}
return {
wavPath,
cleanup: () => rmSync(dir, { recursive: true, force: true }),
};
}
async function main(): Promise<void> {
const inPath = process.argv[2];
if (!inPath) {
console.error('Usage: esp-play.ts <file> (wav, raw, m4a, mp3, …)');
process.exit(1);
}
if (!existsSync(inPath)) {
throw new Error(`file not found: ${inPath}`);
}
// Convert anything that isn't already a .wav or raw PCM blob. This
// covers m4a / mp3 / ogg / aiff / opus / flac via the built-in
// macOS `afconvert` tool.
const ext = extname(inPath).toLowerCase();
const needsConversion = ext !== '.wav' && ext !== '.raw' && ext !== '.pcm';
let cleanup: () => void = () => {};
let loadPath = inPath;
if (needsConversion) {
const converted = convertToEsp32Wav(inPath);
loadPath = converted.wavPath;
cleanup = converted.cleanup;
}
const raw = readFileSync(loadPath);
const pcm = stripWav(raw);
const samples = pcm.length / 2;
const durationMs = (samples / SAMPLE_RATE) * 1000;
console.log(
`→ loaded ${loadPath}: ${pcm.length} bytes (${samples} samples, ${durationMs.toFixed(0)} ms)`,
);
if (pcm.length === 0) {
cleanup();
throw new Error('empty PCM buffer');
}
if (pcm.length % 2 !== 0) {
cleanup();
throw new Error(
'PCM size must be a multiple of 2 (S16 mono). The source file is probably not 16-bit or not mono. If you passed a raw file, convert it first.',
);
}
const path = findDefaultPort();
console.log(`→ opening ${path} @ 921600 baud`);
const port = new SerialPort({ path, baudRate: 921600, autoOpen: false });
await new Promise<void>((resolve, reject) => {
port.open((err) => (err ? reject(err) : resolve()));
});
let ready = false;
const readyWaiters: Array<() => void> = [];
const finished = new Promise<void>((resolve, reject) => {
const timeout = setTimeout(
() => reject(new Error(`timeout waiting for OK after ${durationMs + 8000} ms`)),
durationMs + 8000,
);
let lineBuf = '';
port.on('data', (data: Buffer) => {
lineBuf += data.toString('utf8');
let idx: number;
while ((idx = lineBuf.indexOf('\n')) >= 0) {
const line = lineBuf.slice(0, idx).replace(/\r$/, '').trim();
lineBuf = lineBuf.slice(idx + 1);
if (!line) continue;
if (line === 'OK') {
clearTimeout(timeout);
resolve();
return;
}
if (line === 'READY') {
ready = true;
while (readyWaiters.length) readyWaiters.shift()!();
continue;
}
if (line.startsWith('ERR ')) {
clearTimeout(timeout);
reject(new Error(`firmware error: ${line.slice(4)}`));
return;
}
if (line.startsWith('LOG ')) console.log(`[esp] ${line.slice(4)}`);
else console.log(`[esp] ${line}`);
}
});
port.on('error', reject);
});
// Wait for READY so we don't send PLAY into the bootloader.
await new Promise<void>((resolve, reject) => {
if (ready) return resolve();
const timer = setTimeout(
() => reject(new Error('timeout waiting for READY from firmware')),
5000,
);
readyWaiters.push(() => {
clearTimeout(timer);
resolve();
});
});
await new Promise((r) => setTimeout(r, 50));
console.log(`→ PLAY ${pcm.length} bytes`);
port.write(`PLAY ${pcm.length}\n`);
// Stream the payload paced EXACTLY at the I2S consumption rate so
// the ESP32 RX buffer stays roughly constant in size regardless of
// file length. I2S consumes 16 kHz × 2 bytes/sample = 32 KB/s of
// S16 mono. A 1024-byte burst is 32 ms of audio → sleeping 32 ms
// between bursts matches playback exactly.
//
// We still pad lightly above 32 KB/s (30 ms instead of 32) so the
// DMA never runs dry. The excess fills the ~16 KB RX buffer on the
// firmware slowly; even for a 10 s file we stay well under it.
const CHUNK = 1024;
const PAUSE_MS = 30;
for (let off = 0; off < pcm.length; off += CHUNK) {
const slice = pcm.subarray(off, off + CHUNK);
await new Promise<void>((resolve, reject) => {
port.write(slice, (err) => (err ? reject(err) : resolve()));
});
await new Promise<void>((resolve) => port.drain(() => resolve()));
if (off + CHUNK < pcm.length) {
await new Promise((r) => setTimeout(r, PAUSE_MS));
}
}
await finished;
await new Promise<void>((resolve) => port.close(() => resolve()));
cleanup();
console.log('✅ playback done');
}
main().catch((err) => {
console.error(err);
process.exit(1);
});

View File

@ -0,0 +1,190 @@
/**
* Ti-Pote Record audio from the ESP32 over USB.
*
* Usage:
* pnpm --filter @ti-pote/robot-client exec tsx \
* ../robot-hardware/scripts/esp-record.ts <file.wav> [duration_ms]
*
* Or with the shortcut from robot-hardware:
* pnpm esp:record out.wav 3000
*
* Defaults:
* duration_ms = 3000
* port = auto-detected (first /dev/cu.usbserial-* or /dev/cu.SLAB_*)
* can be overridden with ESP_PORT=/dev/cu.usbserial-XXX
*/
import { readdirSync, writeFileSync } from 'node:fs';
import { SerialPort } from 'serialport';
const SAMPLE_RATE = 16000;
const BYTES_PER_SAMPLE = 2;
function findDefaultPort(): string {
const envPort = process.env.ESP_PORT;
if (envPort) return envPort;
const candidates = readdirSync('/dev').filter(
(f) =>
f.startsWith('cu.usbserial') ||
f.startsWith('cu.SLAB_') ||
f.startsWith('cu.wchusbserial'),
);
if (candidates.length === 0) {
throw new Error(
'No ESP32 serial port detected. Plug the board in, or set ESP_PORT=/dev/cu.usbserial-XXX',
);
}
return `/dev/${candidates[0]}`;
}
function wavHeader(pcmBytes: number, sampleRate: number): Buffer {
const header = Buffer.alloc(44);
header.write('RIFF', 0);
header.writeUInt32LE(36 + pcmBytes, 4);
header.write('WAVE', 8);
header.write('fmt ', 12);
header.writeUInt32LE(16, 16); // fmt chunk size
header.writeUInt16LE(1, 20); // PCM
header.writeUInt16LE(1, 22); // mono
header.writeUInt32LE(sampleRate, 24);
header.writeUInt32LE(sampleRate * 2, 28); // byte rate
header.writeUInt16LE(2, 32); // block align
header.writeUInt16LE(16, 34); // bits per sample
header.write('data', 36);
header.writeUInt32LE(pcmBytes, 40);
return header;
}
async function main(): Promise<void> {
const outPath = process.argv[2];
const durationMs = parseInt(process.argv[3] ?? '3000', 10);
if (!outPath) {
console.error('Usage: esp-record.ts <file.wav> [duration_ms]');
process.exit(1);
}
const path = findDefaultPort();
console.log(`→ opening ${path} @ 921600 baud`);
const port = new SerialPort({ path, baudRate: 921600, autoOpen: false });
await new Promise<void>((resolve, reject) => {
port.open((err) => (err ? reject(err) : resolve()));
});
// ── simple line-based state machine for stdout text ───────────
let phase: 'idle' | 'streaming' = 'idle';
let remaining = 0;
const chunks: Buffer[] = [];
let lineBuf = '';
let ready = false;
const readyWaiters: Array<() => void> = [];
const finished = new Promise<Buffer>((resolve, reject) => {
const timeout = setTimeout(
() => reject(new Error(`timeout waiting for audio after ${durationMs + 5000} ms`)),
durationMs + 5000,
);
port.on('data', (data: Buffer) => {
let offset = 0;
while (offset < data.length) {
if (phase === 'streaming') {
const take = Math.min(remaining, data.length - offset);
chunks.push(data.subarray(offset, offset + take));
offset += take;
remaining -= take;
if (remaining === 0) {
phase = 'idle';
lineBuf = '';
}
continue;
}
// text mode: accumulate until newline
const nl = data.indexOf(0x0a, offset);
if (nl === -1) {
lineBuf += data.subarray(offset).toString('utf8');
break;
}
lineBuf += data.subarray(offset, nl).toString('utf8');
offset = nl + 1;
const line = lineBuf.replace(/\r$/, '').trim();
lineBuf = '';
if (!line) continue;
if (line.startsWith('BEGIN ')) {
remaining = parseInt(line.slice(6), 10);
phase = 'streaming';
console.log(`→ BEGIN ${remaining} bytes`);
} else if (line === 'END') {
clearTimeout(timeout);
const pcm = Buffer.concat(chunks);
resolve(pcm);
} else if (line === 'READY') {
ready = true;
while (readyWaiters.length) readyWaiters.shift()!();
} else if (line.startsWith('LOG ')) {
console.log(`[esp] ${line.slice(4)}`);
} else if (line.startsWith('ERR ')) {
clearTimeout(timeout);
reject(new Error(`firmware error: ${line.slice(4)}`));
} else {
console.log(`[esp] ${line}`);
}
}
});
port.on('error', reject);
});
// The ESP32 resets on port open (DTR/RTS). Wait until it prints
// READY so we don't send commands into the bootloader.
await new Promise<void>((resolve, reject) => {
if (ready) return resolve();
const timer = setTimeout(
() => reject(new Error('timeout waiting for READY from firmware')),
5000,
);
readyWaiters.push(() => {
clearTimeout(timer);
resolve();
});
});
await new Promise((r) => setTimeout(r, 50));
console.log(`→ REC ${durationMs} ms — speak now!`);
port.write(`REC ${durationMs}\n`);
const pcm = await finished;
await new Promise<void>((resolve) => port.close(() => resolve()));
// Basic RMS sanity check.
let sumSq = 0;
const samples = pcm.length / BYTES_PER_SAMPLE;
for (let i = 0; i < pcm.length - 1; i += 2) {
const s = pcm.readInt16LE(i);
sumSq += s * s;
}
const rms = Math.sqrt(sumSq / samples);
console.log(
`✅ captured ${pcm.length} bytes (${samples} samples, ${(
(samples / SAMPLE_RATE) *
1000
).toFixed(0)} ms) RMS=${rms.toFixed(0)}`,
);
if (outPath.toLowerCase().endsWith('.wav')) {
writeFileSync(outPath, Buffer.concat([wavHeader(pcm.length, SAMPLE_RATE), pcm]));
} else {
writeFileSync(outPath, pcm);
}
console.log(`→ wrote ${outPath}`);
}
main().catch((err) => {
console.error(err);
process.exit(1);
});

View File

@ -1,147 +1,281 @@
// Ti-Pote — Robot Hardware firmware (ESP32)
// Ti-Pote — Minimal audio bring-up firmware (ESP32-WROOM-32)
//
// Responsibilities for v0:
// - Listen on UART0 (the USB-connected serial port while the ESP32
// is plugged into Arthur's laptop; on the real robot this will
// eventually be Serial2 wired to the Raspberry Pi).
// - Decode incoming binary frames (see include/protocol_types.h).
// - Dispatch commands to the Eyes renderer.
// - Reply to PING with PONG.
// - Fall back to a sleepy animation if no heartbeat is received
// for HW_HEARTBEAT_TIMEOUT_MS (set in platformio.ini).
// GOAL: prove the I2S audio chain (INMP441 + MAX98357A) end to end
// with nothing else in the loop — no Pi, no OLED, no protocol frames.
// The ESP32 is plugged into a computer via USB and the host runs
// two tiny scripts:
//
// Intentionally NOT yet implemented (Phase 2):
// - I2S audio up/down streaming
// - Servo / LED commands
// scripts/esp-record.mjs <file.raw> <duration_ms>
// scripts/esp-play.mjs <file.raw>
//
// The hook points for those are marked with TODO(phase2).
// Protocol over USB Serial (921600 baud, line-based for commands,
// raw bytes for audio):
//
// host → esp32
// "PING\n" ping
// "REC <ms>\n" start recording for <ms> milliseconds
// "PLAY <bytes>\n" next <bytes> bytes on the wire are raw
// S16 LE mono 16 kHz PCM, play them
//
// esp32 → host
// "READY\n" once at boot
// "PONG\n" reply to PING
// "LOG <text>\n" human-readable log line
// "ERR <text>\n" error message
// "BEGIN <bytes>\n" start of a REC response
// "<raw bytes>" raw PCM (S16 LE mono 16 kHz)
// "END\n" end of a REC response
// "OK\n" command completed
//
// Wiring (shared I2S bus on I2S_NUM_0):
// BCLK = GPIO 32 (mic SCK + speaker BCLK)
// LRCLK = GPIO 33 (mic WS + speaker LRC)
// MIC = GPIO 34 (INMP441 SD → ESP32 data-in, input-only pin)
// SPK = GPIO 22 (ESP32 data-out → MAX98357A DIN)
#include <Arduino.h>
#include "Protocol.h"
#include "Eyes.h"
#include <driver/i2s.h>
#include <string.h>
#ifndef HW_SERIAL_BAUD
#define HW_SERIAL_BAUD 921600
#endif
// ──────────────────────────────────────────────────────────
// Audio config
// ──────────────────────────────────────────────────────────
#ifndef HW_HEARTBEAT_TIMEOUT_MS
#define HW_HEARTBEAT_TIMEOUT_MS 5000
#endif
static constexpr int SAMPLE_RATE = 16000;
static constexpr int PIN_BCLK = 32;
static constexpr int PIN_LRCLK = 33;
static constexpr int PIN_MIC_DIN = 34;
static constexpr int PIN_SPK_DOUT = 22;
// The communication stream. When the ESP32 is plugged into a
// computer, UART0 (Serial) is the USB-CDC port, which is exactly
// what the robot-client will talk to during development. Later,
// for the Pi wiring, change this to Serial2 and call
// `Serial2.begin(HW_SERIAL_BAUD, SERIAL_8N1, RX_PIN, TX_PIN)`.
#define HW_COMM Serial
static constexpr int DMA_COUNT = 4;
static constexpr int DMA_LEN = 256;
using namespace tipote;
// Staging buffers — keep them outside of functions so we don't eat
// stack on every tick.
static constexpr size_t OUT_S16_SAMPLES = 320; // 20 ms of S16 mono
static int32_t g_rawStereo[OUT_S16_SAMPLES * 2];
static int16_t g_micMono [OUT_S16_SAMPLES];
static int32_t g_spkStereo[OUT_S16_SAMPLES * 2];
static uint8_t g_spkInBuf [OUT_S16_SAMPLES * 2]; // 640 bytes of S16 mono
static Eyes eyes;
static FrameDecoder decoder;
// ──────────────────────────────────────────────────────────
// Line buffer for incoming text commands.
// ──────────────────────────────────────────────────────────
static uint32_t lastHeartbeatMs = 0;
static bool idleMode = false;
static char g_line[64];
static size_t g_lineLen = 0;
// Forward decl
static void handleFrame(const Frame& frame, void* userData);
static void logLine(const char* line);
static void sendLog(const char* msg) {
Serial.print("LOG ");
Serial.println(msg);
}
static void sendErr(const char* msg) {
Serial.print("ERR ");
Serial.println(msg);
}
// ──────────────────────────────────────────────────────────
// I2S init — single port, full duplex, shared BCLK/WS.
// ──────────────────────────────────────────────────────────
static bool audioBegin() {
i2s_config_t cfg = {};
cfg.mode = static_cast<i2s_mode_t>(I2S_MODE_MASTER |
I2S_MODE_RX |
I2S_MODE_TX);
cfg.sample_rate = SAMPLE_RATE;
cfg.bits_per_sample = I2S_BITS_PER_SAMPLE_32BIT;
cfg.channel_format = I2S_CHANNEL_FMT_RIGHT_LEFT;
cfg.communication_format = I2S_COMM_FORMAT_STAND_I2S;
cfg.intr_alloc_flags = ESP_INTR_FLAG_LEVEL1;
cfg.dma_buf_count = DMA_COUNT;
cfg.dma_buf_len = DMA_LEN;
cfg.use_apll = false;
cfg.tx_desc_auto_clear = true;
cfg.fixed_mclk = 0;
if (i2s_driver_install(I2S_NUM_0, &cfg, 0, nullptr) != ESP_OK) return false;
i2s_pin_config_t pins = {};
pins.bck_io_num = PIN_BCLK;
pins.ws_io_num = PIN_LRCLK;
pins.data_out_num = PIN_SPK_DOUT;
pins.data_in_num = PIN_MIC_DIN;
if (i2s_set_pin(I2S_NUM_0, &pins) != ESP_OK) {
i2s_driver_uninstall(I2S_NUM_0);
return false;
}
i2s_zero_dma_buffer(I2S_NUM_0);
return true;
}
// Convert one batch of stereo 32-bit mic samples to S16 mono by
// taking the left slot and shifting the 24-bit-aligned data down.
// Returns the number of S16 samples written into `out`.
static size_t micReadMono(int16_t* out, size_t maxSamples) {
size_t wantPairs = maxSamples;
if (wantPairs > OUT_S16_SAMPLES) wantPairs = OUT_S16_SAMPLES;
size_t bytesRead = 0;
const esp_err_t err = i2s_read(
I2S_NUM_0,
g_rawStereo,
wantPairs * 2 * sizeof(int32_t),
&bytesRead,
portMAX_DELAY // block — we're in a dedicated REC loop
);
if (err != ESP_OK || bytesRead == 0) return 0;
const size_t pairs = bytesRead / (2 * sizeof(int32_t));
for (size_t i = 0; i < pairs; ++i) {
int32_t L = g_rawStereo[2 * i];
int32_t s = L >> 14;
if (s > INT16_MAX) s = INT16_MAX;
if (s < INT16_MIN) s = INT16_MIN;
out[i] = static_cast<int16_t>(s);
}
return pairs;
}
// Write one batch of S16 mono PCM to the speaker by duplicating each
// sample into both stereo slots and shifting into the high half of
// the 32-bit word (what the MAX98357A expects on a shared bus).
static void spkWriteMono(const int16_t* samples, size_t count) {
if (count == 0) return;
if (count > OUT_S16_SAMPLES) count = OUT_S16_SAMPLES;
for (size_t i = 0; i < count; ++i) {
const int32_t s32 = static_cast<int32_t>(samples[i]) << 16;
g_spkStereo[2 * i] = s32;
g_spkStereo[2 * i + 1] = s32;
}
size_t bytesWritten = 0;
i2s_write(I2S_NUM_0, g_spkStereo, count * 2 * sizeof(int32_t),
&bytesWritten, portMAX_DELAY);
}
// ──────────────────────────────────────────────────────────
// Command handlers
// ──────────────────────────────────────────────────────────
static void handleRec(uint32_t durationMs) {
const uint32_t totalSamples = (SAMPLE_RATE * durationMs) / 1000;
const uint32_t totalBytes = totalSamples * sizeof(int16_t);
Serial.print("BEGIN ");
Serial.println(totalBytes);
// Flush whatever old noise is in the mic DMA first.
i2s_zero_dma_buffer(I2S_NUM_0);
uint32_t sent = 0;
while (sent < totalSamples) {
size_t want = totalSamples - sent;
if (want > OUT_S16_SAMPLES) want = OUT_S16_SAMPLES;
const size_t got = micReadMono(g_micMono, want);
if (got == 0) continue;
Serial.write(reinterpret_cast<const uint8_t*>(g_micMono),
got * sizeof(int16_t));
sent += got;
}
Serial.println();
Serial.println("END");
}
static void handlePlay(uint32_t totalBytes) {
// Drain any pending crap from the speaker DMA so we don't start
// with a pop.
i2s_zero_dma_buffer(I2S_NUM_0);
// Give Serial.readBytes a generous timeout so a jittery host
// doesn't abort us mid-playback.
Serial.setTimeout(2000);
uint32_t remaining = totalBytes;
while (remaining > 0) {
size_t want = remaining;
if (want > sizeof(g_spkInBuf)) want = sizeof(g_spkInBuf);
// Force an even count so we always have complete S16 samples.
if (want & 1) want -= 1;
if (want == 0) want = 2;
const size_t got = Serial.readBytes(g_spkInBuf, want);
if (got == 0) {
sendErr("PLAY read timeout");
return;
}
const size_t samples = got / sizeof(int16_t);
spkWriteMono(reinterpret_cast<const int16_t*>(g_spkInBuf), samples);
remaining -= got;
}
// Let the last frames actually reach the speaker, then clear.
delay(50);
i2s_zero_dma_buffer(I2S_NUM_0);
Serial.println("OK");
}
static void handleLine(const char* line) {
if (strcmp(line, "PING") == 0) {
Serial.println("PONG");
return;
}
if (strncmp(line, "REC ", 4) == 0) {
const long ms = atol(line + 4);
if (ms <= 0 || ms > 60000) { sendErr("REC bad duration"); return; }
handleRec(static_cast<uint32_t>(ms));
return;
}
if (strncmp(line, "PLAY ", 5) == 0) {
const long bytes = atol(line + 5);
if (bytes <= 0 || bytes > 16 * 1024 * 1024) {
sendErr("PLAY bad size");
return;
}
handlePlay(static_cast<uint32_t>(bytes));
return;
}
sendErr("unknown command");
}
// ──────────────────────────────────────────────────────────
// Arduino entry points
// ──────────────────────────────────────────────────────────
void setup() {
HW_COMM.begin(HW_SERIAL_BAUD);
// Give the host a beat to open the port after auto-reset.
// Bump the UART RX buffer WAY above the 256-byte default so we
// can absorb a full PLAY payload (up to a few tens of KB) without
// losing bytes if the host floods us.
Serial.setRxBufferSize(16 * 1024);
Serial.begin(921600);
delay(50);
eyes.begin();
if (!audioBegin()) {
sendErr("I2S init failed");
} else {
sendLog("I2S ready");
}
decoder.onFrame(handleFrame);
lastHeartbeatMs = millis();
logLine("robot-hardware ready");
Serial.println("READY");
}
void loop() {
// Drain whatever the host has sent since the last tick.
while (HW_COMM.available() > 0) {
int b = HW_COMM.read();
if (b < 0) break;
decoder.feed(static_cast<uint8_t>(b));
while (Serial.available() > 0) {
const int c = Serial.read();
if (c < 0) break;
if (c == '\r') continue;
if (c == '\n') {
g_line[g_lineLen] = 0;
if (g_lineLen > 0) handleLine(g_line);
g_lineLen = 0;
continue;
}
// Heartbeat watchdog: if we haven't heard from the host in a
// while, slip into a sleepy animation so the robot doesn't
// look frozen. Any incoming frame resets this.
const uint32_t now = millis();
if (!idleMode && (now - lastHeartbeatMs) > HW_HEARTBEAT_TIMEOUT_MS) {
idleMode = true;
eyes.show(Emotion::SLEEPY);
if (g_lineLen < sizeof(g_line) - 1) {
g_line[g_lineLen++] = static_cast<char>(c);
} else {
g_lineLen = 0;
sendErr("line overflow");
}
}
// ---------------------------------------------------------------
// Frame dispatcher
// ---------------------------------------------------------------
static void handleFrame(const Frame& frame, void* /*userData*/) {
lastHeartbeatMs = millis();
if (idleMode) {
idleMode = false;
}
switch (frame.type) {
case MsgType::DISPLAY_EMOTION: {
if (frame.length < 1) {
logLine("DISPLAY_EMOTION: empty payload");
return;
}
const uint8_t code = frame.payload[0];
if (code >= static_cast<uint8_t>(Emotion::COUNT)) {
logLine("DISPLAY_EMOTION: out-of-range code");
return;
}
eyes.show(static_cast<Emotion>(code));
// ACK back so the host knows it was applied.
uint8_t ackPayload[1] = {code};
FrameEncoder::writeTo(HW_COMM, MsgType::ACK, ackPayload, 1);
return;
}
case MsgType::DISPLAY_CLEAR: {
eyes.clear();
FrameEncoder::writeTo(HW_COMM, MsgType::ACK);
return;
}
case MsgType::PING: {
// Echo the payload back as PONG. Useful for latency
// measurements and proving the link is symmetric.
FrameEncoder::writeTo(HW_COMM, MsgType::PONG,
frame.payload, frame.length);
return;
}
case MsgType::STATUS: {
// Heartbeat from host — lastHeartbeatMs was already
// bumped above. Nothing else to do for v0.
return;
}
// TODO(phase2): AUDIO_UP / AUDIO_DOWN / SERVO_CMD / LED_CMD
default:
logLine("unknown frame type");
return;
}
}
// ---------------------------------------------------------------
// Diagnostic logging — wraps text in a LOG frame so the host
// can parse it without getting confused by free text on the wire.
// ---------------------------------------------------------------
static void logLine(const char* line) {
const size_t len = strnlen(line, MAX_PAYLOAD_SIZE);
FrameEncoder::writeTo(HW_COMM, MsgType::LOG,
reinterpret_cast<const uint8_t*>(line),
static_cast<uint16_t>(len));
}