ti-pote/apps/robot-client/src/services/wake-word.service.ts
2026-04-09 13:19:51 +02:00

269 lines
8.7 KiB
TypeScript

import { ChildProcess, spawn } from 'node:child_process';
import { EventEmitter } from 'node:events';
import { type WakeWordConfig, type AudioConfig } from '../config/index.js';
import { type HardwareService } from '../hardware/index.js';
import { createLogger, type Logger } from '../utils/index.js';
export interface WakeWordServiceEvents {
detected: () => void;
ready: () => void;
error: (error: Error) => void;
}
/**
* Wake word detection service.
*
* Two operating modes, selected by whether a HardwareService is passed
* to the constructor:
*
* 1. **ALSA mode** (no HardwareService)
* The Python subprocess opens PyAudio on `audioConfig.captureDevice`
* and reads the mic directly. Pause releases the ALSA device so
* arecord (the AlsaAudioService) can use it during conversation.
*
* 2. **ESP32 mode** (HardwareService provided)
* The Python subprocess reads raw S16 mono PCM from stdin. We
* subscribe to `hardware.on('audio_up')` and pipe every mic chunk
* coming off the UART straight into the Python process. Control
* commands (PAUSE/RESUME/RESET/QUIT) go over a separate pipe at
* fd 3 because stdin is busy carrying audio.
*
* The model is loaded once at startup; pause/resume is cheap and
* does not reload it.
*/
export class WakeWordService extends EventEmitter {
private process: ChildProcess | null = null;
private readonly logger: Logger;
private _isListening = false;
private _isPaused = false;
private _streamClosed = false;
private readonly usesHardware: boolean;
/** Latched forwarder so we can detach it on stop / error. */
private readonly forwardMicChunk = (chunk: Buffer): void => {
if (!this.process || !this.process.stdin || this.process.stdin.destroyed) return;
this.process.stdin.write(chunk, (err) => {
if (err && (err as NodeJS.ErrnoException).code === 'EPIPE') {
this.logger.warn('Wake word process stdin pipe broken — detaching audio');
this.detachHardware();
}
});
};
constructor(
private readonly wakeWordConfig: WakeWordConfig,
private readonly audioConfig: AudioConfig,
private readonly hardware: HardwareService | null = null,
) {
super();
this.logger = createLogger('wake-word', 'info');
this.usesHardware = hardware !== null;
}
get isListening(): boolean {
return this._isListening && !this._isPaused;
}
start(): void {
if (this.process) {
if (this._isPaused) this.resume();
return;
}
this.logger.info(
{
mode: this.usesHardware ? 'esp32' : 'alsa',
model: this.wakeWordConfig.modelName,
threshold: this.wakeWordConfig.threshold,
},
'Starting wake word detection',
);
const args = [
this.wakeWordConfig.scriptPath,
'--model', this.wakeWordConfig.modelName,
'--threshold', String(this.wakeWordConfig.threshold),
'--sample-rate', String(this.audioConfig.sampleRate),
];
if (this.usesHardware) {
args.push('--input', 'stdin', '--control-fd', '3');
} else {
args.push('--input', 'alsa', '--device', this.audioConfig.captureDevice);
}
// stdio layout:
// 0: stdin — audio in (ESP32 mode) or control (ALSA mode)
// 1: stdout — DETECTED events
// 2: stderr — status & log lines
// 3: extra — control pipe (ESP32 mode only)
const stdio: ('pipe' | 'ignore')[] = this.usesHardware
? ['pipe', 'pipe', 'pipe', 'pipe']
: ['pipe', 'pipe', 'pipe'];
this.process = spawn(this.wakeWordConfig.pythonPath, args, { stdio });
this._isListening = true;
this._isPaused = false;
this.process.stdout?.on('data', (data: Buffer) => {
const lines = data.toString().trim().split('\n');
for (const line of lines) {
if (line.trim() === 'DETECTED') {
this.logger.info('🎙️ Wake word detected!');
this.emit('detected');
} else {
this.logger.debug({ line }, 'Wake word stdout');
}
}
});
this.process.stderr?.on('data', (data: Buffer) => {
const lines = data.toString().trim().split('\n');
for (const line of lines) {
const msg = line.trim();
if (!msg) continue;
if (msg === 'READY') {
this.logger.info('🎤 Wake word engine ready — listening...');
this.emit('ready');
} else if (msg === 'PAUSED') {
this._streamClosed = false;
this.logger.debug('Wake word paused');
} else if (msg === 'STREAM_CLOSED') {
this._streamClosed = true;
this.logger.debug('Wake word audio stream closed');
} else if (msg === 'RESUMED') {
this.logger.debug('Wake word resumed');
} else if (msg === 'STREAM_REOPENED') {
this.logger.debug('Wake word audio stream reopened');
} else if (msg.startsWith('Loading wake word model')) {
this.logger.info('⏳ Loading wake word model...');
} else if (msg.startsWith('Wake word model loaded')) {
this.logger.info('✅ Wake word model loaded');
} else if (msg.startsWith('Matched device') || msg.startsWith('Using device') || msg.startsWith('Listening')) {
this.logger.info(`🔊 ${msg}`);
} else {
this.logger.warn('Wake word stderr: %s', msg);
}
}
});
this.process.on('error', (err) => {
this._isListening = false;
this.logger.error({ err }, 'Wake word process error');
this.detachHardware();
this.emit('error', new Error(`Wake word process failed: ${err.message}`));
});
this.process.on('exit', (code) => {
this._isListening = false;
this._isPaused = false;
this.detachHardware();
this.process = null;
if (code !== 0 && code !== null) {
this.logger.warn({ code }, 'Wake word process exited unexpectedly');
setTimeout(() => {
this.logger.info('Restarting wake word detection...');
this.start();
}, 2000);
}
});
// In ESP32 mode, start piping mic audio from the UART.
if (this.usesHardware && this.hardware) {
this.hardware.on('audio_up', this.forwardMicChunk);
}
}
/**
* Pause wake word detection.
*
* In ALSA mode we must wait for STREAM_CLOSED so arecord can reclaim
* the device. In ESP32 mode the audio flow never stops — we just
* tell the Python process to ignore detections.
*/
pause(): Promise<void> {
if (!this.process || this._isPaused) return Promise.resolve();
this._isPaused = true;
this._streamClosed = false;
this.writeControl('PAUSE');
if (this.usesHardware) {
// No physical device to release — resolve immediately.
return Promise.resolve();
}
return new Promise((resolve) => {
const checkInterval = setInterval(() => {
if (this._streamClosed || !this.process) {
clearInterval(checkInterval);
resolve();
}
}, 50);
setTimeout(() => {
clearInterval(checkInterval);
resolve();
}, 2000);
});
}
resume(): void {
if (!this.process || !this._isPaused) return;
this._isPaused = false;
this.writeControl('RESUME');
this.logger.info('🎤 Resuming wake word listening...');
}
stop(): void {
if (this.process) {
this.writeControl('QUIT');
this.detachHardware();
setTimeout(() => {
if (this.process) {
this.process.kill('SIGTERM');
this.process = null;
}
}, 500);
}
this._isListening = false;
this._isPaused = false;
this.removeAllListeners();
}
// ──────────────────────────────────────────────────────────
// Internals
// ──────────────────────────────────────────────────────────
/**
* Write a text control command. In ALSA mode that goes to stdin;
* in ESP32 mode stdin carries audio so commands travel over the
* extra pipe at fd 3 (process.stdio[3]).
*/
private writeControl(cmd: string): void {
if (!this.process) return;
const line = `${cmd}\n`;
if (this.usesHardware) {
// stdio[3] is our control pipe — a Node Writable (net.Socket) stream.
const control = this.process.stdio[3] as unknown as
| (NodeJS.WritableStream & { destroyed?: boolean })
| null;
if (control && !control.destroyed) {
control.write(line);
}
} else {
this.process.stdin?.write(line);
}
}
private detachHardware(): void {
if (this.usesHardware && this.hardware) {
this.hardware.off('audio_up', this.forwardMicChunk);
}
}
}