diff --git a/apps/backend/src/adapters/inbound/websocket/robot.gateway.ts b/apps/backend/src/adapters/inbound/websocket/robot.gateway.ts index d27b22e..6a2de95 100644 --- a/apps/backend/src/adapters/inbound/websocket/robot.gateway.ts +++ b/apps/backend/src/adapters/inbound/websocket/robot.gateway.ts @@ -130,6 +130,10 @@ export class RobotGateway implements OnGatewayConnection, OnGatewayDisconnect, I this.connectedDevices.get(deviceId)?.emit('audio_chunk', { data: base64 }); } + sendResponseText(deviceId: string, text: string, audioBase64?: string) { + this.connectedDevices.get(deviceId)?.emit('response_text', { text, audio: audioBase64 }); + } + sendStatus(deviceId: string, state: RobotState) { this.connectedDevices.get(deviceId)?.emit('status', { state }); } diff --git a/apps/backend/src/core/ports/outbound/device-gateway.port.ts b/apps/backend/src/core/ports/outbound/device-gateway.port.ts index b3ebbe4..f7218d1 100644 --- a/apps/backend/src/core/ports/outbound/device-gateway.port.ts +++ b/apps/backend/src/core/ports/outbound/device-gateway.port.ts @@ -1,5 +1,6 @@ export interface IDeviceGatewayPort { sendAudioChunk(deviceId: string, chunk: Buffer): void; + sendResponseText(deviceId: string, text: string, audioBase64?: string): void; sendStatus(deviceId: string, state: 'listening' | 'thinking' | 'speaking' | 'idle'): void; sendNotification(deviceId: string, payload: Record): void; isDeviceConnected(deviceId: string): boolean; diff --git a/apps/backend/src/core/services/conversation.service.ts b/apps/backend/src/core/services/conversation.service.ts index 0333122..1a75d79 100644 --- a/apps/backend/src/core/services/conversation.service.ts +++ b/apps/backend/src/core/services/conversation.service.ts @@ -109,12 +109,14 @@ export class ConversationService implements IConversationPort { if (responseText) { this.deviceGateway.sendStatus(deviceId, 'speaking'); - const audioBuffer = await this.ttsPort.synthesize(responseText); - this.logger.debug(`TTS complete: ${audioBuffer.length} bytes`); - this.deviceGateway.sendAudioChunk(deviceId, audioBuffer); - } + const pcm = await this.ttsPort.synthesize(responseText); + const wav = this.pcmToWav(pcm, 16000); + const audioBase64 = wav.toString('base64'); - this.deviceGateway.sendStatus(deviceId, 'idle'); + this.deviceGateway.sendResponseText(deviceId, responseText, audioBase64); + } else { + this.deviceGateway.sendStatus(deviceId, 'idle'); + } } catch (error) { this.logger.error(`Error processing conversation for ${deviceId}:`, error); this.deviceGateway.sendStatus(deviceId, 'idle'); @@ -123,6 +125,32 @@ export class ConversationService implements IConversationPort { return finalText; } + private pcmToWav(pcm: Buffer, sampleRate: number): Buffer { + const numChannels = 1; + const bitsPerSample = 16; + const byteRate = sampleRate * numChannels * (bitsPerSample / 8); + const blockAlign = numChannels * (bitsPerSample / 8); + const dataSize = pcm.length; + const headerSize = 44; + + const header = Buffer.alloc(headerSize); + header.write('RIFF', 0); + header.writeUInt32LE(dataSize + headerSize - 8, 4); + header.write('WAVE', 8); + header.write('fmt ', 12); + header.writeUInt32LE(16, 16); // subchunk1 size + header.writeUInt16LE(1, 20); // PCM format + header.writeUInt16LE(numChannels, 22); + header.writeUInt32LE(sampleRate, 24); + header.writeUInt32LE(byteRate, 28); + header.writeUInt16LE(blockAlign, 32); + header.writeUInt16LE(bitsPerSample, 34); + header.write('data', 36); + header.writeUInt32LE(dataSize, 40); + + return Buffer.concat([header, pcm]); + } + interrupt(deviceId: string): void { const session = this.activeSessions.get(deviceId); if (!session) return; diff --git a/apps/simulator/src/App.tsx b/apps/simulator/src/App.tsx index e7d7ae9..9a50873 100644 --- a/apps/simulator/src/App.tsx +++ b/apps/simulator/src/App.tsx @@ -37,7 +37,14 @@ function App() { emit('speech_end'); }, [emit]); - const { recording, start: startMic, stop: stopMic } = useMicrophone({ onAudioChunk, onSpeechEnd }); + const { recording, start: startMic, stop: stopMic, silentStop } = useMicrophone({ onAudioChunk, onSpeechEnd }); + + // Stop mic when Ti-Pote starts thinking/speaking + useEffect(() => { + if (recording && (state === 'thinking' || state === 'speaking')) { + silentStop(); + } + }, [state, recording, silentStop]); // Auto-restart listening when Ti-Pote finishes speaking useEffect(() => { @@ -45,8 +52,11 @@ function App() { prevStateRef.current = state; if (conversationActive && state === 'idle' && (prevState === 'speaking' || prevState === 'thinking')) { - emit('wake_word_detected'); - startMic(); + const timer = setTimeout(() => { + emit('wake_word_detected'); + startMic(); + }, 500); + return () => clearTimeout(timer); } }, [state, conversationActive, emit, startMic]); diff --git a/apps/simulator/src/hooks/useAudioPlayer.ts b/apps/simulator/src/hooks/useAudioPlayer.ts index a4d2304..ec0d432 100644 --- a/apps/simulator/src/hooks/useAudioPlayer.ts +++ b/apps/simulator/src/hooks/useAudioPlayer.ts @@ -4,10 +4,12 @@ const SAMPLE_RATE = 16000; export function useAudioPlayer() { const contextRef = useRef(null); + const nextStartTimeRef = useRef(0); const getContext = useCallback(() => { if (!contextRef.current || contextRef.current.state === 'closed') { contextRef.current = new AudioContext({ sampleRate: SAMPLE_RATE }); + nextStartTimeRef.current = 0; } if (contextRef.current.state === 'suspended') { contextRef.current.resume(); @@ -28,6 +30,8 @@ export function useAudioPlayer() { // Ensure even byte count for Int16 const evenLength = bytes.length - (bytes.length % 2); + if (evenLength < 2) return; + const int16 = new Int16Array(bytes.buffer.slice(bytes.byteOffset, bytes.byteOffset + evenLength)); // Int16 PCM → Float32 @@ -42,17 +46,26 @@ export function useAudioPlayer() { const source = ctx.createBufferSource(); source.buffer = buffer; source.connect(ctx.destination); - source.start(); + + // Schedule seamlessly after the previous chunk + const now = ctx.currentTime; + const startTime = Math.max(now, nextStartTimeRef.current); + source.start(startTime); + nextStartTimeRef.current = startTime + buffer.duration; }, [getContext], ); - const flush = useCallback(() => {}, []); + const flush = useCallback(() => { + // Reset scheduling for next conversation turn + nextStartTimeRef.current = 0; + }, []); const stop = useCallback(() => { if (contextRef.current) { contextRef.current.close(); contextRef.current = null; + nextStartTimeRef.current = 0; } }, []); diff --git a/apps/simulator/src/hooks/useMicrophone.ts b/apps/simulator/src/hooks/useMicrophone.ts index 74bbb6c..c25dea6 100644 --- a/apps/simulator/src/hooks/useMicrophone.ts +++ b/apps/simulator/src/hooks/useMicrophone.ts @@ -3,9 +3,7 @@ import { useRef, useState, useCallback } from 'react'; interface UseMicrophoneOptions { onAudioChunk: (chunk: ArrayBuffer, sampleRate: number) => void; onSpeechEnd: () => void; - /** Silence duration in ms before triggering speech end (default: 1500) */ silenceTimeout?: number; - /** RMS threshold below which audio is considered silence (default: 0.01) */ silenceThreshold?: number; } @@ -39,7 +37,7 @@ export function useMicrophone({ contextRef.current = null; streamRef.current = null; hasSpeechRef.current = false; - stoppedRef.current = false; + stoppedRef.current = true; setRecording(false); }, [clearSilenceTimer]); @@ -65,14 +63,12 @@ export function useMicrophone({ const float32 = e.inputBuffer.getChannelData(0); - // Calculate RMS volume let sum = 0; for (let i = 0; i < float32.length; i++) { sum += float32[i] * float32[i]; } const rms = Math.sqrt(sum / float32.length); - // Convert and send audio const int16 = new Int16Array(float32.length); for (let i = 0; i < float32.length; i++) { const s = Math.max(-1, Math.min(1, float32[i])); @@ -80,12 +76,10 @@ export function useMicrophone({ } onAudioChunk(int16.buffer, context.sampleRate); - // VAD logic if (rms > silenceThreshold) { hasSpeechRef.current = true; clearSilenceTimer(); } else if (hasSpeechRef.current && !silenceTimerRef.current) { - // Speech detected before, now silence — start countdown silenceTimerRef.current = setTimeout(() => { if (stoppedRef.current) return; stoppedRef.current = true; @@ -103,12 +97,18 @@ export function useMicrophone({ } }, [onAudioChunk, onSpeechEnd, silenceTimeout, silenceThreshold, clearSilenceTimer, cleanup]); + // Stop and emit speech_end const stop = useCallback(() => { if (stoppedRef.current) return; - stoppedRef.current = true; cleanup(); onSpeechEnd(); }, [onSpeechEnd, cleanup]); - return { recording, start, stop }; + // Stop silently — no speech_end emitted + const silentStop = useCallback(() => { + if (stoppedRef.current) return; + cleanup(); + }, [cleanup]); + + return { recording, start, stop, silentStop }; } diff --git a/apps/simulator/src/hooks/useSocket.ts b/apps/simulator/src/hooks/useSocket.ts index 083df99..2d32d25 100644 --- a/apps/simulator/src/hooks/useSocket.ts +++ b/apps/simulator/src/hooks/useSocket.ts @@ -1,6 +1,5 @@ import { useRef, useState, useCallback } from 'react'; import { io, Socket } from 'socket.io-client'; -import { useAudioPlayer } from './useAudioPlayer'; export type RobotState = 'disconnected' | 'idle' | 'listening' | 'thinking' | 'speaking'; @@ -16,12 +15,35 @@ export function useSocket() { const [state, setState] = useState('disconnected'); const [connected, setConnected] = useState(false); const [logs, setLogs] = useState([]); - const audioPlayer = useAudioPlayer(); const addLog = useCallback((direction: LogEntry['direction'], event: string, data?: string) => { setLogs((prev) => [...prev.slice(-200), { timestamp: new Date(), direction, event, data }]); }, []); + const audioRef = useRef(null); + + const playAudio = useCallback((audioBase64: string) => { + if (audioRef.current) { + audioRef.current.pause(); + audioRef.current = null; + } + + const audio = new Audio(`data:audio/wav;base64,${audioBase64}`); + audioRef.current = audio; + + audio.onended = () => { + audioRef.current = null; + setState('idle'); + }; + + audio.onerror = () => { + audioRef.current = null; + setState('idle'); + }; + + audio.play(); + }, []); + const connect = useCallback( (serverUrl: string, deviceToken: string) => { if (socketRef.current) { @@ -52,31 +74,35 @@ export function useSocket() { socket.on('status', (payload: { state: RobotState }) => { setState(payload.state); addLog('in', 'status', payload.state); - if (payload.state === 'idle') { - audioPlayer.flush(); + }); + + socket.on('response_text', (payload: { text: string; audio?: string }) => { + addLog('in', 'response_text', payload.text); + if (payload.audio) { + playAudio(payload.audio); + } else { + setState('idle'); } }); socket.on('audio_chunk', (payload: { data: string }) => { - addLog('in', 'audio_chunk', `${payload.data?.length ?? 0} chars (base64)`); - if (payload.data) { - audioPlayer.playChunk(payload.data); - } + addLog('in', 'audio_chunk', `${payload.data?.length ?? 0} chars`); }); socket.on('notification', (payload: Record) => { addLog('in', 'notification', JSON.stringify(payload)); }); - socket.on('response_start', () => addLog('in', 'response_start')); - socket.on('response_end', () => addLog('in', 'response_end')); - socketRef.current = socket; }, - [addLog], + [addLog, playAudio], ); const disconnect = useCallback(() => { + if (audioRef.current) { + audioRef.current.pause(); + audioRef.current = null; + } socketRef.current?.disconnect(); socketRef.current = null; setConnected(false);