From 2be22da2ff35fbefa5d67abb54cadc10142485f1 Mon Sep 17 00:00:00 2001 From: ordinarthur <@arthurbarre.js@gmail.com> Date: Tue, 14 Apr 2026 02:31:30 +0200 Subject: [PATCH] add voxtral --- apps/backend/package.json | 4 +- .../inbound/websocket/robot.gateway.ts | 14 +- .../outbound/llm/anthropic.adapter.ts | 7 +- .../adapters/outbound/llm/openai.adapter.ts | 7 +- .../outbound/tts/elevenlabs.adapter.ts | 7 +- .../adapters/outbound/tts/mistral.adapter.ts | 122 ++++++++++++++++++ apps/backend/src/app.module.ts | 10 +- .../src/core/services/conversation.service.ts | 16 ++- pnpm-lock.yaml | 51 ++++++++ 9 files changed, 204 insertions(+), 34 deletions(-) create mode 100644 apps/backend/src/adapters/outbound/tts/mistral.adapter.ts diff --git a/apps/backend/package.json b/apps/backend/package.json index eab20af..5f68d36 100644 --- a/apps/backend/package.json +++ b/apps/backend/package.json @@ -22,13 +22,15 @@ "migration:revert": "pnpm typeorm migration:revert -d src/config/typeorm.config.ts" }, "dependencies": { + "@ai-sdk/mistral": "^3.0.30", "@anthropic-ai/sdk": "^0.80.0", "@deepgram/sdk": "^5.0.0", "@mastra/core": "^1.17.0", + "@mistralai/mistralai": "^2.2.0", "@nestjs/common": "^11.1.17", "@nestjs/config": "^4.0.3", - "@nestjs/event-emitter": "^3.0.0", "@nestjs/core": "^11.1.17", + "@nestjs/event-emitter": "^3.0.0", "@nestjs/jwt": "^11.0.2", "@nestjs/passport": "^11.0.5", "@nestjs/platform-express": "^11.1.17", diff --git a/apps/backend/src/adapters/inbound/websocket/robot.gateway.ts b/apps/backend/src/adapters/inbound/websocket/robot.gateway.ts index 2546385..cfaa163 100644 --- a/apps/backend/src/adapters/inbound/websocket/robot.gateway.ts +++ b/apps/backend/src/adapters/inbound/websocket/robot.gateway.ts @@ -121,20 +121,8 @@ export class RobotGateway implements OnGatewayConnection, OnGatewayDisconnect, I @ConnectedSocket() client: AuthenticatedSocket, @MessageBody() message: AudioChunkMessage, ) { - const chunk = message.data; - // Debug: log audio chunk info to diagnose STT issues - if (!this._audioLogCount) this._audioLogCount = 0; - this._audioLogCount++; - if (this._audioLogCount <= 3 || this._audioLogCount % 100 === 0) { - this.logger.debug( - `Audio chunk #${this._audioLogCount}: type=${typeof chunk}, isBuffer=${Buffer.isBuffer(chunk)}, ` + - `constructor=${chunk?.constructor?.name}, length=${chunk?.length ?? chunk?.byteLength ?? 'N/A'}, ` + - `sampleRate=${message.sampleRate}`, - ); - } - this.conversationPort.processAudioChunk(client.data.deviceId, chunk, message.sampleRate); + this.conversationPort.processAudioChunk(client.data.deviceId, message.data, message.sampleRate); } - private _audioLogCount = 0; @SubscribeMessage('speech_end') async handleSpeechEnd(@ConnectedSocket() client: AuthenticatedSocket) { diff --git a/apps/backend/src/adapters/outbound/llm/anthropic.adapter.ts b/apps/backend/src/adapters/outbound/llm/anthropic.adapter.ts index 4e52b40..48ecdef 100644 --- a/apps/backend/src/adapters/outbound/llm/anthropic.adapter.ts +++ b/apps/backend/src/adapters/outbound/llm/anthropic.adapter.ts @@ -16,11 +16,8 @@ export class AnthropicAdapter implements ILLMPort { private readonly model: string; constructor(private readonly configService: ConfigService) { - const apiKey = this.configService.get('ANTHROPIC_API_KEY'); - if (!apiKey) { - throw new Error('ANTHROPIC_API_KEY is not set'); - } - this.client = new Anthropic({ apiKey }); + const apiKey = this.configService.get('ANTHROPIC_API_KEY', ''); + this.client = new Anthropic({ apiKey: apiKey || 'unused' }); this.model = this.configService.get('ANTHROPIC_MODEL', 'claude-sonnet-4-20250514'); } diff --git a/apps/backend/src/adapters/outbound/llm/openai.adapter.ts b/apps/backend/src/adapters/outbound/llm/openai.adapter.ts index d0a7e33..d7d8778 100644 --- a/apps/backend/src/adapters/outbound/llm/openai.adapter.ts +++ b/apps/backend/src/adapters/outbound/llm/openai.adapter.ts @@ -16,11 +16,8 @@ export class OpenAIAdapter implements ILLMPort { private readonly model: string; constructor(private readonly configService: ConfigService) { - const apiKey = this.configService.get('OPENAI_API_KEY'); - if (!apiKey) { - throw new Error('OPENAI_API_KEY is not set'); - } - this.client = new OpenAI({ apiKey }); + const apiKey = this.configService.get('OPENAI_API_KEY', ''); + this.client = new OpenAI({ apiKey: apiKey || 'unused' }); this.model = this.configService.get('OPENAI_MODEL', 'gpt-4o'); } diff --git a/apps/backend/src/adapters/outbound/tts/elevenlabs.adapter.ts b/apps/backend/src/adapters/outbound/tts/elevenlabs.adapter.ts index 1946db7..e4a7482 100644 --- a/apps/backend/src/adapters/outbound/tts/elevenlabs.adapter.ts +++ b/apps/backend/src/adapters/outbound/tts/elevenlabs.adapter.ts @@ -10,11 +10,8 @@ export class ElevenLabsAdapter implements ITTSPort { private readonly defaultVoiceId: string; constructor(private readonly configService: ConfigService) { - const apiKey = this.configService.get('ELEVENLABS_API_KEY'); - if (!apiKey) { - throw new Error('ELEVENLABS_API_KEY is not set'); - } - this.client = new ElevenLabsClient({ apiKey }); + const apiKey = this.configService.get('ELEVENLABS_API_KEY', ''); + this.client = new ElevenLabsClient({ apiKey: apiKey || 'unused' }); this.defaultVoiceId = this.configService.get('ELEVENLABS_VOICE_ID', 'pFZP5JQG7iQjIQuC4Bku'); } diff --git a/apps/backend/src/adapters/outbound/tts/mistral.adapter.ts b/apps/backend/src/adapters/outbound/tts/mistral.adapter.ts new file mode 100644 index 0000000..2da68a4 --- /dev/null +++ b/apps/backend/src/adapters/outbound/tts/mistral.adapter.ts @@ -0,0 +1,122 @@ +import { Injectable, Logger } from '@nestjs/common'; +import { ConfigService } from '@nestjs/config'; +import { Mistral } from '@mistralai/mistralai'; +import { ITTSPort } from '../../../core/ports/outbound/tts.port'; + +@Injectable() +export class MistralTTSAdapter implements ITTSPort { + private readonly logger = new Logger(MistralTTSAdapter.name); + private readonly client: Mistral; + private readonly voiceId: string; + private readonly model: string; + + constructor(private readonly configService: ConfigService) { + const apiKey = this.configService.get('MISTRAL_API_KEY'); + if (!apiKey) { + throw new Error('MISTRAL_API_KEY is not set'); + } + this.client = new Mistral({ apiKey }); + this.voiceId = this.configService.get('MISTRAL_TTS_VOICE', 'fr_marie_neutral'); + this.model = this.configService.get('MISTRAL_TTS_MODEL', 'voxtral-mini-tts-2603'); + } + + async synthesize(text: string, voice?: string): Promise { + const result = await this.client.audio.speech.complete({ + model: this.model, + input: text, + responseFormat: 'wav', + stream: false, + voiceId: voice || this.voiceId, + }); + + const wavBuffer = Buffer.from(result.audioData, 'base64'); + + // Extract raw PCM from WAV (skip 44-byte header) and resample to 16kHz + // if needed. Voxtral outputs 24kHz by default. + const pcm = this.extractPcmFromWav(wavBuffer); + return pcm; + } + + async synthesizeStream( + text: string, + voice?: string, + onChunk?: (chunk: Buffer) => void, + ): Promise { + // Voxtral doesn't support true streaming; synthesize and emit as a single chunk. + const pcm = await this.synthesize(text, voice); + onChunk?.(pcm); + } + + /** + * Extract raw PCM data from a WAV buffer and resample to 16kHz mono S16LE + * if the source sample rate differs. + */ + private extractPcmFromWav(wav: Buffer): Buffer { + // Parse WAV header + const sampleRate = wav.readUInt32LE(24); + const bitsPerSample = wav.readUInt16LE(34); + const numChannels = wav.readUInt16LE(22); + + // Find the 'data' chunk + let dataOffset = 12; + while (dataOffset < wav.length - 8) { + const chunkId = wav.toString('ascii', dataOffset, dataOffset + 4); + const chunkSize = wav.readUInt32LE(dataOffset + 4); + if (chunkId === 'data') { + dataOffset += 8; + break; + } + dataOffset += 8 + chunkSize; + } + + let pcm = wav.subarray(dataOffset); + + this.logger.debug( + `WAV: ${sampleRate}Hz, ${bitsPerSample}bit, ${numChannels}ch, ${pcm.length} bytes PCM`, + ); + + // Convert to mono if stereo + if (numChannels === 2 && bitsPerSample === 16) { + const monoSamples = pcm.length / 4; + const mono = Buffer.alloc(monoSamples * 2); + for (let i = 0; i < monoSamples; i++) { + const left = pcm.readInt16LE(i * 4); + const right = pcm.readInt16LE(i * 4 + 2); + mono.writeInt16LE(Math.round((left + right) / 2), i * 2); + } + pcm = mono; + } + + // Resample to 16kHz if needed (simple linear interpolation) + if (sampleRate !== 16000) { + pcm = this.resample(pcm, sampleRate, 16000); + } + + return pcm; + } + + /** + * Simple linear-interpolation resampler for 16-bit mono PCM. + */ + private resample(pcm: Buffer, fromRate: number, toRate: number): Buffer { + const ratio = fromRate / toRate; + const srcSamples = pcm.length / 2; + const dstSamples = Math.floor(srcSamples / ratio); + const out = Buffer.alloc(dstSamples * 2); + + for (let i = 0; i < dstSamples; i++) { + const srcPos = i * ratio; + const srcIdx = Math.floor(srcPos); + const frac = srcPos - srcIdx; + + const s0 = pcm.readInt16LE(Math.min(srcIdx, srcSamples - 1) * 2); + const s1 = pcm.readInt16LE(Math.min(srcIdx + 1, srcSamples - 1) * 2); + const sample = Math.round(s0 + frac * (s1 - s0)); + + out.writeInt16LE(Math.max(-32768, Math.min(32767, sample)), i * 2); + } + + this.logger.debug(`Resampled ${fromRate}→${toRate}Hz: ${srcSamples}→${dstSamples} samples`); + return out; + } +} diff --git a/apps/backend/src/app.module.ts b/apps/backend/src/app.module.ts index c4327f6..1046aee 100644 --- a/apps/backend/src/app.module.ts +++ b/apps/backend/src/app.module.ts @@ -32,6 +32,7 @@ import { DeepgramAdapter } from './adapters/outbound/stt/deepgram.adapter'; import { AnthropicAdapter } from './adapters/outbound/llm/anthropic.adapter'; import { OpenAIAdapter } from './adapters/outbound/llm/openai.adapter'; import { ElevenLabsAdapter } from './adapters/outbound/tts/elevenlabs.adapter'; +import { MistralTTSAdapter } from './adapters/outbound/tts/mistral.adapter'; import { RedisAdapter } from './adapters/outbound/cache/redis.adapter'; import { CONVERSATION_PORT } from './core/ports/inbound/conversation.port'; import { HEALTH_TELEMETRY_PORT } from './core/ports/inbound/health-telemetry.port'; @@ -108,7 +109,14 @@ import { CACHE_PORT } from './core/ports/outbound/cache.port'; }, { provide: TTS_PORT, - useClass: ElevenLabsAdapter, + inject: [ConfigService], + useFactory: (configService: ConfigService) => { + const provider = configService.get('TTS_PROVIDER', 'elevenlabs'); + if (provider === 'mistral') { + return new MistralTTSAdapter(configService); + } + return new ElevenLabsAdapter(configService); + }, }, { provide: CACHE_PORT, diff --git a/apps/backend/src/core/services/conversation.service.ts b/apps/backend/src/core/services/conversation.service.ts index 5344564..ef2cf56 100644 --- a/apps/backend/src/core/services/conversation.service.ts +++ b/apps/backend/src/core/services/conversation.service.ts @@ -45,10 +45,18 @@ export class ConversationService implements IConversationPort { private readonly configService: ConfigService, ) { const provider = this.configService.get('LLM_PROVIDER', 'anthropic'); - const model = - provider === 'openai' - ? `openai/${this.configService.get('OPENAI_MODEL', 'gpt-4o')}` - : `anthropic/${this.configService.get('ANTHROPIC_MODEL', 'claude-sonnet-4-20250514')}`; + let model: string; + switch (provider) { + case 'openai': + model = `openai/${this.configService.get('OPENAI_MODEL', 'gpt-4o')}`; + break; + case 'mistral': + model = `mistral/${this.configService.get('MISTRAL_MODEL', 'ministral-3b-latest')}`; + break; + default: + model = `anthropic/${this.configService.get('ANTHROPIC_MODEL', 'claude-sonnet-4-20250514')}`; + break; + } this.agent = new Agent({ id: 'ti-pote', diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index f8a9519..641ebf3 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -10,6 +10,9 @@ importers: apps/backend: dependencies: + '@ai-sdk/mistral': + specifier: ^3.0.30 + version: 3.0.30(zod@4.3.6) '@anthropic-ai/sdk': specifier: ^0.80.0 version: 0.80.0(zod@4.3.6) @@ -19,6 +22,9 @@ importers: '@mastra/core': specifier: ^1.17.0 version: 1.17.0(@standard-community/standard-json@0.3.5(@standard-schema/spec@1.1.0)(@types/json-schema@7.0.15)(quansync@0.2.11)(zod-to-json-schema@3.25.2(zod@4.3.6))(zod@4.3.6))(@standard-community/standard-openapi@0.2.9(@standard-community/standard-json@0.3.5(@standard-schema/spec@1.1.0)(@types/json-schema@7.0.15)(quansync@0.2.11)(zod-to-json-schema@3.25.2(zod@4.3.6))(zod@4.3.6))(@standard-schema/spec@1.1.0)(openapi-types@12.1.3)(zod@4.3.6))(@types/json-schema@7.0.15)(openapi-types@12.1.3)(zod@4.3.6) + '@mistralai/mistralai': + specifier: ^2.2.0 + version: 2.2.0 '@nestjs/common': specifier: ^11.1.17 version: 11.1.17(class-transformer@0.5.1)(class-validator@0.15.1)(reflect-metadata@0.2.2)(rxjs@7.8.2) @@ -284,6 +290,12 @@ packages: resolution: {integrity: sha512-VTDuRS5V0ATbJ/LkaQlisMnTAeYKXAK6scMguVBstf+KIBQ7HIuKhiXLv+G/hvejkV+THoXzoNifInAkU81P1g==} engines: {node: '>=18'} + '@ai-sdk/mistral@3.0.30': + resolution: {integrity: sha512-+j4IXRSk9E661cFSafmIr+XHOzwjFagawwzMOlSqwL6U4Sq4PCFLDF+oHbX5NUqNjUL7FD1zi/9lBIfa41pUvw==} + engines: {node: '>=18'} + peerDependencies: + zod: ^3.25.76 || ^4.1.8 + '@ai-sdk/provider-utils@2.2.8': resolution: {integrity: sha512-fqhG+4sCVv8x7nFzYnFo19ryhAa3w096Kmc3hWxMQfW/TubPOmt3A6tYZhl4mUfQWWQMsuSkLrtjlWuXBVSGQA==} engines: {node: '>=18'} @@ -302,6 +314,12 @@ packages: peerDependencies: zod: ^3.25.76 || ^4.1.8 + '@ai-sdk/provider-utils@4.0.23': + resolution: {integrity: sha512-z8GlDaCmRSDlqkMF2f4/RFgWxdarvIbyuk+m6WXT1LYgsnGiXRJGTD2Z1+SDl3LqtFuRtGX1aghYvQLoHL/9pg==} + engines: {node: '>=18'} + peerDependencies: + zod: ^3.25.76 || ^4.1.8 + '@ai-sdk/provider@1.1.3': resolution: {integrity: sha512-qZMxYJ0qqX/RfnuIaab+zp8UAeJn/ygXXAffR5I4N0n1IrvA6qBsjc8hXLmBiMV2zoXlifkacF7sEFnYnjBcqg==} engines: {node: '>=18'} @@ -318,6 +336,10 @@ packages: resolution: {integrity: sha512-2Xmoq6DBJqmSl80U6V9z5jJSJP7ehaJJQMy2iFUqTay06wdCqTnPVBBQbtEL8RCChenL+q5DC5H5WzU3vV3v8w==} engines: {node: '>=18'} + '@ai-sdk/provider@3.0.8': + resolution: {integrity: sha512-oGMAgGoQdBXbZqNG0Ze56CHjDZ1IDYOwGYxYjO5KLSlz5HiNQ9udIXsPZ61VWaHGZ5XW/jyjmr6t2xz2jGVwbQ==} + engines: {node: '>=18'} + '@ai-sdk/ui-utils@1.2.11': resolution: {integrity: sha512-3zcwCc8ezzFlwp3ZD15wAPjf2Au4s3vAbKsXQVyhxODHcmu0iyPO2Eua6D/vicq/AUm/BAo60r97O6HU+EI0+w==} engines: {node: '>=18'} @@ -1205,6 +1227,9 @@ packages: peerDependencies: zod: ^3.25.0 || ^4.0.0 + '@mistralai/mistralai@2.2.0': + resolution: {integrity: sha512-JQUGIXjFWnw/J9LpTSf/ZXwVW3Sh8FBAcfTo5QvAHqkl4CfSiIwnjRJhMoAFcP6ncCe84YPU1ncDGX+p3OXnfg==} + '@modelcontextprotocol/sdk@1.28.0': resolution: {integrity: sha512-gmloF+i+flI8ouQK7MWW4mOwuMh4RePBuPFAEPC6+pdqyWOUMDOixb6qZ69owLJpz6XmyllCouc4t8YWO+E2Nw==} engines: {node: '>=18'} @@ -5573,6 +5598,12 @@ snapshots: transitivePeerDependencies: - supports-color + '@ai-sdk/mistral@3.0.30(zod@4.3.6)': + dependencies: + '@ai-sdk/provider': 3.0.8 + '@ai-sdk/provider-utils': 4.0.23(zod@4.3.6) + zod: 4.3.6 + '@ai-sdk/provider-utils@2.2.8(zod@4.3.6)': dependencies: '@ai-sdk/provider': 1.1.3 @@ -5594,6 +5625,13 @@ snapshots: eventsource-parser: 3.0.6 zod: 4.3.6 + '@ai-sdk/provider-utils@4.0.23(zod@4.3.6)': + dependencies: + '@ai-sdk/provider': 3.0.8 + '@standard-schema/spec': 1.1.0 + eventsource-parser: 3.0.6 + zod: 4.3.6 + '@ai-sdk/provider@1.1.3': dependencies: json-schema: 0.4.0 @@ -5610,6 +5648,10 @@ snapshots: dependencies: json-schema: 0.4.0 + '@ai-sdk/provider@3.0.8': + dependencies: + json-schema: 0.4.0 + '@ai-sdk/ui-utils@1.2.11(zod@4.3.6)': dependencies: '@ai-sdk/provider': 1.1.3 @@ -6529,6 +6571,15 @@ snapshots: zod-from-json-schema-v3: zod-from-json-schema@0.0.5 zod-to-json-schema: 3.25.2(zod@4.3.6) + '@mistralai/mistralai@2.2.0': + dependencies: + ws: 8.20.0 + zod: 4.3.6 + zod-to-json-schema: 3.25.2(zod@4.3.6) + transitivePeerDependencies: + - bufferutil + - utf-8-validate + '@modelcontextprotocol/sdk@1.28.0(zod@4.3.6)': dependencies: '@hono/node-server': 1.19.11(hono@4.12.9)