freedge/backend/src/ai/transcriber.ts

import type { OpenAI } from 'openai';
import type { FastifyBaseLogger } from 'fastify';
import * as fs from 'node:fs';
import * as path from 'node:path';
import * as os from 'node:os';
import { computeTranscribeCost, type CallLog } from './cost';

const TRANSCRIBE_MODEL = process.env.OPENAI_TRANSCRIBE_MODEL || 'gpt-4o-mini-transcribe';

export interface TranscribeOptions {
  audioPath: string;
  userId?: string;
  /** Indice de langue ISO-639-1 (défaut: 'fr') pour améliorer la précision */
  language?: string;
}

export interface TranscribeResult {
  text: string;
  log: CallLog;
}

interface TempFile {
  path: string;
  cleanup: () => void;
}

export async function downloadToTemp(
  url: string,
  extension = '.mp3'
): Promise<TempFile> {
  const response = await fetch(url);
  if (!response.ok) {
    throw new Error(`Échec téléchargement audio: ${response.statusText}`);
  }
  const buffer = Buffer.from(await response.arrayBuffer());
  const tempFilePath = path.join(os.tmpdir(), `freedge-audio-${Date.now()}${extension}`);
  fs.writeFileSync(tempFilePath, buffer);
  return {
    path: tempFilePath,
    cleanup: () => {
      try {
        fs.unlinkSync(tempFilePath);
      } catch {
        /* ignore */
      }
    },
  };
}

/**
 * Transcrit un fichier audio local en texte.
 *
 * Utilise `gpt-4o-mini-transcribe` par défaut (50 % moins cher que whisper-1
 * et meilleur sur les langues romanes).
 */
export async function transcribeAudio(
  openai: OpenAI,
  logger: FastifyBaseLogger,
  options: TranscribeOptions
): Promise<TranscribeResult> {
  const start = Date.now();

  const stat = fs.statSync(options.audioPath);
  if (stat.size === 0) {
    throw new Error('Fichier audio vide');
  }

  const transcription = await openai.audio.transcriptions.create({
    file: fs.createReadStream(options.audioPath),
    model: TRANSCRIBE_MODEL,
    language: options.language || 'fr',
    // Prompt court pour aider Whisper à comprendre le contexte
    prompt:
      "Énumération d'ingrédients culinaires en français. Vocabulaire de cuisine.",
  });

  const text = transcription.text.trim();
  if (!text) {
    throw new Error('Transcription vide');
  }

  // Estimation de la durée audio à partir de la taille (rough mais suffisant
  // pour le tracking de coût). 1 minute MP3 ≈ 1 MB à 128 kbps.
  const estimatedMinutes = Math.max(0.1, stat.size / (1024 * 1024));
  const cost = computeTranscribeCost(TRANSCRIBE_MODEL, estimatedMinutes);

  const log: CallLog = {
    userId: options.userId,
    operation: 'transcription',
    model: TRANSCRIBE_MODEL,
    durationMs: Date.now() - start,
    costUsd: cost,
    usage: { audioMinutes: estimatedMinutes },
  };

  logger.info(log, 'openai_audio_transcribed');

  return { text, log };
}