rubis/apps/api/commands/ocr_validate.ts

import { BaseCommand, flags } from '@adonisjs/core/ace'
import type { CommandOptions } from '@adonisjs/core/types/ace'
import { readFile, readdir } from 'node:fs/promises'
import { extname, join, basename } from 'node:path'
import drive from '@adonisjs/drive/services/main'
import { randomUUID } from 'node:crypto'

import { getOcrProvider } from '#services/ocr/index'
import type { OcrResult } from '#services/ocr/ocr_provider'

/**
 * Commande de validation OCR — mesure la qualité d'extraction du
 * provider courant sur un set de factures réelles avec ground truth.
 *
 * Usage :
 *
 *   # Avec le provider courant (.env) :
 *   node ace ocr:validate
 *
 *   # Forcer Mistral (vrai OCR) :
 *   OCR_PROVIDER=mistral MISTRAL_API_KEY=... node ace ocr:validate
 *
 *   # Avec un dossier custom :
 *   node ace ocr:validate --fixtures-dir=path/to/pdfs
 *
 *   # JSON report :
 *   node ace ocr:validate --out=ocr-report.json
 *
 * Format des fixtures :
 *   - `<name>.pdf` (ou .png/.jpg) : facture à OCRiser
 *   - `<name>.expected.json` : ground truth avec :
 *       {
 *         "expected": {
 *           "clientName": "...",
 *           "clientEmail": "..." | null,
 *           "numero": "...",
 *           "amountTtcCents": 124000,
 *           "issueDate": "2024-04-15",   // YYYY-MM-DD
 *           "dueDate": "2024-05-15"
 *         },
 *         "notes": "facture B2B classique"   // libre, ignoré par la commande
 *       }
 *
 * Tolérances :
 *   - amountTtcCents : exact (la précision financière compte)
 *   - issueDate / dueDate : jour exact (heure ignorée)
 *   - numero : exact (case-insensitive, trim)
 *   - clientName : Levenshtein ≤ 3 OR similarity Jaccard ≥ 85 %
 *   - clientEmail : exact (lowercased) ou null
 *
 * Pour ajouter une facture au bench :
 *   1. Dépose `e2e/fixtures/invoices/ma-facture.pdf`
 *   2. Crée `e2e/fixtures/invoices/ma-facture.expected.json` avec les
 *      valeurs lisibles sur la facture
 *   3. Relance la commande
 */

type ExpectedFields = {
  clientName: string
  clientEmail: string | null
  numero: string
  amountTtcCents: number
  issueDate: string // YYYY-MM-DD
  dueDate: string
}

type ExpectedFile = {
  expected: ExpectedFields
  notes?: string
}

type FieldComparison = {
  field: keyof ExpectedFields
  expected: string | number | null
  got: string | number | null
  match: boolean
  reason?: string
  confidence?: number
}

type FixtureResult = {
  filename: string
  durationMs: number
  fields: FieldComparison[]
  /** True si tous les champs match dans leurs tolérances. */
  allMatch: boolean
}

const SUPPORTED_EXT = new Set(['.pdf', '.png', '.jpg', '.jpeg'])

export default class OcrValidate extends BaseCommand {
  static commandName = 'ocr:validate'
  static description =
    "Bench OCR : compare l'extraction du provider courant à des ground truth (e2e/fixtures/invoices/)"

  static options: CommandOptions = {
    startApp: true,
  }

  @flags.string({
    description: "Dossier des fixtures (default: e2e/fixtures/invoices)",
  })
  declare fixturesDir: string

  @flags.string({
    description: 'Path du rapport JSON en sortie (optionnel)',
  })
  declare out: string

  @flags.number({
    description:
      "Délai en ms entre deux appels provider (anti rate-limit). Default: 1500 ms pour Mistral free tier.",
  })
  declare delayMs: number

  async run() {
    // Le cwd quand la commande tourne est apps/api (pnpm --filter ou ace
    // direct). Le default pointe donc vers e2e/fixtures/invoices à la
    // racine du monorepo.
    const dir = this.fixturesDir ?? '../../e2e/fixtures/invoices'
    const provider = getOcrProvider()
    this.logger.info(`→ Provider OCR : ${provider.constructor.name}`)
    this.logger.info(`→ Fixtures dir : ${dir}`)

    let entries: string[]
    try {
      entries = await readdir(dir)
    } catch (err) {
      this.logger.error(
        `Dossier introuvable : ${dir}. Créer le dossier et y poser tes PDFs + .expected.json.`
      )
      this.exitCode = 1
      return
    }

    const pdfs = entries.filter((e) => SUPPORTED_EXT.has(extname(e).toLowerCase()))
    if (pdfs.length === 0) {
      this.logger.warning(
        `Aucun PDF/PNG/JPG dans ${dir}. Voir le format dans le header de cette commande.`
      )
      this.exitCode = 1
      return
    }

    this.logger.info(`→ ${pdfs.length} fixture(s) à valider\n`)

    const results: FixtureResult[] = []
    const driveDisk = drive.use()
    const delayMs = this.delayMs ?? 1500

    for (const [idx, pdf] of pdfs.entries()) {
      if (idx > 0 && delayMs > 0) {
        await new Promise((r) => setTimeout(r, delayMs))
      }
      const pdfPath = join(dir, pdf)
      const expectedPath = join(dir, basename(pdf, extname(pdf)) + '.expected.json')

      let expected: ExpectedFields
      try {
        const json = JSON.parse(await readFile(expectedPath, 'utf-8')) as ExpectedFile
        expected = json.expected
      } catch {
        this.logger.warning(`⚠ Pas de ${expectedPath} — skip ${pdf}`)
        continue
      }

      const buffer = await readFile(pdfPath)
      // Upload temporaire vers le disk courant (MinIO en dev), pour que
      // le provider (Mistral) puisse re-télécharger comme en prod.
      const storageKey = `ocr-validate/${randomUUID()}/${pdf}`
      await driveDisk.put(storageKey, buffer)

      const t0 = Date.now()
      let ocrResult: OcrResult | null = null
      // Retry sur 429 (rate limit) avec backoff exponentiel — utile en
      // free tier Mistral où la limite n'est pas linéaire. Max 3 retries
      // (30s + 60s + 90s = 3 min max d'attente avant abandon).
      const maxRetries = 3
      for (let attempt = 0; attempt <= maxRetries; attempt++) {
        try {
          ocrResult = await provider.extract({ storageKey, filename: pdf })
          break
        } catch (err) {
          const msg = (err as Error).message
          const isRateLimit = msg.includes('429') || msg.includes('Rate limit')
          if (!isRateLimit || attempt === maxRetries) {
            this.logger.error(`✗ ${pdf} — extraction throw : ${msg}`)
            break
          }
          const waitSec = 30 + attempt * 30
          this.logger.warning(
            `⏸ ${pdf} — 429 rate limit, retry dans ${waitSec}s (attempt ${attempt + 1}/${maxRetries})`
          )
          await new Promise((r) => setTimeout(r, waitSec * 1000))
        }
      }
      const durationMs = Date.now() - t0
      await driveDisk.delete(storageKey).catch(() => {})
      if (!ocrResult) continue

      const comparisons = compareFields(expected, ocrResult)
      const allMatch = comparisons.every((c) => c.match)
      results.push({ filename: pdf, durationMs, fields: comparisons, allMatch })

      this.printFixtureResult(pdf, durationMs, comparisons, allMatch)
    }

    this.printSummary(results)

    if (this.out) {
      const { writeFile } = await import('node:fs/promises')
      await writeFile(this.out, JSON.stringify({ provider: provider.constructor.name, results }, null, 2))
      this.logger.info(`✔ Rapport JSON écrit : ${this.out}`)
    }

    // Exit 1 si une fixture a échoué — utile en CI
    if (results.some((r) => !r.allMatch)) {
      this.exitCode = 1
    }
  }

  private printFixtureResult(
    filename: string,
    durationMs: number,
    fields: FieldComparison[],
    allMatch: boolean
  ) {
    const status = allMatch ? '✔' : '✗'
    this.logger.info(`\n${status} ${filename}  (${durationMs} ms)`)
    for (const f of fields) {
      const icon = f.match ? '  ✓' : '  ✗'
      const conf = f.confidence !== undefined ? ` conf=${f.confidence.toFixed(2)}` : ''
      const reason = f.reason ? `  [${f.reason}]` : ''
      this.logger.info(
        `${icon} ${f.field.padEnd(16)} expected=${formatValue(f.expected)}  got=${formatValue(
          f.got
        )}${conf}${reason}`
      )
    }
  }

  private printSummary(results: FixtureResult[]) {
    const total = results.length
    const fullPass = results.filter((r) => r.allMatch).length
    const totalFields = results.reduce((sum, r) => sum + r.fields.length, 0)
    const matchedFields = results.reduce(
      (sum, r) => sum + r.fields.filter((f) => f.match).length,
      0
    )
    const fieldAccuracy = totalFields > 0 ? (matchedFields / totalFields) * 100 : 0
    const docAccuracy = total > 0 ? (fullPass / total) * 100 : 0
    const avgLatency =
      results.reduce((sum, r) => sum + r.durationMs, 0) / Math.max(total, 1)

    this.logger.info('\n────────────────────────────────────────────────────────────')
    this.logger.info(`Total factures        : ${total}`)
    this.logger.info(`Factures 100 % match  : ${fullPass}  (${docAccuracy.toFixed(1)} %)`)
    this.logger.info(
      `Champs match (total)  : ${matchedFields}/${totalFields}  (${fieldAccuracy.toFixed(1)} %)`
    )
    this.logger.info(`Latence moyenne       : ${avgLatency.toFixed(0)} ms / facture`)
    this.logger.info('────────────────────────────────────────────────────────────\n')

    // Détail par champ
    const fieldStats = new Map<string, { ok: number; total: number }>()
    for (const r of results) {
      for (const f of r.fields) {
        const s = fieldStats.get(f.field) ?? { ok: 0, total: 0 }
        s.total += 1
        if (f.match) s.ok += 1
        fieldStats.set(f.field, s)
      }
    }
    this.logger.info('Précision par champ :')
    for (const [field, s] of fieldStats) {
      const pct = (s.ok / s.total) * 100
      this.logger.info(`  ${field.padEnd(18)} ${s.ok}/${s.total}  (${pct.toFixed(1)} %)`)
    }
  }
}

// ---------------------------------------------------------------------------
// Comparison
// ---------------------------------------------------------------------------

function compareFields(expected: ExpectedFields, got: OcrResult): FieldComparison[] {
  return [
    {
      field: 'clientName',
      expected: expected.clientName,
      got: got.fields.clientName.value,
      confidence: got.fields.clientName.confidence,
      match: matchesName(expected.clientName, got.fields.clientName.value),
      reason: matchesName(expected.clientName, got.fields.clientName.value)
        ? undefined
        : 'fuzzy similarity < 85 %',
    },
    {
      field: 'clientEmail',
      expected: expected.clientEmail,
      got: got.fields.clientEmail.value,
      confidence: got.fields.clientEmail.confidence,
      match: matchesEmail(expected.clientEmail, got.fields.clientEmail.value),
    },
    {
      field: 'numero',
      expected: expected.numero,
      got: got.fields.numero.value,
      confidence: got.fields.numero.confidence,
      match: matchesString(expected.numero, got.fields.numero.value),
    },
    {
      field: 'amountTtcCents',
      expected: expected.amountTtcCents,
      got: got.fields.amountTtcCents.value,
      confidence: got.fields.amountTtcCents.confidence,
      match: expected.amountTtcCents === got.fields.amountTtcCents.value,
    },
    {
      field: 'issueDate',
      expected: expected.issueDate,
      got: got.fields.issueDate.value,
      confidence: got.fields.issueDate.confidence,
      match: matchesDate(expected.issueDate, got.fields.issueDate.value),
    },
    {
      field: 'dueDate',
      expected: expected.dueDate,
      got: got.fields.dueDate.value,
      confidence: got.fields.dueDate.confidence,
      match: matchesDate(expected.dueDate, got.fields.dueDate.value),
    },
  ]
}

function matchesString(a: string, b: string): boolean {
  return a.trim().toLowerCase() === b.trim().toLowerCase()
}

function matchesEmail(a: string | null, b: string | null): boolean {
  if (a === null && b === null) return true
  if (a === null || b === null) return false
  return a.trim().toLowerCase() === b.trim().toLowerCase()
}

function matchesDate(a: string, b: string): boolean {
  // a au format YYYY-MM-DD, b au format ISO 8601. On compare au jour près.
  const aDay = a.slice(0, 10)
  const bDay = b.slice(0, 10)
  return aDay === bDay
}

function matchesName(a: string, b: string): boolean {
  const an = a.trim().toLowerCase()
  const bn = b.trim().toLowerCase()
  if (an === bn) return true
  // Similarité Jaccard sur les mots (tolérante aux suffixes SARL/SAS/…
  // et aux espaces différents).
  const aTokens = new Set(an.split(/\s+/).filter((t) => t.length > 1))
  const bTokens = new Set(bn.split(/\s+/).filter((t) => t.length > 1))
  if (aTokens.size === 0 || bTokens.size === 0) return false
  const intersection = new Set([...aTokens].filter((t) => bTokens.has(t)))
  const union = new Set([...aTokens, ...bTokens])
  const jaccard = intersection.size / union.size
  return jaccard >= 0.85
}

function formatValue(v: string | number | null): string {
  if (v === null) return 'null'
  if (typeof v === 'number') return String(v)
  return JSON.stringify(v)
}