import { BaseCommand, flags } from '@adonisjs/core/ace' import type { CommandOptions } from '@adonisjs/core/types/ace' import { readFile, readdir } from 'node:fs/promises' import { extname, join, basename } from 'node:path' import drive from '@adonisjs/drive/services/main' import { randomUUID } from 'node:crypto' import { getOcrProvider } from '#services/ocr/index' import type { OcrResult } from '#services/ocr/ocr_provider' /** * Commande de validation OCR — mesure la qualité d'extraction du * provider courant sur un set de factures réelles avec ground truth. * * Usage : * * # Avec le provider courant (.env) : * node ace ocr:validate * * # Forcer Mistral (vrai OCR) : * OCR_PROVIDER=mistral MISTRAL_API_KEY=... node ace ocr:validate * * # Avec un dossier custom : * node ace ocr:validate --fixtures-dir=path/to/pdfs * * # JSON report : * node ace ocr:validate --out=ocr-report.json * * Format des fixtures : * - `.pdf` (ou .png/.jpg) : facture à OCRiser * - `.expected.json` : ground truth avec : * { * "expected": { * "clientName": "...", * "clientEmail": "..." | null, * "numero": "...", * "amountTtcCents": 124000, * "issueDate": "2024-04-15", // YYYY-MM-DD * "dueDate": "2024-05-15" * }, * "notes": "facture B2B classique" // libre, ignoré par la commande * } * * Tolérances : * - amountTtcCents : exact (la précision financière compte) * - issueDate / dueDate : jour exact (heure ignorée) * - numero : exact (case-insensitive, trim) * - clientName : Levenshtein ≤ 3 OR similarity Jaccard ≥ 85 % * - clientEmail : exact (lowercased) ou null * * Pour ajouter une facture au bench : * 1. Dépose `e2e/fixtures/invoices/ma-facture.pdf` * 2. Crée `e2e/fixtures/invoices/ma-facture.expected.json` avec les * valeurs lisibles sur la facture * 3. Relance la commande */ type ExpectedFields = { clientName: string clientEmail: string | null numero: string amountTtcCents: number issueDate: string // YYYY-MM-DD dueDate: string } type ExpectedFile = { expected: ExpectedFields notes?: string } type FieldComparison = { field: keyof ExpectedFields expected: string | number | null got: string | number | null match: boolean reason?: string confidence?: number } type FixtureResult = { filename: string durationMs: number fields: FieldComparison[] /** True si tous les champs match dans leurs tolérances. */ allMatch: boolean } const SUPPORTED_EXT = new Set(['.pdf', '.png', '.jpg', '.jpeg']) export default class OcrValidate extends BaseCommand { static commandName = 'ocr:validate' static description = "Bench OCR : compare l'extraction du provider courant à des ground truth (e2e/fixtures/invoices/)" static options: CommandOptions = { startApp: true, } @flags.string({ description: "Dossier des fixtures (default: e2e/fixtures/invoices)", }) declare fixturesDir: string @flags.string({ description: 'Path du rapport JSON en sortie (optionnel)', }) declare out: string @flags.number({ description: "Délai en ms entre deux appels provider (anti rate-limit). Default: 1500 ms pour Mistral free tier.", }) declare delayMs: number async run() { // Le cwd quand la commande tourne est apps/api (pnpm --filter ou ace // direct). Le default pointe donc vers e2e/fixtures/invoices à la // racine du monorepo. const dir = this.fixturesDir ?? '../../e2e/fixtures/invoices' const provider = getOcrProvider() this.logger.info(`→ Provider OCR : ${provider.constructor.name}`) this.logger.info(`→ Fixtures dir : ${dir}`) let entries: string[] try { entries = await readdir(dir) } catch (err) { this.logger.error( `Dossier introuvable : ${dir}. Créer le dossier et y poser tes PDFs + .expected.json.` ) this.exitCode = 1 return } const pdfs = entries.filter((e) => SUPPORTED_EXT.has(extname(e).toLowerCase())) if (pdfs.length === 0) { this.logger.warning( `Aucun PDF/PNG/JPG dans ${dir}. Voir le format dans le header de cette commande.` ) this.exitCode = 1 return } this.logger.info(`→ ${pdfs.length} fixture(s) à valider\n`) const results: FixtureResult[] = [] const driveDisk = drive.use() const delayMs = this.delayMs ?? 1500 for (const [idx, pdf] of pdfs.entries()) { if (idx > 0 && delayMs > 0) { await new Promise((r) => setTimeout(r, delayMs)) } const pdfPath = join(dir, pdf) const expectedPath = join(dir, basename(pdf, extname(pdf)) + '.expected.json') let expected: ExpectedFields try { const json = JSON.parse(await readFile(expectedPath, 'utf-8')) as ExpectedFile expected = json.expected } catch { this.logger.warning(`⚠ Pas de ${expectedPath} — skip ${pdf}`) continue } const buffer = await readFile(pdfPath) // Upload temporaire vers le disk courant (MinIO en dev), pour que // le provider (Mistral) puisse re-télécharger comme en prod. const storageKey = `ocr-validate/${randomUUID()}/${pdf}` await driveDisk.put(storageKey, buffer) const t0 = Date.now() let ocrResult: OcrResult | null = null // Retry sur 429 (rate limit) avec backoff exponentiel — utile en // free tier Mistral où la limite n'est pas linéaire. Max 3 retries // (30s + 60s + 90s = 3 min max d'attente avant abandon). const maxRetries = 3 for (let attempt = 0; attempt <= maxRetries; attempt++) { try { ocrResult = await provider.extract({ storageKey, filename: pdf }) break } catch (err) { const msg = (err as Error).message const isRateLimit = msg.includes('429') || msg.includes('Rate limit') if (!isRateLimit || attempt === maxRetries) { this.logger.error(`✗ ${pdf} — extraction throw : ${msg}`) break } const waitSec = 30 + attempt * 30 this.logger.warning( `⏸ ${pdf} — 429 rate limit, retry dans ${waitSec}s (attempt ${attempt + 1}/${maxRetries})` ) await new Promise((r) => setTimeout(r, waitSec * 1000)) } } const durationMs = Date.now() - t0 await driveDisk.delete(storageKey).catch(() => {}) if (!ocrResult) continue const comparisons = compareFields(expected, ocrResult) const allMatch = comparisons.every((c) => c.match) results.push({ filename: pdf, durationMs, fields: comparisons, allMatch }) this.printFixtureResult(pdf, durationMs, comparisons, allMatch) } this.printSummary(results) if (this.out) { const { writeFile } = await import('node:fs/promises') await writeFile(this.out, JSON.stringify({ provider: provider.constructor.name, results }, null, 2)) this.logger.info(`✔ Rapport JSON écrit : ${this.out}`) } // Exit 1 si une fixture a échoué — utile en CI if (results.some((r) => !r.allMatch)) { this.exitCode = 1 } } private printFixtureResult( filename: string, durationMs: number, fields: FieldComparison[], allMatch: boolean ) { const status = allMatch ? '✔' : '✗' this.logger.info(`\n${status} ${filename} (${durationMs} ms)`) for (const f of fields) { const icon = f.match ? ' ✓' : ' ✗' const conf = f.confidence !== undefined ? ` conf=${f.confidence.toFixed(2)}` : '' const reason = f.reason ? ` [${f.reason}]` : '' this.logger.info( `${icon} ${f.field.padEnd(16)} expected=${formatValue(f.expected)} got=${formatValue( f.got )}${conf}${reason}` ) } } private printSummary(results: FixtureResult[]) { const total = results.length const fullPass = results.filter((r) => r.allMatch).length const totalFields = results.reduce((sum, r) => sum + r.fields.length, 0) const matchedFields = results.reduce( (sum, r) => sum + r.fields.filter((f) => f.match).length, 0 ) const fieldAccuracy = totalFields > 0 ? (matchedFields / totalFields) * 100 : 0 const docAccuracy = total > 0 ? (fullPass / total) * 100 : 0 const avgLatency = results.reduce((sum, r) => sum + r.durationMs, 0) / Math.max(total, 1) this.logger.info('\n────────────────────────────────────────────────────────────') this.logger.info(`Total factures : ${total}`) this.logger.info(`Factures 100 % match : ${fullPass} (${docAccuracy.toFixed(1)} %)`) this.logger.info( `Champs match (total) : ${matchedFields}/${totalFields} (${fieldAccuracy.toFixed(1)} %)` ) this.logger.info(`Latence moyenne : ${avgLatency.toFixed(0)} ms / facture`) this.logger.info('────────────────────────────────────────────────────────────\n') // Détail par champ const fieldStats = new Map() for (const r of results) { for (const f of r.fields) { const s = fieldStats.get(f.field) ?? { ok: 0, total: 0 } s.total += 1 if (f.match) s.ok += 1 fieldStats.set(f.field, s) } } this.logger.info('Précision par champ :') for (const [field, s] of fieldStats) { const pct = (s.ok / s.total) * 100 this.logger.info(` ${field.padEnd(18)} ${s.ok}/${s.total} (${pct.toFixed(1)} %)`) } } } // --------------------------------------------------------------------------- // Comparison // --------------------------------------------------------------------------- function compareFields(expected: ExpectedFields, got: OcrResult): FieldComparison[] { return [ { field: 'clientName', expected: expected.clientName, got: got.fields.clientName.value, confidence: got.fields.clientName.confidence, match: matchesName(expected.clientName, got.fields.clientName.value), reason: matchesName(expected.clientName, got.fields.clientName.value) ? undefined : 'fuzzy similarity < 85 %', }, { field: 'clientEmail', expected: expected.clientEmail, got: got.fields.clientEmail.value, confidence: got.fields.clientEmail.confidence, match: matchesEmail(expected.clientEmail, got.fields.clientEmail.value), }, { field: 'numero', expected: expected.numero, got: got.fields.numero.value, confidence: got.fields.numero.confidence, match: matchesString(expected.numero, got.fields.numero.value), }, { field: 'amountTtcCents', expected: expected.amountTtcCents, got: got.fields.amountTtcCents.value, confidence: got.fields.amountTtcCents.confidence, match: expected.amountTtcCents === got.fields.amountTtcCents.value, }, { field: 'issueDate', expected: expected.issueDate, got: got.fields.issueDate.value, confidence: got.fields.issueDate.confidence, match: matchesDate(expected.issueDate, got.fields.issueDate.value), }, { field: 'dueDate', expected: expected.dueDate, got: got.fields.dueDate.value, confidence: got.fields.dueDate.confidence, match: matchesDate(expected.dueDate, got.fields.dueDate.value), }, ] } function matchesString(a: string, b: string): boolean { return a.trim().toLowerCase() === b.trim().toLowerCase() } function matchesEmail(a: string | null, b: string | null): boolean { if (a === null && b === null) return true if (a === null || b === null) return false return a.trim().toLowerCase() === b.trim().toLowerCase() } function matchesDate(a: string, b: string): boolean { // a au format YYYY-MM-DD, b au format ISO 8601. On compare au jour près. const aDay = a.slice(0, 10) const bDay = b.slice(0, 10) return aDay === bDay } function matchesName(a: string, b: string): boolean { const an = a.trim().toLowerCase() const bn = b.trim().toLowerCase() if (an === bn) return true // Similarité Jaccard sur les mots (tolérante aux suffixes SARL/SAS/… // et aux espaces différents). const aTokens = new Set(an.split(/\s+/).filter((t) => t.length > 1)) const bTokens = new Set(bn.split(/\s+/).filter((t) => t.length > 1)) if (aTokens.size === 0 || bTokens.size === 0) return false const intersection = new Set([...aTokens].filter((t) => bTokens.has(t))) const union = new Set([...aTokens, ...bTokens]) const jaccard = intersection.size / union.size return jaccard >= 0.85 } function formatValue(v: string | number | null): string { if (v === null) return 'null' if (typeof v === 'number') return String(v) return JSON.stringify(v) }