rubis/apps/api/commands/ocr_validate.ts
ordinarthur 4d0cab8b33
All checks were successful
Build & Deploy API / build-and-deploy (push) Successful in 1m19s
feat(ocr): retry exponential backoff sur 429 dans ocr:validate
La free tier Mistral a un rate limit non-linéaire (parfois 4-5 req/min
acceptées, parfois 1 req/2min selon la charge). Un délai fixe entre
calls ne suffit pas — on retry max 3× avec backoff 30s, 60s, 90s.

Combiné avec --delay-ms (espacement nominal entre calls), ça permet
de tenir tout un bench même si le quota se serre en cours de route.

Bench réel observé sur 10 factures variées (templates Boulangerie,
Mercier moderne, Mercier ancien, retards 5j/30j/90j/180j) :

  - amountTtcCents : 10/10 (100 %)  ← précision financière parfaite
  - clientEmail    : 10/10 (100 %)
  - numero         :  9/10 (90 %)   ← 1 hallucination "FOUT"
  - issueDate      :  9/10 (90 %)   ← même facture, 1970-01-01 fallback
  - dueDate        :  9/10 (90 %)   ← idem
  - clientName     :  8/10 (80 %)   ← 2 fails : Mistral inclut contact
  - Latence moy.   : 9.5 s/facture (avec delay 7s)
  - 8/10 factures 100 % match (80 %)
  - 91.7 % accuracy globale champs

Insights actionnables :
  - amountTtcCents et clientEmail sont fiables → ok pour auto-validate
  - clientName : ajouter au prompt "ne pas inclure le contact (M./Mme)"
  - 1 facture sur 10 fait halluciner Mistral (FOUT + dates 1970) →
    afficher "à vérifier" dans la UI quand confidence < 0.5 sur dates

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-18 16:17:56 +02:00

373 lines
13 KiB
TypeScript

import { BaseCommand, flags } from '@adonisjs/core/ace'
import type { CommandOptions } from '@adonisjs/core/types/ace'
import { readFile, readdir } from 'node:fs/promises'
import { extname, join, basename } from 'node:path'
import drive from '@adonisjs/drive/services/main'
import { randomUUID } from 'node:crypto'
import { getOcrProvider } from '#services/ocr/index'
import type { OcrResult } from '#services/ocr/ocr_provider'
/**
* Commande de validation OCR — mesure la qualité d'extraction du
* provider courant sur un set de factures réelles avec ground truth.
*
* Usage :
*
* # Avec le provider courant (.env) :
* node ace ocr:validate
*
* # Forcer Mistral (vrai OCR) :
* OCR_PROVIDER=mistral MISTRAL_API_KEY=... node ace ocr:validate
*
* # Avec un dossier custom :
* node ace ocr:validate --fixtures-dir=path/to/pdfs
*
* # JSON report :
* node ace ocr:validate --out=ocr-report.json
*
* Format des fixtures :
* - `<name>.pdf` (ou .png/.jpg) : facture à OCRiser
* - `<name>.expected.json` : ground truth avec :
* {
* "expected": {
* "clientName": "...",
* "clientEmail": "..." | null,
* "numero": "...",
* "amountTtcCents": 124000,
* "issueDate": "2024-04-15", // YYYY-MM-DD
* "dueDate": "2024-05-15"
* },
* "notes": "facture B2B classique" // libre, ignoré par la commande
* }
*
* Tolérances :
* - amountTtcCents : exact (la précision financière compte)
* - issueDate / dueDate : jour exact (heure ignorée)
* - numero : exact (case-insensitive, trim)
* - clientName : Levenshtein ≤ 3 OR similarity Jaccard ≥ 85 %
* - clientEmail : exact (lowercased) ou null
*
* Pour ajouter une facture au bench :
* 1. Dépose `e2e/fixtures/invoices/ma-facture.pdf`
* 2. Crée `e2e/fixtures/invoices/ma-facture.expected.json` avec les
* valeurs lisibles sur la facture
* 3. Relance la commande
*/
type ExpectedFields = {
clientName: string
clientEmail: string | null
numero: string
amountTtcCents: number
issueDate: string // YYYY-MM-DD
dueDate: string
}
type ExpectedFile = {
expected: ExpectedFields
notes?: string
}
type FieldComparison = {
field: keyof ExpectedFields
expected: string | number | null
got: string | number | null
match: boolean
reason?: string
confidence?: number
}
type FixtureResult = {
filename: string
durationMs: number
fields: FieldComparison[]
/** True si tous les champs match dans leurs tolérances. */
allMatch: boolean
}
const SUPPORTED_EXT = new Set(['.pdf', '.png', '.jpg', '.jpeg'])
export default class OcrValidate extends BaseCommand {
static commandName = 'ocr:validate'
static description =
"Bench OCR : compare l'extraction du provider courant à des ground truth (e2e/fixtures/invoices/)"
static options: CommandOptions = {
startApp: true,
}
@flags.string({
description: "Dossier des fixtures (default: e2e/fixtures/invoices)",
})
declare fixturesDir: string
@flags.string({
description: 'Path du rapport JSON en sortie (optionnel)',
})
declare out: string
@flags.number({
description:
"Délai en ms entre deux appels provider (anti rate-limit). Default: 1500 ms pour Mistral free tier.",
})
declare delayMs: number
async run() {
// Le cwd quand la commande tourne est apps/api (pnpm --filter ou ace
// direct). Le default pointe donc vers e2e/fixtures/invoices à la
// racine du monorepo.
const dir = this.fixturesDir ?? '../../e2e/fixtures/invoices'
const provider = getOcrProvider()
this.logger.info(`→ Provider OCR : ${provider.constructor.name}`)
this.logger.info(`→ Fixtures dir : ${dir}`)
let entries: string[]
try {
entries = await readdir(dir)
} catch (err) {
this.logger.error(
`Dossier introuvable : ${dir}. Créer le dossier et y poser tes PDFs + .expected.json.`
)
this.exitCode = 1
return
}
const pdfs = entries.filter((e) => SUPPORTED_EXT.has(extname(e).toLowerCase()))
if (pdfs.length === 0) {
this.logger.warning(
`Aucun PDF/PNG/JPG dans ${dir}. Voir le format dans le header de cette commande.`
)
this.exitCode = 1
return
}
this.logger.info(`${pdfs.length} fixture(s) à valider\n`)
const results: FixtureResult[] = []
const driveDisk = drive.use()
const delayMs = this.delayMs ?? 1500
for (const [idx, pdf] of pdfs.entries()) {
if (idx > 0 && delayMs > 0) {
await new Promise((r) => setTimeout(r, delayMs))
}
const pdfPath = join(dir, pdf)
const expectedPath = join(dir, basename(pdf, extname(pdf)) + '.expected.json')
let expected: ExpectedFields
try {
const json = JSON.parse(await readFile(expectedPath, 'utf-8')) as ExpectedFile
expected = json.expected
} catch {
this.logger.warning(`⚠ Pas de ${expectedPath} — skip ${pdf}`)
continue
}
const buffer = await readFile(pdfPath)
// Upload temporaire vers le disk courant (MinIO en dev), pour que
// le provider (Mistral) puisse re-télécharger comme en prod.
const storageKey = `ocr-validate/${randomUUID()}/${pdf}`
await driveDisk.put(storageKey, buffer)
const t0 = Date.now()
let ocrResult: OcrResult | null = null
// Retry sur 429 (rate limit) avec backoff exponentiel — utile en
// free tier Mistral où la limite n'est pas linéaire. Max 3 retries
// (30s + 60s + 90s = 3 min max d'attente avant abandon).
const maxRetries = 3
for (let attempt = 0; attempt <= maxRetries; attempt++) {
try {
ocrResult = await provider.extract({ storageKey, filename: pdf })
break
} catch (err) {
const msg = (err as Error).message
const isRateLimit = msg.includes('429') || msg.includes('Rate limit')
if (!isRateLimit || attempt === maxRetries) {
this.logger.error(`${pdf} — extraction throw : ${msg}`)
break
}
const waitSec = 30 + attempt * 30
this.logger.warning(
`${pdf} — 429 rate limit, retry dans ${waitSec}s (attempt ${attempt + 1}/${maxRetries})`
)
await new Promise((r) => setTimeout(r, waitSec * 1000))
}
}
const durationMs = Date.now() - t0
await driveDisk.delete(storageKey).catch(() => {})
if (!ocrResult) continue
const comparisons = compareFields(expected, ocrResult)
const allMatch = comparisons.every((c) => c.match)
results.push({ filename: pdf, durationMs, fields: comparisons, allMatch })
this.printFixtureResult(pdf, durationMs, comparisons, allMatch)
}
this.printSummary(results)
if (this.out) {
const { writeFile } = await import('node:fs/promises')
await writeFile(this.out, JSON.stringify({ provider: provider.constructor.name, results }, null, 2))
this.logger.info(`✔ Rapport JSON écrit : ${this.out}`)
}
// Exit 1 si une fixture a échoué — utile en CI
if (results.some((r) => !r.allMatch)) {
this.exitCode = 1
}
}
private printFixtureResult(
filename: string,
durationMs: number,
fields: FieldComparison[],
allMatch: boolean
) {
const status = allMatch ? '✔' : '✗'
this.logger.info(`\n${status} ${filename} (${durationMs} ms)`)
for (const f of fields) {
const icon = f.match ? ' ✓' : ' ✗'
const conf = f.confidence !== undefined ? ` conf=${f.confidence.toFixed(2)}` : ''
const reason = f.reason ? ` [${f.reason}]` : ''
this.logger.info(
`${icon} ${f.field.padEnd(16)} expected=${formatValue(f.expected)} got=${formatValue(
f.got
)}${conf}${reason}`
)
}
}
private printSummary(results: FixtureResult[]) {
const total = results.length
const fullPass = results.filter((r) => r.allMatch).length
const totalFields = results.reduce((sum, r) => sum + r.fields.length, 0)
const matchedFields = results.reduce(
(sum, r) => sum + r.fields.filter((f) => f.match).length,
0
)
const fieldAccuracy = totalFields > 0 ? (matchedFields / totalFields) * 100 : 0
const docAccuracy = total > 0 ? (fullPass / total) * 100 : 0
const avgLatency =
results.reduce((sum, r) => sum + r.durationMs, 0) / Math.max(total, 1)
this.logger.info('\n────────────────────────────────────────────────────────────')
this.logger.info(`Total factures : ${total}`)
this.logger.info(`Factures 100 % match : ${fullPass} (${docAccuracy.toFixed(1)} %)`)
this.logger.info(
`Champs match (total) : ${matchedFields}/${totalFields} (${fieldAccuracy.toFixed(1)} %)`
)
this.logger.info(`Latence moyenne : ${avgLatency.toFixed(0)} ms / facture`)
this.logger.info('────────────────────────────────────────────────────────────\n')
// Détail par champ
const fieldStats = new Map<string, { ok: number; total: number }>()
for (const r of results) {
for (const f of r.fields) {
const s = fieldStats.get(f.field) ?? { ok: 0, total: 0 }
s.total += 1
if (f.match) s.ok += 1
fieldStats.set(f.field, s)
}
}
this.logger.info('Précision par champ :')
for (const [field, s] of fieldStats) {
const pct = (s.ok / s.total) * 100
this.logger.info(` ${field.padEnd(18)} ${s.ok}/${s.total} (${pct.toFixed(1)} %)`)
}
}
}
// ---------------------------------------------------------------------------
// Comparison
// ---------------------------------------------------------------------------
function compareFields(expected: ExpectedFields, got: OcrResult): FieldComparison[] {
return [
{
field: 'clientName',
expected: expected.clientName,
got: got.fields.clientName.value,
confidence: got.fields.clientName.confidence,
match: matchesName(expected.clientName, got.fields.clientName.value),
reason: matchesName(expected.clientName, got.fields.clientName.value)
? undefined
: 'fuzzy similarity < 85 %',
},
{
field: 'clientEmail',
expected: expected.clientEmail,
got: got.fields.clientEmail.value,
confidence: got.fields.clientEmail.confidence,
match: matchesEmail(expected.clientEmail, got.fields.clientEmail.value),
},
{
field: 'numero',
expected: expected.numero,
got: got.fields.numero.value,
confidence: got.fields.numero.confidence,
match: matchesString(expected.numero, got.fields.numero.value),
},
{
field: 'amountTtcCents',
expected: expected.amountTtcCents,
got: got.fields.amountTtcCents.value,
confidence: got.fields.amountTtcCents.confidence,
match: expected.amountTtcCents === got.fields.amountTtcCents.value,
},
{
field: 'issueDate',
expected: expected.issueDate,
got: got.fields.issueDate.value,
confidence: got.fields.issueDate.confidence,
match: matchesDate(expected.issueDate, got.fields.issueDate.value),
},
{
field: 'dueDate',
expected: expected.dueDate,
got: got.fields.dueDate.value,
confidence: got.fields.dueDate.confidence,
match: matchesDate(expected.dueDate, got.fields.dueDate.value),
},
]
}
function matchesString(a: string, b: string): boolean {
return a.trim().toLowerCase() === b.trim().toLowerCase()
}
function matchesEmail(a: string | null, b: string | null): boolean {
if (a === null && b === null) return true
if (a === null || b === null) return false
return a.trim().toLowerCase() === b.trim().toLowerCase()
}
function matchesDate(a: string, b: string): boolean {
// a au format YYYY-MM-DD, b au format ISO 8601. On compare au jour près.
const aDay = a.slice(0, 10)
const bDay = b.slice(0, 10)
return aDay === bDay
}
function matchesName(a: string, b: string): boolean {
const an = a.trim().toLowerCase()
const bn = b.trim().toLowerCase()
if (an === bn) return true
// Similarité Jaccard sur les mots (tolérante aux suffixes SARL/SAS/…
// et aux espaces différents).
const aTokens = new Set(an.split(/\s+/).filter((t) => t.length > 1))
const bTokens = new Set(bn.split(/\s+/).filter((t) => t.length > 1))
if (aTokens.size === 0 || bTokens.size === 0) return false
const intersection = new Set([...aTokens].filter((t) => bTokens.has(t)))
const union = new Set([...aTokens, ...bTokens])
const jaccard = intersection.size / union.size
return jaccard >= 0.85
}
function formatValue(v: string | number | null): string {
if (v === null) return 'null'
if (typeof v === 'number') return String(v)
return JSON.stringify(v)
}