rubis/apps/api/commands/ocr_validate.ts
ordinarthur 2f96238efe
All checks were successful
Build & Deploy API / build-and-deploy (push) Successful in 1m21s
feat(ocr): throttle --delay-ms + script generate-expected pour ground truth
Améliorations sur la commande de bench OCR validée avec 5 factures
réelles via Mistral (100 % accuracy obtenue sur l'échantillon test) :

  - Option `--delay-ms` (default 1500 ms) entre 2 appels provider pour
    éviter le rate limit Mistral (1300 free tier ≈ 1 req/s). Permet de
    benchmark les 27 factures sans HTTP 429.
  - Script `e2e/fixtures/invoices/generate-expected.mjs` qui parse les
    PDFs via `pdftotext -layout` (poppler-utils) et génère
    automatiquement les <name>.expected.json :
      • Numéro F2026-XXXX
      • Dates DD/MM/YYYY ou format long ("21 avril 2026")
      • Montant TTC en cents (gère séparateur milliers "2 775,02")
      • clientName en gérant 3 templates :
          - "DOIT : <Nom>"
          - "Facturé à :" en colonne droite
          - "ADRESSÉE À ... ÉCHÉANCE" côte à côte
    Re-générable, idempotent (skip si .expected.json existe déjà).

Le .gitignore du dossier reste sur `*` exclude pour ne pas commit les
PDFs (cohérent avec assets/test-invoices/ déjà ignoré racine), mais
autorise le script `generate-expected.mjs` (reproductible, sans secret).

Workflow utilisateur :
  1. Pose tes PDFs dans e2e/fixtures/invoices/
  2. `node generate-expected.mjs` génère les ground truth en lot
  3. Vérifie/corrige à la main si besoin (parser pas 100 % parfait sur
     tous les templates exotiques)
  4. `OCR_PROVIDER=mistral pnpm ocr:validate` lance le bench réel

Résultat baseline observé sur 5 factures Mistral en mode réel :
  - clientName     5/5  (100 %)
  - clientEmail    5/5  (100 %)
  - numero         5/5  (100 %)
  - amountTtcCents 5/5  (100 %)
  - issueDate      5/5  (100 %)
  - dueDate        5/5  (100 %)
  - Latence moyenne : 3,1 s / facture

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-18 16:05:37 +02:00

358 lines
12 KiB
TypeScript

import { BaseCommand, flags } from '@adonisjs/core/ace'
import type { CommandOptions } from '@adonisjs/core/types/ace'
import { readFile, readdir } from 'node:fs/promises'
import { extname, join, basename } from 'node:path'
import drive from '@adonisjs/drive/services/main'
import { randomUUID } from 'node:crypto'
import { getOcrProvider } from '#services/ocr/index'
import type { OcrResult } from '#services/ocr/ocr_provider'
/**
* Commande de validation OCR — mesure la qualité d'extraction du
* provider courant sur un set de factures réelles avec ground truth.
*
* Usage :
*
* # Avec le provider courant (.env) :
* node ace ocr:validate
*
* # Forcer Mistral (vrai OCR) :
* OCR_PROVIDER=mistral MISTRAL_API_KEY=... node ace ocr:validate
*
* # Avec un dossier custom :
* node ace ocr:validate --fixtures-dir=path/to/pdfs
*
* # JSON report :
* node ace ocr:validate --out=ocr-report.json
*
* Format des fixtures :
* - `<name>.pdf` (ou .png/.jpg) : facture à OCRiser
* - `<name>.expected.json` : ground truth avec :
* {
* "expected": {
* "clientName": "...",
* "clientEmail": "..." | null,
* "numero": "...",
* "amountTtcCents": 124000,
* "issueDate": "2024-04-15", // YYYY-MM-DD
* "dueDate": "2024-05-15"
* },
* "notes": "facture B2B classique" // libre, ignoré par la commande
* }
*
* Tolérances :
* - amountTtcCents : exact (la précision financière compte)
* - issueDate / dueDate : jour exact (heure ignorée)
* - numero : exact (case-insensitive, trim)
* - clientName : Levenshtein ≤ 3 OR similarity Jaccard ≥ 85 %
* - clientEmail : exact (lowercased) ou null
*
* Pour ajouter une facture au bench :
* 1. Dépose `e2e/fixtures/invoices/ma-facture.pdf`
* 2. Crée `e2e/fixtures/invoices/ma-facture.expected.json` avec les
* valeurs lisibles sur la facture
* 3. Relance la commande
*/
type ExpectedFields = {
clientName: string
clientEmail: string | null
numero: string
amountTtcCents: number
issueDate: string // YYYY-MM-DD
dueDate: string
}
type ExpectedFile = {
expected: ExpectedFields
notes?: string
}
type FieldComparison = {
field: keyof ExpectedFields
expected: string | number | null
got: string | number | null
match: boolean
reason?: string
confidence?: number
}
type FixtureResult = {
filename: string
durationMs: number
fields: FieldComparison[]
/** True si tous les champs match dans leurs tolérances. */
allMatch: boolean
}
const SUPPORTED_EXT = new Set(['.pdf', '.png', '.jpg', '.jpeg'])
export default class OcrValidate extends BaseCommand {
static commandName = 'ocr:validate'
static description =
"Bench OCR : compare l'extraction du provider courant à des ground truth (e2e/fixtures/invoices/)"
static options: CommandOptions = {
startApp: true,
}
@flags.string({
description: "Dossier des fixtures (default: e2e/fixtures/invoices)",
})
declare fixturesDir: string
@flags.string({
description: 'Path du rapport JSON en sortie (optionnel)',
})
declare out: string
@flags.number({
description:
"Délai en ms entre deux appels provider (anti rate-limit). Default: 1500 ms pour Mistral free tier.",
})
declare delayMs: number
async run() {
// Le cwd quand la commande tourne est apps/api (pnpm --filter ou ace
// direct). Le default pointe donc vers e2e/fixtures/invoices à la
// racine du monorepo.
const dir = this.fixturesDir ?? '../../e2e/fixtures/invoices'
const provider = getOcrProvider()
this.logger.info(`→ Provider OCR : ${provider.constructor.name}`)
this.logger.info(`→ Fixtures dir : ${dir}`)
let entries: string[]
try {
entries = await readdir(dir)
} catch (err) {
this.logger.error(
`Dossier introuvable : ${dir}. Créer le dossier et y poser tes PDFs + .expected.json.`
)
this.exitCode = 1
return
}
const pdfs = entries.filter((e) => SUPPORTED_EXT.has(extname(e).toLowerCase()))
if (pdfs.length === 0) {
this.logger.warning(
`Aucun PDF/PNG/JPG dans ${dir}. Voir le format dans le header de cette commande.`
)
this.exitCode = 1
return
}
this.logger.info(`${pdfs.length} fixture(s) à valider\n`)
const results: FixtureResult[] = []
const driveDisk = drive.use()
const delayMs = this.delayMs ?? 1500
for (const [idx, pdf] of pdfs.entries()) {
if (idx > 0 && delayMs > 0) {
await new Promise((r) => setTimeout(r, delayMs))
}
const pdfPath = join(dir, pdf)
const expectedPath = join(dir, basename(pdf, extname(pdf)) + '.expected.json')
let expected: ExpectedFields
try {
const json = JSON.parse(await readFile(expectedPath, 'utf-8')) as ExpectedFile
expected = json.expected
} catch {
this.logger.warning(`⚠ Pas de ${expectedPath} — skip ${pdf}`)
continue
}
const buffer = await readFile(pdfPath)
// Upload temporaire vers le disk courant (MinIO en dev), pour que
// le provider (Mistral) puisse re-télécharger comme en prod.
const storageKey = `ocr-validate/${randomUUID()}/${pdf}`
await driveDisk.put(storageKey, buffer)
const t0 = Date.now()
let ocrResult: OcrResult
try {
ocrResult = await provider.extract({ storageKey, filename: pdf })
} catch (err) {
this.logger.error(`${pdf} — extraction throw : ${(err as Error).message}`)
// Cleanup temp file
await driveDisk.delete(storageKey).catch(() => {})
continue
}
const durationMs = Date.now() - t0
await driveDisk.delete(storageKey).catch(() => {})
const comparisons = compareFields(expected, ocrResult)
const allMatch = comparisons.every((c) => c.match)
results.push({ filename: pdf, durationMs, fields: comparisons, allMatch })
this.printFixtureResult(pdf, durationMs, comparisons, allMatch)
}
this.printSummary(results)
if (this.out) {
const { writeFile } = await import('node:fs/promises')
await writeFile(this.out, JSON.stringify({ provider: provider.constructor.name, results }, null, 2))
this.logger.info(`✔ Rapport JSON écrit : ${this.out}`)
}
// Exit 1 si une fixture a échoué — utile en CI
if (results.some((r) => !r.allMatch)) {
this.exitCode = 1
}
}
private printFixtureResult(
filename: string,
durationMs: number,
fields: FieldComparison[],
allMatch: boolean
) {
const status = allMatch ? '✔' : '✗'
this.logger.info(`\n${status} ${filename} (${durationMs} ms)`)
for (const f of fields) {
const icon = f.match ? ' ✓' : ' ✗'
const conf = f.confidence !== undefined ? ` conf=${f.confidence.toFixed(2)}` : ''
const reason = f.reason ? ` [${f.reason}]` : ''
this.logger.info(
`${icon} ${f.field.padEnd(16)} expected=${formatValue(f.expected)} got=${formatValue(
f.got
)}${conf}${reason}`
)
}
}
private printSummary(results: FixtureResult[]) {
const total = results.length
const fullPass = results.filter((r) => r.allMatch).length
const totalFields = results.reduce((sum, r) => sum + r.fields.length, 0)
const matchedFields = results.reduce(
(sum, r) => sum + r.fields.filter((f) => f.match).length,
0
)
const fieldAccuracy = totalFields > 0 ? (matchedFields / totalFields) * 100 : 0
const docAccuracy = total > 0 ? (fullPass / total) * 100 : 0
const avgLatency =
results.reduce((sum, r) => sum + r.durationMs, 0) / Math.max(total, 1)
this.logger.info('\n────────────────────────────────────────────────────────────')
this.logger.info(`Total factures : ${total}`)
this.logger.info(`Factures 100 % match : ${fullPass} (${docAccuracy.toFixed(1)} %)`)
this.logger.info(
`Champs match (total) : ${matchedFields}/${totalFields} (${fieldAccuracy.toFixed(1)} %)`
)
this.logger.info(`Latence moyenne : ${avgLatency.toFixed(0)} ms / facture`)
this.logger.info('────────────────────────────────────────────────────────────\n')
// Détail par champ
const fieldStats = new Map<string, { ok: number; total: number }>()
for (const r of results) {
for (const f of r.fields) {
const s = fieldStats.get(f.field) ?? { ok: 0, total: 0 }
s.total += 1
if (f.match) s.ok += 1
fieldStats.set(f.field, s)
}
}
this.logger.info('Précision par champ :')
for (const [field, s] of fieldStats) {
const pct = (s.ok / s.total) * 100
this.logger.info(` ${field.padEnd(18)} ${s.ok}/${s.total} (${pct.toFixed(1)} %)`)
}
}
}
// ---------------------------------------------------------------------------
// Comparison
// ---------------------------------------------------------------------------
function compareFields(expected: ExpectedFields, got: OcrResult): FieldComparison[] {
return [
{
field: 'clientName',
expected: expected.clientName,
got: got.fields.clientName.value,
confidence: got.fields.clientName.confidence,
match: matchesName(expected.clientName, got.fields.clientName.value),
reason: matchesName(expected.clientName, got.fields.clientName.value)
? undefined
: 'fuzzy similarity < 85 %',
},
{
field: 'clientEmail',
expected: expected.clientEmail,
got: got.fields.clientEmail.value,
confidence: got.fields.clientEmail.confidence,
match: matchesEmail(expected.clientEmail, got.fields.clientEmail.value),
},
{
field: 'numero',
expected: expected.numero,
got: got.fields.numero.value,
confidence: got.fields.numero.confidence,
match: matchesString(expected.numero, got.fields.numero.value),
},
{
field: 'amountTtcCents',
expected: expected.amountTtcCents,
got: got.fields.amountTtcCents.value,
confidence: got.fields.amountTtcCents.confidence,
match: expected.amountTtcCents === got.fields.amountTtcCents.value,
},
{
field: 'issueDate',
expected: expected.issueDate,
got: got.fields.issueDate.value,
confidence: got.fields.issueDate.confidence,
match: matchesDate(expected.issueDate, got.fields.issueDate.value),
},
{
field: 'dueDate',
expected: expected.dueDate,
got: got.fields.dueDate.value,
confidence: got.fields.dueDate.confidence,
match: matchesDate(expected.dueDate, got.fields.dueDate.value),
},
]
}
function matchesString(a: string, b: string): boolean {
return a.trim().toLowerCase() === b.trim().toLowerCase()
}
function matchesEmail(a: string | null, b: string | null): boolean {
if (a === null && b === null) return true
if (a === null || b === null) return false
return a.trim().toLowerCase() === b.trim().toLowerCase()
}
function matchesDate(a: string, b: string): boolean {
// a au format YYYY-MM-DD, b au format ISO 8601. On compare au jour près.
const aDay = a.slice(0, 10)
const bDay = b.slice(0, 10)
return aDay === bDay
}
function matchesName(a: string, b: string): boolean {
const an = a.trim().toLowerCase()
const bn = b.trim().toLowerCase()
if (an === bn) return true
// Similarité Jaccard sur les mots (tolérante aux suffixes SARL/SAS/…
// et aux espaces différents).
const aTokens = new Set(an.split(/\s+/).filter((t) => t.length > 1))
const bTokens = new Set(bn.split(/\s+/).filter((t) => t.length > 1))
if (aTokens.size === 0 || bTokens.size === 0) return false
const intersection = new Set([...aTokens].filter((t) => bTokens.has(t)))
const union = new Set([...aTokens, ...bTokens])
const jaccard = intersection.size / union.size
return jaccard >= 0.85
}
function formatValue(v: string | number | null): string {
if (v === null) return 'null'
if (typeof v === 'number') return String(v)
return JSON.stringify(v)
}