All checks were successful
Build & Deploy API / build-and-deploy (push) Successful in 1m21s
Améliorations sur la commande de bench OCR validée avec 5 factures
réelles via Mistral (100 % accuracy obtenue sur l'échantillon test) :
- Option `--delay-ms` (default 1500 ms) entre 2 appels provider pour
éviter le rate limit Mistral (1300 free tier ≈ 1 req/s). Permet de
benchmark les 27 factures sans HTTP 429.
- Script `e2e/fixtures/invoices/generate-expected.mjs` qui parse les
PDFs via `pdftotext -layout` (poppler-utils) et génère
automatiquement les <name>.expected.json :
• Numéro F2026-XXXX
• Dates DD/MM/YYYY ou format long ("21 avril 2026")
• Montant TTC en cents (gère séparateur milliers "2 775,02")
• clientName en gérant 3 templates :
- "DOIT : <Nom>"
- "Facturé à :" en colonne droite
- "ADRESSÉE À ... ÉCHÉANCE" côte à côte
Re-générable, idempotent (skip si .expected.json existe déjà).
Le .gitignore du dossier reste sur `*` exclude pour ne pas commit les
PDFs (cohérent avec assets/test-invoices/ déjà ignoré racine), mais
autorise le script `generate-expected.mjs` (reproductible, sans secret).
Workflow utilisateur :
1. Pose tes PDFs dans e2e/fixtures/invoices/
2. `node generate-expected.mjs` génère les ground truth en lot
3. Vérifie/corrige à la main si besoin (parser pas 100 % parfait sur
tous les templates exotiques)
4. `OCR_PROVIDER=mistral pnpm ocr:validate` lance le bench réel
Résultat baseline observé sur 5 factures Mistral en mode réel :
- clientName 5/5 (100 %)
- clientEmail 5/5 (100 %)
- numero 5/5 (100 %)
- amountTtcCents 5/5 (100 %)
- issueDate 5/5 (100 %)
- dueDate 5/5 (100 %)
- Latence moyenne : 3,1 s / facture
Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
358 lines
12 KiB
TypeScript
358 lines
12 KiB
TypeScript
import { BaseCommand, flags } from '@adonisjs/core/ace'
|
|
import type { CommandOptions } from '@adonisjs/core/types/ace'
|
|
import { readFile, readdir } from 'node:fs/promises'
|
|
import { extname, join, basename } from 'node:path'
|
|
import drive from '@adonisjs/drive/services/main'
|
|
import { randomUUID } from 'node:crypto'
|
|
|
|
import { getOcrProvider } from '#services/ocr/index'
|
|
import type { OcrResult } from '#services/ocr/ocr_provider'
|
|
|
|
/**
|
|
* Commande de validation OCR — mesure la qualité d'extraction du
|
|
* provider courant sur un set de factures réelles avec ground truth.
|
|
*
|
|
* Usage :
|
|
*
|
|
* # Avec le provider courant (.env) :
|
|
* node ace ocr:validate
|
|
*
|
|
* # Forcer Mistral (vrai OCR) :
|
|
* OCR_PROVIDER=mistral MISTRAL_API_KEY=... node ace ocr:validate
|
|
*
|
|
* # Avec un dossier custom :
|
|
* node ace ocr:validate --fixtures-dir=path/to/pdfs
|
|
*
|
|
* # JSON report :
|
|
* node ace ocr:validate --out=ocr-report.json
|
|
*
|
|
* Format des fixtures :
|
|
* - `<name>.pdf` (ou .png/.jpg) : facture à OCRiser
|
|
* - `<name>.expected.json` : ground truth avec :
|
|
* {
|
|
* "expected": {
|
|
* "clientName": "...",
|
|
* "clientEmail": "..." | null,
|
|
* "numero": "...",
|
|
* "amountTtcCents": 124000,
|
|
* "issueDate": "2024-04-15", // YYYY-MM-DD
|
|
* "dueDate": "2024-05-15"
|
|
* },
|
|
* "notes": "facture B2B classique" // libre, ignoré par la commande
|
|
* }
|
|
*
|
|
* Tolérances :
|
|
* - amountTtcCents : exact (la précision financière compte)
|
|
* - issueDate / dueDate : jour exact (heure ignorée)
|
|
* - numero : exact (case-insensitive, trim)
|
|
* - clientName : Levenshtein ≤ 3 OR similarity Jaccard ≥ 85 %
|
|
* - clientEmail : exact (lowercased) ou null
|
|
*
|
|
* Pour ajouter une facture au bench :
|
|
* 1. Dépose `e2e/fixtures/invoices/ma-facture.pdf`
|
|
* 2. Crée `e2e/fixtures/invoices/ma-facture.expected.json` avec les
|
|
* valeurs lisibles sur la facture
|
|
* 3. Relance la commande
|
|
*/
|
|
|
|
type ExpectedFields = {
|
|
clientName: string
|
|
clientEmail: string | null
|
|
numero: string
|
|
amountTtcCents: number
|
|
issueDate: string // YYYY-MM-DD
|
|
dueDate: string
|
|
}
|
|
|
|
type ExpectedFile = {
|
|
expected: ExpectedFields
|
|
notes?: string
|
|
}
|
|
|
|
type FieldComparison = {
|
|
field: keyof ExpectedFields
|
|
expected: string | number | null
|
|
got: string | number | null
|
|
match: boolean
|
|
reason?: string
|
|
confidence?: number
|
|
}
|
|
|
|
type FixtureResult = {
|
|
filename: string
|
|
durationMs: number
|
|
fields: FieldComparison[]
|
|
/** True si tous les champs match dans leurs tolérances. */
|
|
allMatch: boolean
|
|
}
|
|
|
|
const SUPPORTED_EXT = new Set(['.pdf', '.png', '.jpg', '.jpeg'])
|
|
|
|
export default class OcrValidate extends BaseCommand {
|
|
static commandName = 'ocr:validate'
|
|
static description =
|
|
"Bench OCR : compare l'extraction du provider courant à des ground truth (e2e/fixtures/invoices/)"
|
|
|
|
static options: CommandOptions = {
|
|
startApp: true,
|
|
}
|
|
|
|
@flags.string({
|
|
description: "Dossier des fixtures (default: e2e/fixtures/invoices)",
|
|
})
|
|
declare fixturesDir: string
|
|
|
|
@flags.string({
|
|
description: 'Path du rapport JSON en sortie (optionnel)',
|
|
})
|
|
declare out: string
|
|
|
|
@flags.number({
|
|
description:
|
|
"Délai en ms entre deux appels provider (anti rate-limit). Default: 1500 ms pour Mistral free tier.",
|
|
})
|
|
declare delayMs: number
|
|
|
|
async run() {
|
|
// Le cwd quand la commande tourne est apps/api (pnpm --filter ou ace
|
|
// direct). Le default pointe donc vers e2e/fixtures/invoices à la
|
|
// racine du monorepo.
|
|
const dir = this.fixturesDir ?? '../../e2e/fixtures/invoices'
|
|
const provider = getOcrProvider()
|
|
this.logger.info(`→ Provider OCR : ${provider.constructor.name}`)
|
|
this.logger.info(`→ Fixtures dir : ${dir}`)
|
|
|
|
let entries: string[]
|
|
try {
|
|
entries = await readdir(dir)
|
|
} catch (err) {
|
|
this.logger.error(
|
|
`Dossier introuvable : ${dir}. Créer le dossier et y poser tes PDFs + .expected.json.`
|
|
)
|
|
this.exitCode = 1
|
|
return
|
|
}
|
|
|
|
const pdfs = entries.filter((e) => SUPPORTED_EXT.has(extname(e).toLowerCase()))
|
|
if (pdfs.length === 0) {
|
|
this.logger.warning(
|
|
`Aucun PDF/PNG/JPG dans ${dir}. Voir le format dans le header de cette commande.`
|
|
)
|
|
this.exitCode = 1
|
|
return
|
|
}
|
|
|
|
this.logger.info(`→ ${pdfs.length} fixture(s) à valider\n`)
|
|
|
|
const results: FixtureResult[] = []
|
|
const driveDisk = drive.use()
|
|
const delayMs = this.delayMs ?? 1500
|
|
|
|
for (const [idx, pdf] of pdfs.entries()) {
|
|
if (idx > 0 && delayMs > 0) {
|
|
await new Promise((r) => setTimeout(r, delayMs))
|
|
}
|
|
const pdfPath = join(dir, pdf)
|
|
const expectedPath = join(dir, basename(pdf, extname(pdf)) + '.expected.json')
|
|
|
|
let expected: ExpectedFields
|
|
try {
|
|
const json = JSON.parse(await readFile(expectedPath, 'utf-8')) as ExpectedFile
|
|
expected = json.expected
|
|
} catch {
|
|
this.logger.warning(`⚠ Pas de ${expectedPath} — skip ${pdf}`)
|
|
continue
|
|
}
|
|
|
|
const buffer = await readFile(pdfPath)
|
|
// Upload temporaire vers le disk courant (MinIO en dev), pour que
|
|
// le provider (Mistral) puisse re-télécharger comme en prod.
|
|
const storageKey = `ocr-validate/${randomUUID()}/${pdf}`
|
|
await driveDisk.put(storageKey, buffer)
|
|
|
|
const t0 = Date.now()
|
|
let ocrResult: OcrResult
|
|
try {
|
|
ocrResult = await provider.extract({ storageKey, filename: pdf })
|
|
} catch (err) {
|
|
this.logger.error(`✗ ${pdf} — extraction throw : ${(err as Error).message}`)
|
|
// Cleanup temp file
|
|
await driveDisk.delete(storageKey).catch(() => {})
|
|
continue
|
|
}
|
|
const durationMs = Date.now() - t0
|
|
await driveDisk.delete(storageKey).catch(() => {})
|
|
|
|
const comparisons = compareFields(expected, ocrResult)
|
|
const allMatch = comparisons.every((c) => c.match)
|
|
results.push({ filename: pdf, durationMs, fields: comparisons, allMatch })
|
|
|
|
this.printFixtureResult(pdf, durationMs, comparisons, allMatch)
|
|
}
|
|
|
|
this.printSummary(results)
|
|
|
|
if (this.out) {
|
|
const { writeFile } = await import('node:fs/promises')
|
|
await writeFile(this.out, JSON.stringify({ provider: provider.constructor.name, results }, null, 2))
|
|
this.logger.info(`✔ Rapport JSON écrit : ${this.out}`)
|
|
}
|
|
|
|
// Exit 1 si une fixture a échoué — utile en CI
|
|
if (results.some((r) => !r.allMatch)) {
|
|
this.exitCode = 1
|
|
}
|
|
}
|
|
|
|
private printFixtureResult(
|
|
filename: string,
|
|
durationMs: number,
|
|
fields: FieldComparison[],
|
|
allMatch: boolean
|
|
) {
|
|
const status = allMatch ? '✔' : '✗'
|
|
this.logger.info(`\n${status} ${filename} (${durationMs} ms)`)
|
|
for (const f of fields) {
|
|
const icon = f.match ? ' ✓' : ' ✗'
|
|
const conf = f.confidence !== undefined ? ` conf=${f.confidence.toFixed(2)}` : ''
|
|
const reason = f.reason ? ` [${f.reason}]` : ''
|
|
this.logger.info(
|
|
`${icon} ${f.field.padEnd(16)} expected=${formatValue(f.expected)} got=${formatValue(
|
|
f.got
|
|
)}${conf}${reason}`
|
|
)
|
|
}
|
|
}
|
|
|
|
private printSummary(results: FixtureResult[]) {
|
|
const total = results.length
|
|
const fullPass = results.filter((r) => r.allMatch).length
|
|
const totalFields = results.reduce((sum, r) => sum + r.fields.length, 0)
|
|
const matchedFields = results.reduce(
|
|
(sum, r) => sum + r.fields.filter((f) => f.match).length,
|
|
0
|
|
)
|
|
const fieldAccuracy = totalFields > 0 ? (matchedFields / totalFields) * 100 : 0
|
|
const docAccuracy = total > 0 ? (fullPass / total) * 100 : 0
|
|
const avgLatency =
|
|
results.reduce((sum, r) => sum + r.durationMs, 0) / Math.max(total, 1)
|
|
|
|
this.logger.info('\n────────────────────────────────────────────────────────────')
|
|
this.logger.info(`Total factures : ${total}`)
|
|
this.logger.info(`Factures 100 % match : ${fullPass} (${docAccuracy.toFixed(1)} %)`)
|
|
this.logger.info(
|
|
`Champs match (total) : ${matchedFields}/${totalFields} (${fieldAccuracy.toFixed(1)} %)`
|
|
)
|
|
this.logger.info(`Latence moyenne : ${avgLatency.toFixed(0)} ms / facture`)
|
|
this.logger.info('────────────────────────────────────────────────────────────\n')
|
|
|
|
// Détail par champ
|
|
const fieldStats = new Map<string, { ok: number; total: number }>()
|
|
for (const r of results) {
|
|
for (const f of r.fields) {
|
|
const s = fieldStats.get(f.field) ?? { ok: 0, total: 0 }
|
|
s.total += 1
|
|
if (f.match) s.ok += 1
|
|
fieldStats.set(f.field, s)
|
|
}
|
|
}
|
|
this.logger.info('Précision par champ :')
|
|
for (const [field, s] of fieldStats) {
|
|
const pct = (s.ok / s.total) * 100
|
|
this.logger.info(` ${field.padEnd(18)} ${s.ok}/${s.total} (${pct.toFixed(1)} %)`)
|
|
}
|
|
}
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Comparison
|
|
// ---------------------------------------------------------------------------
|
|
|
|
function compareFields(expected: ExpectedFields, got: OcrResult): FieldComparison[] {
|
|
return [
|
|
{
|
|
field: 'clientName',
|
|
expected: expected.clientName,
|
|
got: got.fields.clientName.value,
|
|
confidence: got.fields.clientName.confidence,
|
|
match: matchesName(expected.clientName, got.fields.clientName.value),
|
|
reason: matchesName(expected.clientName, got.fields.clientName.value)
|
|
? undefined
|
|
: 'fuzzy similarity < 85 %',
|
|
},
|
|
{
|
|
field: 'clientEmail',
|
|
expected: expected.clientEmail,
|
|
got: got.fields.clientEmail.value,
|
|
confidence: got.fields.clientEmail.confidence,
|
|
match: matchesEmail(expected.clientEmail, got.fields.clientEmail.value),
|
|
},
|
|
{
|
|
field: 'numero',
|
|
expected: expected.numero,
|
|
got: got.fields.numero.value,
|
|
confidence: got.fields.numero.confidence,
|
|
match: matchesString(expected.numero, got.fields.numero.value),
|
|
},
|
|
{
|
|
field: 'amountTtcCents',
|
|
expected: expected.amountTtcCents,
|
|
got: got.fields.amountTtcCents.value,
|
|
confidence: got.fields.amountTtcCents.confidence,
|
|
match: expected.amountTtcCents === got.fields.amountTtcCents.value,
|
|
},
|
|
{
|
|
field: 'issueDate',
|
|
expected: expected.issueDate,
|
|
got: got.fields.issueDate.value,
|
|
confidence: got.fields.issueDate.confidence,
|
|
match: matchesDate(expected.issueDate, got.fields.issueDate.value),
|
|
},
|
|
{
|
|
field: 'dueDate',
|
|
expected: expected.dueDate,
|
|
got: got.fields.dueDate.value,
|
|
confidence: got.fields.dueDate.confidence,
|
|
match: matchesDate(expected.dueDate, got.fields.dueDate.value),
|
|
},
|
|
]
|
|
}
|
|
|
|
function matchesString(a: string, b: string): boolean {
|
|
return a.trim().toLowerCase() === b.trim().toLowerCase()
|
|
}
|
|
|
|
function matchesEmail(a: string | null, b: string | null): boolean {
|
|
if (a === null && b === null) return true
|
|
if (a === null || b === null) return false
|
|
return a.trim().toLowerCase() === b.trim().toLowerCase()
|
|
}
|
|
|
|
function matchesDate(a: string, b: string): boolean {
|
|
// a au format YYYY-MM-DD, b au format ISO 8601. On compare au jour près.
|
|
const aDay = a.slice(0, 10)
|
|
const bDay = b.slice(0, 10)
|
|
return aDay === bDay
|
|
}
|
|
|
|
function matchesName(a: string, b: string): boolean {
|
|
const an = a.trim().toLowerCase()
|
|
const bn = b.trim().toLowerCase()
|
|
if (an === bn) return true
|
|
// Similarité Jaccard sur les mots (tolérante aux suffixes SARL/SAS/…
|
|
// et aux espaces différents).
|
|
const aTokens = new Set(an.split(/\s+/).filter((t) => t.length > 1))
|
|
const bTokens = new Set(bn.split(/\s+/).filter((t) => t.length > 1))
|
|
if (aTokens.size === 0 || bTokens.size === 0) return false
|
|
const intersection = new Set([...aTokens].filter((t) => bTokens.has(t)))
|
|
const union = new Set([...aTokens, ...bTokens])
|
|
const jaccard = intersection.size / union.size
|
|
return jaccard >= 0.85
|
|
}
|
|
|
|
function formatValue(v: string | number | null): string {
|
|
if (v === null) return 'null'
|
|
if (typeof v === 'number') return String(v)
|
|
return JSON.stringify(v)
|
|
}
|