import json import sys from pathlib import Path _GRADES = [(90, "A"), (75, "B"), (60, "C"), (40, "D"), (0, "F")] def _score(r: dict) -> tuple[int, list[str]]: """ Voto 0-100 sulla qualità del clean.md per vettorizzazione. Penalità struttura: livello 0 (assente) → −40 livello 1 (piatto) → −15 Penalità residui (degradano il retrieval): backtick → −2/cad (max −20) dot-leader → −5/cad (max −10) URL/watermark → −5/cad (max −15) immagini → −5/cad (max −10)
inline → −2/cad (max −15) simboli encoding → −1/cad (max −10) formule inline [N.M] → −1/cad (max −8) footnote residui → −1/cad (max −8) caratteri PUA → −2/cad (max −20) Penalità anomalie: bare headers → −3/cad (max −15) """ score = 100 detail = [] structure = r.get("structure", {}) anomalie = r.get("anomalie", {}) residui = r.get("residui", {}) livello = structure.get("livello_struttura", 0) if livello == 0: score -= 40 detail.append("struttura assente −40") elif livello == 1: score -= 15 detail.append("struttura piatta −15") def _pen(key: str, per_item: int, cap: int, label: str) -> None: n = residui.get(key, 0) if n: p = min(cap, n * per_item) nonlocal score score -= p detail.append(f"{label} ×{n} −{p}") _pen("backtick", 2, 20, "backtick") _pen("dotleader", 5, 10, "dot-leader") _pen("url", 5, 15, "url") _pen("immagini", 5, 10, "immagini") _pen("br_inline", 2, 15, "
inline") _pen("simboli_encoding", 1, 10, "simboli encoding") _pen("formule_inline", 1, 8, "formule inline") _pen("footnote_markers", 1, 8, "footnote residui") _pen("pua_markers", 2, 20, "caratteri PUA font Symbol") _pen("formula_headers", 3, 15, "formula/esercizio come header") n_bare = anomalie.get("bare_headers", 0) if n_bare: p = min(15, n_bare * 3) score -= p detail.append(f"bare headers ×{n_bare} −{p}") return max(0, score), detail def _grade(score: int) -> str: return next(g for threshold, g in _GRADES if score >= threshold) def validate(stems: list[str], project_root: Path, detail: bool = False) -> None: conv_dir = project_root / "conversione" paths = ( [conv_dir / s / "report.json" for s in stems] if stems else sorted(conv_dir.glob("*/report.json")) ) if not paths: print("Nessun report.json trovato in conversione/*/") sys.exit(0) rows = [ json.loads(p.read_text(encoding="utf-8")) if p.exists() else {"stem": p.parent.name, "_missing": True} for p in paths ] col = max(len(r.get("stem", "stem")) for r in rows) + 2 header = ( f"{'stem':<{col}}" f"{'h2':>4}{'h3':>5} " f"{'strategia':<18}" f"{'bare':>5}{'corte':>6}{'lunghe':>7}" f"{'btk':>5}{'br':>4}{'enc':>4}{'url':>4}{'fhdr':>5}" f"{'med':>6}" f" {'voto':>4} grade" ) sep = "─" * len(header) print(f"\n{header}\n{sep}") scores = [] for r in rows: if r.get("_missing"): print(f"{r['stem']:<{col}} (report.json non trovato)") continue st = r.get("structure", {}) an = r.get("anomalie", {}) res = r.get("residui", {}) dist = r.get("distribution", {}) s, pen = _score(r) scores.append(s) print( f"{r['stem']:<{col}}" f"{st.get('n_h2', 0):>4}" f"{st.get('n_h3', 0):>5} " f"{st.get('strategia_chunking','?'):<18}" f"{an.get('bare_headers', 0):>5}" f"{an.get('short_sections', 0):>6}" f"{an.get('long_sections', 0):>7}" f"{res.get('backtick', 0):>5}" f"{res.get('br_inline', 0):>4}" f"{res.get('simboli_encoding', 0):>4}" f"{res.get('url', 0):>4}" f"{res.get('formula_headers', 0):>5}" f"{dist.get('mediana', 0):>6}" f" {s:>4} {_grade(s)}" ) if detail and pen: for p in pen: print(f" {'':>{col}} ↳ {p}") print(sep) if scores: media = sum(scores) / len(scores) print( f"Documenti: {len(scores)} " f"Media: {media:.0f}/100 {_grade(int(media))} " f"(A≥90 B≥75 C≥60 D≥40 F<40)" ) print( "\nColonne: bare=header vuoti corte=sez<150ch lunghe=sez>1500ch " "btk=backtick br=
inline enc=simboli encoding fhdr=formula-header med=mediana chars\n" )