feat(validate): support single-file flags and explicit markdown score output

2026-04-16 16:05:03 +02:00
parent 5b6940e479
commit bcf2e688aa
1 changed files with 160 additions and 39 deletions
@@ -2,56 +2,159 @@
 """
 conversione/validate.py — Validazione batch di tutti gli stem convertiti

-Legge i report.json prodotti da pipeline.py e stampa una tabella di stato
-per ogni documento, evidenziando anomalie e problemi residui.
+Legge i report.json prodotti da pipeline.py, stampa una tabella di stato
+e assegna un voto (0-100) a ogni documento per misurare la bontà del
+Markdown prodotto.

-Stato per stem:
-  ✅  nessuna anomalia critica
-  ⚠️  anomalie presenti ma documento processabile
-  ❌  struttura non rilevata o problemi gravi
+Voto:
+  90-100  A  — ottimo, pronto per il chunker
+  75-89   B  — buono, qualche sezione lunga ma accettabile
+  60-74   C  — accettabile, anomalie minori da verificare
+  40-59   D  — da rivedere, problemi strutturali o residui evidenti
+   0-39   F  — da riprocessare, struttura assente o testo corrotto

 Uso:
    python conversione/validate.py              # tutti gli stem
    python conversione/validate.py analisi1     # stem specifico
+    python conversione/validate.py --stem analisi1
+    python conversione/validate.py --analisi1   # compatibilità
 """

 import json
+import argparse
 import sys
 from pathlib import Path


-# ─── Soglie ──────────────────────────────────────────────────────────────────
+# ─── Punteggio ───────────────────────────────────────────────────────────────

-_CRITICO_STRUTTURA   = 0    # livello_struttura == 0 → testo piatto, nessun header
-_CRITICO_BACKTICK    = 50   # molti accenti non corretti → testo illeggibile
-_WARNING_BARE        = 1    # anche un solo header senza titolo è sospetto
-_WARNING_BACKTICK    = 1    # qualsiasi backtick residuo va verificato
-_WARNING_LONG_SECS   = 80   # troppe sezioni lunghe indica struttura insufficiente
+def _score(r: dict) -> int:
+    """
+    Calcola un punteggio 0-100 sulla qualità del Markdown prodotto.

-
-def _status(r: dict) -> str:
+    Penalità:
+      - struttura assente o piatta        → -40 / -15
+      - backtick residui nel testo        → -2 per occorrenza (max -30)
+      - URL / watermark residui           → -5 per occorrenza (max -15)
+      - immagini residue                  → -5 per occorrenza (max -10)
+      - dot-leader residui                → -5 per occorrenza (max -10)
+      - header senza titolo (bare)        → -3 per occorrenza (max -15)
+      - troppe sezioni > 1500 chars       → -5 / -10 (in % sul totale h3)
+    """
+    score     = 100
    structure = r.get("structure", {})
    anomalie  = r.get("anomalie", {})
    residui   = r.get("residui", {})

-    livello  = structure.get("livello_struttura", -1)
-    backtick = residui.get("backtick", 0)
+    livello  = structure.get("livello_struttura", 0)
+    n_h3     = max(structure.get("n_h3", 0), 1)

-    if livello <= _CRITICO_STRUTTURA or backtick >= _CRITICO_BACKTICK:
-        return "❌"
-    if (
-        anomalie.get("bare_headers", 0) >= _WARNING_BARE
-        or backtick >= _WARNING_BACKTICK
-        or anomalie.get("long_sections", 0) >= _WARNING_LONG_SECS
-    ):
-        return "⚠️ "
-    return "✅"
+    # Struttura
+    if livello == 0:
+        score -= 40
+    elif livello == 1:
+        score -= 15
+
+    # Residui nel testo
+    score -= min(30, residui.get("backtick",  0) * 2)
+    score -= min(15, residui.get("url",       0) * 5)
+    score -= min(10, residui.get("immagini",  0) * 5)
+    score -= min(10, residui.get("dotleader", 0) * 5)
+
+    # Anomalie strutturali
+    score -= min(15, anomalie.get("bare_headers", 0) * 3)
+
+    # Sezioni troppo lunghe (in % sul totale delle sezioni ###)
+    long_ratio = anomalie.get("long_sections", 0) / n_h3
+    if long_ratio > 0.6:
+        score -= 10
+    elif long_ratio > 0.35:
+        score -= 5
+
+    return max(0, score)


-def _fmt(value, width: int) -> str:
-    return str(value).ljust(width)
+def _grade(score: int) -> str:
+    if score >= 90: return "A"
+    if score >= 75: return "B"
+    if score >= 60: return "C"
+    if score >= 40: return "D"
+    return "F"


+# ─── CLI ─────────────────────────────────────────────────────────────────────
+
+def _normalize_target(token: str) -> str:
+    """
+    Normalizza un target CLI in stem:
+      - analisi1
+      - --analisi1          (compatibilità)
+      - conversione/analisi1/report.json
+      - analisi1.pdf / analisi1.md / report.json
+    """
+    raw = token.strip()
+    if not raw:
+        return raw
+
+    # Compatibilità con invocazione tipo: --analisi1
+    if raw.startswith("--") and len(raw) > 2:
+        raw = raw[2:]
+
+    p = Path(raw)
+
+    # Path diretto al report
+    if p.name == "report.json" and p.parent.name:
+        return p.parent.name
+
+    name = p.name
+    if name.endswith((".pdf", ".md", ".json")):
+        name = Path(name).stem
+
+    return name
+
+
+def _parse_cli_args(argv: list[str]) -> list[str]:
+    parser = argparse.ArgumentParser(
+        description="Valida i report Markdown prodotti in conversione/<stem>/report.json"
+    )
+    parser.add_argument(
+        "targets",
+        nargs="*",
+        help="Stem, file o path da validare (es: analisi1 oppure conversione/analisi1/report.json)",
+    )
+    parser.add_argument(
+        "-s",
+        "--stem",
+        action="append",
+        default=[],
+        help="Stem specifico (ripetibile, es: --stem analisi1 --stem nietzsche)",
+    )
+
+    args, unknown = parser.parse_known_args(argv)
+
+    targets = [*args.targets, *args.stem]
+
+    # Compatibilità: `python validate.py --analisi1`
+    for tok in unknown:
+        if tok.startswith("--") and len(tok) > 2:
+            targets.append(tok[2:])
+        else:
+            parser.error(f"Argomento non riconosciuto: {tok}")
+
+    stems = []
+    seen = set()
+    for t in targets:
+        stem = _normalize_target(t)
+        if not stem or stem in seen:
+            continue
+        seen.add(stem)
+        stems.append(stem)
+
+    return stems
+
+
+# ─── Validazione ─────────────────────────────────────────────────────────────
+
 def validate(stems: list[str], project_root: Path) -> None:
    conv_dir = project_root / "conversione"

@@ -73,20 +176,23 @@ def validate(stems: list[str], project_root: Path) -> None:
        rows.append(r)

    # ── Intestazione ─────────────────────────────────────────────────────
-    col_stem    = max(len(r.get("stem", "stem")) for r in rows) + 2
+    col_stem = max(len(r.get("stem", "stem")) for r in rows) + 2
    header = (
        f"{'stem':<{col_stem}}"
        f"{'h2':>4}{'h3':>5}  "
        f"{'strategia':<20}"
        f"{'bare':>5}{'corte':>6}{'lunghe':>7}"
        f"{'backtick':>9}{'dotlead':>8}{'url':>4}"
-        f"  {'status'}"
+        f"  {'voto':>4}  {'grade'}"
    )
    sep = "─" * len(header)
    print()
    print(header)
    print(sep)

+    scores = []
+    scored_docs = []
+
    # ── Righe ─────────────────────────────────────────────────────────────
    for r in rows:
        if r.get("_missing"):
@@ -107,7 +213,11 @@ def validate(stems: list[str], project_root: Path) -> None:
        backtick = residui.get("backtick", 0)
        dotlead  = residui.get("dotleader", 0)
        url      = residui.get("url", 0)
-        status   = _status(r)
+
+        s = _score(r)
+        g = _grade(s)
+        scores.append(s)
+        scored_docs.append((stem, s, g))

        print(
            f"{stem:<{col_stem}}"
@@ -115,22 +225,33 @@ def validate(stems: list[str], project_root: Path) -> None:
            f"{strat:<20}"
            f"{bare:>5}{corte:>6}{lunghe:>7}"
            f"{backtick:>9}{dotlead:>8}{url:>4}"
-            f"  {status}"
+            f"  {s:>4}  {g}"
        )

+    # ── Riepilogo ─────────────────────────────────────────────────────────
    print(sep)
-    totali = len(rows)
-    ok  = sum(1 for r in rows if not r.get("_missing") and _status(r) == "✅")
-    warn = sum(1 for r in rows if not r.get("_missing") and _status(r).startswith("⚠"))
-    err = sum(1 for r in rows if not r.get("_missing") and _status(r) == "❌")
-    print(f"Totale: {totali}  ✅ {ok}  ⚠️  {warn}  ❌ {err}")
+    if scores:
+        media = sum(scores) / len(scores)
+        grade_media = _grade(int(media))
+        print(f"Documenti: {len(scores)}   "
+              f"Voto medio: {media:.0f}/100  {grade_media}   "
+              f"(A≥90  B≥75  C≥60  D≥40  F<40)")
+        if len(scored_docs) == 1:
+            stem, score, grade = scored_docs[0]
+            print(f"Voto finale Markdown ({stem}): {score}/100  {grade}")
+        else:
+            voti = ", ".join(
+                f"{stem}={score}/100 {grade}"
+                for stem, score, grade in scored_docs
+            )
+            print(f"Voti Markdown: {voti}")
    print()
-    print("Legenda colonne: bare=header senza titolo  corte=sezioni<150ch  "
-          "lunghe=sezioni>1500ch  backtick=accenti residui")
+    print("Penalità: struttura assente −40, backtick residui −2/cad, "
+          "bare headers −3/cad, sezioni >1500ch >35% −5")
    print()


 if __name__ == "__main__":
    project_root = Path(__file__).parent.parent
-    stems = sys.argv[1:]
+    stems = _parse_cli_args(sys.argv[1:])
    validate(stems, project_root)