feat(validate): scoring orientato a chunking/vettorizzazione, flag --detail

- _score() ritorna (int, list[str]) con dettaglio penalità applicate - Rimossi criteri non pertinenti al chunking: sezioni_corte, sezioni_lunghe, mediana, p25 — il chunker le normalizza già in fase di suddivisione - Aggiunte penalità per residui che impattano i vettori: br_inline, simboli_encoding, formule_inline - Flag --detail / -d per mostrare breakdown penalità per documento - Colonne tabella aggiornate: btk, br, enc, url, med
2026-04-17 09:20:15 +02:00
parent ea721774da
commit 875a342efa
1 changed files with 78 additions and 37 deletions
@@ -15,6 +15,7 @@ Uso:
    python conversione/validate.py              # tutti gli stem
    python conversione/validate.py analisi1     # stem specifico
    python conversione/validate.py a b c        # stem multipli
+    python conversione/validate.py --detail analisi1  # mostra dettaglio penalità
 """

 import argparse
@@ -28,45 +29,72 @@ from pathlib import Path
 _GRADES = [(90, "A"), (75, "B"), (60, "C"), (40, "D"), (0, "F")]


-def _score(r: dict) -> int:
+def _score(r: dict) -> tuple[int, list[str]]:
    """
-    Calcola un punteggio 0-100 sulla qualità del Markdown prodotto.
+    Calcola un punteggio 0-100 sulla qualità del clean.md ai fini della
+    suddivisione in chunk e vettorizzazione.
+    Restituisce (score, lista_penalità_applicate).

-    Penalità:
-      struttura assente / piatta  → −40 / −15
-      backtick residui            → −2/cad (max −30)
-      URL / watermark             → −5/cad (max −15)
-      immagini residue            → −5/cad (max −10)
-      dot-leader residui          → −5/cad (max −10)
-      bare headers                → −3/cad (max −15)
-      sezioni >1500ch >35/60%     → −5 / −10
+    Penalità struttura (il chunker non può operare senza header):
+      struttura assente (livello 0)    → −40
+      struttura piatta (livello 1)     → −15
+
+    Penalità residui (finiscono nei vettori e degradano il retrieval):
+      backtick                         → −2/cad  (max −20)
+      dot-leader                       → −5/cad  (max −10)
+      URL / watermark                  → −5/cad  (max −15)
+      immagini residue                 → −5/cad  (max −10)
+      <br> inline (artefatti tabelle)  → −2/cad  (max −15)
+      simboli encoding (!/" residui)   → −1/cad  (max −10)
+      formule inline [N.M]             → −1/cad  (max −8)
+
+    Penalità anomalie:
+      bare headers                     → −3/cad  (max −15)
+
+    Non penalizzate (il chunker le normalizza):
+      sezioni corte, sezioni lunghe, mediana, p25
    """
-    score    = 100
+    score  = 100
+    detail = []
    structure = r.get("structure", {})
    anomalie  = r.get("anomalie",  {})
    residui   = r.get("residui",   {})

    livello = structure.get("livello_struttura", 0)
-    n_h3    = max(structure.get("n_h3", 0), 1)

+    # ── Struttura ─────────────────────────────────────────────────────────
    if livello == 0:
        score -= 40
+        detail.append("struttura assente −40")
    elif livello == 1:
        score -= 15
+        detail.append("struttura piatta −15")

-    score -= min(30, residui.get("backtick",  0) * 2)
-    score -= min(15, residui.get("url",       0) * 5)
-    score -= min(10, residui.get("immagini",  0) * 5)
-    score -= min(10, residui.get("dotleader", 0) * 5)
-    score -= min(15, anomalie.get("bare_headers", 0) * 3)
+    # ── Residui ───────────────────────────────────────────────────────────
+    def _pen(key: str, per_item: int, cap: int, label: str) -> None:
+        n = residui.get(key, 0)
+        if n:
+            p = min(cap, n * per_item)
+            nonlocal score
+            score -= p
+            detail.append(f"{label} ×{n} −{p}")

-    long_ratio = anomalie.get("long_sections", 0) / n_h3
-    if long_ratio > 0.60:
-        score -= 10
-    elif long_ratio > 0.35:
-        score -= 5
+    _pen("backtick",         2, 20, "backtick")
+    _pen("dotleader",        5, 10, "dot-leader")
+    _pen("url",              5, 15, "url")
+    _pen("immagini",         5, 10, "immagini")
+    _pen("br_inline",        2, 15, "<br> inline")
+    _pen("simboli_encoding", 1, 10, "simboli encoding")
+    _pen("formule_inline",   1,  8, "formule inline")

-    return max(0, score)
+    # ── Anomalie ──────────────────────────────────────────────────────────
+    n_bare = anomalie.get("bare_headers", 0)
+    if n_bare:
+        p = min(15, n_bare * 3)
+        score -= p
+        detail.append(f"bare headers ×{n_bare} −{p}")
+
+    return max(0, score), detail


 def _grade(score: int) -> str:
@@ -75,7 +103,7 @@ def _grade(score: int) -> str:

 # ─── Validazione ─────────────────────────────────────────────────────────────

-def validate(stems: list[str], project_root: Path) -> None:
+def validate(stems: list[str], project_root: Path, detail: bool = False) -> None:
    conv_dir = project_root / "conversione"

    paths = (
@@ -99,9 +127,10 @@ def validate(stems: list[str], project_root: Path) -> None:
    header = (
        f"{'stem':<{col}}"
        f"{'h2':>4}{'h3':>5}  "
-        f"{'strategia':<20}"
+        f"{'strategia':<18}"
        f"{'bare':>5}{'corte':>6}{'lunghe':>7}"
-        f"{'backtick':>9}{'dotlead':>8}{'url':>4}"
+        f"{'btk':>5}{'br':>4}{'enc':>4}{'url':>4}"
+        f"{'med':>6}"
        f"  {'voto':>4}  grade"
    )
    sep = "─" * len(header)
@@ -115,26 +144,33 @@ def validate(stems: list[str], project_root: Path) -> None:
            print(f"{r['stem']:<{col}}  (report.json non trovato)")
            continue

-        st  = r.get("structure", {})
-        an  = r.get("anomalie",  {})
-        res = r.get("residui",   {})
-        s   = _score(r)
+        st   = r.get("structure",    {})
+        an   = r.get("anomalie",     {})
+        res  = r.get("residui",      {})
+        dist = r.get("distribution", {})
+        s, pen = _score(r)
        scores.append(s)

        print(
            f"{r['stem']:<{col}}"
            f"{st.get('n_h2',              0):>4}"
            f"{st.get('n_h3',              0):>5}  "
-            f"{st.get('strategia_chunking','?'):<20}"
+            f"{st.get('strategia_chunking','?'):<18}"
            f"{an.get('bare_headers',      0):>5}"
            f"{an.get('short_sections',    0):>6}"
            f"{an.get('long_sections',     0):>7}"
-            f"{res.get('backtick',         0):>9}"
-            f"{res.get('dotleader',        0):>8}"
-            f"{res.get('url',             0):>4}"
+            f"{res.get('backtick',         0):>5}"
+            f"{res.get('br_inline',        0):>4}"
+            f"{res.get('simboli_encoding', 0):>4}"
+            f"{res.get('url',              0):>4}"
+            f"{dist.get('mediana',         0):>6}"
            f"  {s:>4}  {_grade(s)}"
        )

+        if detail and pen:
+            for p in pen:
+                print(f"  {'':>{col}}  ↳ {p}")
+
    # ── Riepilogo ─────────────────────────────────────────────────────────
    print(sep)
    if scores:
@@ -145,8 +181,8 @@ def validate(stems: list[str], project_root: Path) -> None:
            f"(A≥90  B≥75  C≥60  D≥40  F<40)"
        )
    print(
-        "\nPenalità: struttura assente −40, backtick −2/cad, "
-        "bare headers −3/cad, sezioni >1500ch >35% −5\n"
+        "\nColonne: bare=header vuoti  corte=sez<150ch  lunghe=sez>1500ch  "
+        "btk=backtick  br=<br>inline  enc=simboli encoding  med=mediana chars\n"
    )


@@ -163,5 +199,10 @@ if __name__ == "__main__":
        metavar="STEM",
        help="stem da validare (es: analisi1). Ometti per tutti.",
    )
+    parser.add_argument(
+        "--detail", "-d",
+        action="store_true",
+        help="mostra dettaglio penalità per ogni documento",
+    )
    args = parser.parse_args()
-    validate(args.stems, Path(__file__).parent.parent)
+    validate(args.stems, Path(__file__).parent.parent, detail=args.detail)