feat(pdf-to-md): sostituisci report.md con report.json + validate.py

pipeline.py produce conversione/<stem>/report.json invece di structure_profile.json + report.md. Il JSON contiene tutto: trasformazioni, struttura, distribuzione lunghezze sezioni, anomalie (bare_headers, short/long sections) e residui con esempi. Fix: bare_headers flagga solo header senza corpo < 30 chars; header numerati con corpo lungo (aforismi) non sono anomalie. Nuovo validate.py legge tutti i report.json e stampa tabella di stato per ogni stem (✅ / ⚠️ / ❌) con soglie configurabili. README aggiornato con sezione validazione batch e struttura report.json.
2026-04-16 15:53:46 +02:00
parent 6ec54c8616
commit 5b6940e479
3 changed files with 338 additions and 17 deletions
@@ -55,26 +55,49 @@ Per ogni stem vengono prodotti tre file in `conversione/<stem>/`:
 |------|-------------|
 | `raw.md` | Markdown grezzo estratto dal PDF — **non modificare** |
 | `clean.md` | Markdown pulito e strutturato — input per il chunker |
-| `structure_profile.json` | Profilo strutturale del documento |
+| `report.json` | Metriche complete di qualità della conversione |

-### structure_profile.json
+### report.json
+
+Contiene tutto ciò che serve per valutare la conversione: statistiche
+trasformazioni, struttura rilevata, distribuzione lunghezze sezioni,
+anomalie e problemi residui con esempi.

 ```json
 {
-  "livello_struttura": 3,
-  "n_h1": 1,
-  "n_h2": 6,
-  "n_h3": 163,
-  "n_paragrafi": 213,
-  "boundary_primario": "h3",
-  "lingua_rilevata": "it",
-  "lunghezza_media_sezione": 520,
-  "strategia_chunking": "h3_aware",
-  "avvertenze": []
+  "stem": "dirittoprivato",
+  "timestamp": "2026-04-16 15:41",
+  "transforms": {
+    "n_accenti_corretti": 0,
+    "n_dotleader_rimossi": 0,
+    "toc_rimosso": false,
+    "n_sezioni_numerate": 63,
+    "riduzione_pct": 1
+  },
+  "structure": {
+    "livello_struttura": 3,
+    "n_h1": 0, "n_h2": 6, "n_h3": 163,
+    "lingua_rilevata": "it",
+    "strategia_chunking": "h3_aware",
+    "avvertenze": []
+  },
+  "distribution": { "min": 12, "p25": 312, "mediana": 681, "p75": 1197, "max": 6120 },
+  "anomalie": {
+    "bare_headers": 0,
+    "short_sections": 1,
+    "long_sections": 39,
+    "bare_headers_list": [],
+    "short_sections_list": [...],
+    "long_sections_list": [...]
+  },
+  "residui": {
+    "backtick": 0, "dotleader": 0, "url": 0, "immagini": 0,
+    "backtick_esempi": []
+  }
 }
 ```

-**`strategia_chunking`** indica come il chunker dovrebbe suddividere il documento:
+**`strategia_chunking`** indica come suddividere il documento in chunk:

 | Valore | Significato |
 |--------|-------------|
@@ -85,6 +108,44 @@ Per ogni stem vengono prodotti tre file in `conversione/<stem>/`:

 ---

+## Validazione batch
+
+Dopo aver convertito uno o più documenti, esegui `validate.py` per ottenere
+una tabella di stato su tutti gli stem:
+
+```bash
+python conversione/validate.py
+```
+
+Output di esempio:
+
+```
+stem              h2   h3  strategia            bare corte lunghe backtick dotlead url  status
+──────────────────────────────────────────────────────────────────────────────────────────────
+analisi1          13  279  h3_aware                0    36    151       10       0   0  ⚠️
+dirittoprivato     6  163  h3_aware                0     1     39        0       0   0  ✅
+nietzsche          4  303  h3_aware                6   104    100        0       0   0  ⚠️
+──────────────────────────────────────────────────────────────────────────────────────────────
+Totale: 3  ✅ 1  ⚠️  2  ❌ 0
+```
+
+**Legenda colonne:**
+
+| Colonna | Significato | Soglia warning |
+|---------|-------------|----------------|
+| `bare` | Header solo-numero senza corpo (`### 1.` vuoto) | ≥ 1 |
+| `corte` | Sezioni con corpo < 150 chars | informativo |
+| `lunghe` | Sezioni con corpo > 1500 chars | ≥ 80 |
+| `backtick` | Backtick `` ` `` residui nel testo | ≥ 1 |
+| `dotlead` | Dot-leader residui (`. . . .`) | ≥ 1 |
+
+**Stato:**
+- ✅ nessuna anomalia critica
+- ⚠️ anomalie presenti, documento processabile ma da verificare
+- ❌ struttura non rilevata (`livello_struttura = 0`) o > 50 backtick residui
+
+---
+
 ## Cosa fa la pipeline

 La pipeline esegue quattro fasi in sequenza.
@@ -30,6 +30,7 @@ import re
 import subprocess
 import sys
 import tempfile
+from datetime import datetime
 from pathlib import Path


@@ -563,14 +564,136 @@ def analyze(md_path: Path) -> dict:
    }


+# ─── Report di conversione ───────────────────────────────────────────────────
+
+def build_report(
+    stem: str,
+    out_dir: Path,
+    clean_text: str,
+    t_stats: dict,
+    profile: dict,
+    reduction: float,
+) -> Path:
+    """
+    Genera conversione/<stem>/report.json con tutte le metriche di qualità:
+    statistiche trasformazioni, struttura, distribuzione lunghezze, anomalie
+    e problemi residui. Leggibile da validate.py per la validazione batch.
+    """
+    text_lines = clean_text.split("\n")
+
+    # ── Raccolta sezioni ### con corpo ────────────────────────────────────
+    sections: list[tuple[str, str]] = []
+    cur_hdr: str | None = None
+    cur_body: list[str] = []
+    for line in text_lines:
+        if re.match(r"^### ", line):
+            if cur_hdr is not None:
+                sections.append((cur_hdr, "\n".join(cur_body).strip()))
+            cur_hdr = line
+            cur_body = []
+        elif cur_hdr is not None:
+            cur_body.append(line)
+    if cur_hdr is not None:
+        sections.append((cur_hdr, "\n".join(cur_body).strip()))
+
+    lengths = [len(body) for _, body in sections]
+
+    # ── Distribuzione lunghezze ───────────────────────────────────────────
+    def _pct(data: list[int], p: float) -> int:
+        if not data:
+            return 0
+        s = sorted(data)
+        return s[max(0, min(len(s) - 1, int(len(s) * p)))]
+
+    distribution = {
+        "min":     min(lengths) if lengths else 0,
+        "p25":     _pct(lengths, 0.25),
+        "mediana": _pct(lengths, 0.50),
+        "p75":     _pct(lengths, 0.75),
+        "max":     max(lengths) if lengths else 0,
+    }
+
+    # ── Anomalie ──────────────────────────────────────────────────────────
+    # Header solo-numero senza corpo sostanziale: anomalia solo se il corpo
+    # è vuoto o < 30 chars. Un body lungo è una sezione numerata legittima
+    # (es. aforismi numerati dove il numero è l'identificatore della sezione).
+    bare_hdrs = [
+        {"header": hdr, "corpo_inizio": body[:120].replace("\n", " ")}
+        for hdr, body in sections
+        if re.match(r"^### \d+\.\s*$", hdr) and len(body.strip()) < 30
+    ]
+
+    short_secs = [
+        {"header": hdr, "chars": length, "testo": body[:80].replace("\n", " ")}
+        for (hdr, body), length in zip(sections, lengths)
+        if 0 < length < 150
+    ]
+
+    long_secs = [
+        {"header": hdr, "chars": length}
+        for (hdr, _), length in zip(sections, lengths)
+        if length > 1500
+    ]
+
+    # ── Problemi residui (max 10 esempi ciascuno) ─────────────────────────
+    def _scan(pattern: str, max_n: int = 10) -> list[dict]:
+        hits = []
+        for i, line in enumerate(text_lines):
+            if re.search(pattern, line) and not re.match(r"^#+ ", line):
+                hits.append({"riga": i + 1, "testo": line.strip()[:120]})
+                if len(hits) >= max_n:
+                    break
+        return hits
+
+    residui = {
+        "backtick":  _scan(r"`"),
+        "dotleader": _scan(r"(?:\. ){3,}"),
+        "url":       _scan(r"^(https?://|www\.)\S+"),
+        "immagini":  _scan(r"!\[[^\]]*\]\([^)]*\)"),
+    }
+
+    # ── Composizione report ───────────────────────────────────────────────
+    report = {
+        "stem": stem,
+        "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M"),
+        "transforms": {
+            **t_stats,
+            "riduzione_pct": round(reduction),
+        },
+        "structure": profile,
+        "distribution": distribution,
+        "anomalie": {
+            "bare_headers":   len(bare_hdrs),
+            "short_sections": len(short_secs),
+            "long_sections":  len(long_secs),
+            "bare_headers_list":   bare_hdrs,
+            "short_sections_list": short_secs,
+            "long_sections_list":  long_secs,
+        },
+        "residui": {
+            "backtick":  len(residui["backtick"]),
+            "dotleader": len(residui["dotleader"]),
+            "url":       len(residui["url"]),
+            "immagini":  len(residui["immagini"]),
+            "backtick_esempi":  residui["backtick"],
+            "dotleader_esempi": residui["dotleader"],
+            "url_esempi":       residui["url"],
+            "immagini_esempi":  residui["immagini"],
+        },
+    }
+
+    report_path = out_dir / "report.json"
+    report_path.write_text(json.dumps(report, ensure_ascii=False, indent=2), encoding="utf-8")
+    return report_path
+
+
 # ─── Pipeline principale ──────────────────────────────────────────────────────

 def run(stem: str, project_root: Path, force: bool) -> bool:
    pdf_path = project_root / "sources" / f"{stem}.pdf"
-    out_dir = project_root / "conversion" / stem
+    out_dir = project_root / "conversione" / stem
    raw_out = out_dir / "raw.md"
    clean_out = out_dir / "clean.md"
-    profile_out = out_dir / "structure_profile.json"

    print(f"\n{'─' * 52}")
    print(f"  {stem}")
@@ -623,7 +746,6 @@ def run(stem: str, project_root: Path, force: bool) -> bool:
    raw_out.write_text(raw_text, encoding="utf-8")
    clean_out.write_text(clean_text, encoding="utf-8")
    profile = analyze(clean_out)
-    profile_out.write_text(json.dumps(profile, ensure_ascii=False, indent=2), encoding="utf-8")

    _LIVELLO_DESC = {3: "ricca (h3)", 2: "parziale (h2)", 1: "paragrafi", 0: "testo piatto"}
    print(f"  ✅ Struttura: livello {profile['livello_struttura']} — {_LIVELLO_DESC[profile['livello_struttura']]}")
@@ -634,10 +756,12 @@ def run(stem: str, project_root: Path, force: bool) -> bool:
    for w in profile["avvertenze"]:
        print(f"     ⚠️  {w}")

+    build_report(stem, out_dir, clean_text, t_stats, profile, reduction)
+
    print(f"\n  Output:")
    print(f"    conversione/{stem}/raw.md               (immutabile)")
    print(f"    conversione/{stem}/clean.md")
-    print(f"    conversione/{stem}/structure_profile.json")
+    print(f"    conversione/{stem}/report.json")
    print(f"\n  clean.md pronto per la suddivisione in chunk.")
    return True

@@ -0,0 +1,136 @@
+#!/usr/bin/env python3
+"""
+conversione/validate.py — Validazione batch di tutti gli stem convertiti
+
+Legge i report.json prodotti da pipeline.py e stampa una tabella di stato
+per ogni documento, evidenziando anomalie e problemi residui.
+
+Stato per stem:
+  ✅  nessuna anomalia critica
+  ⚠️  anomalie presenti ma documento processabile
+  ❌  struttura non rilevata o problemi gravi
+
+Uso:
+    python conversione/validate.py              # tutti gli stem
+    python conversione/validate.py analisi1     # stem specifico
+"""
+
+import json
+import sys
+from pathlib import Path
+
+
+# ─── Soglie ──────────────────────────────────────────────────────────────────
+
+_CRITICO_STRUTTURA   = 0    # livello_struttura == 0 → testo piatto, nessun header
+_CRITICO_BACKTICK    = 50   # molti accenti non corretti → testo illeggibile
+_WARNING_BARE        = 1    # anche un solo header senza titolo è sospetto
+_WARNING_BACKTICK    = 1    # qualsiasi backtick residuo va verificato
+_WARNING_LONG_SECS   = 80   # troppe sezioni lunghe indica struttura insufficiente
+
+
+def _status(r: dict) -> str:
+    structure = r.get("structure", {})
+    anomalie  = r.get("anomalie", {})
+    residui   = r.get("residui", {})
+
+    livello  = structure.get("livello_struttura", -1)
+    backtick = residui.get("backtick", 0)
+
+    if livello <= _CRITICO_STRUTTURA or backtick >= _CRITICO_BACKTICK:
+        return "❌"
+    if (
+        anomalie.get("bare_headers", 0) >= _WARNING_BARE
+        or backtick >= _WARNING_BACKTICK
+        or anomalie.get("long_sections", 0) >= _WARNING_LONG_SECS
+    ):
+        return "⚠️ "
+    return "✅"
+
+
+def _fmt(value, width: int) -> str:
+    return str(value).ljust(width)
+
+
+def validate(stems: list[str], project_root: Path) -> None:
+    conv_dir = project_root / "conversione"
+
+    if stems:
+        paths = [conv_dir / s / "report.json" for s in stems]
+    else:
+        paths = sorted(conv_dir.glob("*/report.json"))
+
+    if not paths:
+        print("Nessun report.json trovato in conversione/*/")
+        sys.exit(0)
+
+    rows = []
+    for path in paths:
+        if not path.exists():
+            rows.append({"stem": path.parent.name, "_missing": True})
+            continue
+        r = json.loads(path.read_text(encoding="utf-8"))
+        rows.append(r)
+
+    # ── Intestazione ─────────────────────────────────────────────────────
+    col_stem    = max(len(r.get("stem", "stem")) for r in rows) + 2
+    header = (
+        f"{'stem':<{col_stem}}"
+        f"{'h2':>4}{'h3':>5}  "
+        f"{'strategia':<20}"
+        f"{'bare':>5}{'corte':>6}{'lunghe':>7}"
+        f"{'backtick':>9}{'dotlead':>8}{'url':>4}"
+        f"  {'status'}"
+    )
+    sep = "─" * len(header)
+    print()
+    print(header)
+    print(sep)
+
+    # ── Righe ─────────────────────────────────────────────────────────────
+    for r in rows:
+        if r.get("_missing"):
+            print(f"{r['stem']:<{col_stem}}  (report.json non trovato)")
+            continue
+
+        stem      = r.get("stem", "?")
+        structure = r.get("structure", {})
+        anomalie  = r.get("anomalie", {})
+        residui   = r.get("residui", {})
+
+        h2       = structure.get("n_h2", 0)
+        h3       = structure.get("n_h3", 0)
+        strat    = structure.get("strategia_chunking", "?")
+        bare     = anomalie.get("bare_headers", 0)
+        corte    = anomalie.get("short_sections", 0)
+        lunghe   = anomalie.get("long_sections", 0)
+        backtick = residui.get("backtick", 0)
+        dotlead  = residui.get("dotleader", 0)
+        url      = residui.get("url", 0)
+        status   = _status(r)
+
+        print(
+            f"{stem:<{col_stem}}"
+            f"{h2:>4}{h3:>5}  "
+            f"{strat:<20}"
+            f"{bare:>5}{corte:>6}{lunghe:>7}"
+            f"{backtick:>9}{dotlead:>8}{url:>4}"
+            f"  {status}"
+        )
+
+    print(sep)
+    totali = len(rows)
+    ok  = sum(1 for r in rows if not r.get("_missing") and _status(r) == "✅")
+    warn = sum(1 for r in rows if not r.get("_missing") and _status(r).startswith("⚠"))
+    err = sum(1 for r in rows if not r.get("_missing") and _status(r) == "❌")
+    print(f"Totale: {totali}  ✅ {ok}  ⚠️  {warn}  ❌ {err}")
+    print()
+    print("Legenda colonne: bare=header senza titolo  corte=sezioni<150ch  "
+          "lunghe=sezioni>1500ch  backtick=accenti residui")
+    print()
+
+
+if __name__ == "__main__":
+    project_root = Path(__file__).parent.parent
+    stems = sys.argv[1:]
+    validate(stems, project_root)