feat(pdf-to-md): sostituisci report.md con report.json + validate.py

pipeline.py produce conversione/<stem>/report.json invece di structure_profile.json + report.md. Il JSON contiene tutto: trasformazioni, struttura, distribuzione lunghezze sezioni, anomalie (bare_headers, short/long sections) e residui con esempi. Fix: bare_headers flagga solo header senza corpo < 30 chars; header numerati con corpo lungo (aforismi) non sono anomalie. Nuovo validate.py legge tutti i report.json e stampa tabella di stato per ogni stem (✅ / ⚠️ / ❌) con soglie configurabili. README aggiornato con sezione validazione batch e struttura report.json.
2026-04-16 15:53:46 +02:00
parent 6ec54c8616
commit 5b6940e479
3 changed files with 338 additions and 17 deletions
@@ -55,26 +55,49 @@ Per ogni stem vengono prodotti tre file in `conversione/<stem>/`:
 |------|-------------|
 | `raw.md` | Markdown grezzo estratto dal PDF — **non modificare** |
 | `clean.md` | Markdown pulito e strutturato — input per il chunker |
-| `structure_profile.json` | Profilo strutturale del documento |
+| `report.json` | Metriche complete di qualità della conversione |
-### structure_profile.json
+### report.json
 Contiene tutto ciò che serve per valutare la conversione: statistiche
 trasformazioni, struttura rilevata, distribuzione lunghezze sezioni,
 anomalie e problemi residui con esempi.
 ```json
 {
-  "livello_struttura": 3,
+  "stem": "dirittoprivato",
-  "n_h1": 1,
+  "timestamp": "2026-04-16 15:41",
-  "n_h2": 6,
+  "transforms": {
-  "n_h3": 163,
+    "n_accenti_corretti": 0,
-  "n_paragrafi": 213,
+    "n_dotleader_rimossi": 0,
-  "boundary_primario": "h3",
+    "toc_rimosso": false,
-  "lingua_rilevata": "it",
+    "n_sezioni_numerate": 63,
-  "lunghezza_media_sezione": 520,
+    "riduzione_pct": 1
-  "strategia_chunking": "h3_aware",
+  },
-  "avvertenze": []
+  "structure": {
    "livello_struttura": 3,
    "n_h1": 0, "n_h2": 6, "n_h3": 163,
    "lingua_rilevata": "it",
    "strategia_chunking": "h3_aware",
    "avvertenze": []
  },
  "distribution": { "min": 12, "p25": 312, "mediana": 681, "p75": 1197, "max": 6120 },
  "anomalie": {
    "bare_headers": 0,
    "short_sections": 1,
    "long_sections": 39,
    "bare_headers_list": [],
    "short_sections_list": [...],
    "long_sections_list": [...]
  },
  "residui": {
    "backtick": 0, "dotleader": 0, "url": 0, "immagini": 0,
    "backtick_esempi": []
  }
 }
 ```
-**`strategia_chunking`** indica come il chunker dovrebbe suddividere il documento:
+**`strategia_chunking`** indica come suddividere il documento in chunk:
 | Valore | Significato |
 |--------|-------------|
@@ -85,6 +108,44 @@ Per ogni stem vengono prodotti tre file in `conversione/<stem>/`:
 ---
 ## Validazione batch
 Dopo aver convertito uno o più documenti, esegui `validate.py` per ottenere
 una tabella di stato su tutti gli stem:
 ```bash
 python conversione/validate.py
 ```
 Output di esempio:
 ```
 stem              h2   h3  strategia            bare corte lunghe backtick dotlead url  status
 ──────────────────────────────────────────────────────────────────────────────────────────────
 analisi1          13  279  h3_aware                0    36    151       10       0   0  ⚠️
 dirittoprivato     6  163  h3_aware                0     1     39        0       0   0  ✅
 nietzsche          4  303  h3_aware                6   104    100        0       0   0  ⚠️
 ──────────────────────────────────────────────────────────────────────────────────────────────
 Totale: 3  ✅ 1  ⚠️  2  ❌ 0
 ```
 **Legenda colonne:**
 | Colonna | Significato | Soglia warning |
 |---------|-------------|----------------|
 | `bare` | Header solo-numero senza corpo (`### 1.` vuoto) | ≥ 1 |
 | `corte` | Sezioni con corpo < 150 chars | informativo |
 | `lunghe` | Sezioni con corpo > 1500 chars | ≥ 80 |
 | `backtick` | Backtick `` ` `` residui nel testo | ≥ 1 |
 | `dotlead` | Dot-leader residui (`. . . .`) | ≥ 1 |
 **Stato:**
 - ✅ nessuna anomalia critica
 - ⚠️ anomalie presenti, documento processabile ma da verificare
 - ❌ struttura non rilevata (`livello_struttura = 0`) o > 50 backtick residui
 ---
 ## Cosa fa la pipeline
 La pipeline esegue quattro fasi in sequenza.
@@ -30,6 +30,7 @@ import re
 import subprocess
 import sys
 import tempfile
 from datetime import datetime
 from pathlib import Path
@@ -563,14 +564,136 @@ def analyze(md_path: Path) -> dict:
    }
 # ─── Report di conversione ───────────────────────────────────────────────────
 def build_report(
    stem: str,
    out_dir: Path,
    clean_text: str,
    t_stats: dict,
    profile: dict,
    reduction: float,
 ) -> Path:
    """
    Genera conversione/<stem>/report.json con tutte le metriche di qualità:
    statistiche trasformazioni, struttura, distribuzione lunghezze, anomalie
    e problemi residui. Leggibile da validate.py per la validazione batch.
    """
    text_lines = clean_text.split("\n")
    # ── Raccolta sezioni ### con corpo ────────────────────────────────────
    sections: list[tuple[str, str]] = []
    cur_hdr: str | None = None
    cur_body: list[str] = []
    for line in text_lines:
        if re.match(r"^### ", line):
            if cur_hdr is not None:
                sections.append((cur_hdr, "\n".join(cur_body).strip()))
            cur_hdr = line
            cur_body = []
        elif cur_hdr is not None:
            cur_body.append(line)
    if cur_hdr is not None:
        sections.append((cur_hdr, "\n".join(cur_body).strip()))
    lengths = [len(body) for _, body in sections]
    # ── Distribuzione lunghezze ───────────────────────────────────────────
    def _pct(data: list[int], p: float) -> int:
        if not data:
            return 0
        s = sorted(data)
        return s[max(0, min(len(s) - 1, int(len(s) * p)))]
    distribution = {
        "min":     min(lengths) if lengths else 0,
        "p25":     _pct(lengths, 0.25),
        "mediana": _pct(lengths, 0.50),
        "p75":     _pct(lengths, 0.75),
        "max":     max(lengths) if lengths else 0,
    }
    # ── Anomalie ──────────────────────────────────────────────────────────
    # Header solo-numero senza corpo sostanziale: anomalia solo se il corpo
    # è vuoto o < 30 chars. Un body lungo è una sezione numerata legittima
    # (es. aforismi numerati dove il numero è l'identificatore della sezione).
    bare_hdrs = [
        {"header": hdr, "corpo_inizio": body[:120].replace("\n", " ")}
        for hdr, body in sections
        if re.match(r"^### \d+\.\s*$", hdr) and len(body.strip()) < 30
    ]
    short_secs = [
        {"header": hdr, "chars": length, "testo": body[:80].replace("\n", " ")}
        for (hdr, body), length in zip(sections, lengths)
        if 0 < length < 150
    ]
    long_secs = [
        {"header": hdr, "chars": length}
        for (hdr, _), length in zip(sections, lengths)
        if length > 1500
    ]
    # ── Problemi residui (max 10 esempi ciascuno) ─────────────────────────
    def _scan(pattern: str, max_n: int = 10) -> list[dict]:
        hits = []
        for i, line in enumerate(text_lines):
            if re.search(pattern, line) and not re.match(r"^#+ ", line):
                hits.append({"riga": i + 1, "testo": line.strip()[:120]})
                if len(hits) >= max_n:
                    break
        return hits
    residui = {
        "backtick":  _scan(r"`"),
        "dotleader": _scan(r"(?:\. ){3,}"),
        "url":       _scan(r"^(https?://|www\.)\S+"),
        "immagini":  _scan(r"!\[[^\]]*\]\([^)]*\)"),
    }
    # ── Composizione report ───────────────────────────────────────────────
    report = {
        "stem": stem,
        "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M"),
        "transforms": {
            **t_stats,
            "riduzione_pct": round(reduction),
        },
        "structure": profile,
        "distribution": distribution,
        "anomalie": {
            "bare_headers":   len(bare_hdrs),
            "short_sections": len(short_secs),
            "long_sections":  len(long_secs),
            "bare_headers_list":   bare_hdrs,
            "short_sections_list": short_secs,
            "long_sections_list":  long_secs,
        },
        "residui": {
            "backtick":  len(residui["backtick"]),
            "dotleader": len(residui["dotleader"]),
            "url":       len(residui["url"]),
            "immagini":  len(residui["immagini"]),
            "backtick_esempi":  residui["backtick"],
            "dotleader_esempi": residui["dotleader"],
            "url_esempi":       residui["url"],
            "immagini_esempi":  residui["immagini"],
        },
    }
    report_path = out_dir / "report.json"
    report_path.write_text(json.dumps(report, ensure_ascii=False, indent=2), encoding="utf-8")
    return report_path
 # ─── Pipeline principale ──────────────────────────────────────────────────────
 def run(stem: str, project_root: Path, force: bool) -> bool:
    pdf_path = project_root / "sources" / f"{stem}.pdf"
-    out_dir = project_root / "conversion" / stem
+    out_dir = project_root / "conversione" / stem
    raw_out = out_dir / "raw.md"
    clean_out = out_dir / "clean.md"
    profile_out = out_dir / "structure_profile.json"
    print(f"\n{'─' * 52}")
    print(f"  {stem}")
@@ -623,7 +746,6 @@ def run(stem: str, project_root: Path, force: bool) -> bool:
    raw_out.write_text(raw_text, encoding="utf-8")
    clean_out.write_text(clean_text, encoding="utf-8")
    profile = analyze(clean_out)
    profile_out.write_text(json.dumps(profile, ensure_ascii=False, indent=2), encoding="utf-8")
    _LIVELLO_DESC = {3: "ricca (h3)", 2: "parziale (h2)", 1: "paragrafi", 0: "testo piatto"}
    print(f"  ✅ Struttura: livello {profile['livello_struttura']} — {_LIVELLO_DESC[profile['livello_struttura']]}")
@@ -634,10 +756,12 @@ def run(stem: str, project_root: Path, force: bool) -> bool:
    for w in profile["avvertenze"]:
        print(f"     ⚠️  {w}")
    build_report(stem, out_dir, clean_text, t_stats, profile, reduction)
    print(f"\n  Output:")
    print(f"    conversione/{stem}/raw.md               (immutabile)")
    print(f"    conversione/{stem}/clean.md")
-    print(f"    conversione/{stem}/structure_profile.json")
+    print(f"    conversione/{stem}/report.json")
    print(f"\n  clean.md pronto per la suddivisione in chunk.")
    return True
@@ -0,0 +1,136 @@
 #!/usr/bin/env python3
 """
 conversione/validate.py — Validazione batch di tutti gli stem convertiti
 Legge i report.json prodotti da pipeline.py e stampa una tabella di stato
 per ogni documento, evidenziando anomalie e problemi residui.
 Stato per stem:
  ✅  nessuna anomalia critica
  ⚠️  anomalie presenti ma documento processabile
  ❌  struttura non rilevata o problemi gravi
 Uso:
    python conversione/validate.py              # tutti gli stem
    python conversione/validate.py analisi1     # stem specifico
 """
 import json
 import sys
 from pathlib import Path
 # ─── Soglie ──────────────────────────────────────────────────────────────────
 _CRITICO_STRUTTURA   = 0    # livello_struttura == 0 → testo piatto, nessun header
 _CRITICO_BACKTICK    = 50   # molti accenti non corretti → testo illeggibile
 _WARNING_BARE        = 1    # anche un solo header senza titolo è sospetto
 _WARNING_BACKTICK    = 1    # qualsiasi backtick residuo va verificato
 _WARNING_LONG_SECS   = 80   # troppe sezioni lunghe indica struttura insufficiente
 def _status(r: dict) -> str:
    structure = r.get("structure", {})
    anomalie  = r.get("anomalie", {})
    residui   = r.get("residui", {})
    livello  = structure.get("livello_struttura", -1)
    backtick = residui.get("backtick", 0)
    if livello <= _CRITICO_STRUTTURA or backtick >= _CRITICO_BACKTICK:
        return "❌"
    if (
        anomalie.get("bare_headers", 0) >= _WARNING_BARE
        or backtick >= _WARNING_BACKTICK
        or anomalie.get("long_sections", 0) >= _WARNING_LONG_SECS
    ):
        return "⚠️ "
    return "✅"
 def _fmt(value, width: int) -> str:
    return str(value).ljust(width)
 def validate(stems: list[str], project_root: Path) -> None:
    conv_dir = project_root / "conversione"
    if stems:
        paths = [conv_dir / s / "report.json" for s in stems]
    else:
        paths = sorted(conv_dir.glob("*/report.json"))
    if not paths:
        print("Nessun report.json trovato in conversione/*/")
        sys.exit(0)
    rows = []
    for path in paths:
        if not path.exists():
            rows.append({"stem": path.parent.name, "_missing": True})
            continue
        r = json.loads(path.read_text(encoding="utf-8"))
        rows.append(r)
    # ── Intestazione ─────────────────────────────────────────────────────
    col_stem    = max(len(r.get("stem", "stem")) for r in rows) + 2
    header = (
        f"{'stem':<{col_stem}}"
        f"{'h2':>4}{'h3':>5}  "
        f"{'strategia':<20}"
        f"{'bare':>5}{'corte':>6}{'lunghe':>7}"
        f"{'backtick':>9}{'dotlead':>8}{'url':>4}"
        f"  {'status'}"
    )
    sep = "─" * len(header)
    print()
    print(header)
    print(sep)
    # ── Righe ─────────────────────────────────────────────────────────────
    for r in rows:
        if r.get("_missing"):
            print(f"{r['stem']:<{col_stem}}  (report.json non trovato)")
            continue
        stem      = r.get("stem", "?")
        structure = r.get("structure", {})
        anomalie  = r.get("anomalie", {})
        residui   = r.get("residui", {})
        h2       = structure.get("n_h2", 0)
        h3       = structure.get("n_h3", 0)
        strat    = structure.get("strategia_chunking", "?")
        bare     = anomalie.get("bare_headers", 0)
        corte    = anomalie.get("short_sections", 0)
        lunghe   = anomalie.get("long_sections", 0)
        backtick = residui.get("backtick", 0)
        dotlead  = residui.get("dotleader", 0)
        url      = residui.get("url", 0)
        status   = _status(r)
        print(
            f"{stem:<{col_stem}}"
            f"{h2:>4}{h3:>5}  "
            f"{strat:<20}"
            f"{bare:>5}{corte:>6}{lunghe:>7}"
            f"{backtick:>9}{dotlead:>8}{url:>4}"
            f"  {status}"
        )
    print(sep)
    totali = len(rows)
    ok  = sum(1 for r in rows if not r.get("_missing") and _status(r) == "✅")
    warn = sum(1 for r in rows if not r.get("_missing") and _status(r).startswith("⚠"))
    err = sum(1 for r in rows if not r.get("_missing") and _status(r) == "❌")
    print(f"Totale: {totali}  ✅ {ok}  ⚠️  {warn}  ❌ {err}")
    print()
    print("Legenda colonne: bare=header senza titolo  corte=sezioni<150ch  "
          "lunghe=sezioni>1500ch  backtick=accenti residui")
    print()
 if __name__ == "__main__":
    project_root = Path(__file__).parent.parent
    stems = sys.argv[1:]
    validate(stems, project_root)