step-0: add check_pdf.py

Script di verifica idoneità PDF per lo step 0 della pipeline RAG. Legge automaticamente tutti i PDF in sources/, controlla criteri obbligatori e desiderabili, salva il report in step-0/.
2026-04-13 08:03:08 +02:00
parent 42c38c30f7
commit eda04dc464
1 changed files with 229 additions and 0 deletions
@@ -0,0 +1,229 @@
+#!/usr/bin/env python3
+"""
+Step 0 — Verifica idoneità PDF
+
+Legge tutti i PDF in sources/ e salva un report per ognuno in step-0/.
+
+Uso:
+    python step-0/check_pdf.py
+
+Output:
+    step-0/<nome_pdf>_step0_report.txt
+"""
+
+import sys
+import statistics
+from datetime import datetime
+from pathlib import Path
+
+
+def check_pdf(pdf_path: str, save: bool = True) -> None:
+    try:
+        import pdfplumber
+    except ImportError:
+        print("Errore: pdfplumber non è installato.")
+        print("       pip install pdfplumber")
+        sys.exit(1)
+
+    path = Path(pdf_path)
+    if not path.exists():
+        print(f"Errore: file non trovato — {pdf_path}")
+        sys.exit(1)
+    if path.suffix.lower() != ".pdf":
+        print(f"Errore: il file non è un PDF — {pdf_path}")
+        sys.exit(1)
+
+    lines = []  # righe del report
+    results = []  # (etichetta, stato, messaggio)
+
+    def out(text=""):
+        lines.append(text)
+        print(text)
+
+    out(f"Step 0 — Verifica idoneità PDF")
+    out(f"File:    {path.name}")
+    out(f"Data:    {datetime.now().strftime('%Y-%m-%d %H:%M')}")
+    out("=" * 50)
+
+    # ------------------------------------------------------------------ #
+    # Criterio 1 — Non protetto da password
+    # ------------------------------------------------------------------ #
+    try:
+        with pdfplumber.open(path) as pdf:
+            n_pages = len(pdf.pages)
+        results.append(("Non protetto da password", "PASS", f"{n_pages} pagine"))
+    except Exception as e:
+        msg = str(e).lower()
+        if "password" in msg or "encrypted" in msg or "decrypt" in msg:
+            results.append(("Non protetto da password", "FAIL",
+                             "Il PDF è cifrato — non può essere elaborato"))
+        else:
+            results.append(("Non protetto da password", "FAIL",
+                             f"Impossibile aprire il file: {e}"))
+        _render_results(results, out)
+        _maybe_save(lines, path, save)
+        return
+
+    # ------------------------------------------------------------------ #
+    # Lettura pagine — una sola passata
+    # ------------------------------------------------------------------ #
+    char_counts = []
+    line_lengths = []
+    all_text = ""
+    empty_pages = 0
+
+    with pdfplumber.open(path) as pdf:
+        for page in pdf.pages:
+            text = page.extract_text() or ""
+            all_text += text + "\n"
+            chars = len(text.strip())
+            char_counts.append(chars)
+            if chars == 0:
+                empty_pages += 1
+            for line in text.splitlines():
+                stripped = line.strip()
+                if stripped:
+                    line_lengths.append(len(stripped))
+
+    total_pages = len(char_counts)
+    pages_with_text = sum(1 for c in char_counts if c > 50)
+    text_coverage = pages_with_text / total_pages if total_pages > 0 else 0
+
+    # ------------------------------------------------------------------ #
+    # Criterio 2 — Testo estraibile
+    # ------------------------------------------------------------------ #
+    if text_coverage >= 0.7:
+        results.append(("Testo estraibile", "PASS",
+                         f"{pages_with_text}/{total_pages} pagine con testo ({text_coverage:.0%})"))
+    elif text_coverage >= 0.4:
+        results.append(("Testo estraibile", "WARN",
+                         f"Solo {pages_with_text}/{total_pages} pagine con testo — revisione estesa necessaria"))
+    else:
+        results.append(("Testo estraibile", "FAIL",
+                         f"Solo {pages_with_text}/{total_pages} pagine con testo — probabilmente scansionato"))
+
+    # ------------------------------------------------------------------ #
+    # Criterio 3 — Generato digitalmente (non scansionato)
+    # ------------------------------------------------------------------ #
+    pages_text_only = [c for c in char_counts if c > 0]
+    avg_chars = statistics.mean(pages_text_only) if pages_text_only else 0
+
+    if avg_chars >= 300:
+        results.append(("Generato digitalmente (non scansionato)", "PASS",
+                         f"Media {avg_chars:.0f} char/pagina"))
+    elif avg_chars >= 100:
+        results.append(("Generato digitalmente (non scansionato)", "WARN",
+                         f"Media bassa: {avg_chars:.0f} char/pagina — alcune pagine potrebbero essere immagini"))
+    else:
+        results.append(("Generato digitalmente (non scansionato)", "FAIL",
+                         f"Media molto bassa: {avg_chars:.0f} char/pagina — il PDF sembra scansionato"))
+
+    # ------------------------------------------------------------------ #
+    # Criterio 4 — Pagine vuote
+    # ------------------------------------------------------------------ #
+    if empty_pages == 0:
+        results.append(("Pagine vuote", "PASS", "Nessuna pagina vuota"))
+    elif empty_pages <= total_pages * 0.05:
+        results.append(("Pagine vuote", "WARN",
+                         f"{empty_pages} pagine vuote (≤ 5%) — probabilmente copertine o separatori"))
+    else:
+        results.append(("Pagine vuote", "WARN",
+                         f"{empty_pages} pagine vuote ({empty_pages/total_pages:.0%}) — controllare"))
+
+    # ------------------------------------------------------------------ #
+    # Criterio desiderabile — Layout a colonne singola
+    # ------------------------------------------------------------------ #
+    if line_lengths:
+        median_len = statistics.median(line_lengths)
+        short_lines = sum(1 for l in line_lengths if l < median_len * 0.4)
+        short_ratio = short_lines / len(line_lengths)
+        if short_ratio < 0.15:
+            results.append(("Layout a colonne singola (desiderabile)", "PASS",
+                             f"Righe corte: {short_ratio:.0%} — struttura lineare"))
+        elif short_ratio < 0.35:
+            results.append(("Layout a colonne singola (desiderabile)", "WARN",
+                             f"Righe corte: {short_ratio:.0%} — possibile layout a colonne parziale"))
+        else:
+            results.append(("Layout a colonne singola (desiderabile)", "WARN",
+                             f"Righe corte: {short_ratio:.0%} — probabile layout a colonne multiple"))
+    else:
+        results.append(("Layout a colonne singola (desiderabile)", "WARN",
+                         "Impossibile analizzare (nessuna riga estratta)"))
+
+    # ------------------------------------------------------------------ #
+    # Criterio desiderabile — Struttura logica (titoli)
+    # ------------------------------------------------------------------ #
+    candidate_headings = [
+        line.strip() for line in all_text.splitlines()
+        if 3 <= len(line.strip()) <= 80
+        and line.strip()[0].isupper()
+        and not line.strip().endswith(".")
+        and not line.strip().endswith(",")
+        and len(line.strip().split()) <= 10
+    ]
+    heading_density = len(candidate_headings) / total_pages if total_pages > 0 else 0
+
+    if heading_density >= 1.0:
+        results.append(("Struttura logica riconoscibile (desiderabile)", "PASS",
+                         f"~{len(candidate_headings)} possibili titoli rilevati ({heading_density:.1f}/pagina)"))
+    elif heading_density >= 0.3:
+        results.append(("Struttura logica riconoscibile (desiderabile)", "WARN",
+                         f"~{len(candidate_headings)} possibili titoli ({heading_density:.1f}/pagina) — struttura parziale"))
+    else:
+        results.append(("Struttura logica riconoscibile (desiderabile)", "WARN",
+                         "Pochi titoli rilevati — testo narrativo o struttura non standard"))
+
+    _render_results(results, out)
+    _maybe_save(lines, path, save)
+
+
+def _render_results(results: list, out) -> None:
+    icons = {"PASS": "✅", "WARN": "⚠️ ", "FAIL": "❌"}
+    out()
+    for label, status, message in results:
+        icon = icons.get(status, "  ")
+        out(f"  {icon} {label}")
+        out(f"       {message}")
+    out()
+
+    fails = [r for r in results if r[1] == "FAIL"]
+    warns = [r for r in results if r[1] == "WARN"]
+
+    if fails:
+        out("ESITO: ❌ PDF NON IDONEO")
+        out("       Criteri obbligatori non soddisfatti — scegli un PDF diverso.")
+    elif warns:
+        out("ESITO: ⚠️  PDF ACCETTABILE CON CAUTELA")
+        out("       Procedi, ma aspettati più lavoro nella revisione manuale (step 4).")
+    else:
+        out("ESITO: ✅ PDF IDONEO")
+        out("       Tutti i criteri soddisfatti — procedi con lo step 1.")
+    out()
+
+
+def _maybe_save(lines: list, pdf_path: Path, save: bool) -> None:
+    if not save:
+        return
+    script_dir = Path(__file__).parent
+    out_file = script_dir / f"{pdf_path.stem}_step0_report.txt"
+    out_file.write_text("\n".join(lines), encoding="utf-8")
+    print(f"Report salvato in: {out_file}")
+
+
+if __name__ == "__main__":
+    project_root = Path(__file__).parent.parent
+    sources_dir = project_root / "sources"
+
+    if not sources_dir.exists():
+        print(f"Errore: cartella sources/ non trovata in {project_root}")
+        sys.exit(1)
+
+    pdfs = sorted(sources_dir.glob("*.pdf"))
+    if not pdfs:
+        print(f"Errore: nessun PDF trovato in {sources_dir}")
+        sys.exit(1)
+
+    for pdf in pdfs:
+        check_pdf(str(pdf), save=True)
+        if len(pdfs) > 1:
+            print("-" * 50)