refactor(pdf-to-md): rimuovi riferimenti agli step interni da conversione/

pipeline.py è una pipeline autonoma e non deve nominare la suddivisione interna del progetto (step-0..4). Aggiornati docstring, commenti sezione, messaggi di output e argparse description.
2026-04-16 15:30:59 +02:00
parent b7994100e7
commit 2545d834a9
1 changed files with 23 additions and 29 deletions
@@ -1,22 +1,23 @@
 #!/usr/bin/env python3
 """
-conversion/pipeline.py — PDF → clean Markdown (pipeline automatica)
+conversione/pipeline.py — PDF → clean Markdown (pipeline automatica)

-Sostituisce step-0 + step-1 + step-2 + step-3 + step-4 in un solo comando,
-senza operazioni manuali.
+Converte un PDF grezzo in Markdown strutturato e pulito, pronto per la
+suddivisione in chunk. Gestisce validazione, estrazione testo, pulizia
+strutturale e rilevamento automatico della struttura del documento.

 Usa opendataloader-pdf (algoritmo XY-Cut++ per ordine di lettura corretto,
-testo fluente, struttura preservata) al posto di pymupdf4llm.
+testo fluente, struttura preservata).

-Output (compatibile con step-5+):
-  conversion/<stem>/raw.md                — output grezzo opendataloader (immutabile)
-  conversion/<stem>/clean.md              — MD pulito e strutturato
-  conversion/<stem>/structure_profile.json
+Output per ciascuno stem:
+  conversione/<stem>/raw.md                — Markdown grezzo (immutabile)
+  conversione/<stem>/clean.md              — Markdown pulito e strutturato
+  conversione/<stem>/structure_profile.json

 Uso:
-    python conversion/pipeline.py --stem <nome>
-    python conversion/pipeline.py                       # tutti i PDF in sources/
-    python conversion/pipeline.py --stem <nome> --force # forza riesecuzione
+    python conversione/pipeline.py --stem <nome>
+    python conversione/pipeline.py                       # tutti i PDF in sources/
+    python conversione/pipeline.py --stem <nome> --force # forza riesecuzione

 Prerequisiti:
    pip install opendataloader-pdf
@@ -55,7 +56,7 @@ def _check_deps() -> None:
        sys.exit(1)


-# ─── [1] Validazione PDF (step-0 + step-1) ────────────────────────────────────
+# ─── [1] Validazione PDF ─────────────────────────────────────────────────────

 def check_pdf(pdf_path: Path) -> tuple[bool, str]:
    """
@@ -93,7 +94,7 @@ def check_pdf(pdf_path: Path) -> tuple[bool, str]:
        return False, f"Impossibile aprire: {e}"


-# ─── [2] Conversione PDF → Markdown (step-2) ─────────────────────────────────
+# ─── [2] Conversione PDF → Markdown ─────────────────────────────────────────

 def convert_pdf(pdf_path: Path, out_dir: Path) -> Path:
    """
@@ -131,9 +132,7 @@ def convert_pdf(pdf_path: Path, out_dir: Path) -> Path:
    return md_file


-# ─── [3] Pulizia strutturale (step-4 / revise.py) ────────────────────────────
-#
-# Logica identica a step-4/revise.py — mantenuta sincronizzata.
+# ─── [3] Pulizia strutturale ─────────────────────────────────────────────────

 _TOC_KEYWORDS = frozenset([
    "indice", "index", "contents", "table of contents",
@@ -473,9 +472,7 @@ def apply_transforms(text: str) -> tuple[str, dict]:
    return text, stats


-# ─── [4] Rilevamento struttura (step-3 / detect_structure.py) ────────────────
-#
-# Logica identica a step-3/detect_structure.py — mantenuta sincronizzata.
+# ─── [4] Rilevamento struttura ───────────────────────────────────────────────

 _IT_WORDS = frozenset([
    "il", "la", "di", "e", "che", "non", "per", "un", "una", "si",
@@ -580,7 +577,7 @@ def run(stem: str, project_root: Path, force: bool) -> bool:
    print(f"{'─' * 52}")

    if clean_out.exists() and not force:
-        print(f"  ⚠️  conversion/{stem}/clean.md già presente — skip")
+        print(f"  ⚠️  conversione/{stem}/clean.md già presente — skip")
        print(f"      (usa --force per rieseguire)")
        return True

@@ -638,10 +635,10 @@ def run(stem: str, project_root: Path, force: bool) -> bool:
        print(f"     ⚠️  {w}")

    print(f"\n  Output:")
-    print(f"    conversion/{stem}/raw.md               (immutabile)")
-    print(f"    conversion/{stem}/clean.md")
-    print(f"    conversion/{stem}/structure_profile.json")
-    print(f"\n  Prossimo passo: python step-5/chunker.py --stem {stem}")
+    print(f"    conversione/{stem}/raw.md               (immutabile)")
+    print(f"    conversione/{stem}/clean.md")
+    print(f"    conversione/{stem}/structure_profile.json")
+    print(f"\n  clean.md pronto per la suddivisione in chunk.")
    return True


@@ -651,11 +648,8 @@ if __name__ == "__main__":
    project_root = Path(__file__).parent.parent

    parser = argparse.ArgumentParser(
-        description="Pipeline PDF → clean Markdown (sostituisce step 0+1+2+3+4)",
-        epilog=(
-            "Output compatibile con step-5+.\n"
-            "Prerequisiti: pip install opendataloader-pdf  +  Java 11+ sul PATH"
-        ),
+        description="Pipeline PDF → clean Markdown strutturato, pronto per chunking",
+        epilog="Prerequisiti: pip install opendataloader-pdf  +  Java 11+ sul PATH",
    )
    parser.add_argument(
        "--stem",