From 346e336f1aedcb7ef4df77f91b9a2746ce4818a0 Mon Sep 17 00:00:00 2001
From: Davide Grilli <davide.grilli@outlook.com>
Date: Mon, 13 Apr 2026 10:00:42 +0200
Subject: [PATCH] step-2: add convert_pdf.py (pymupdf4llm, low-memory)

Converts PDFs in sources/ to Markdown using pymupdf4llm (pure C,
~30-50 MB RAM, no ML models). Output: step-2/<stem>/raw.md + clean.md.
---
 .gitignore            |  3 ++
 requirements.txt      |  3 ++
 step-2/convert_pdf.py | 80 +++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 86 insertions(+)
 create mode 100644 step-2/convert_pdf.py
diff --git a/.gitignore b/.gitignore
index 369f1d6..87aef95 100644
--- a/.gitignore
+++ b/.gitignore
@@ -27,3 +27,6 @@ Thumbs.db
 step-0/*_step0_report.txt
 step-1/*_step1_report.txt
 
+# Output step-2 — MD grezzo generato da marker
+step-2/*/
+
diff --git a/requirements.txt b/requirements.txt
index e5577a4..a5686af 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,2 +1,5 @@
 # Step 0-1 — Ispezione e verifica PDF
 pdfplumber==0.11.9
+
+# Step 2 — Conversione PDF → Markdown
+pymupdf4llm
diff --git a/step-2/convert_pdf.py b/step-2/convert_pdf.py
new file mode 100644
index 0000000..efc6376
--- /dev/null
+++ b/step-2/convert_pdf.py
@@ -0,0 +1,80 @@
+#!/usr/bin/env python3
+"""
+Step 2 — Conversione PDF → Markdown grezzo
+
+Usa pymupdf4llm (PyMuPDF puro C, zero modelli ML, ~30-50 MB RAM)
+per convertire ogni PDF in sources/ e organizza l'output in:
+  step-2/<stem>/raw.md    — MD grezzo, non modificare mai
+  step-2/<stem>/clean.md  — copia di lavoro per lo step 4
+
+Uso:
+    python step-2/convert_pdf.py                        # tutti i PDF in sources/
+    python step-2/convert_pdf.py --pdf sources/doc.pdf  # un solo PDF
+"""
+
+import argparse
+import shutil
+import sys
+from pathlib import Path
+
+import pymupdf4llm
+
+
+def convert_pdf(pdf_path: Path, project_root: Path) -> bool:
+    stem = pdf_path.stem
+    out_dir = project_root / "step-2" / stem
+    raw_md = out_dir / "raw.md"
+    clean_md = out_dir / "clean.md"
+
+    print(f"\nConversione: {pdf_path.name}")
+    print(f"  Output:    step-2/{stem}/")
+
+    if raw_md.exists():
+        print(f"  ⚠️  raw.md già presente — skip")
+        print(f"       (elimina {raw_md} per riconvertire)")
+        return True
+
+    out_dir.mkdir(parents=True, exist_ok=True)
+
+    print(f"  Conversione in corso...")
+    md_text = pymupdf4llm.to_markdown(str(pdf_path))
+
+    raw_md.write_text(md_text, encoding="utf-8")
+    shutil.copy2(raw_md, clean_md)
+
+    size_kb = raw_md.stat().st_size // 1024
+    print(f"  ✅ raw.md salvato ({size_kb} KB)")
+    print(f"  ✅ clean.md creato (copia di lavoro per step 4)")
+    return True
+
+
+if __name__ == "__main__":
+    project_root = Path(__file__).parent.parent
+
+    parser = argparse.ArgumentParser(description="Step 2 — Conversione PDF → Markdown")
+    parser.add_argument("--pdf", help="Percorso di un singolo PDF da convertire")
+    args = parser.parse_args()
+
+    if args.pdf:
+        pdf_path = Path(args.pdf)
+        if not pdf_path.exists():
+            print(f"Errore: file non trovato — {args.pdf}")
+            sys.exit(1)
+        pdfs = [pdf_path]
+    else:
+        sources_dir = project_root / "sources"
+        if not sources_dir.exists():
+            print(f"Errore: cartella sources/ non trovata in {project_root}")
+            sys.exit(1)
+        pdfs = sorted(sources_dir.glob("*.pdf"))
+        if not pdfs:
+            print(f"Errore: nessun PDF trovato in {sources_dir}")
+            sys.exit(1)
+
+    results = [convert_pdf(p, project_root) for p in pdfs]
+
+    ok_count = sum(results)
+    total = len(results)
+    print(f"\n{'✅' if all(results) else '⚠️ '} {ok_count}/{total} PDF convertiti")
+
+    sys.exit(0 if all(results) else 1)