From 346e336f1aedcb7ef4df77f91b9a2746ce4818a0 Mon Sep 17 00:00:00 2001 From: Davide Grilli Date: Mon, 13 Apr 2026 10:00:42 +0200 Subject: [PATCH] step-2: add convert_pdf.py (pymupdf4llm, low-memory) Converts PDFs in sources/ to Markdown using pymupdf4llm (pure C, ~30-50 MB RAM, no ML models). Output: step-2//raw.md + clean.md. --- .gitignore | 3 ++ requirements.txt | 3 ++ step-2/convert_pdf.py | 80 +++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 86 insertions(+) create mode 100644 step-2/convert_pdf.py diff --git a/.gitignore b/.gitignore index 369f1d6..87aef95 100644 --- a/.gitignore +++ b/.gitignore @@ -27,3 +27,6 @@ Thumbs.db step-0/*_step0_report.txt step-1/*_step1_report.txt +# Output step-2 — MD grezzo generato da marker +step-2/*/ + diff --git a/requirements.txt b/requirements.txt index e5577a4..a5686af 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,5 @@ # Step 0-1 — Ispezione e verifica PDF pdfplumber==0.11.9 + +# Step 2 — Conversione PDF → Markdown +pymupdf4llm diff --git a/step-2/convert_pdf.py b/step-2/convert_pdf.py new file mode 100644 index 0000000..efc6376 --- /dev/null +++ b/step-2/convert_pdf.py @@ -0,0 +1,80 @@ +#!/usr/bin/env python3 +""" +Step 2 — Conversione PDF → Markdown grezzo + +Usa pymupdf4llm (PyMuPDF puro C, zero modelli ML, ~30-50 MB RAM) +per convertire ogni PDF in sources/ e organizza l'output in: + step-2//raw.md — MD grezzo, non modificare mai + step-2//clean.md — copia di lavoro per lo step 4 + +Uso: + python step-2/convert_pdf.py # tutti i PDF in sources/ + python step-2/convert_pdf.py --pdf sources/doc.pdf # un solo PDF +""" + +import argparse +import shutil +import sys +from pathlib import Path + +import pymupdf4llm + + +def convert_pdf(pdf_path: Path, project_root: Path) -> bool: + stem = pdf_path.stem + out_dir = project_root / "step-2" / stem + raw_md = out_dir / "raw.md" + clean_md = out_dir / "clean.md" + + print(f"\nConversione: {pdf_path.name}") + print(f" Output: step-2/{stem}/") + + if raw_md.exists(): + print(f" ⚠️ raw.md già presente — skip") + print(f" (elimina {raw_md} per riconvertire)") + return True + + out_dir.mkdir(parents=True, exist_ok=True) + + print(f" Conversione in corso...") + md_text = pymupdf4llm.to_markdown(str(pdf_path)) + + raw_md.write_text(md_text, encoding="utf-8") + shutil.copy2(raw_md, clean_md) + + size_kb = raw_md.stat().st_size // 1024 + print(f" ✅ raw.md salvato ({size_kb} KB)") + print(f" ✅ clean.md creato (copia di lavoro per step 4)") + return True + + +if __name__ == "__main__": + project_root = Path(__file__).parent.parent + + parser = argparse.ArgumentParser(description="Step 2 — Conversione PDF → Markdown") + parser.add_argument("--pdf", help="Percorso di un singolo PDF da convertire") + args = parser.parse_args() + + if args.pdf: + pdf_path = Path(args.pdf) + if not pdf_path.exists(): + print(f"Errore: file non trovato — {args.pdf}") + sys.exit(1) + pdfs = [pdf_path] + else: + sources_dir = project_root / "sources" + if not sources_dir.exists(): + print(f"Errore: cartella sources/ non trovata in {project_root}") + sys.exit(1) + pdfs = sorted(sources_dir.glob("*.pdf")) + if not pdfs: + print(f"Errore: nessun PDF trovato in {sources_dir}") + sys.exit(1) + + results = [convert_pdf(p, project_root) for p in pdfs] + + ok_count = sum(results) + total = len(results) + print(f"\n{'✅' if all(results) else '⚠️ '} {ok_count}/{total} PDF convertiti") + + sys.exit(0 if all(results) else 1)