346e336f1a
Converts PDFs in sources/ to Markdown using pymupdf4llm (pure C, ~30-50 MB RAM, no ML models). Output: step-2/<stem>/raw.md + clean.md.
81 lines
2.4 KiB
Python
81 lines
2.4 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Step 2 — Conversione PDF → Markdown grezzo
|
|
|
|
Usa pymupdf4llm (PyMuPDF puro C, zero modelli ML, ~30-50 MB RAM)
|
|
per convertire ogni PDF in sources/ e organizza l'output in:
|
|
step-2/<stem>/raw.md — MD grezzo, non modificare mai
|
|
step-2/<stem>/clean.md — copia di lavoro per lo step 4
|
|
|
|
Uso:
|
|
python step-2/convert_pdf.py # tutti i PDF in sources/
|
|
python step-2/convert_pdf.py --pdf sources/doc.pdf # un solo PDF
|
|
"""
|
|
|
|
import argparse
|
|
import shutil
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
import pymupdf4llm
|
|
|
|
|
|
def convert_pdf(pdf_path: Path, project_root: Path) -> bool:
|
|
stem = pdf_path.stem
|
|
out_dir = project_root / "step-2" / stem
|
|
raw_md = out_dir / "raw.md"
|
|
clean_md = out_dir / "clean.md"
|
|
|
|
print(f"\nConversione: {pdf_path.name}")
|
|
print(f" Output: step-2/{stem}/")
|
|
|
|
if raw_md.exists():
|
|
print(f" ⚠️ raw.md già presente — skip")
|
|
print(f" (elimina {raw_md} per riconvertire)")
|
|
return True
|
|
|
|
out_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
print(f" Conversione in corso...")
|
|
md_text = pymupdf4llm.to_markdown(str(pdf_path))
|
|
|
|
raw_md.write_text(md_text, encoding="utf-8")
|
|
shutil.copy2(raw_md, clean_md)
|
|
|
|
size_kb = raw_md.stat().st_size // 1024
|
|
print(f" ✅ raw.md salvato ({size_kb} KB)")
|
|
print(f" ✅ clean.md creato (copia di lavoro per step 4)")
|
|
return True
|
|
|
|
|
|
if __name__ == "__main__":
|
|
project_root = Path(__file__).parent.parent
|
|
|
|
parser = argparse.ArgumentParser(description="Step 2 — Conversione PDF → Markdown")
|
|
parser.add_argument("--pdf", help="Percorso di un singolo PDF da convertire")
|
|
args = parser.parse_args()
|
|
|
|
if args.pdf:
|
|
pdf_path = Path(args.pdf)
|
|
if not pdf_path.exists():
|
|
print(f"Errore: file non trovato — {args.pdf}")
|
|
sys.exit(1)
|
|
pdfs = [pdf_path]
|
|
else:
|
|
sources_dir = project_root / "sources"
|
|
if not sources_dir.exists():
|
|
print(f"Errore: cartella sources/ non trovata in {project_root}")
|
|
sys.exit(1)
|
|
pdfs = sorted(sources_dir.glob("*.pdf"))
|
|
if not pdfs:
|
|
print(f"Errore: nessun PDF trovato in {sources_dir}")
|
|
sys.exit(1)
|
|
|
|
results = [convert_pdf(p, project_root) for p in pdfs]
|
|
|
|
ok_count = sum(results)
|
|
total = len(results)
|
|
print(f"\n{'✅' if all(results) else '⚠️ '} {ok_count}/{total} PDF convertiti")
|
|
|
|
sys.exit(0 if all(results) else 1)
|