#!/usr/bin/env python3 """ Step 2 — Conversione PDF → Markdown grezzo Usa pymupdf4llm (PyMuPDF puro C, zero modelli ML, ~30-50 MB RAM) per convertire ogni PDF in sources/ e organizza l'output in: step-2//raw.md — MD grezzo, non modificare mai step-2//clean.md — copia di lavoro per lo step 4 Uso: python step-2/convert_pdf.py # tutti i PDF in sources/ python step-2/convert_pdf.py --pdf sources/doc.pdf # un solo PDF """ import argparse import shutil import sys from pathlib import Path import pymupdf4llm def convert_pdf(pdf_path: Path, project_root: Path) -> bool: stem = pdf_path.stem out_dir = project_root / "step-2" / stem raw_md = out_dir / "raw.md" clean_md = out_dir / "clean.md" print(f"\nConversione: {pdf_path.name}") print(f" Output: step-2/{stem}/") if raw_md.exists(): print(f" ⚠️ raw.md già presente — skip") print(f" (elimina {raw_md} per riconvertire)") return True out_dir.mkdir(parents=True, exist_ok=True) print(f" Conversione in corso...") md_text = pymupdf4llm.to_markdown(str(pdf_path)) raw_md.write_text(md_text, encoding="utf-8") shutil.copy2(raw_md, clean_md) size_kb = raw_md.stat().st_size // 1024 print(f" ✅ raw.md salvato ({size_kb} KB)") print(f" ✅ clean.md creato (copia di lavoro per step 4)") return True if __name__ == "__main__": project_root = Path(__file__).parent.parent parser = argparse.ArgumentParser(description="Step 2 — Conversione PDF → Markdown") parser.add_argument("--pdf", help="Percorso di un singolo PDF da convertire") args = parser.parse_args() if args.pdf: pdf_path = Path(args.pdf) if not pdf_path.exists(): print(f"Errore: file non trovato — {args.pdf}") sys.exit(1) pdfs = [pdf_path] else: sources_dir = project_root / "sources" if not sources_dir.exists(): print(f"Errore: cartella sources/ non trovata in {project_root}") sys.exit(1) pdfs = sorted(sources_dir.glob("*.pdf")) if not pdfs: print(f"Errore: nessun PDF trovato in {sources_dir}") sys.exit(1) results = [convert_pdf(p, project_root) for p in pdfs] ok_count = sum(results) total = len(results) print(f"\n{'✅' if all(results) else '⚠️ '} {ok_count}/{total} PDF convertiti") sys.exit(0 if all(results) else 1)