step-2: add convert_pdf.py (pymupdf4llm, low-memory)
Converts PDFs in sources/ to Markdown using pymupdf4llm (pure C, ~30-50 MB RAM, no ML models). Output: step-2/<stem>/raw.md + clean.md.
This commit is contained in:
@@ -27,3 +27,6 @@ Thumbs.db
|
||||
step-0/*_step0_report.txt
|
||||
step-1/*_step1_report.txt
|
||||
|
||||
# Output step-2 — MD grezzo generato da marker
|
||||
step-2/*/
|
||||
|
||||
|
||||
@@ -1,2 +1,5 @@
|
||||
# Step 0-1 — Ispezione e verifica PDF
|
||||
pdfplumber==0.11.9
|
||||
|
||||
# Step 2 — Conversione PDF → Markdown
|
||||
pymupdf4llm
|
||||
|
||||
@@ -0,0 +1,80 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Step 2 — Conversione PDF → Markdown grezzo
|
||||
|
||||
Usa pymupdf4llm (PyMuPDF puro C, zero modelli ML, ~30-50 MB RAM)
|
||||
per convertire ogni PDF in sources/ e organizza l'output in:
|
||||
step-2/<stem>/raw.md — MD grezzo, non modificare mai
|
||||
step-2/<stem>/clean.md — copia di lavoro per lo step 4
|
||||
|
||||
Uso:
|
||||
python step-2/convert_pdf.py # tutti i PDF in sources/
|
||||
python step-2/convert_pdf.py --pdf sources/doc.pdf # un solo PDF
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import shutil
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
import pymupdf4llm
|
||||
|
||||
|
||||
def convert_pdf(pdf_path: Path, project_root: Path) -> bool:
|
||||
stem = pdf_path.stem
|
||||
out_dir = project_root / "step-2" / stem
|
||||
raw_md = out_dir / "raw.md"
|
||||
clean_md = out_dir / "clean.md"
|
||||
|
||||
print(f"\nConversione: {pdf_path.name}")
|
||||
print(f" Output: step-2/{stem}/")
|
||||
|
||||
if raw_md.exists():
|
||||
print(f" ⚠️ raw.md già presente — skip")
|
||||
print(f" (elimina {raw_md} per riconvertire)")
|
||||
return True
|
||||
|
||||
out_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
print(f" Conversione in corso...")
|
||||
md_text = pymupdf4llm.to_markdown(str(pdf_path))
|
||||
|
||||
raw_md.write_text(md_text, encoding="utf-8")
|
||||
shutil.copy2(raw_md, clean_md)
|
||||
|
||||
size_kb = raw_md.stat().st_size // 1024
|
||||
print(f" ✅ raw.md salvato ({size_kb} KB)")
|
||||
print(f" ✅ clean.md creato (copia di lavoro per step 4)")
|
||||
return True
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
project_root = Path(__file__).parent.parent
|
||||
|
||||
parser = argparse.ArgumentParser(description="Step 2 — Conversione PDF → Markdown")
|
||||
parser.add_argument("--pdf", help="Percorso di un singolo PDF da convertire")
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.pdf:
|
||||
pdf_path = Path(args.pdf)
|
||||
if not pdf_path.exists():
|
||||
print(f"Errore: file non trovato — {args.pdf}")
|
||||
sys.exit(1)
|
||||
pdfs = [pdf_path]
|
||||
else:
|
||||
sources_dir = project_root / "sources"
|
||||
if not sources_dir.exists():
|
||||
print(f"Errore: cartella sources/ non trovata in {project_root}")
|
||||
sys.exit(1)
|
||||
pdfs = sorted(sources_dir.glob("*.pdf"))
|
||||
if not pdfs:
|
||||
print(f"Errore: nessun PDF trovato in {sources_dir}")
|
||||
sys.exit(1)
|
||||
|
||||
results = [convert_pdf(p, project_root) for p in pdfs]
|
||||
|
||||
ok_count = sum(results)
|
||||
total = len(results)
|
||||
print(f"\n{'✅' if all(results) else '⚠️ '} {ok_count}/{total} PDF convertiti")
|
||||
|
||||
sys.exit(0 if all(results) else 1)
|
||||
Reference in New Issue
Block a user