"""Orchestrazione della pipeline PDF → Markdown a 9 stadi.""" import json import sys import threading import time from pathlib import Path from .extract import validate_pdf, extract_metadata from .stage1_metadata import extract_raw_data_with_pdfplumber_fallback as extract_raw_data from .stage2_layout import analyze_layout from .stage3_font import build_font_profile from .stage4_headers import classify_blocks from .stage5_hierarchy import infer_hierarchy from .stage6_tree import build_tree from .stage7_markdown import serialize_tree from .stage8_normalize import normalize_hierarchy from .stage9_validate import validate_markdown from .structure import analyze from .report import build_report from .validator import _score, _grade _LIVELLO_DESC = {3: "ricca (h3)", 2: "parziale (h2)", 1: "paragrafi", 0: "testo piatto"} _SPIN_FRAMES = "⠋⠙⠹⠸⠼⠴⠦⠧⠇⠏" def _build_frontmatter(meta: dict) -> str: lines = ["---", f"source: {meta['source']}"] if meta.get("title"): lines.append(f'title: "{meta["title"]}"') if meta.get("author"): lines.append(f'author: "{meta["author"]}"') if meta.get("year"): lines.append(f"year: {meta['year']}") if meta.get("pages"): lines.append(f"pages: {meta['pages']}") lines += ["---", ""] return "\n".join(lines) + "\n" class _Spinner: def __init__(self, prefix: str): self._prefix = prefix self._stop = threading.Event() self._thread = threading.Thread(target=self._run, daemon=True) self._t0 = 0.0 def __enter__(self): self._t0 = time.perf_counter() self._thread.start() return self def __exit__(self, *_): self._stop.set() self._thread.join() sys.stdout.write("\r" + " " * 72 + "\r") sys.stdout.flush() def _run(self): i = 0 while not self._stop.wait(0.1): elapsed = time.perf_counter() - self._t0 frame = _SPIN_FRAMES[i % len(_SPIN_FRAMES)] sys.stdout.write(f"\r {frame} {self._prefix} {elapsed:.0f}s") sys.stdout.flush() i += 1 def run(stem: str, project_root: Path, force: bool) -> bool: pdf_path = project_root / "sources" / f"{stem}.pdf" out_dir = project_root / "conversione" / stem raw_out = out_dir / "raw.md" clean_out = out_dir / "clean.md" print(f"\n{'─' * 52}") print(f" {stem}") print(f"{'─' * 52}") if clean_out.exists() and not force: print(f" ⚠️ conversione/{stem}/clean.md già presente — skip") print(f" (usa --force per rieseguire)") return True # ── [1] Validazione PDF ─────────────────────────────────────────────────── print(" [1/9] Validazione PDF...") pdf_mb = pdf_path.stat().st_size / (1024 * 1024) if pdf_path.exists() else 0 print(f" File: {pdf_path.name} ({pdf_mb:.1f} MB)") ok, msg = validate_pdf(pdf_path) if not ok: print(f" ✗ {msg}") return False print(f" ✅ {msg}") meta = extract_metadata(pdf_path) meta["source"] = pdf_path.name if meta.get("title"): print(f" Titolo: {meta['title']}") if meta.get("author"): print(f" Autore: {meta['author']}") # ── [2] Stage 1: estrazione span ────────────────────────────────────────── print(" [2/9] Stage 1: Estrazione span PyMuPDF...") with _Spinner("Lettura PDF con PyMuPDF..."): try: raw_blocks, doc_meta = extract_raw_data(pdf_path) except Exception as e: print(f" ✗ Estrazione fallita: {e}") return False print(f" ✅ {len(raw_blocks)} span estratti da {doc_meta['page_count']} pagine") toc_entries = len(doc_meta.get("toc", [])) if toc_entries: print(f" TOC: {toc_entries} voci") # ── [3] Stage 2: layout ─────────────────────────────────────────────────── print(" [3/9] Stage 2: Analisi layout e reading order...") with _Spinner("Analisi layout..."): blocks = analyze_layout(raw_blocks, doc_meta) print(f" ✅ {len(blocks)} blocchi dopo layout analysis") # ── [4] Stage 3: font analysis ──────────────────────────────────────────── print(" [4/9] Stage 3: Font analysis...") profile = build_font_profile(blocks) print(f" ✅ Body size: {profile.body_size}pt " f"Header sizes: {profile.header_sizes}") # ── [5] Stage 4: header detection ───────────────────────────────────────── print(" [5/9] Stage 4: Header detection...") blocks = classify_blocks(blocks, profile) n_candidates = sum(1 for b in blocks if b.block_type == "header_candidate") print(f" ✅ {n_candidates} header candidate rilevati") # ── [6] Stage 5: hierarchy inference ───────────────────────────────────── print(" [6/9] Stage 5: Hierarchy inference...") blocks = infer_hierarchy(blocks, profile, doc_meta.get("toc", [])) from collections import Counter level_dist = Counter(b.level for b in blocks if b.block_type == "header_candidate") print(f" ✅ H1={level_dist.get(1,0)} H2={level_dist.get(2,0)} H3={level_dist.get(3,0)}") # ── [7] Stage 6: document tree ──────────────────────────────────────────── print(" [7/9] Stage 6: Document tree reconstruction...") tree = build_tree(blocks) print(f" ✅ {len(tree)} sezioni radice") # ── [8] Stage 7: markdown generation ───────────────────────────────────── print(" [8/9] Stage 7: Markdown generation...") with _Spinner("Serializzazione albero..."): raw_md = serialize_tree(tree, meta, pdf_path=pdf_path) size_kb = len(raw_md.encode()) // 1024 n_lines = raw_md.count("\n") print(f" ✅ raw.md: {size_kb} KB, {n_lines} righe") # Scrittura raw.md (IMMUTABILE) try: out_dir.mkdir(parents=True, exist_ok=True) if not raw_out.exists() or force: raw_out.write_text(raw_md, encoding="utf-8") except PermissionError as e: print(f" ✗ Permesso negato durante la scrittura: {e}") return False # ── [9] Stage 8+9: normalizzazione + validazione ────────────────────────── print(" [9/9] Stage 8-9: Normalize + validate...") clean_md, norm_stats = normalize_hierarchy(raw_md) validation = validate_markdown(clean_md, meta.get("pages", 0)) if norm_stats["n_level_jumps_repaired"]: print(f" Salti livello riparati: {norm_stats['n_level_jumps_repaired']}") if norm_stats["n_empty_headers_removed"]: print(f" Header vuoti rimossi: {norm_stats['n_empty_headers_removed']}") if norm_stats["n_duplicate_headers_removed"]: print(f" Header duplicati rimossi: {norm_stats['n_duplicate_headers_removed']}") for w in validation.warnings: print(f" ⚠️ {w}") for e in validation.errors: print(f" ✗ {e}") # Aggiungi frontmatter a clean.md frontmatter = _build_frontmatter(meta) full_clean = frontmatter + clean_md try: clean_out.write_text(full_clean, encoding="utf-8") except PermissionError as e: print(f" ✗ Permesso negato durante la scrittura di clean.md: {e}") return False print(f" ✅ clean.md scritto") # ── Analisi struttura + report + score ──────────────────────────────────── profile_struct = analyze(clean_out) (out_dir / "structure_profile.json").write_text( json.dumps(profile_struct, ensure_ascii=False, indent=2), encoding="utf-8" ) print(f" Struttura: livello {profile_struct['livello_struttura']} — " f"{_LIVELLO_DESC[profile_struct['livello_struttura']]}") print(f" h1={profile_struct['n_h1']} h2={profile_struct['n_h2']} " f"h3={profile_struct['n_h3']} paragrafi={profile_struct['n_paragrafi']}") print(f" Strategia chunking: {profile_struct['strategia_chunking']}") print(f" Lingua rilevata: {profile_struct['lingua_rilevata']}") for w in profile_struct["avvertenze"]: print(f" ⚠️ {w}") t_stats = { **norm_stats, "validation": validation.to_dict(), } reduction = 100.0 * (1 - len(clean_md) / len(raw_md)) if raw_md else 0.0 report_path = build_report(stem, out_dir, full_clean, t_stats, profile_struct, reduction) report_data = json.loads(report_path.read_text(encoding="utf-8")) score, _ = _score(report_data) print(f"\n Output → conversione/{stem}/") print(f" raw.md (immutabile) clean.md report.json") print(f" Punteggio qualità: {score}/100 {_grade(score)}") return True