conversione/_pipeline/runner.py

"""Orchestrazione della pipeline PDF → Markdown a 9 stadi."""
import json
import sys
import threading
import time
from pathlib import Path

from .extract     import validate_pdf, extract_metadata
from .stage1_metadata import extract_raw_data_with_pdfplumber_fallback as extract_raw_data
from .stage2_layout   import analyze_layout
from .stage3_font     import build_font_profile
from .stage4_headers  import classify_blocks
from .stage5_hierarchy import infer_hierarchy
from .stage6_tree     import build_tree
from .stage7_markdown import serialize_tree
from .stage8_normalize import normalize_hierarchy
from .stage9_validate  import validate_markdown
from .structure   import analyze
from .report      import build_report
from .validator   import _score, _grade


_LIVELLO_DESC = {3: "ricca (h3)", 2: "parziale (h2)", 1: "paragrafi", 0: "testo piatto"}
_SPIN_FRAMES  = "⠋⠙⠹⠸⠼⠴⠦⠧⠇⠏"


def _build_frontmatter(meta: dict) -> str:
    lines = ["---", f"source: {meta['source']}"]
    if meta.get("title"):
        lines.append(f'title: "{meta["title"]}"')
    if meta.get("author"):
        lines.append(f'author: "{meta["author"]}"')
    if meta.get("year"):
        lines.append(f"year: {meta['year']}")
    if meta.get("pages"):
        lines.append(f"pages: {meta['pages']}")
    lines += ["---", ""]
    return "\n".join(lines) + "\n"


class _Spinner:
    def __init__(self, prefix: str):
        self._prefix = prefix
        self._stop   = threading.Event()
        self._thread = threading.Thread(target=self._run, daemon=True)
        self._t0     = 0.0

    def __enter__(self):
        self._t0 = time.perf_counter()
        self._thread.start()
        return self

    def __exit__(self, *_):
        self._stop.set()
        self._thread.join()
        sys.stdout.write("\r" + " " * 72 + "\r")
        sys.stdout.flush()

    def _run(self):
        i = 0
        while not self._stop.wait(0.1):
            elapsed = time.perf_counter() - self._t0
            frame   = _SPIN_FRAMES[i % len(_SPIN_FRAMES)]
            sys.stdout.write(f"\r  {frame} {self._prefix}  {elapsed:.0f}s")
            sys.stdout.flush()
            i += 1


def run(stem: str, project_root: Path, force: bool) -> bool:
    pdf_path  = project_root / "sources" / f"{stem}.pdf"
    out_dir   = project_root / "conversione" / stem
    raw_out   = out_dir / "raw.md"
    clean_out = out_dir / "clean.md"

    print(f"\n{'─' * 52}")
    print(f"  {stem}")
    print(f"{'─' * 52}")

    if clean_out.exists() and not force:
        print(f"  ⚠️  conversione/{stem}/clean.md già presente — skip")
        print(f"      (usa --force per rieseguire)")
        return True

    # ── [1] Validazione PDF ───────────────────────────────────────────────────
    print("  [1/9] Validazione PDF...")
    pdf_mb = pdf_path.stat().st_size / (1024 * 1024) if pdf_path.exists() else 0
    print(f"     File: {pdf_path.name}  ({pdf_mb:.1f} MB)")

    ok, msg = validate_pdf(pdf_path)
    if not ok:
        print(f"  ✗ {msg}")
        return False
    print(f"  ✅ {msg}")

    meta = extract_metadata(pdf_path)
    meta["source"] = pdf_path.name
    if meta.get("title"):
        print(f"     Titolo:  {meta['title']}")
    if meta.get("author"):
        print(f"     Autore:  {meta['author']}")

    # ── [2] Stage 1: estrazione span ──────────────────────────────────────────
    print("  [2/9] Stage 1: Estrazione span PyMuPDF...")
    with _Spinner("Lettura PDF con PyMuPDF..."):
        try:
            raw_blocks, doc_meta = extract_raw_data(pdf_path)
        except Exception as e:
            print(f"  ✗ Estrazione fallita: {e}")
            return False

    print(f"  ✅ {len(raw_blocks)} span estratti da {doc_meta['page_count']} pagine")
    toc_entries = len(doc_meta.get("toc", []))
    if toc_entries:
        print(f"     TOC: {toc_entries} voci")

    # ── [3] Stage 2: layout ───────────────────────────────────────────────────
    print("  [3/9] Stage 2: Analisi layout e reading order...")
    with _Spinner("Analisi layout..."):
        blocks = analyze_layout(raw_blocks, doc_meta)
    print(f"  ✅ {len(blocks)} blocchi dopo layout analysis")

    # ── [4] Stage 3: font analysis ────────────────────────────────────────────
    print("  [4/9] Stage 3: Font analysis...")
    profile = build_font_profile(blocks)
    print(f"  ✅ Body size: {profile.body_size}pt  "
          f"Header sizes: {profile.header_sizes}")

    # ── [5] Stage 4: header detection ─────────────────────────────────────────
    print("  [5/9] Stage 4: Header detection...")
    blocks = classify_blocks(blocks, profile)
    n_candidates = sum(1 for b in blocks if b.block_type == "header_candidate")
    print(f"  ✅ {n_candidates} header candidate rilevati")

    # ── [6] Stage 5: hierarchy inference ─────────────────────────────────────
    print("  [6/9] Stage 5: Hierarchy inference...")
    blocks = infer_hierarchy(blocks, profile, doc_meta.get("toc", []))
    from collections import Counter
    level_dist = Counter(b.level for b in blocks if b.block_type == "header_candidate")
    print(f"  ✅ H1={level_dist.get(1,0)}  H2={level_dist.get(2,0)}  H3={level_dist.get(3,0)}")

    # ── [7] Stage 6: document tree ────────────────────────────────────────────
    print("  [7/9] Stage 6: Document tree reconstruction...")
    tree = build_tree(blocks)
    print(f"  ✅ {len(tree)} sezioni radice")

    # ── [8] Stage 7: markdown generation ─────────────────────────────────────
    print("  [8/9] Stage 7: Markdown generation...")
    with _Spinner("Serializzazione albero..."):
        raw_md = serialize_tree(tree, meta, pdf_path=pdf_path)

    size_kb = len(raw_md.encode()) // 1024
    n_lines = raw_md.count("\n")
    print(f"  ✅ raw.md: {size_kb} KB, {n_lines} righe")

    # Scrittura raw.md (IMMUTABILE)
    try:
        out_dir.mkdir(parents=True, exist_ok=True)
        if not raw_out.exists() or force:
            raw_out.write_text(raw_md, encoding="utf-8")
    except PermissionError as e:
        print(f"  ✗ Permesso negato durante la scrittura: {e}")
        return False

    # ── [9] Stage 8+9: normalizzazione + validazione ──────────────────────────
    print("  [9/9] Stage 8-9: Normalize + validate...")
    clean_md, norm_stats = normalize_hierarchy(raw_md)
    validation = validate_markdown(clean_md, meta.get("pages", 0))

    if norm_stats["n_level_jumps_repaired"]:
        print(f"     Salti livello riparati:    {norm_stats['n_level_jumps_repaired']}")
    if norm_stats["n_empty_headers_removed"]:
        print(f"     Header vuoti rimossi:      {norm_stats['n_empty_headers_removed']}")
    if norm_stats["n_duplicate_headers_removed"]:
        print(f"     Header duplicati rimossi:  {norm_stats['n_duplicate_headers_removed']}")

    for w in validation.warnings:
        print(f"     ⚠️  {w}")
    for e in validation.errors:
        print(f"     ✗  {e}")

    # Aggiungi frontmatter a clean.md
    frontmatter = _build_frontmatter(meta)
    full_clean  = frontmatter + clean_md

    try:
        clean_out.write_text(full_clean, encoding="utf-8")
    except PermissionError as e:
        print(f"  ✗ Permesso negato durante la scrittura di clean.md: {e}")
        return False

    print(f"  ✅ clean.md scritto")

    # ── Analisi struttura + report + score ────────────────────────────────────
    profile_struct = analyze(clean_out)
    (out_dir / "structure_profile.json").write_text(
        json.dumps(profile_struct, ensure_ascii=False, indent=2), encoding="utf-8"
    )

    print(f"     Struttura: livello {profile_struct['livello_struttura']} — "
          f"{_LIVELLO_DESC[profile_struct['livello_struttura']]}")
    print(f"     h1={profile_struct['n_h1']}  h2={profile_struct['n_h2']}  "
          f"h3={profile_struct['n_h3']}  paragrafi={profile_struct['n_paragrafi']}")
    print(f"     Strategia chunking: {profile_struct['strategia_chunking']}")
    print(f"     Lingua rilevata:    {profile_struct['lingua_rilevata']}")
    for w in profile_struct["avvertenze"]:
        print(f"     ⚠️  {w}")

    t_stats = {
        **norm_stats,
        "validation": validation.to_dict(),
    }
    reduction = 100.0 * (1 - len(clean_md) / len(raw_md)) if raw_md else 0.0
    report_path = build_report(stem, out_dir, full_clean, t_stats, profile_struct, reduction)
    report_data = json.loads(report_path.read_text(encoding="utf-8"))
    score, _    = _score(report_data)

    print(f"\n  Output → conversione/{stem}/")
    print(f"    raw.md   (immutabile)  clean.md   report.json")
    print(f"  Punteggio qualità: {score}/100 {_grade(score)}")
    return True
feat: integra pipeline PDF→Markdown a 9 stadi e test suite 2026-05-11 14:46:16 +02:00			`"""Orchestrazione della pipeline PDF → Markdown a 9 stadi."""`
			`import json`
			`import sys`
			`import threading`
			`import time`
			`from pathlib import Path`

			`from .extract import validate_pdf, extract_metadata`
			`from .stage1_metadata import extract_raw_data_with_pdfplumber_fallback as extract_raw_data`
			`from .stage2_layout import analyze_layout`
			`from .stage3_font import build_font_profile`
			`from .stage4_headers import classify_blocks`
			`from .stage5_hierarchy import infer_hierarchy`
			`from .stage6_tree import build_tree`
			`from .stage7_markdown import serialize_tree`
			`from .stage8_normalize import normalize_hierarchy`
			`from .stage9_validate import validate_markdown`
			`from .structure import analyze`
			`from .report import build_report`
			`from .validator import _score, _grade`


			`_LIVELLO_DESC = {3: "ricca (h3)", 2: "parziale (h2)", 1: "paragrafi", 0: "testo piatto"}`
			`_SPIN_FRAMES = "⠋⠙⠹⠸⠼⠴⠦⠧⠇⠏"`


			`def _build_frontmatter(meta: dict) -> str:`
			`lines = ["---", f"source: {meta['source']}"]`
			`if meta.get("title"):`
			`lines.append(f'title: "{meta["title"]}"')`
			`if meta.get("author"):`
			`lines.append(f'author: "{meta["author"]}"')`
			`if meta.get("year"):`
			`lines.append(f"year: {meta['year']}")`
			`if meta.get("pages"):`
			`lines.append(f"pages: {meta['pages']}")`
			`lines += ["---", ""]`
			`return "\n".join(lines) + "\n"`


			`class _Spinner:`
			`def __init__(self, prefix: str):`
			`self._prefix = prefix`
			`self._stop = threading.Event()`
			`self._thread = threading.Thread(target=self._run, daemon=True)`
			`self._t0 = 0.0`

			`def __enter__(self):`
			`self._t0 = time.perf_counter()`
			`self._thread.start()`
			`return self`

			`def __exit__(self, *_):`
			`self._stop.set()`
			`self._thread.join()`
			`sys.stdout.write("\r" + " " * 72 + "\r")`
			`sys.stdout.flush()`

			`def _run(self):`
			`i = 0`
			`while not self._stop.wait(0.1):`
			`elapsed = time.perf_counter() - self._t0`
			`frame = _SPIN_FRAMES[i % len(_SPIN_FRAMES)]`
			`sys.stdout.write(f"\r {frame} {self._prefix} {elapsed:.0f}s")`
			`sys.stdout.flush()`
			`i += 1`


			`def run(stem: str, project_root: Path, force: bool) -> bool:`
			`pdf_path = project_root / "sources" / f"{stem}.pdf"`
			`out_dir = project_root / "conversione" / stem`
			`raw_out = out_dir / "raw.md"`
			`clean_out = out_dir / "clean.md"`

			`print(f"\n{'─' * 52}")`
			`print(f" {stem}")`
			`print(f"{'─' * 52}")`

			`if clean_out.exists() and not force:`
			`print(f" ⚠️ conversione/{stem}/clean.md già presente — skip")`
			`print(f" (usa --force per rieseguire)")`
			`return True`

			`# ── [1] Validazione PDF ───────────────────────────────────────────────────`
			`print(" [1/9] Validazione PDF...")`
			`pdf_mb = pdf_path.stat().st_size / (1024 * 1024) if pdf_path.exists() else 0`
			`print(f" File: {pdf_path.name} ({pdf_mb:.1f} MB)")`

			`ok, msg = validate_pdf(pdf_path)`
			`if not ok:`
			`print(f" ✗ {msg}")`
			`return False`
			`print(f" ✅ {msg}")`

			`meta = extract_metadata(pdf_path)`
			`meta["source"] = pdf_path.name`
			`if meta.get("title"):`
			`print(f" Titolo: {meta['title']}")`
			`if meta.get("author"):`
			`print(f" Autore: {meta['author']}")`

			`# ── [2] Stage 1: estrazione span ──────────────────────────────────────────`
			`print(" [2/9] Stage 1: Estrazione span PyMuPDF...")`
			`with _Spinner("Lettura PDF con PyMuPDF..."):`
			`try:`
			`raw_blocks, doc_meta = extract_raw_data(pdf_path)`
			`except Exception as e:`
			`print(f" ✗ Estrazione fallita: {e}")`
			`return False`

			`print(f" ✅ {len(raw_blocks)} span estratti da {doc_meta['page_count']} pagine")`
			`toc_entries = len(doc_meta.get("toc", []))`
			`if toc_entries:`
			`print(f" TOC: {toc_entries} voci")`

			`# ── [3] Stage 2: layout ───────────────────────────────────────────────────`
			`print(" [3/9] Stage 2: Analisi layout e reading order...")`
			`with _Spinner("Analisi layout..."):`
			`blocks = analyze_layout(raw_blocks, doc_meta)`
			`print(f" ✅ {len(blocks)} blocchi dopo layout analysis")`

			`# ── [4] Stage 3: font analysis ────────────────────────────────────────────`
			`print(" [4/9] Stage 3: Font analysis...")`
			`profile = build_font_profile(blocks)`
			`print(f" ✅ Body size: {profile.body_size}pt "`
			`f"Header sizes: {profile.header_sizes}")`

			`# ── [5] Stage 4: header detection ─────────────────────────────────────────`
			`print(" [5/9] Stage 4: Header detection...")`
			`blocks = classify_blocks(blocks, profile)`
			`n_candidates = sum(1 for b in blocks if b.block_type == "header_candidate")`
			`print(f" ✅ {n_candidates} header candidate rilevati")`

			`# ── [6] Stage 5: hierarchy inference ─────────────────────────────────────`
			`print(" [6/9] Stage 5: Hierarchy inference...")`
			`blocks = infer_hierarchy(blocks, profile, doc_meta.get("toc", []))`
			`from collections import Counter`
			`level_dist = Counter(b.level for b in blocks if b.block_type == "header_candidate")`
			`print(f" ✅ H1={level_dist.get(1,0)} H2={level_dist.get(2,0)} H3={level_dist.get(3,0)}")`

			`# ── [7] Stage 6: document tree ────────────────────────────────────────────`
			`print(" [7/9] Stage 6: Document tree reconstruction...")`
			`tree = build_tree(blocks)`
			`print(f" ✅ {len(tree)} sezioni radice")`

			`# ── [8] Stage 7: markdown generation ─────────────────────────────────────`
			`print(" [8/9] Stage 7: Markdown generation...")`
			`with _Spinner("Serializzazione albero..."):`
			`raw_md = serialize_tree(tree, meta, pdf_path=pdf_path)`

			`size_kb = len(raw_md.encode()) // 1024`
			`n_lines = raw_md.count("\n")`
			`print(f" ✅ raw.md: {size_kb} KB, {n_lines} righe")`

			`# Scrittura raw.md (IMMUTABILE)`
			`try:`
			`out_dir.mkdir(parents=True, exist_ok=True)`
			`if not raw_out.exists() or force:`
			`raw_out.write_text(raw_md, encoding="utf-8")`
			`except PermissionError as e:`
			`print(f" ✗ Permesso negato durante la scrittura: {e}")`
			`return False`

			`# ── [9] Stage 8+9: normalizzazione + validazione ──────────────────────────`
			`print(" [9/9] Stage 8-9: Normalize + validate...")`
			`clean_md, norm_stats = normalize_hierarchy(raw_md)`
			`validation = validate_markdown(clean_md, meta.get("pages", 0))`

			`if norm_stats["n_level_jumps_repaired"]:`
			`print(f" Salti livello riparati: {norm_stats['n_level_jumps_repaired']}")`
			`if norm_stats["n_empty_headers_removed"]:`
			`print(f" Header vuoti rimossi: {norm_stats['n_empty_headers_removed']}")`
			`if norm_stats["n_duplicate_headers_removed"]:`
			`print(f" Header duplicati rimossi: {norm_stats['n_duplicate_headers_removed']}")`

			`for w in validation.warnings:`
			`print(f" ⚠️ {w}")`
			`for e in validation.errors:`
			`print(f" ✗ {e}")`

			`# Aggiungi frontmatter a clean.md`
			`frontmatter = _build_frontmatter(meta)`
			`full_clean = frontmatter + clean_md`

			`try:`
			`clean_out.write_text(full_clean, encoding="utf-8")`
			`except PermissionError as e:`
			`print(f" ✗ Permesso negato durante la scrittura di clean.md: {e}")`
			`return False`

			`print(f" ✅ clean.md scritto")`

			`# ── Analisi struttura + report + score ────────────────────────────────────`
			`profile_struct = analyze(clean_out)`
			`(out_dir / "structure_profile.json").write_text(`
			`json.dumps(profile_struct, ensure_ascii=False, indent=2), encoding="utf-8"`
			`)`

			`print(f" Struttura: livello {profile_struct['livello_struttura']} — "`
			`f"{_LIVELLO_DESC[profile_struct['livello_struttura']]}")`
			`print(f" h1={profile_struct['n_h1']} h2={profile_struct['n_h2']} "`
			`f"h3={profile_struct['n_h3']} paragrafi={profile_struct['n_paragrafi']}")`
			`print(f" Strategia chunking: {profile_struct['strategia_chunking']}")`
			`print(f" Lingua rilevata: {profile_struct['lingua_rilevata']}")`
			`for w in profile_struct["avvertenze"]:`
			`print(f" ⚠️ {w}")`

			`t_stats = {`
			`**norm_stats,`
			`"validation": validation.to_dict(),`
			`}`
			`reduction = 100.0 * (1 - len(clean_md) / len(raw_md)) if raw_md else 0.0`
			`report_path = build_report(stem, out_dir, full_clean, t_stats, profile_struct, reduction)`
			`report_data = json.loads(report_path.read_text(encoding="utf-8"))`
			`score, _ = _score(report_data)`

			`print(f"\n Output → conversione/{stem}/")`
			`print(f" raw.md (immutabile) clean.md report.json")`
			`print(f" Punteggio qualità: {score}/100 {_grade(score)}")`
			`return True`