e1b5298b20
Porta da branch marker la riscrittura completa di conversione/_pipeline/ (9 stadi PyMuPDF) e la suite tests/ senza modificare il resto del progetto RAG (ollama/, step-5/, step-6/, step-8/, rag.py, retrieve.py, config.py). requirements.txt: aggiunge PyMuPDF>=1.24.0 e pytest>=8.0, mantiene chromadb, rimuove opendataloader-pdf e pymupdf4llm. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
221 lines
9.3 KiB
Python
221 lines
9.3 KiB
Python
"""Orchestrazione della pipeline PDF → Markdown a 9 stadi."""
|
|
import json
|
|
import sys
|
|
import threading
|
|
import time
|
|
from pathlib import Path
|
|
|
|
from .extract import validate_pdf, extract_metadata
|
|
from .stage1_metadata import extract_raw_data_with_pdfplumber_fallback as extract_raw_data
|
|
from .stage2_layout import analyze_layout
|
|
from .stage3_font import build_font_profile
|
|
from .stage4_headers import classify_blocks
|
|
from .stage5_hierarchy import infer_hierarchy
|
|
from .stage6_tree import build_tree
|
|
from .stage7_markdown import serialize_tree
|
|
from .stage8_normalize import normalize_hierarchy
|
|
from .stage9_validate import validate_markdown
|
|
from .structure import analyze
|
|
from .report import build_report
|
|
from .validator import _score, _grade
|
|
|
|
|
|
_LIVELLO_DESC = {3: "ricca (h3)", 2: "parziale (h2)", 1: "paragrafi", 0: "testo piatto"}
|
|
_SPIN_FRAMES = "⠋⠙⠹⠸⠼⠴⠦⠧⠇⠏"
|
|
|
|
|
|
def _build_frontmatter(meta: dict) -> str:
|
|
lines = ["---", f"source: {meta['source']}"]
|
|
if meta.get("title"):
|
|
lines.append(f'title: "{meta["title"]}"')
|
|
if meta.get("author"):
|
|
lines.append(f'author: "{meta["author"]}"')
|
|
if meta.get("year"):
|
|
lines.append(f"year: {meta['year']}")
|
|
if meta.get("pages"):
|
|
lines.append(f"pages: {meta['pages']}")
|
|
lines += ["---", ""]
|
|
return "\n".join(lines) + "\n"
|
|
|
|
|
|
class _Spinner:
|
|
def __init__(self, prefix: str):
|
|
self._prefix = prefix
|
|
self._stop = threading.Event()
|
|
self._thread = threading.Thread(target=self._run, daemon=True)
|
|
self._t0 = 0.0
|
|
|
|
def __enter__(self):
|
|
self._t0 = time.perf_counter()
|
|
self._thread.start()
|
|
return self
|
|
|
|
def __exit__(self, *_):
|
|
self._stop.set()
|
|
self._thread.join()
|
|
sys.stdout.write("\r" + " " * 72 + "\r")
|
|
sys.stdout.flush()
|
|
|
|
def _run(self):
|
|
i = 0
|
|
while not self._stop.wait(0.1):
|
|
elapsed = time.perf_counter() - self._t0
|
|
frame = _SPIN_FRAMES[i % len(_SPIN_FRAMES)]
|
|
sys.stdout.write(f"\r {frame} {self._prefix} {elapsed:.0f}s")
|
|
sys.stdout.flush()
|
|
i += 1
|
|
|
|
|
|
def run(stem: str, project_root: Path, force: bool) -> bool:
|
|
pdf_path = project_root / "sources" / f"{stem}.pdf"
|
|
out_dir = project_root / "conversione" / stem
|
|
raw_out = out_dir / "raw.md"
|
|
clean_out = out_dir / "clean.md"
|
|
|
|
print(f"\n{'─' * 52}")
|
|
print(f" {stem}")
|
|
print(f"{'─' * 52}")
|
|
|
|
if clean_out.exists() and not force:
|
|
print(f" ⚠️ conversione/{stem}/clean.md già presente — skip")
|
|
print(f" (usa --force per rieseguire)")
|
|
return True
|
|
|
|
# ── [1] Validazione PDF ───────────────────────────────────────────────────
|
|
print(" [1/9] Validazione PDF...")
|
|
pdf_mb = pdf_path.stat().st_size / (1024 * 1024) if pdf_path.exists() else 0
|
|
print(f" File: {pdf_path.name} ({pdf_mb:.1f} MB)")
|
|
|
|
ok, msg = validate_pdf(pdf_path)
|
|
if not ok:
|
|
print(f" ✗ {msg}")
|
|
return False
|
|
print(f" ✅ {msg}")
|
|
|
|
meta = extract_metadata(pdf_path)
|
|
meta["source"] = pdf_path.name
|
|
if meta.get("title"):
|
|
print(f" Titolo: {meta['title']}")
|
|
if meta.get("author"):
|
|
print(f" Autore: {meta['author']}")
|
|
|
|
# ── [2] Stage 1: estrazione span ──────────────────────────────────────────
|
|
print(" [2/9] Stage 1: Estrazione span PyMuPDF...")
|
|
with _Spinner("Lettura PDF con PyMuPDF..."):
|
|
try:
|
|
raw_blocks, doc_meta = extract_raw_data(pdf_path)
|
|
except Exception as e:
|
|
print(f" ✗ Estrazione fallita: {e}")
|
|
return False
|
|
|
|
print(f" ✅ {len(raw_blocks)} span estratti da {doc_meta['page_count']} pagine")
|
|
toc_entries = len(doc_meta.get("toc", []))
|
|
if toc_entries:
|
|
print(f" TOC: {toc_entries} voci")
|
|
|
|
# ── [3] Stage 2: layout ───────────────────────────────────────────────────
|
|
print(" [3/9] Stage 2: Analisi layout e reading order...")
|
|
with _Spinner("Analisi layout..."):
|
|
blocks = analyze_layout(raw_blocks, doc_meta)
|
|
print(f" ✅ {len(blocks)} blocchi dopo layout analysis")
|
|
|
|
# ── [4] Stage 3: font analysis ────────────────────────────────────────────
|
|
print(" [4/9] Stage 3: Font analysis...")
|
|
profile = build_font_profile(blocks)
|
|
print(f" ✅ Body size: {profile.body_size}pt "
|
|
f"Header sizes: {profile.header_sizes}")
|
|
|
|
# ── [5] Stage 4: header detection ─────────────────────────────────────────
|
|
print(" [5/9] Stage 4: Header detection...")
|
|
blocks = classify_blocks(blocks, profile)
|
|
n_candidates = sum(1 for b in blocks if b.block_type == "header_candidate")
|
|
print(f" ✅ {n_candidates} header candidate rilevati")
|
|
|
|
# ── [6] Stage 5: hierarchy inference ─────────────────────────────────────
|
|
print(" [6/9] Stage 5: Hierarchy inference...")
|
|
blocks = infer_hierarchy(blocks, profile, doc_meta.get("toc", []))
|
|
from collections import Counter
|
|
level_dist = Counter(b.level for b in blocks if b.block_type == "header_candidate")
|
|
print(f" ✅ H1={level_dist.get(1,0)} H2={level_dist.get(2,0)} H3={level_dist.get(3,0)}")
|
|
|
|
# ── [7] Stage 6: document tree ────────────────────────────────────────────
|
|
print(" [7/9] Stage 6: Document tree reconstruction...")
|
|
tree = build_tree(blocks)
|
|
print(f" ✅ {len(tree)} sezioni radice")
|
|
|
|
# ── [8] Stage 7: markdown generation ─────────────────────────────────────
|
|
print(" [8/9] Stage 7: Markdown generation...")
|
|
with _Spinner("Serializzazione albero..."):
|
|
raw_md = serialize_tree(tree, meta, pdf_path=pdf_path)
|
|
|
|
size_kb = len(raw_md.encode()) // 1024
|
|
n_lines = raw_md.count("\n")
|
|
print(f" ✅ raw.md: {size_kb} KB, {n_lines} righe")
|
|
|
|
# Scrittura raw.md (IMMUTABILE)
|
|
try:
|
|
out_dir.mkdir(parents=True, exist_ok=True)
|
|
if not raw_out.exists() or force:
|
|
raw_out.write_text(raw_md, encoding="utf-8")
|
|
except PermissionError as e:
|
|
print(f" ✗ Permesso negato durante la scrittura: {e}")
|
|
return False
|
|
|
|
# ── [9] Stage 8+9: normalizzazione + validazione ──────────────────────────
|
|
print(" [9/9] Stage 8-9: Normalize + validate...")
|
|
clean_md, norm_stats = normalize_hierarchy(raw_md)
|
|
validation = validate_markdown(clean_md, meta.get("pages", 0))
|
|
|
|
if norm_stats["n_level_jumps_repaired"]:
|
|
print(f" Salti livello riparati: {norm_stats['n_level_jumps_repaired']}")
|
|
if norm_stats["n_empty_headers_removed"]:
|
|
print(f" Header vuoti rimossi: {norm_stats['n_empty_headers_removed']}")
|
|
if norm_stats["n_duplicate_headers_removed"]:
|
|
print(f" Header duplicati rimossi: {norm_stats['n_duplicate_headers_removed']}")
|
|
|
|
for w in validation.warnings:
|
|
print(f" ⚠️ {w}")
|
|
for e in validation.errors:
|
|
print(f" ✗ {e}")
|
|
|
|
# Aggiungi frontmatter a clean.md
|
|
frontmatter = _build_frontmatter(meta)
|
|
full_clean = frontmatter + clean_md
|
|
|
|
try:
|
|
clean_out.write_text(full_clean, encoding="utf-8")
|
|
except PermissionError as e:
|
|
print(f" ✗ Permesso negato durante la scrittura di clean.md: {e}")
|
|
return False
|
|
|
|
print(f" ✅ clean.md scritto")
|
|
|
|
# ── Analisi struttura + report + score ────────────────────────────────────
|
|
profile_struct = analyze(clean_out)
|
|
(out_dir / "structure_profile.json").write_text(
|
|
json.dumps(profile_struct, ensure_ascii=False, indent=2), encoding="utf-8"
|
|
)
|
|
|
|
print(f" Struttura: livello {profile_struct['livello_struttura']} — "
|
|
f"{_LIVELLO_DESC[profile_struct['livello_struttura']]}")
|
|
print(f" h1={profile_struct['n_h1']} h2={profile_struct['n_h2']} "
|
|
f"h3={profile_struct['n_h3']} paragrafi={profile_struct['n_paragrafi']}")
|
|
print(f" Strategia chunking: {profile_struct['strategia_chunking']}")
|
|
print(f" Lingua rilevata: {profile_struct['lingua_rilevata']}")
|
|
for w in profile_struct["avvertenze"]:
|
|
print(f" ⚠️ {w}")
|
|
|
|
t_stats = {
|
|
**norm_stats,
|
|
"validation": validation.to_dict(),
|
|
}
|
|
reduction = 100.0 * (1 - len(clean_md) / len(raw_md)) if raw_md else 0.0
|
|
report_path = build_report(stem, out_dir, full_clean, t_stats, profile_struct, reduction)
|
|
report_data = json.loads(report_path.read_text(encoding="utf-8"))
|
|
score, _ = _score(report_data)
|
|
|
|
print(f"\n Output → conversione/{stem}/")
|
|
print(f" raw.md (immutabile) clean.md report.json")
|
|
print(f" Punteggio qualità: {score}/100 {_grade(score)}")
|
|
return True
|