Files
davide e1b5298b20 feat: integra pipeline PDF→Markdown a 9 stadi e test suite
Porta da branch marker la riscrittura completa di conversione/_pipeline/
(9 stadi PyMuPDF) e la suite tests/ senza modificare il resto del progetto
RAG (ollama/, step-5/, step-6/, step-8/, rag.py, retrieve.py, config.py).

requirements.txt: aggiunge PyMuPDF>=1.24.0 e pytest>=8.0, mantiene chromadb,
rimuove opendataloader-pdf e pymupdf4llm.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-11 14:44:16 +02:00

153 lines
4.9 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import json
import sys
from pathlib import Path
_GRADES = [(90, "A"), (75, "B"), (60, "C"), (40, "D"), (0, "F")]
def _score(r: dict) -> tuple[int, list[str]]:
"""
Voto 0-100 sulla qualità del clean.md per vettorizzazione.
Penalità struttura:
livello 0 (assente) → 40
livello 1 (piatto) → 15
Penalità residui (degradano il retrieval):
backtick → 2/cad (max 20)
dot-leader → 5/cad (max 10)
URL/watermark → 5/cad (max 15)
immagini → 5/cad (max 10)
<br> inline → 2/cad (max 15)
simboli encoding → 1/cad (max 10)
formule inline [N.M] → 1/cad (max 8)
footnote residui → 1/cad (max 8)
caratteri PUA → 2/cad (max 20)
Penalità anomalie:
bare headers → 3/cad (max 15)
"""
score = 100
detail = []
structure = r.get("structure", {})
anomalie = r.get("anomalie", {})
residui = r.get("residui", {})
livello = structure.get("livello_struttura", 0)
if livello == 0:
score -= 40
detail.append("struttura assente 40")
elif livello == 1:
score -= 15
detail.append("struttura piatta 15")
def _pen(key: str, per_item: int, cap: int, label: str) -> None:
n = residui.get(key, 0)
if n:
p = min(cap, n * per_item)
nonlocal score
score -= p
detail.append(f"{label} ×{n} {p}")
_pen("backtick", 2, 20, "backtick")
_pen("dotleader", 5, 10, "dot-leader")
_pen("url", 5, 15, "url")
_pen("immagini", 5, 10, "immagini")
_pen("br_inline", 2, 15, "<br> inline")
_pen("simboli_encoding", 1, 10, "simboli encoding")
_pen("formule_inline", 1, 8, "formule inline")
_pen("footnote_markers", 1, 8, "footnote residui")
_pen("pua_markers", 2, 20, "caratteri PUA font Symbol")
_pen("formula_headers", 3, 15, "formula/esercizio come header")
n_bare = anomalie.get("bare_headers", 0)
if n_bare:
p = min(15, n_bare * 3)
score -= p
detail.append(f"bare headers ×{n_bare} {p}")
return max(0, score), detail
def _grade(score: int) -> str:
return next(g for threshold, g in _GRADES if score >= threshold)
def validate(stems: list[str], project_root: Path, detail: bool = False) -> None:
conv_dir = project_root / "conversione"
paths = (
[conv_dir / s / "report.json" for s in stems]
if stems
else sorted(conv_dir.glob("*/report.json"))
)
if not paths:
print("Nessun report.json trovato in conversione/*/")
sys.exit(0)
rows = [
json.loads(p.read_text(encoding="utf-8")) if p.exists()
else {"stem": p.parent.name, "_missing": True}
for p in paths
]
col = max(len(r.get("stem", "stem")) for r in rows) + 2
header = (
f"{'stem':<{col}}"
f"{'h2':>4}{'h3':>5} "
f"{'strategia':<18}"
f"{'bare':>5}{'corte':>6}{'lunghe':>7}"
f"{'btk':>5}{'br':>4}{'enc':>4}{'url':>4}{'fhdr':>5}"
f"{'med':>6}"
f" {'voto':>4} grade"
)
sep = "" * len(header)
print(f"\n{header}\n{sep}")
scores = []
for r in rows:
if r.get("_missing"):
print(f"{r['stem']:<{col}} (report.json non trovato)")
continue
st = r.get("structure", {})
an = r.get("anomalie", {})
res = r.get("residui", {})
dist = r.get("distribution", {})
s, pen = _score(r)
scores.append(s)
print(
f"{r['stem']:<{col}}"
f"{st.get('n_h2', 0):>4}"
f"{st.get('n_h3', 0):>5} "
f"{st.get('strategia_chunking','?'):<18}"
f"{an.get('bare_headers', 0):>5}"
f"{an.get('short_sections', 0):>6}"
f"{an.get('long_sections', 0):>7}"
f"{res.get('backtick', 0):>5}"
f"{res.get('br_inline', 0):>4}"
f"{res.get('simboli_encoding', 0):>4}"
f"{res.get('url', 0):>4}"
f"{res.get('formula_headers', 0):>5}"
f"{dist.get('mediana', 0):>6}"
f" {s:>4} {_grade(s)}"
)
if detail and pen:
for p in pen:
print(f" {'':>{col}}{p}")
print(sep)
if scores:
media = sum(scores) / len(scores)
print(
f"Documenti: {len(scores)} "
f"Media: {media:.0f}/100 {_grade(int(media))} "
f"(A≥90 B≥75 C≥60 D≥40 F<40)"
)
print(
"\nColonne: bare=header vuoti corte=sez<150ch lunghe=sez>1500ch "
"btk=backtick br=<br>inline enc=simboli encoding fhdr=formula-header med=mediana chars\n"
)