2026-04-16 15:53:46 +02:00
|
|
|
|
#!/usr/bin/env python3
|
|
|
|
|
|
"""
|
2026-04-17 07:47:56 +02:00
|
|
|
|
conversione/validate.py — Validazione qualità Markdown
|
2026-04-16 15:53:46 +02:00
|
|
|
|
|
2026-04-16 16:05:03 +02:00
|
|
|
|
Legge i report.json prodotti da pipeline.py, stampa una tabella di stato
|
2026-04-17 07:47:56 +02:00
|
|
|
|
e assegna un voto (0-100) a ogni documento.
|
2026-04-16 15:53:46 +02:00
|
|
|
|
|
2026-04-16 16:05:03 +02:00
|
|
|
|
90-100 A — ottimo, pronto per il chunker
|
|
|
|
|
|
75-89 B — buono, qualche sezione lunga ma accettabile
|
|
|
|
|
|
60-74 C — accettabile, anomalie minori da verificare
|
|
|
|
|
|
40-59 D — da rivedere, problemi strutturali o residui evidenti
|
|
|
|
|
|
0-39 F — da riprocessare, struttura assente o testo corrotto
|
2026-04-16 15:53:46 +02:00
|
|
|
|
|
|
|
|
|
|
Uso:
|
|
|
|
|
|
python conversione/validate.py # tutti gli stem
|
|
|
|
|
|
python conversione/validate.py analisi1 # stem specifico
|
2026-04-17 07:47:56 +02:00
|
|
|
|
python conversione/validate.py a b c # stem multipli
|
2026-04-16 15:53:46 +02:00
|
|
|
|
"""
|
|
|
|
|
|
|
2026-04-16 16:05:03 +02:00
|
|
|
|
import argparse
|
2026-04-17 07:47:56 +02:00
|
|
|
|
import json
|
2026-04-16 15:53:46 +02:00
|
|
|
|
import sys
|
|
|
|
|
|
from pathlib import Path
|
|
|
|
|
|
|
|
|
|
|
|
|
2026-04-16 16:05:03 +02:00
|
|
|
|
# ─── Punteggio ───────────────────────────────────────────────────────────────
|
2026-04-16 15:53:46 +02:00
|
|
|
|
|
2026-04-17 07:47:56 +02:00
|
|
|
|
_GRADES = [(90, "A"), (75, "B"), (60, "C"), (40, "D"), (0, "F")]
|
|
|
|
|
|
|
|
|
|
|
|
|
2026-04-16 16:05:03 +02:00
|
|
|
|
def _score(r: dict) -> int:
|
|
|
|
|
|
"""
|
|
|
|
|
|
Calcola un punteggio 0-100 sulla qualità del Markdown prodotto.
|
2026-04-16 15:53:46 +02:00
|
|
|
|
|
2026-04-16 16:05:03 +02:00
|
|
|
|
Penalità:
|
2026-04-17 07:47:56 +02:00
|
|
|
|
struttura assente / piatta → −40 / −15
|
|
|
|
|
|
backtick residui → −2/cad (max −30)
|
|
|
|
|
|
URL / watermark → −5/cad (max −15)
|
|
|
|
|
|
immagini residue → −5/cad (max −10)
|
|
|
|
|
|
dot-leader residui → −5/cad (max −10)
|
|
|
|
|
|
bare headers → −3/cad (max −15)
|
|
|
|
|
|
sezioni >1500ch >35/60% → −5 / −10
|
2026-04-16 16:05:03 +02:00
|
|
|
|
"""
|
2026-04-17 07:47:56 +02:00
|
|
|
|
score = 100
|
2026-04-16 15:53:46 +02:00
|
|
|
|
structure = r.get("structure", {})
|
2026-04-17 07:47:56 +02:00
|
|
|
|
anomalie = r.get("anomalie", {})
|
|
|
|
|
|
residui = r.get("residui", {})
|
2026-04-16 15:53:46 +02:00
|
|
|
|
|
2026-04-17 07:47:56 +02:00
|
|
|
|
livello = structure.get("livello_struttura", 0)
|
|
|
|
|
|
n_h3 = max(structure.get("n_h3", 0), 1)
|
2026-04-16 16:05:03 +02:00
|
|
|
|
|
|
|
|
|
|
if livello == 0:
|
|
|
|
|
|
score -= 40
|
|
|
|
|
|
elif livello == 1:
|
|
|
|
|
|
score -= 15
|
|
|
|
|
|
|
|
|
|
|
|
score -= min(30, residui.get("backtick", 0) * 2)
|
|
|
|
|
|
score -= min(15, residui.get("url", 0) * 5)
|
|
|
|
|
|
score -= min(10, residui.get("immagini", 0) * 5)
|
|
|
|
|
|
score -= min(10, residui.get("dotleader", 0) * 5)
|
|
|
|
|
|
score -= min(15, anomalie.get("bare_headers", 0) * 3)
|
|
|
|
|
|
|
|
|
|
|
|
long_ratio = anomalie.get("long_sections", 0) / n_h3
|
2026-04-17 07:47:56 +02:00
|
|
|
|
if long_ratio > 0.60:
|
2026-04-16 16:05:03 +02:00
|
|
|
|
score -= 10
|
|
|
|
|
|
elif long_ratio > 0.35:
|
|
|
|
|
|
score -= 5
|
|
|
|
|
|
|
|
|
|
|
|
return max(0, score)
|
|
|
|
|
|
|
2026-04-16 15:53:46 +02:00
|
|
|
|
|
2026-04-16 16:05:03 +02:00
|
|
|
|
def _grade(score: int) -> str:
|
2026-04-17 07:47:56 +02:00
|
|
|
|
return next(g for threshold, g in _GRADES if score >= threshold)
|
2026-04-16 16:05:03 +02:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# ─── Validazione ─────────────────────────────────────────────────────────────
|
2026-04-16 15:53:46 +02:00
|
|
|
|
|
|
|
|
|
|
def validate(stems: list[str], project_root: Path) -> None:
|
|
|
|
|
|
conv_dir = project_root / "conversione"
|
|
|
|
|
|
|
2026-04-17 07:47:56 +02:00
|
|
|
|
paths = (
|
|
|
|
|
|
[conv_dir / s / "report.json" for s in stems]
|
|
|
|
|
|
if stems
|
|
|
|
|
|
else sorted(conv_dir.glob("*/report.json"))
|
|
|
|
|
|
)
|
2026-04-16 15:53:46 +02:00
|
|
|
|
|
|
|
|
|
|
if not paths:
|
|
|
|
|
|
print("Nessun report.json trovato in conversione/*/")
|
|
|
|
|
|
sys.exit(0)
|
|
|
|
|
|
|
2026-04-17 07:47:56 +02:00
|
|
|
|
rows = [
|
|
|
|
|
|
json.loads(p.read_text(encoding="utf-8")) if p.exists()
|
|
|
|
|
|
else {"stem": p.parent.name, "_missing": True}
|
|
|
|
|
|
for p in paths
|
|
|
|
|
|
]
|
2026-04-16 15:53:46 +02:00
|
|
|
|
|
|
|
|
|
|
# ── Intestazione ─────────────────────────────────────────────────────
|
2026-04-17 07:47:56 +02:00
|
|
|
|
col = max(len(r.get("stem", "stem")) for r in rows) + 2
|
2026-04-16 15:53:46 +02:00
|
|
|
|
header = (
|
2026-04-17 07:47:56 +02:00
|
|
|
|
f"{'stem':<{col}}"
|
2026-04-16 15:53:46 +02:00
|
|
|
|
f"{'h2':>4}{'h3':>5} "
|
|
|
|
|
|
f"{'strategia':<20}"
|
|
|
|
|
|
f"{'bare':>5}{'corte':>6}{'lunghe':>7}"
|
|
|
|
|
|
f"{'backtick':>9}{'dotlead':>8}{'url':>4}"
|
2026-04-17 07:47:56 +02:00
|
|
|
|
f" {'voto':>4} grade"
|
2026-04-16 15:53:46 +02:00
|
|
|
|
)
|
|
|
|
|
|
sep = "─" * len(header)
|
2026-04-17 07:47:56 +02:00
|
|
|
|
print(f"\n{header}\n{sep}")
|
2026-04-16 15:53:46 +02:00
|
|
|
|
|
2026-04-16 16:05:03 +02:00
|
|
|
|
scores = []
|
|
|
|
|
|
|
2026-04-16 15:53:46 +02:00
|
|
|
|
# ── Righe ─────────────────────────────────────────────────────────────
|
|
|
|
|
|
for r in rows:
|
|
|
|
|
|
if r.get("_missing"):
|
2026-04-17 07:47:56 +02:00
|
|
|
|
print(f"{r['stem']:<{col}} (report.json non trovato)")
|
2026-04-16 15:53:46 +02:00
|
|
|
|
continue
|
|
|
|
|
|
|
2026-04-17 07:47:56 +02:00
|
|
|
|
st = r.get("structure", {})
|
|
|
|
|
|
an = r.get("anomalie", {})
|
|
|
|
|
|
res = r.get("residui", {})
|
|
|
|
|
|
s = _score(r)
|
2026-04-16 16:05:03 +02:00
|
|
|
|
scores.append(s)
|
2026-04-16 15:53:46 +02:00
|
|
|
|
|
|
|
|
|
|
print(
|
2026-04-17 07:47:56 +02:00
|
|
|
|
f"{r['stem']:<{col}}"
|
|
|
|
|
|
f"{st.get('n_h2', 0):>4}"
|
|
|
|
|
|
f"{st.get('n_h3', 0):>5} "
|
|
|
|
|
|
f"{st.get('strategia_chunking','?'):<20}"
|
|
|
|
|
|
f"{an.get('bare_headers', 0):>5}"
|
|
|
|
|
|
f"{an.get('short_sections', 0):>6}"
|
|
|
|
|
|
f"{an.get('long_sections', 0):>7}"
|
|
|
|
|
|
f"{res.get('backtick', 0):>9}"
|
|
|
|
|
|
f"{res.get('dotleader', 0):>8}"
|
|
|
|
|
|
f"{res.get('url', 0):>4}"
|
|
|
|
|
|
f" {s:>4} {_grade(s)}"
|
2026-04-16 15:53:46 +02:00
|
|
|
|
)
|
|
|
|
|
|
|
2026-04-16 16:05:03 +02:00
|
|
|
|
# ── Riepilogo ─────────────────────────────────────────────────────────
|
2026-04-16 15:53:46 +02:00
|
|
|
|
print(sep)
|
2026-04-16 16:05:03 +02:00
|
|
|
|
if scores:
|
|
|
|
|
|
media = sum(scores) / len(scores)
|
2026-04-17 07:47:56 +02:00
|
|
|
|
print(
|
|
|
|
|
|
f"Documenti: {len(scores)} "
|
|
|
|
|
|
f"Media: {media:.0f}/100 {_grade(int(media))} "
|
|
|
|
|
|
f"(A≥90 B≥75 C≥60 D≥40 F<40)"
|
|
|
|
|
|
)
|
|
|
|
|
|
print(
|
|
|
|
|
|
"\nPenalità: struttura assente −40, backtick −2/cad, "
|
|
|
|
|
|
"bare headers −3/cad, sezioni >1500ch >35% −5\n"
|
|
|
|
|
|
)
|
|
|
|
|
|
|
2026-04-16 15:53:46 +02:00
|
|
|
|
|
2026-04-17 07:47:56 +02:00
|
|
|
|
# ─── Entry point ─────────────────────────────────────────────────────────────
|
2026-04-16 15:53:46 +02:00
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
2026-04-17 07:47:56 +02:00
|
|
|
|
parser = argparse.ArgumentParser(
|
|
|
|
|
|
description="Valida i report Markdown prodotti da pipeline.py",
|
|
|
|
|
|
epilog="Senza argomenti valida tutti gli stem in conversione/*/",
|
|
|
|
|
|
)
|
|
|
|
|
|
parser.add_argument(
|
|
|
|
|
|
"stems",
|
|
|
|
|
|
nargs="*",
|
|
|
|
|
|
metavar="STEM",
|
|
|
|
|
|
help="stem da validare (es: analisi1). Ometti per tutti.",
|
|
|
|
|
|
)
|
|
|
|
|
|
args = parser.parse_args()
|
|
|
|
|
|
validate(args.stems, Path(__file__).parent.parent)
|