From faa8acae84732297566419c7d93649c9b5941d00 Mon Sep 17 00:00:00 2001 From: Davide Grilli Date: Thu, 30 Apr 2026 14:58:15 +0200 Subject: [PATCH] =?UTF-8?q?feat(pipeline):=20ottimizzazione=20completa=20P?= =?UTF-8?q?DF=E2=86=92Markdown=20senza=20revisione=20manuale?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - converter: parametri adattivi (use_struct_tree per PDF taggati, table_method=cluster, content_safety_off) - transforms: +20 PUA bracket TeX U+F8EB-F8FE (290 simboli corretti su analisi1) - transforms: _t_math_header_demotion — demota header ##/### che sono enunciati esercizi o formule - report: metrica formula_headers_residui con esempi - validator: penalità formula_headers (−3/cad, cap −15), colonna fhdr nel report tabellare Risultato su analisi1: voto 92/A, PUA residui 0, formula-hdr residui 0 Co-Authored-By: Claude Sonnet 4.6 --- conversione/_pipeline/converter.py | 62 ++ conversione/_pipeline/report.py | 144 ++++ conversione/_pipeline/runner.py | 110 ++++ conversione/_pipeline/transforms.py | 974 ++++++++++++++++++++++++++++ conversione/_pipeline/validator.py | 152 +++++ 5 files changed, 1442 insertions(+) create mode 100644 conversione/_pipeline/converter.py create mode 100644 conversione/_pipeline/report.py create mode 100644 conversione/_pipeline/runner.py create mode 100644 conversione/_pipeline/transforms.py create mode 100644 conversione/_pipeline/validator.py diff --git a/conversione/_pipeline/converter.py b/conversione/_pipeline/converter.py new file mode 100644 index 0000000..38f028d --- /dev/null +++ b/conversione/_pipeline/converter.py @@ -0,0 +1,62 @@ +from pathlib import Path + + +def _is_tagged_pdf(pdf_path: Path) -> bool: + try: + import fitz + doc = fitz.open(str(pdf_path)) + tagged = "StructTreeRoot" in doc.pdf_catalog() + doc.close() + return tagged + except Exception: + return False + + +def convert_pdf(pdf_path: Path, out_dir: Path) -> Path: + """ + Converte il PDF in Markdown tramite opendataloader-pdf. + Scrive il file nella out_dir e restituisce il percorso. + + Parametri scelti per output RAG-ottimale: + - keep_line_breaks=False → testo fluente, no hard-wrap PDF + - reading_order="xycut" → corregge ordine multi-colonna (XY-Cut++) + - sanitize=False → preserva il testo originale + - image_output="off" → nessuna immagine estratta né referenziata + - table_method="cluster" → rileva tabelle senza bordi visibili + - content_safety_off → evita filtraggio di footnote (tiny) e layer OCG + - use_struct_tree → attivo solo se il PDF è taggato (Word/InDesign) + """ + import opendataloader_pdf + + out_dir.mkdir(parents=True, exist_ok=True) + tagged = _is_tagged_pdf(pdf_path) + + opendataloader_pdf.convert( + input_path=str(pdf_path), + output_dir=str(out_dir), + format="markdown", + keep_line_breaks=False, + reading_order="xycut", + sanitize=False, + image_output="off", + table_method="cluster", + content_safety_off=["tiny", "hidden-ocg"], + use_struct_tree=tagged, + quiet=True, + ) + + md_file = out_dir / f"{pdf_path.stem}.md" + if not md_file.exists(): + candidates = list(out_dir.glob("*.md")) + if not candidates: + raise RuntimeError(f"Nessun file .md prodotto in {out_dir}") + md_file = candidates[0] + + content = md_file.read_text(encoding="utf-8", errors="replace").strip() + if len(content) < 100: + raise RuntimeError( + f"opendataloader ha prodotto un file .md quasi vuoto ({len(content)} char) " + f"— il PDF potrebbe essere corrotto o non supportato" + ) + + return md_file diff --git a/conversione/_pipeline/report.py b/conversione/_pipeline/report.py new file mode 100644 index 0000000..093bc86 --- /dev/null +++ b/conversione/_pipeline/report.py @@ -0,0 +1,144 @@ +import json +import re +from collections import Counter +from datetime import datetime +from pathlib import Path + +from .structure import _parse_sections_with_body + + +def build_report( + stem: str, + out_dir: Path, + clean_text: str, + t_stats: dict, + profile: dict, + reduction: float, +) -> Path: + text_lines = clean_text.split("\n") + + sections = _parse_sections_with_body(clean_text, 3) + lengths = [len(body) for _, body in sections] + + def _pct(data: list[int], p: float) -> int: + if not data: + return 0 + s = sorted(data) + return s[max(0, min(len(s) - 1, int(len(s) * p)))] + + distribution = { + "min": min(lengths) if lengths else 0, + "p25": _pct(lengths, 0.25), + "mediana": _pct(lengths, 0.50), + "p75": _pct(lengths, 0.75), + "max": max(lengths) if lengths else 0, + } + + bare_hdrs = [ + {"header": hdr, "corpo_inizio": body[:120].replace("\n", " ")} + for hdr, body in sections + if re.match(r"^### \d+\.\s*$", hdr) and len(body.strip()) < 30 + ] + short_secs = [ + {"header": hdr, "chars": length, "testo": body[:80].replace("\n", " ")} + for (hdr, body), length in zip(sections, lengths) + if 0 < length < 150 + ] + long_secs = [ + {"header": hdr, "chars": length} + for (hdr, _), length in zip(sections, lengths) + if length > 1500 + ] + + def _scan(pattern: str, max_n: int = 10) -> list[dict]: + hits = [] + for i, line in enumerate(text_lines): + if re.search(pattern, line) and not re.match(r"^#+ ", line): + hits.append({"riga": i + 1, "testo": line.strip()[:120]}) + if len(hits) >= max_n: + break + return hits + + _math_sym_scan = re.compile( + r"[=+∈∀∃≤≥∞∑∫∂→↔⊂⊃∩∪αβγδεζηθικλμνξοπρστυφχψωΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩ]" + ) + _ex_trigger_scan = re.compile( + r"\b(Si dimostri|Si calcoli|Si provi|Si trovi|Trovare|Find|Prove|Show that" + r"|Compute|Calculate|Dimostrare|Verificare)\b", + re.IGNORECASE, + ) + + def _scan_formula_headers(max_n: int = 10) -> list[dict]: + hits = [] + for i, line in enumerate(text_lines): + m = re.match(r"^(#{2,3})\s+(.+)$", line) + if not m: + continue + body = m.group(2) + if len(body) <= 100: + continue + has_math = len(_math_sym_scan.findall(body)) >= 3 + has_ex = bool(_ex_trigger_scan.search(body)) + if has_math or has_ex: + hits.append({"riga": i + 1, "testo": line.strip()[:120]}) + if len(hits) >= max_n: + break + return hits + + residui = { + "backtick": _scan(r"`"), + "dotleader": _scan(r"(?:\. ){3,}"), + "url": _scan(r"^(https?://|www\.)\S+"), + "immagini": _scan(r"!\[[^\]]*\]\([^)]*\)"), + "br_inline": _scan(r"
"), + "simboli_encoding": _scan(r'(?<=[0-9A-Za-z])[!"](?=[0-9A-Za-z])'), + "formule_inline": _scan(r"\[\d+\.\d+\]"), + "footnote_markers": _scan(r'[¹²³⁰⁴-⁹]'), + "pua_markers": _scan(r'[-]'), + "formula_headers": _scan_formula_headers(), + } + + report = { + "stem": stem, + "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M"), + "transforms": { + **t_stats, + "riduzione_pct": round(reduction), + }, + "structure": profile, + "distribution": distribution, + "anomalie": { + "bare_headers": len(bare_hdrs), + "short_sections": len(short_secs), + "long_sections": len(long_secs), + "bare_headers_list": bare_hdrs, + "short_sections_list": short_secs, + "long_sections_list": long_secs, + }, + "residui": { + "backtick": len(residui["backtick"]), + "dotleader": len(residui["dotleader"]), + "url": len(residui["url"]), + "immagini": len(residui["immagini"]), + "br_inline": len(residui["br_inline"]), + "simboli_encoding": len(residui["simboli_encoding"]), + "formule_inline": len(residui["formule_inline"]), + "footnote_markers": len(residui["footnote_markers"]), + "pua_markers": len(residui["pua_markers"]), + "backtick_esempi": residui["backtick"], + "dotleader_esempi": residui["dotleader"], + "url_esempi": residui["url"], + "immagini_esempi": residui["immagini"], + "br_inline_esempi": residui["br_inline"], + "simboli_encoding_esempi": residui["simboli_encoding"], + "formule_inline_esempi": residui["formule_inline"], + "footnote_markers_esempi": residui["footnote_markers"], + "pua_markers_esempi": residui["pua_markers"], + "formula_headers": len(residui["formula_headers"]), + "formula_headers_esempi": residui["formula_headers"], + }, + } + + report_path = out_dir / "report.json" + report_path.write_text(json.dumps(report, ensure_ascii=False, indent=2), encoding="utf-8") + return report_path diff --git a/conversione/_pipeline/runner.py b/conversione/_pipeline/runner.py new file mode 100644 index 0000000..7eb02dc --- /dev/null +++ b/conversione/_pipeline/runner.py @@ -0,0 +1,110 @@ +import json +import tempfile +from pathlib import Path + +from .checker import check_pdf +from .converter import convert_pdf +from .transforms import apply_transforms +from .structure import analyze +from .report import build_report + + +_LIVELLO_DESC = {3: "ricca (h3)", 2: "parziale (h2)", 1: "paragrafi", 0: "testo piatto"} + + +def run(stem: str, project_root: Path, force: bool) -> bool: + pdf_path = project_root / "sources" / f"{stem}.pdf" + out_dir = project_root / "conversione" / stem + raw_out = out_dir / "raw.md" + clean_out = out_dir / "clean.md" + + print(f"\n{'─' * 52}") + print(f" {stem}") + print(f"{'─' * 52}") + + if clean_out.exists() and not force: + print(f" ⚠️ conversione/{stem}/clean.md già presente — skip") + print(f" (usa --force per rieseguire)") + return True + + # [1] Validazione + print(" [1/4] Validazione PDF...") + ok, msg = check_pdf(pdf_path) + if not ok: + print(f" ✗ {msg}") + return False + print(f" ✅ {msg}") + + # [2] Conversione + print(" [2/4] Conversione PDF → Markdown (opendataloader-pdf)...") + with tempfile.TemporaryDirectory() as tmp: + try: + md_file = convert_pdf(pdf_path, Path(tmp)) + except MemoryError: + print(" ✗ Memoria esaurita durante la conversione") + return False + except Exception as e: + print(f" ✗ Conversione fallita: {e}") + return False + try: + raw_text = md_file.read_text(encoding="utf-8") + except UnicodeDecodeError as e: + print(f" ✗ Errore encoding nel file prodotto: {e}") + return False + + size_kb = len(raw_text.encode()) // 1024 + n_lines = raw_text.count("\n") + print(f" ✅ Markdown grezzo: {size_kb} KB, {n_lines} righe") + + # [3] Pulizia strutturale + print(" [3/4] Pulizia strutturale...") + clean_text, t = apply_transforms(raw_text) + reduction = 100 * (1 - len(clean_text) / len(raw_text)) if raw_text else 0 + print(f" ✅ Simboli PUA corretti: {t['n_simboli_pua_corretti']}") + print(f" Immagini rimosse: {t['n_immagini_rimosse']}") + print(f" Note rimosse: {t['n_note_rimosse']}") + print(f" Accenti corretti: {t['n_accenti_corretti']}") + print(f" Dot-leader rimossi: {t['n_dotleader_rimossi']}") + print(f" Header concat fixati: {t['n_header_concat_fixati']}") + print(f" Header num. normaliz.: {t['n_header_numerati_normalizzati']}") + print(f" Articoli → ###: {t['n_articoli_estratti']}") + print(f" Ambienti matematici: {t['n_ambienti_matematici']}") + print(f" Titoli header uniti: {t['n_titoli_uniti']}") + print(f" TOC rimosso: {'sì' if t['toc_rimosso'] else 'no'}") + print(f" Versi poesia riprist.: {t['n_versi_ripristinati']}") + print(f" Header verso demotati: {t['n_header_verso_demotati']}") + print(f" ALL-CAPS → ##: {t['n_header_allcaps']}") + print(f" Sezioni → ###: {t['n_sezioni_numerate']}") + print(f" Paragrafi uniti: {t['n_paragrafi_uniti']}") + print(f" Formula-hdr demotati: {t['n_formula_headers_demotati']}") + print(f" Riduzione testo: {reduction:.0f}%") + + # [4] Profilo strutturale + print(" [4/4] Analisi struttura...") + try: + out_dir.mkdir(parents=True, exist_ok=True) + raw_out.write_text(raw_text, encoding="utf-8") + clean_out.write_text(clean_text, encoding="utf-8") + except PermissionError as e: + print(f" ✗ Permesso negato durante la scrittura: {e}") + return False + + profile = analyze(clean_out) + (out_dir / "structure_profile.json").write_text( + json.dumps(profile, ensure_ascii=False, indent=2), encoding="utf-8" + ) + + print(f" ✅ Struttura: livello {profile['livello_struttura']} — " + f"{_LIVELLO_DESC[profile['livello_struttura']]}") + print(f" h1={profile['n_h1']} h2={profile['n_h2']} h3={profile['n_h3']} " + f"paragrafi={profile['n_paragrafi']}") + print(f" Strategia chunking: {profile['strategia_chunking']}") + print(f" Lingua rilevata: {profile['lingua_rilevata']}") + for w in profile["avvertenze"]: + print(f" ⚠️ {w}") + + build_report(stem, out_dir, clean_text, t, profile, reduction) + + print(f"\n Output → conversione/{stem}/") + print(f" raw.md (immutabile) clean.md report.json") + return True diff --git a/conversione/_pipeline/transforms.py b/conversione/_pipeline/transforms.py new file mode 100644 index 0000000..1c6a7cd --- /dev/null +++ b/conversione/_pipeline/transforms.py @@ -0,0 +1,974 @@ +import re +from collections import Counter +from functools import partial + +# ─── Costanti ──────────────────────────────────────────────────────────────── + +_TOC_KEYWORDS = frozenset([ + "indice", "index", "contents", "table of contents", + "sommario", "inhaltsverzeichnis", "inhalt", + "indice generale", "indice analitico", "indice dei contenuti", + "elenco dei capitoli", "argomenti", "table des matières", + "tabla de contenidos", "содержание", +]) + +_ORDINALS_IT = { + "PRIMO": "I", "SECONDO": "II", "TERZO": "III", "QUARTO": "IV", + "QUINTO": "V", "SESTO": "VI", "SETTIMO": "VII", "OTTAVO": "VIII", + "NONO": "IX", "DECIMO": "X", +} +_ORDINALS_EN = { + "ONE": "1", "TWO": "2", "THREE": "3", "FOUR": "4", "FIVE": "5", + "SIX": "6", "SEVEN": "7", "EIGHT": "8", "NINE": "9", "TEN": "10", +} + +# Mapping PUA Unicode (U+F020-U+F0FF) → simboli Unicode standard. +# Font Symbol di Windows codifica lettere greche e operatori matematici +# nel range Private Use Area invece dei codepoint Unicode standard. +_SYMBOL_PUA_MAP: dict[str, str] = { + "": " ", + "": "(", + "": ")", + "": "+", + "": "−", # minus + "": ".", + "": "/", + "": "0", "": "1", "": "2", "": "3", "": "4", + "": "5", "": "6", "": "7", "": "8", "": "9", + "": ":", "": ";", "": "<", "": "=", "": ">", + "": "≅", # congruent + "": "Α", # Alpha + "": "Β", # Beta + "": "Χ", # Chi + "": "Δ", # Delta + "": "Ε", # Epsilon + "": "Φ", # Phi + "": "Γ", # Gamma + "": "Η", # Eta + "": "Ι", # Iota + "": "ϑ", # theta variant + "": "Κ", # Kappa + "": "Λ", # Lambda + "": "Μ", # Mu + "": "Ν", # Nu + "": "Ο", # Omicron + "": "Π", # Pi + "": "Θ", # Theta + "": "Ρ", # Rho + "": "Σ", # Sigma + "": "Τ", # Tau + "": "Υ", # Upsilon + "": "ς", # sigma final + "": "Ω", # Omega + "": "Ξ", # Xi + "": "Ψ", # Psi + "": "Ζ", # Zeta + "": "[", + "": "∴", # therefore + "": "]", + "": "⊥", # perpendicular + "": "α", # alpha + "": "β", # beta + "": "χ", # chi + "": "δ", # delta + "": "ε", # epsilon + "": "φ", # phi + "": "γ", # gamma + "": "η", # eta + "": "ι", # iota + "": "ϕ", # phi variant + "": "κ", # kappa + "": "λ", # lambda + "": "μ", # mu + "": "ν", # nu + "": "ο", # omicron + "": "π", # pi + "": "θ", # theta + "": "ρ", # rho + "": "σ", # sigma + "": "τ", # tau + "": "υ", # upsilon + "": "ϖ", # pi symbol + "": "ω", # omega + "": "ξ", # xi + "": "ψ", # psi + "": "ζ", # zeta + "": "{", + "": "|", + "": "}", + "": "~", + "": "±", # plus-minus + "": "•", # bullet + "": "√", # square root + "": "≤", # less or equal + "": "≥", # greater or equal + "": "∝", # proportional + "": "×", # multiplication + "": "÷", # division + "": "×", # alternate multiply + "": "≠", # not equal + "": "≠", # not equal alternate + "": "≥", # greater or equal alternate + "": "′", # prime + "": "*", + "": ",", + "": "≤", # less or equal (Symbol 0xA3) + "": "•", # bullet (Wingdings 0xA7) + "": "•", # bullet variant + "": "→", # right arrow (Symbol 0xAE) + "": "÷", # division / range separator + "": "", # Wingdings decorative icon (rimosso) + "": "→", # right arrow variant + "": "", # bracket extension piece (non ricostruibile) + "": "", + "": "", + "": "", + "": "", + "": "", # TeX large paren left U+F8EB + "": "", # TeX large paren extension U+F8EC + "": "", # TeX large paren right U+F8ED + "": "", # TeX large paren right ext U+F8EE + "": "", # TeX large bracket left U+F8EF + "": "", # TeX large bracket ext U+F8F0 + "": "", # TeX brace top-left U+F8F1 + "": "", # TeX brace mid U+F8F2 + "": "", # TeX brace mid-right U+F8F3 + "": "", # TeX brace extension U+F8F4 + "": "", # TeX brace right U+F8F5 + "": "", # TeX bracket right large U+F8F6 + "": "", # TeX bracket right ext U+F8F7 + "": "", # TeX bracket right close U+F8F8 + "": "", # TeX integral large U+F8F9 + "": "", # TeX integral extension U+F8FA + "": "", # TeX integral top U+F8FB + "": "", # TeX radical top U+F8FC + "": "", # TeX radical extension U+F8FD + "": "", # TeX arrowhead U+F8FE +} + +_SYMBOL_PUA_RE = re.compile( + "[" + "".join(re.escape(k) for k in _SYMBOL_PUA_MAP) + "]" +) + +_SUPERSCRIPT_RE = re.compile(r'[¹²³⁰⁴-⁹]+') +_FOOTNOTE_BODY_RE = re.compile( + r'^([¹²³⁰⁴-⁹]+\s+|\[\d{1,3}\]\s+)' +) +_NUMBERED_HDR_RE = re.compile( + r"^(#{1,6})\s+(\d+(?:\.\d+)*)\.\s+(.+)$", + re.MULTILINE, +) +_BIB_MARKERS_RE = re.compile( + r'\b(pp?\.|vol\.|n\.\s*\d|ed\.|edn\.|ISBN|DOI|arXiv)\b' + r'|\b(19|20)\d{2}\b', + re.IGNORECASE, +) +_WATERMARK_RE = re.compile( + r"^(BOZZA|DRAFT|CONFIDENTIAL|RISERVATO|PROVVISORIO|SAMPLE|SPECIMEN" + r"|DO NOT DISTRIBUTE|NON DISTRIBUIRE|COPY|COPIA)\s*$", + re.IGNORECASE | re.MULTILINE, +) + +_MATH_SYMBOLS_RE = re.compile( + r"[=+∈∀∃≤≥∞∑∫∂→↔⊂⊃∩∪αβγδεζηθικλμνξοπρστυφχψωΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩ]" +) +_EXERCISE_TRIGGER_RE = re.compile( + r"\b(Si dimostri|Si calcoli|Si provi|Si trovi|Trovare|Find|Prove|Show that" + r"|Compute|Calculate|Dimostrare|Verificare)\b", + re.IGNORECASE, +) +_MATH_HDR_RE = re.compile(r"^(#{2,3})\s+(.+)$") +_NUMBERED_PREFIX_RE = re.compile(r"^(\d+(?:\.\d+)*[.)])\s+(.+)$", re.DOTALL) + +# Erano compilati dentro le funzioni a ogni chiamata — ora costanti di modulo +_TABSEP_RE = re.compile(r"(?m)^\|\s*\|\s*$|^\|---\|?\s*$") +_FM_RE = re.compile( + r"https?://|www\.|@[A-Za-z]|\bUniversit[àa]\b|\bDipartimento\b|" + r"\bCopyright\b|\bLicenza\b|\bEdizione\b|" + r"protetto da|tutti i diritti", + re.IGNORECASE, +) +_VERSE_NUM_RE = re.compile( + r'([.!?\xbb\'\"’]\s+)(\d+)(\s+)(?=[A-Z\xc0-\xd9a-z\xe0-\xf9\xab"“‟])' +) + + +# ─── Helper puri ───────────────────────────────────────────────────────────── + +def _sentence_case(s: str) -> str: + if not s: + return s + lower = s.lower() + return lower[0].upper() + lower[1:] + + +def _is_allcaps_line(line: str) -> bool: + stripped = line.strip() + letters = [c for c in stripped if c.isalpha()] + return ( + len(letters) >= 3 + and all(c.isupper() for c in letters) + and not stripped.startswith("#") + and not stripped.startswith("|") + ) + + +def _allcaps_to_header(raw_line: str) -> str: + text = re.sub(r"^[-*+]\s+", "", raw_line.strip()) + text = text.rstrip(".").rstrip("?").strip() + + _ORD_IT_PAT = "|".join(_ORDINALS_IT.keys()) + m = re.match(rf"^CAPITOLO ({_ORD_IT_PAT})\. (.+)", text) + if m: + roman = _ORDINALS_IT[m.group(1)] + titolo = m.group(2).rstrip(".").rstrip("?").strip() + return f"## Capitolo {roman} — {_sentence_case(titolo)}" + + _ORD_EN_PAT = "|".join(_ORDINALS_EN.keys()) + m = re.match(rf"^CHAPTER ({_ORD_EN_PAT}|\d+)\.? (.+)", text) + if m: + n = _ORDINALS_EN.get(m.group(1), m.group(1)) + titolo = m.group(2).rstrip(".").rstrip("?").strip() + return f"## Chapter {n} — {_sentence_case(titolo)}" + + m = re.match(r"^([IVXLCDM]+|[0-9]+)\. (.+)", text) + if m: + return f"## {m.group(1)}. {_sentence_case(m.group(2).rstrip('.').strip())}" + + return f"## {_sentence_case(text)}" + + +def _extract_math_environments(text: str) -> tuple[str, int]: + _ENVS = ( + r"Definizione|Definition|Teorema|Theorem|Lemma|" + r"Proposizione|Proposition|Corollario|Corollary|" + r"Osservazione|Remark|Nota|Note|Esempio|Example" + ) + count = 0 + blocks = text.split("\n\n") + result = [] + + for block in blocks: + stripped = block.strip() + if not stripped or stripped.startswith("#"): + result.append(block) + continue + + m = re.match( + rf"^({_ENVS})\s+((?:\d+\.?){{1,4}})\s*(.*)", + stripped, + re.DOTALL, + ) + if not m: + result.append(block) + continue + + env = m.group(1) + num = m.group(2).rstrip(".") + rest = m.group(3).strip() + + title_m = re.match(r"^(\([^)]{2,60}\))\s+(.*)", rest, re.DOTALL) + if title_m: + header = f"### {env} {num} {title_m.group(1)}" + body = title_m.group(2).strip() + else: + header = f"### {env} {num}." + body = rest + + result.append(f"{header}\n\n{body}" if body else header) + count += 1 + + return "\n\n".join(result), count + + +def _merge_title_headers(text: str) -> tuple[str, int]: + count = 0 + blocks = re.split(r"\n{2,}", text) + result = [] + i = 0 + while i < len(blocks): + block = blocks[i] + stripped = block.strip() + if ( + re.match(r"^#{2,3} \d+\.\s*$", stripped) + and i + 1 < len(blocks) + ): + nxt = blocks[i + 1].strip() + if ( + nxt + and "\n" not in nxt + and len(nxt) <= 80 + and not nxt.startswith("#") + and not re.match(r"^\d+[\.\)]\s", nxt) + ): + result.append(stripped.rstrip() + " " + nxt) + count += 1 + i += 2 + continue + result.append(block) + i += 1 + return re.sub(r"\n{3,}", "\n\n", "\n\n".join(result)), count + + +def _extract_article_headers(text: str) -> tuple[str, int]: + count = 0 + + def _repl(m: re.Match) -> str: + nonlocal count + num = m.group(1) + rest = m.group(2).strip() + + title_m = re.match( + r"^([A-Z\xc0\xc8\xc9\xcc\xcd\xd2\xd3\xd9\xda].{1,74}?)\.\s+" + r"([A-Z\xc0\xc8\xc9\xcc\xcd\xd2\xd3\xd9\xda\(\d].{4,})", + rest, + ) + if title_m: + count += 1 + return ( + f"### Art. {num}. {title_m.group(1)}.\n\n" + f"{title_m.group(2).strip()}" + ) + if rest: + count += 1 + return f"### Art. {num}.\n\n{rest}" + count += 1 + return f"### Art. {num}." + + text = re.sub( + r"^-\s+Art\.\s+([\d]+[a-z\-]*)\.\s*(.*)", + _repl, + text, + flags=re.MULTILINE, + ) + return text, count + + +# ─── Trasformazioni atomiche ────────────────────────────────────────────────── + +def _t_fix_symbol_font(text: str) -> tuple[str, int]: + count = [0] + + def _repl(m: re.Match) -> str: + count[0] += 1 + return _SYMBOL_PUA_MAP[m.group(0)] + + result = _SYMBOL_PUA_RE.sub(_repl, text) + return result, count[0] + + +def _t_remove_images(text: str) -> tuple[str, int]: + n = len(re.findall(r"!\[[^\]]*\]\([^)]*\)", text)) + text = re.sub(r"!\[[^\]]*\]\([^)]*\)\s*", "", text) + return text, n + + +def _t_remove_footnotes(text: str) -> tuple[str, int]: + lines = text.split("\n") + result, count = [], 0 + for line in lines: + stripped = line.strip() + if stripped and _FOOTNOTE_BODY_RE.match(stripped) and len(stripped) < 300: + count += 1 + continue + cleaned = _SUPERSCRIPT_RE.sub("", line) + if cleaned != line: + count += 1 + result.append(cleaned) + return "\n".join(result), count + + +def _t_fix_br(text: str) -> tuple[str, int]: + n = len(re.findall(r"
", text, re.IGNORECASE)) + text = re.sub(r"
\s*", " ", text, flags=re.IGNORECASE) + return text, n + + +def _t_fix_tabsep(text: str) -> tuple[str, int]: + n = len(_TABSEP_RE.findall(text)) + text = _TABSEP_RE.sub("", text) + return text, n + + +def _t_fix_accents(text: str) -> tuple[str, int]: + _ACCENT_MAP = { + "e": "\xe8", "E": "\xc8", "a": "\xe0", "A": "\xc0", + "u": "\xf9", "U": "\xd9", "i": "\xec", "I": "\xcc", + "o": "\xf2", "O": "\xd2", + } + n_bt_before = text.count("`") + text = re.sub(r"`([eEaAuUiIoO])", lambda m: _ACCENT_MAP[m.group(1)], text) + text = re.sub(r"([eEaAuUiIoO])`", lambda m: _ACCENT_MAP[m.group(1)], text) + n_accenti = n_bt_before - text.count("`") + n_bt_orfani = text.count("`") + if n_bt_orfani: + text = re.sub(r"`", "", text) + n_accenti += n_bt_orfani + return text, n_accenti + + +def _t_fix_multiplication(text: str) -> tuple[str, int]: + n = len(re.findall(r'(?<=[0-9])"(?=[0-9(])', text)) + text = re.sub(r'(?<=[0-9])"(?=[0-9(])', '×', text) + return text, n + + +def _t_fix_micro(text: str) -> tuple[str, int]: + _SI_UNITS_RE = r'[mAsgVWFHTKNJClΩ]' + n = len(re.findall(rf'\d\s*!(?={_SI_UNITS_RE})', text)) + text = re.sub(rf'(\d)\s*!({_SI_UNITS_RE})', r'\1 µ\2', text) + return text, n + + +def _t_remove_formula_labels(text: str) -> tuple[str, int]: + n = len(re.findall(r"\[\d+\.\d+\]", text)) + text = re.sub(r"\s*\[\d+\.\d+\]\s*", " ", text) + return text, n + + +def _t_remove_dotleaders(text: str) -> tuple[str, int]: + _DOTLEADER_RE = r"^[^\n]*(?:(?:\. ){3,}|\.{4,})[^\n]*$" + n = len(re.findall(_DOTLEADER_RE, text, re.MULTILINE)) + text = re.sub(_DOTLEADER_RE, "", text, flags=re.MULTILINE) + text = re.sub( + r"(?m)^(i{1,3}|iv|vi{0,3}|ix|xi{0,2}|x)$", + "", + text, + flags=re.IGNORECASE, + ) + return text, n + + +def _t_fix_header_concat(text: str) -> tuple[str, int]: + count = 0 + + def _fix(m: re.Match) -> str: + nonlocal count + hashes = m.group(1) + full = m.group(2).strip() + if len(full) < 60: + return m.group(0) + skip = min(10, len(full) // 3) + split = re.search(r"(?<=[a-z\xe0\xe8\xe9\xec\xed\xf2\xf3\xf9\xfa\xe4])(?=[A-Z\xc0\xc8\xc9\xcc\xcd\xd2\xd3\xd9\xda])", full[skip:]) + if split: + pos = skip + split.start() + title = full[:pos].strip() + body = full[pos:].strip() + if len(title) >= 5 and len(body) >= 15: + count += 1 + return f"{hashes} {title}\n\n{body}" + return m.group(0) + + text = re.sub(r"^(#{2,6})\s+(.{40,})$", _fix, text, flags=re.MULTILINE) + return text, count + + +def _t_extract_capitolo(text: str) -> tuple[str, int]: + def _repl(m: re.Match) -> str: + num = m.group(1) + titolo = _sentence_case(m.group(2).strip().rstrip("- ").strip()) + return f"\n\n## Capitolo {num}: {titolo}\n\n" + + text = re.sub( + r"\bCapitolo\s+(\d+)\s*[:\s]\s*([A-Z\xc0\xc8\xc9\xcc\xcd\xd2\xd3\xd9\xda\'L]" + r"[A-Z\xc0\xc8\xc9\xcc\xcd\xd2\xd3\xd9\xda\s\'\.,\(\)]{5,80}?)" + r"(?=\s*[-–]\s*\d|\s*\n|\s*$)", + _repl, + text, + ) + return text, 0 + + +def _t_normalize_numbered_headings(text: str) -> tuple[str, int]: + all_matches = list(_NUMBERED_HDR_RE.finditer(text)) + if not all_matches: + return text, 0 + + pairs = [(m.group(2).count(".") + 1, len(m.group(1))) for m in all_matches] + depths = [d for d, _ in pairs] + min_depth = min(depths) + max_depth = max(depths) + if max_depth == min_depth: + return text, 0 + + base_level = min(lv for d, lv in pairs if d == min_depth) + count = 0 + + def _repl(m: re.Match) -> str: + nonlocal count + hashes, num, title = m.group(1), m.group(2), m.group(3) + depth = num.count(".") + 1 + new_level = min(base_level + (depth - min_depth), 6) + if new_level == len(hashes): + return m.group(0) + count += 1 + return f"{'#' * new_level} {num}. {title}" + + return _NUMBERED_HDR_RE.sub(_repl, text), count + + +def _t_normalize_header_levels(text: str) -> tuple[str, int]: + text = re.sub(r"^#{3,6}\s*$", "", text, flags=re.MULTILINE) + text = re.sub( + r"^(#{3,6})\s+(\d{1,3})\s+(.+)$", + lambda m: f"### {m.group(2)}. {m.group(3)}", + text, + flags=re.MULTILINE, + ) + text = re.sub(r"^#{4,6}\s+(.+)$", r"### \1", text, flags=re.MULTILINE) + return text, 0 + + +def _t_extract_articles(text: str) -> tuple[str, int]: + return _extract_article_headers(text) + + +def _t_remove_header_bold(text: str) -> tuple[str, int]: + text = re.sub( + r"^(#{1,6})\s+\*\*(.+?)\*\*\s*$", + r"\1 \2", + text, flags=re.MULTILINE, + ) + return text, 0 + + +def _t_normalize_allcaps_headers(text: str) -> tuple[str, int]: + def _norm(m: re.Match) -> str: + hashes, content = m.group(1), m.group(2).strip() + letters = [c for c in content if c.isalpha()] + if letters and all(c.isupper() for c in letters): + return f"{hashes} {_sentence_case(content)}" + return m.group(0) + + text = re.sub(r"^(#{1,6}) (.+)$", _norm, text, flags=re.MULTILINE) + return text, 0 + + +def _t_remove_toc(text: str) -> tuple[str, int]: + lines = text.split("\n") + new_lines = [] + _in_toc = False + removed = False + for line in lines: + bare = re.sub(r"^#+\s*", "", line.strip()) + first_word = bare.split(".")[0].strip().lower() + if first_word in _TOC_KEYWORDS: + removed = True + _in_toc = True + continue + if _in_toc: + if re.match(r"^\s*$", line) or re.match(r"^\s*[-*+]\s+\d", line): + continue + if re.match(r"^\s*[-*+]\s+.{2,70}\s+\d{1,3}\s*$", line): + continue + if len(line.strip()) > 200: + _in_toc = False + new_lines.append(line) + continue + _in_toc = False + new_lines.append(line) + return "\n".join(new_lines), 1 if removed else 0 + + +def _t_allcaps_to_headers(text: str) -> tuple[str, int]: + count = 0 + blocks = text.split("\n\n") + new_blocks = [] + for block in blocks: + stripped = block.strip() + if "\n" not in stripped and _is_allcaps_line(stripped): + new_blocks.append(_allcaps_to_header(stripped)) + count += 1 + else: + sub_lines = block.split("\n") + converted = [] + for ln in sub_lines: + if _is_allcaps_line(ln) and len(ln.strip()) > 3: + converted.append(_allcaps_to_header(ln)) + count += 1 + else: + converted.append(ln) + new_blocks.append("\n".join(converted)) + return "\n\n".join(new_blocks), count + + +def _t_numbered_sections(text: str, has_exercises: bool = False) -> tuple[str, int]: + count = 0 + + def _num_repl(m: re.Match) -> str: + nonlocal count + content = m.group(2).strip() + if content.endswith(".") and len(content) > 40: + return m.group(0) + if _BIB_MARKERS_RE.search(content): + return m.group(0) + count += 1 + return f"### {m.group(1)}.\n\n{content}" + + text = re.sub(r"^(\d+)\.\s+(.+)$", _num_repl, text, flags=re.MULTILINE) + + def _num_letter_repl(m: re.Match) -> str: + nonlocal count + count += 1 + return f"### {m.group(1)}{m.group(2)}.\n\n{m.group(3).strip()}" + + text = re.sub(r"^(\d+)\s*([a-z])\.\s+(.+)$", _num_letter_repl, text, flags=re.MULTILINE) + + if not has_exercises: + def _aphorism_repl(m: re.Match) -> str: + nonlocal count + content = m.group(2).strip() + if _BIB_MARKERS_RE.search(content): + return m.group(0) + count += 1 + return f"\n\n### {m.group(1)}.\n\n{content}" + + text = re.sub( + r"^-\s+(\d{1,3})\.\s+(.{10,})$", + _aphorism_repl, + text, + flags=re.MULTILINE, + ) + + def _list_section_repl(m: re.Match) -> str: + nonlocal count + num = m.group(1) + content = m.group(2).strip() + if _BIB_MARKERS_RE.search(content): + return m.group(0) + count += 1 + split = re.search(r"(?<=[a-z\xe0\xe8\xe9\xec\xed\xf2\xf3\xf9\xfa])\s+(?=[A-Z\xc0\xc8\xc9\xcc\xcd\xd2\xd3\xd9\xda])", content) + if split and split.start() >= 3: + title = content[: split.start()].strip() + body = content[split.end():].strip() + if len(body) >= 20: + return f"\n\n### {num}. {title}\n\n{body}" + return f"\n\n### {num}. {content}" + + text = re.sub( + r"^-\s+(\d{1,3})\s+([A-Z\xc0\xc8\xc9\xcc\xcd\xd2\xd3\xd9\xda\'L].{10,})$", + _list_section_repl, + text, + flags=re.MULTILINE, + ) + return text, count + + +def _t_extract_math(text: str) -> tuple[str, int]: + return _extract_math_environments(text) + + +def _t_merge_paragraphs(text: str) -> tuple[str, int]: + _SENTENCE_END = set(".?!\xbb)\"'") + blocks = text.split("\n\n") + merged = [] + count = 0 + i = 0 + while i < len(blocks): + b = blocks[i] + stripped = b.strip() + while ( + i + 1 < len(blocks) + and stripped + and not stripped.startswith("#") + and not stripped.startswith("|") + and stripped[-1] not in _SENTENCE_END + ): + nxt = blocks[i + 1].strip() + if ( + not nxt + or nxt.startswith("#") + or nxt.startswith("|") + or re.match(r"^\d+\.", nxt) + or re.match(r"^[-*+]\s", nxt) + ): + break + b = stripped + " " + nxt + stripped = b.strip() + count += 1 + i += 1 + merged.append(b) + i += 1 + text = "\n\n".join(merged) + text = re.sub(r"(?m)^\|---\|\s*", "", text) + return text, count + + +def _t_normalize_whitespace(text: str) -> tuple[str, int]: + lines = text.split("\n") + text = "\n".join( + re.sub(r" +", " ", line) if line.strip() else line + for line in lines + ) + return text, 0 + + +def _t_collapse_blank_lines(text: str) -> tuple[str, int]: + return re.sub(r"\n{3,}", "\n\n", text), 0 + + +def _t_demote_verse_headers(text: str) -> tuple[str, int]: + count = 0 + + def _demote(m: re.Match) -> str: + nonlocal count + hashes, content = m.group(1), m.group(2).strip() + if not re.search(r"\s\d{1,4}\s*$", content): + return m.group(0) + inner = re.sub(r"\s\d{1,4}\s*$", "", content) + if not re.search(r'[,;:.!?\xbb"\'][\ ]+[A-Za-z\xc0-\xff\xab"“]', inner): + return m.group(0) + count += 1 + clean = re.sub(r"\s\d{1,4}\s*$", "", content) + return clean + + text = re.sub(r"^(#{1,6})\s+(.{20,})$", _demote, text, flags=re.MULTILINE) + return text, count + + +def _t_restore_poetry_lines(text: str) -> tuple[str, int]: + count = 0 + blocks = text.split("\n\n") + result = [] + + for block in blocks: + stripped = block.strip() + if not stripped or stripped.startswith("#"): + result.append(block) + continue + + matches = list(_VERSE_NUM_RE.finditer(stripped)) + if len(matches) < 2: + result.append(block) + continue + + nums = [int(m.group(2)) for m in matches] + diffs = [nums[i + 1] - nums[i] for i in range(len(nums) - 1)] + if not diffs or len(set(diffs)) > 2 or not (1 <= diffs[0] <= 5): + result.append(block) + continue + + step = diffs[0] + + def _replace_verse_num(m: re.Match) -> str: + n = int(m.group(2)) + sep = "\n\n" if n % (step * 3) == 0 else "\n" + return m.group(1).rstrip() + sep + + new_block = _VERSE_NUM_RE.sub(_replace_verse_num, stripped) + if new_block != stripped: + count += len(matches) + result.append(new_block) + + return "\n\n".join(result), count + + +def _t_remove_urls(text: str) -> tuple[str, int]: + return re.sub(r"(?m)^(https?://|www\.)\S+\s*$", "", text), 0 + + +def _t_remove_empty_headers(text: str) -> tuple[str, int]: + blocks = re.split(r"\n{2,}", text) + cleaned = [] + for i, block in enumerate(blocks): + stripped = block.strip() + if re.match(r"^#{1,6} ", stripped) and "\n" not in stripped: + next_stripped = blocks[i + 1].strip() if i + 1 < len(blocks) else "" + next_is_long_hdr = ( + re.match(r"^#{1,6} ", next_stripped) and len(next_stripped) > 80 + ) + if not next_stripped or ( + re.match(r"^#{1,6} ", next_stripped) and not next_is_long_hdr + ): + continue + cleaned.append(block) + return re.sub(r"\n{3,}", "\n\n", "\n\n".join(cleaned)), 0 + + +def _t_merge_title_headers(text: str) -> tuple[str, int]: + return _merge_title_headers(text) + + +def _t_remove_garbage_headers(text: str) -> tuple[str, int]: + def _is_garbage(content: str) -> bool: + if content.lstrip().startswith("..."): + return True + if not re.search(r"[A-Za-z\xc0-\xffΑ-ω]{2,}", content): + return True + if re.fullmatch(r"\(?\s*[A-Za-z]{1,4}\s*\)?", content.strip()): + return True + if len(content) > 60 and re.search(r"[!%#]\w|\w[!%#]|\b\w+-\s*\w", content): + return True + first_alpha = next((c for c in content if c.isalpha()), None) + if first_alpha and first_alpha.islower() and len(content) > 40: + return True + if re.match(r"^[A-Za-zΑ-ω_]{1,3}\s*[=<>≤≥]", content.strip()): + return True + if re.match(r"^(Figura|Figure|Fig\.|Tabella|Table|Tab\.)\s+\d", content.strip(), re.IGNORECASE): + return True + return False + + count = 0 + lines = text.split("\n") + new_lines = [] + for line in lines: + m = re.match(r"^#{1,6} (.+)$", line) + if m and _is_garbage(m.group(1)): + count += 1 + continue + new_lines.append(line) + text = "\n".join(new_lines) + text = re.sub(r"\n{3,}", "\n\n", text) + return text, count + + +def _t_remove_frontmatter(text: str) -> tuple[str, int]: + blocks = re.split(r"\n{2,}", text) + cleaned = [] + count = 0 + total = len(blocks) + cutoff = max(5, min(15, int(total * 0.20))) + for i, block in enumerate(blocks): + stripped = block.strip() + if i >= cutoff: + cleaned.append(block) + continue + if not re.match(r"^### ", stripped) or re.match(r"^### \d", stripped): + cleaned.append(block) + continue + body = blocks[i + 1].strip() if i + 1 < len(blocks) else "" + is_fm_body = len(body) < 250 and _FM_RE.search(body) + is_fm_hdr = _FM_RE.search(stripped) + if is_fm_body or is_fm_hdr: + count += 1 + continue + cleaned.append(block) + return re.sub(r"\n{3,}", "\n\n", "\n\n".join(cleaned)), count + + +def _t_remove_watermarks(text: str) -> tuple[str, int]: + lines = text.split("\n") + result, count = [], 0 + for line in lines: + if _WATERMARK_RE.match(line): + count += 1 + else: + result.append(line) + return "\n".join(result), count + + +def _t_fix_math_symbols(text: str) -> tuple[str, int]: + lines = text.split("\n") + result, count = [], 0 + for line in lines: + if line.strip() and re.match(r"^[\s■-◿☐-☒•▪▫◆◇●○•]+$", line): + count += 1 + else: + result.append(line) + return "\n".join(result), count + + +def _t_remove_recurring_lines(text: str) -> tuple[str, int]: + lines = text.split("\n") + short_lines = [ + ln.strip() for ln in lines + if 3 < len(ln.strip()) < 80 + and not ln.strip().startswith("#") + and not ln.strip().startswith("|") + ] + freq = Counter(short_lines) + recurring = {ln for ln, c in freq.items() if c >= 5} + if not recurring: + return text, 0 + result, count = [], 0 + for line in lines: + if line.strip() in recurring: + count += 1 + else: + result.append(line) + return "\n".join(result), count + + +def _t_math_header_demotion(text: str) -> tuple[str, int]: + lines = text.split("\n") + result, count = [], 0 + for line in lines: + m = _MATH_HDR_RE.match(line) + if not m: + result.append(line) + continue + body = m.group(2) + if len(body) <= 100: + result.append(line) + continue + has_math = len(_MATH_SYMBOLS_RE.findall(body)) >= 3 + has_exercise = bool(_EXERCISE_TRIGGER_RE.search(body)) + if not (has_math or has_exercise): + result.append(line) + continue + nm = _NUMBERED_PREFIX_RE.match(body) + if nm: + result.append(f"**{nm.group(1)}** {nm.group(2)}") + else: + result.append(body) + count += 1 + return "\n".join(result), count + + +# ─── Orchestratore ─────────────────────────────────────────────────────────── + +def apply_transforms(text: str) -> tuple[str, dict]: + """ + Applica le trasformazioni strutturali al Markdown grezzo. + Restituisce (testo_modificato, statistiche). + L'ordine è semantico: encoding → struttura header → costruzione struttura → testo → rifinitura. + """ + _has_ex = bool(re.search(r"\b(Esercizi|Exercises|Problems|Homework)\b", text, re.IGNORECASE)) + + _transforms: list[tuple[str | None, object]] = [ + ("n_simboli_pua_corretti", _t_fix_symbol_font), + ("n_immagini_rimosse", _t_remove_images), + ("n_br_rimossi", _t_fix_br), + ("n_tabsep_rimossi", _t_fix_tabsep), + ("n_note_rimosse", _t_remove_footnotes), + ("n_accenti_corretti", _t_fix_accents), + ("n_moltiplicazioni_corrette", _t_fix_multiplication), + ("n_micro_corretti", _t_fix_micro), + ("n_simboli_math_rimossi", _t_fix_math_symbols), + ("n_formule_rimossi", _t_remove_formula_labels), + ("n_dotleader_rimossi", _t_remove_dotleaders), + ("n_righe_ricorrenti_rimosse", _t_remove_recurring_lines), + ("n_header_concat_fixati", _t_fix_header_concat), + (None, _t_extract_capitolo), + ("n_header_numerati_normalizzati", _t_normalize_numbered_headings), + (None, _t_normalize_header_levels), + ("n_articoli_estratti", _t_extract_articles), + (None, _t_remove_header_bold), + (None, _t_normalize_allcaps_headers), + ("toc_rimosso", _t_remove_toc), + ("n_header_allcaps", _t_allcaps_to_headers), + ("n_sezioni_numerate", partial(_t_numbered_sections, has_exercises=_has_ex)), + ("n_ambienti_matematici", _t_extract_math), + ("n_paragrafi_uniti", _t_merge_paragraphs), + (None, _t_normalize_whitespace), + (None, _t_collapse_blank_lines), + ("n_versi_ripristinati", _t_restore_poetry_lines), + ("n_header_verso_demotati", _t_demote_verse_headers), + (None, _t_remove_urls), + (None, _t_remove_empty_headers), + ("n_titoli_uniti", _t_merge_title_headers), + (None, lambda t: (re.sub(r"(?m)^(#{1,6}.+?)\s*\|\s*\d{1,3}\s*$", r"\1", t), 0)), + ("n_garbage_headers_rimossi", _t_remove_garbage_headers), + ("n_formula_headers_demotati", _t_math_header_demotion), + ("n_frontmatter_rimossi", _t_remove_frontmatter), + ("n_watermark_rimossi", _t_remove_watermarks), + ] + + stats: dict = {} + for stat_key, fn in _transforms: + text, n = fn(text) + if stat_key: + stats[stat_key] = stats.get(stat_key, 0) + n + + stats["toc_rimosso"] = bool(stats.get("toc_rimosso", 0)) + return text, stats diff --git a/conversione/_pipeline/validator.py b/conversione/_pipeline/validator.py new file mode 100644 index 0000000..8e79e16 --- /dev/null +++ b/conversione/_pipeline/validator.py @@ -0,0 +1,152 @@ +import json +import sys +from pathlib import Path + +_GRADES = [(90, "A"), (75, "B"), (60, "C"), (40, "D"), (0, "F")] + + +def _score(r: dict) -> tuple[int, list[str]]: + """ + Voto 0-100 sulla qualità del clean.md per vettorizzazione. + + Penalità struttura: + livello 0 (assente) → −40 + livello 1 (piatto) → −15 + + Penalità residui (degradano il retrieval): + backtick → −2/cad (max −20) + dot-leader → −5/cad (max −10) + URL/watermark → −5/cad (max −15) + immagini → −5/cad (max −10) +
inline → −2/cad (max −15) + simboli encoding → −1/cad (max −10) + formule inline [N.M] → −1/cad (max −8) + footnote residui → −1/cad (max −8) + caratteri PUA → −2/cad (max −20) + + Penalità anomalie: + bare headers → −3/cad (max −15) + """ + score = 100 + detail = [] + structure = r.get("structure", {}) + anomalie = r.get("anomalie", {}) + residui = r.get("residui", {}) + + livello = structure.get("livello_struttura", 0) + if livello == 0: + score -= 40 + detail.append("struttura assente −40") + elif livello == 1: + score -= 15 + detail.append("struttura piatta −15") + + def _pen(key: str, per_item: int, cap: int, label: str) -> None: + n = residui.get(key, 0) + if n: + p = min(cap, n * per_item) + nonlocal score + score -= p + detail.append(f"{label} ×{n} −{p}") + + _pen("backtick", 2, 20, "backtick") + _pen("dotleader", 5, 10, "dot-leader") + _pen("url", 5, 15, "url") + _pen("immagini", 5, 10, "immagini") + _pen("br_inline", 2, 15, "
inline") + _pen("simboli_encoding", 1, 10, "simboli encoding") + _pen("formule_inline", 1, 8, "formule inline") + _pen("footnote_markers", 1, 8, "footnote residui") + _pen("pua_markers", 2, 20, "caratteri PUA font Symbol") + _pen("formula_headers", 3, 15, "formula/esercizio come header") + + n_bare = anomalie.get("bare_headers", 0) + if n_bare: + p = min(15, n_bare * 3) + score -= p + detail.append(f"bare headers ×{n_bare} −{p}") + + return max(0, score), detail + + +def _grade(score: int) -> str: + return next(g for threshold, g in _GRADES if score >= threshold) + + +def validate(stems: list[str], project_root: Path, detail: bool = False) -> None: + conv_dir = project_root / "conversione" + + paths = ( + [conv_dir / s / "report.json" for s in stems] + if stems + else sorted(conv_dir.glob("*/report.json")) + ) + + if not paths: + print("Nessun report.json trovato in conversione/*/") + sys.exit(0) + + rows = [ + json.loads(p.read_text(encoding="utf-8")) if p.exists() + else {"stem": p.parent.name, "_missing": True} + for p in paths + ] + + col = max(len(r.get("stem", "stem")) for r in rows) + 2 + header = ( + f"{'stem':<{col}}" + f"{'h2':>4}{'h3':>5} " + f"{'strategia':<18}" + f"{'bare':>5}{'corte':>6}{'lunghe':>7}" + f"{'btk':>5}{'br':>4}{'enc':>4}{'url':>4}{'fhdr':>5}" + f"{'med':>6}" + f" {'voto':>4} grade" + ) + sep = "─" * len(header) + print(f"\n{header}\n{sep}") + + scores = [] + for r in rows: + if r.get("_missing"): + print(f"{r['stem']:<{col}} (report.json non trovato)") + continue + + st = r.get("structure", {}) + an = r.get("anomalie", {}) + res = r.get("residui", {}) + dist = r.get("distribution", {}) + s, pen = _score(r) + scores.append(s) + + print( + f"{r['stem']:<{col}}" + f"{st.get('n_h2', 0):>4}" + f"{st.get('n_h3', 0):>5} " + f"{st.get('strategia_chunking','?'):<18}" + f"{an.get('bare_headers', 0):>5}" + f"{an.get('short_sections', 0):>6}" + f"{an.get('long_sections', 0):>7}" + f"{res.get('backtick', 0):>5}" + f"{res.get('br_inline', 0):>4}" + f"{res.get('simboli_encoding', 0):>4}" + f"{res.get('url', 0):>4}" + f"{res.get('formula_headers', 0):>5}" + f"{dist.get('mediana', 0):>6}" + f" {s:>4} {_grade(s)}" + ) + if detail and pen: + for p in pen: + print(f" {'':>{col}} ↳ {p}") + + print(sep) + if scores: + media = sum(scores) / len(scores) + print( + f"Documenti: {len(scores)} " + f"Media: {media:.0f}/100 {_grade(int(media))} " + f"(A≥90 B≥75 C≥60 D≥40 F<40)" + ) + print( + "\nColonne: bare=header vuoti corte=sez<150ch lunghe=sez>1500ch " + "btk=backtick br=
inline enc=simboli encoding fhdr=formula-header med=mediana chars\n" + )