From faa8acae84732297566419c7d93649c9b5941d00 Mon Sep 17 00:00:00 2001
From: Davide Grilli <davide.grilli@outlook.com>
Date: Thu, 30 Apr 2026 14:58:15 +0200
Subject: [PATCH] =?UTF-8?q?feat(pipeline):=20ottimizzazione=20completa=20P?=
 =?UTF-8?q?DF=E2=86=92Markdown=20senza=20revisione=20manuale?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- converter: parametri adattivi (use_struct_tree per PDF taggati, table_method=cluster, content_safety_off)
- transforms: +20 PUA bracket TeX U+F8EB-F8FE (290 simboli corretti su analisi1)
- transforms: _t_math_header_demotion — demota header ##/### che sono enunciati esercizi o formule
- report: metrica formula_headers_residui con esempi
- validator: penalità formula_headers (−3/cad, cap −15), colonna fhdr nel report tabellare

Risultato su analisi1: voto 92/A, PUA residui 0, formula-hdr residui 0

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 conversione/_pipeline/converter.py  |  62 ++
 conversione/_pipeline/report.py     | 144 ++++
 conversione/_pipeline/runner.py     | 110 ++++
 conversione/_pipeline/transforms.py | 974 ++++++++++++++++++++++++++++
 conversione/_pipeline/validator.py  | 152 +++++
 5 files changed, 1442 insertions(+)
 create mode 100644 conversione/_pipeline/converter.py
 create mode 100644 conversione/_pipeline/report.py
 create mode 100644 conversione/_pipeline/runner.py
 create mode 100644 conversione/_pipeline/transforms.py
 create mode 100644 conversione/_pipeline/validator.py

diff --git a/conversione/_pipeline/converter.py b/conversione/_pipeline/converter.py
new file mode 100644
index 0000000..38f028d
--- /dev/null
+++ b/conversione/_pipeline/converter.py
@@ -0,0 +1,62 @@
+from pathlib import Path
+
+
+def _is_tagged_pdf(pdf_path: Path) -> bool:
+    try:
+        import fitz
+        doc = fitz.open(str(pdf_path))
+        tagged = "StructTreeRoot" in doc.pdf_catalog()
+        doc.close()
+        return tagged
+    except Exception:
+        return False
+
+
+def convert_pdf(pdf_path: Path, out_dir: Path) -> Path:
+    """
+    Converte il PDF in Markdown tramite opendataloader-pdf.
+    Scrive il file nella out_dir e restituisce il percorso.
+
+    Parametri scelti per output RAG-ottimale:
+      - keep_line_breaks=False   → testo fluente, no hard-wrap PDF
+      - reading_order="xycut"    → corregge ordine multi-colonna (XY-Cut++)
+      - sanitize=False           → preserva il testo originale
+      - image_output="off"       → nessuna immagine estratta né referenziata
+      - table_method="cluster"   → rileva tabelle senza bordi visibili
+      - content_safety_off       → evita filtraggio di footnote (tiny) e layer OCG
+      - use_struct_tree          → attivo solo se il PDF è taggato (Word/InDesign)
+    """
+    import opendataloader_pdf
+
+    out_dir.mkdir(parents=True, exist_ok=True)
+    tagged = _is_tagged_pdf(pdf_path)
+
+    opendataloader_pdf.convert(
+        input_path=str(pdf_path),
+        output_dir=str(out_dir),
+        format="markdown",
+        keep_line_breaks=False,
+        reading_order="xycut",
+        sanitize=False,
+        image_output="off",
+        table_method="cluster",
+        content_safety_off=["tiny", "hidden-ocg"],
+        use_struct_tree=tagged,
+        quiet=True,
+    )
+
+    md_file = out_dir / f"{pdf_path.stem}.md"
+    if not md_file.exists():
+        candidates = list(out_dir.glob("*.md"))
+        if not candidates:
+            raise RuntimeError(f"Nessun file .md prodotto in {out_dir}")
+        md_file = candidates[0]
+
+    content = md_file.read_text(encoding="utf-8", errors="replace").strip()
+    if len(content) < 100:
+        raise RuntimeError(
+            f"opendataloader ha prodotto un file .md quasi vuoto ({len(content)} char) "
+            f"— il PDF potrebbe essere corrotto o non supportato"
+        )
+
+    return md_file
diff --git a/conversione/_pipeline/report.py b/conversione/_pipeline/report.py
new file mode 100644
index 0000000..093bc86
--- /dev/null
+++ b/conversione/_pipeline/report.py
@@ -0,0 +1,144 @@
+import json
+import re
+from collections import Counter
+from datetime import datetime
+from pathlib import Path
+
+from .structure import _parse_sections_with_body
+
+
+def build_report(
+    stem: str,
+    out_dir: Path,
+    clean_text: str,
+    t_stats: dict,
+    profile: dict,
+    reduction: float,
+) -> Path:
+    text_lines = clean_text.split("\n")
+
+    sections = _parse_sections_with_body(clean_text, 3)
+    lengths  = [len(body) for _, body in sections]
+
+    def _pct(data: list[int], p: float) -> int:
+        if not data:
+            return 0
+        s = sorted(data)
+        return s[max(0, min(len(s) - 1, int(len(s) * p)))]
+
+    distribution = {
+        "min":     min(lengths) if lengths else 0,
+        "p25":     _pct(lengths, 0.25),
+        "mediana": _pct(lengths, 0.50),
+        "p75":     _pct(lengths, 0.75),
+        "max":     max(lengths) if lengths else 0,
+    }
+
+    bare_hdrs = [
+        {"header": hdr, "corpo_inizio": body[:120].replace("\n", " ")}
+        for hdr, body in sections
+        if re.match(r"^### \d+\.\s*$", hdr) and len(body.strip()) < 30
+    ]
+    short_secs = [
+        {"header": hdr, "chars": length, "testo": body[:80].replace("\n", " ")}
+        for (hdr, body), length in zip(sections, lengths)
+        if 0 < length < 150
+    ]
+    long_secs = [
+        {"header": hdr, "chars": length}
+        for (hdr, _), length in zip(sections, lengths)
+        if length > 1500
+    ]
+
+    def _scan(pattern: str, max_n: int = 10) -> list[dict]:
+        hits = []
+        for i, line in enumerate(text_lines):
+            if re.search(pattern, line) and not re.match(r"^#+ ", line):
+                hits.append({"riga": i + 1, "testo": line.strip()[:120]})
+                if len(hits) >= max_n:
+                    break
+        return hits
+
+    _math_sym_scan = re.compile(
+        r"[=+∈∀∃≤≥∞∑∫∂→↔⊂⊃∩∪αβγδεζηθικλμνξοπρστυφχψωΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩ]"
+    )
+    _ex_trigger_scan = re.compile(
+        r"\b(Si dimostri|Si calcoli|Si provi|Si trovi|Trovare|Find|Prove|Show that"
+        r"|Compute|Calculate|Dimostrare|Verificare)\b",
+        re.IGNORECASE,
+    )
+
+    def _scan_formula_headers(max_n: int = 10) -> list[dict]:
+        hits = []
+        for i, line in enumerate(text_lines):
+            m = re.match(r"^(#{2,3})\s+(.+)$", line)
+            if not m:
+                continue
+            body = m.group(2)
+            if len(body) <= 100:
+                continue
+            has_math = len(_math_sym_scan.findall(body)) >= 3
+            has_ex = bool(_ex_trigger_scan.search(body))
+            if has_math or has_ex:
+                hits.append({"riga": i + 1, "testo": line.strip()[:120]})
+                if len(hits) >= max_n:
+                    break
+        return hits
+
+    residui = {
+        "backtick":         _scan(r"`"),
+        "dotleader":        _scan(r"(?:\. ){3,}"),
+        "url":              _scan(r"^(https?://|www\.)\S+"),
+        "immagini":         _scan(r"!\[[^\]]*\]\([^)]*\)"),
+        "br_inline":        _scan(r"<br>"),
+        "simboli_encoding": _scan(r'(?<=[0-9A-Za-z])[!"](?=[0-9A-Za-z])'),
+        "formule_inline":   _scan(r"\[\d+\.\d+\]"),
+        "footnote_markers": _scan(r'[¹²³⁰⁴-⁹]'),
+        "pua_markers":      _scan(r'[-]'),
+        "formula_headers":  _scan_formula_headers(),
+    }
+
+    report = {
+        "stem":      stem,
+        "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M"),
+        "transforms": {
+            **t_stats,
+            "riduzione_pct": round(reduction),
+        },
+        "structure":    profile,
+        "distribution": distribution,
+        "anomalie": {
+            "bare_headers":        len(bare_hdrs),
+            "short_sections":      len(short_secs),
+            "long_sections":       len(long_secs),
+            "bare_headers_list":   bare_hdrs,
+            "short_sections_list": short_secs,
+            "long_sections_list":  long_secs,
+        },
+        "residui": {
+            "backtick":                   len(residui["backtick"]),
+            "dotleader":                  len(residui["dotleader"]),
+            "url":                        len(residui["url"]),
+            "immagini":                   len(residui["immagini"]),
+            "br_inline":                  len(residui["br_inline"]),
+            "simboli_encoding":           len(residui["simboli_encoding"]),
+            "formule_inline":             len(residui["formule_inline"]),
+            "footnote_markers":           len(residui["footnote_markers"]),
+            "pua_markers":                len(residui["pua_markers"]),
+            "backtick_esempi":            residui["backtick"],
+            "dotleader_esempi":           residui["dotleader"],
+            "url_esempi":                 residui["url"],
+            "immagini_esempi":            residui["immagini"],
+            "br_inline_esempi":           residui["br_inline"],
+            "simboli_encoding_esempi":    residui["simboli_encoding"],
+            "formule_inline_esempi":      residui["formule_inline"],
+            "footnote_markers_esempi":    residui["footnote_markers"],
+            "pua_markers_esempi":         residui["pua_markers"],
+            "formula_headers":            len(residui["formula_headers"]),
+            "formula_headers_esempi":     residui["formula_headers"],
+        },
+    }
+
+    report_path = out_dir / "report.json"
+    report_path.write_text(json.dumps(report, ensure_ascii=False, indent=2), encoding="utf-8")
+    return report_path
diff --git a/conversione/_pipeline/runner.py b/conversione/_pipeline/runner.py
new file mode 100644
index 0000000..7eb02dc
--- /dev/null
+++ b/conversione/_pipeline/runner.py
@@ -0,0 +1,110 @@
+import json
+import tempfile
+from pathlib import Path
+
+from .checker   import check_pdf
+from .converter import convert_pdf
+from .transforms import apply_transforms
+from .structure  import analyze
+from .report     import build_report
+
+
+_LIVELLO_DESC = {3: "ricca (h3)", 2: "parziale (h2)", 1: "paragrafi", 0: "testo piatto"}
+
+
+def run(stem: str, project_root: Path, force: bool) -> bool:
+    pdf_path  = project_root / "sources" / f"{stem}.pdf"
+    out_dir   = project_root / "conversione" / stem
+    raw_out   = out_dir / "raw.md"
+    clean_out = out_dir / "clean.md"
+
+    print(f"\n{'─' * 52}")
+    print(f"  {stem}")
+    print(f"{'─' * 52}")
+
+    if clean_out.exists() and not force:
+        print(f"  ⚠️  conversione/{stem}/clean.md già presente — skip")
+        print(f"      (usa --force per rieseguire)")
+        return True
+
+    # [1] Validazione
+    print("  [1/4] Validazione PDF...")
+    ok, msg = check_pdf(pdf_path)
+    if not ok:
+        print(f"  ✗ {msg}")
+        return False
+    print(f"  ✅ {msg}")
+
+    # [2] Conversione
+    print("  [2/4] Conversione PDF → Markdown (opendataloader-pdf)...")
+    with tempfile.TemporaryDirectory() as tmp:
+        try:
+            md_file = convert_pdf(pdf_path, Path(tmp))
+        except MemoryError:
+            print("  ✗ Memoria esaurita durante la conversione")
+            return False
+        except Exception as e:
+            print(f"  ✗ Conversione fallita: {e}")
+            return False
+        try:
+            raw_text = md_file.read_text(encoding="utf-8")
+        except UnicodeDecodeError as e:
+            print(f"  ✗ Errore encoding nel file prodotto: {e}")
+            return False
+
+    size_kb = len(raw_text.encode()) // 1024
+    n_lines = raw_text.count("\n")
+    print(f"  ✅ Markdown grezzo: {size_kb} KB, {n_lines} righe")
+
+    # [3] Pulizia strutturale
+    print("  [3/4] Pulizia strutturale...")
+    clean_text, t = apply_transforms(raw_text)
+    reduction = 100 * (1 - len(clean_text) / len(raw_text)) if raw_text else 0
+    print(f"  ✅ Simboli PUA corretti:  {t['n_simboli_pua_corretti']}")
+    print(f"     Immagini rimosse:      {t['n_immagini_rimosse']}")
+    print(f"     Note rimosse:          {t['n_note_rimosse']}")
+    print(f"     Accenti corretti:      {t['n_accenti_corretti']}")
+    print(f"     Dot-leader rimossi:    {t['n_dotleader_rimossi']}")
+    print(f"     Header concat fixati:  {t['n_header_concat_fixati']}")
+    print(f"     Header num. normaliz.: {t['n_header_numerati_normalizzati']}")
+    print(f"     Articoli → ###:        {t['n_articoli_estratti']}")
+    print(f"     Ambienti matematici:   {t['n_ambienti_matematici']}")
+    print(f"     Titoli header uniti:   {t['n_titoli_uniti']}")
+    print(f"     TOC rimosso:           {'sì' if t['toc_rimosso'] else 'no'}")
+    print(f"     Versi poesia riprist.: {t['n_versi_ripristinati']}")
+    print(f"     Header verso demotati: {t['n_header_verso_demotati']}")
+    print(f"     ALL-CAPS → ##:         {t['n_header_allcaps']}")
+    print(f"     Sezioni → ###:         {t['n_sezioni_numerate']}")
+    print(f"     Paragrafi uniti:       {t['n_paragrafi_uniti']}")
+    print(f"     Formula-hdr demotati:  {t['n_formula_headers_demotati']}")
+    print(f"     Riduzione testo:       {reduction:.0f}%")
+
+    # [4] Profilo strutturale
+    print("  [4/4] Analisi struttura...")
+    try:
+        out_dir.mkdir(parents=True, exist_ok=True)
+        raw_out.write_text(raw_text, encoding="utf-8")
+        clean_out.write_text(clean_text, encoding="utf-8")
+    except PermissionError as e:
+        print(f"  ✗ Permesso negato durante la scrittura: {e}")
+        return False
+
+    profile = analyze(clean_out)
+    (out_dir / "structure_profile.json").write_text(
+        json.dumps(profile, ensure_ascii=False, indent=2), encoding="utf-8"
+    )
+
+    print(f"  ✅ Struttura: livello {profile['livello_struttura']} — "
+          f"{_LIVELLO_DESC[profile['livello_struttura']]}")
+    print(f"     h1={profile['n_h1']}  h2={profile['n_h2']}  h3={profile['n_h3']}  "
+          f"paragrafi={profile['n_paragrafi']}")
+    print(f"     Strategia chunking: {profile['strategia_chunking']}")
+    print(f"     Lingua rilevata:    {profile['lingua_rilevata']}")
+    for w in profile["avvertenze"]:
+        print(f"     ⚠️  {w}")
+
+    build_report(stem, out_dir, clean_text, t, profile, reduction)
+
+    print(f"\n  Output → conversione/{stem}/")
+    print(f"    raw.md   (immutabile)  clean.md   report.json")
+    return True
diff --git a/conversione/_pipeline/transforms.py b/conversione/_pipeline/transforms.py
new file mode 100644
index 0000000..1c6a7cd
--- /dev/null
+++ b/conversione/_pipeline/transforms.py
@@ -0,0 +1,974 @@
+import re
+from collections import Counter
+from functools import partial
+
+# ─── Costanti ────────────────────────────────────────────────────────────────
+
+_TOC_KEYWORDS = frozenset([
+    "indice", "index", "contents", "table of contents",
+    "sommario", "inhaltsverzeichnis", "inhalt",
+    "indice generale", "indice analitico", "indice dei contenuti",
+    "elenco dei capitoli", "argomenti", "table des matières",
+    "tabla de contenidos", "содержание",
+])
+
+_ORDINALS_IT = {
+    "PRIMO": "I", "SECONDO": "II", "TERZO": "III", "QUARTO": "IV",
+    "QUINTO": "V", "SESTO": "VI", "SETTIMO": "VII", "OTTAVO": "VIII",
+    "NONO": "IX", "DECIMO": "X",
+}
+_ORDINALS_EN = {
+    "ONE": "1", "TWO": "2", "THREE": "3", "FOUR": "4", "FIVE": "5",
+    "SIX": "6", "SEVEN": "7", "EIGHT": "8", "NINE": "9", "TEN": "10",
+}
+
+# Mapping PUA Unicode (U+F020-U+F0FF) → simboli Unicode standard.
+# Font Symbol di Windows codifica lettere greche e operatori matematici
+# nel range Private Use Area invece dei codepoint Unicode standard.
+_SYMBOL_PUA_MAP: dict[str, str] = {
+    "": " ",
+    "": "(",
+    "": ")",
+    "": "+",
+    "": "−",  # minus
+    "": ".",
+    "": "/",
+    "": "0", "": "1", "": "2", "": "3", "": "4",
+    "": "5", "": "6", "": "7", "": "8", "": "9",
+    "": ":", "": ";", "": "<", "": "=", "": ">",
+    "": "≅",  # congruent
+    "": "Α",  # Alpha
+    "": "Β",  # Beta
+    "": "Χ",  # Chi
+    "": "Δ",  # Delta
+    "": "Ε",  # Epsilon
+    "": "Φ",  # Phi
+    "": "Γ",  # Gamma
+    "": "Η",  # Eta
+    "": "Ι",  # Iota
+    "": "ϑ",  # theta variant
+    "": "Κ",  # Kappa
+    "": "Λ",  # Lambda
+    "": "Μ",  # Mu
+    "": "Ν",  # Nu
+    "": "Ο",  # Omicron
+    "": "Π",  # Pi
+    "": "Θ",  # Theta
+    "": "Ρ",  # Rho
+    "": "Σ",  # Sigma
+    "": "Τ",  # Tau
+    "": "Υ",  # Upsilon
+    "": "ς",  # sigma final
+    "": "Ω",  # Omega
+    "": "Ξ",  # Xi
+    "": "Ψ",  # Psi
+    "": "Ζ",  # Zeta
+    "": "[",
+    "": "∴",  # therefore
+    "": "]",
+    "": "⊥",  # perpendicular
+    "": "α",  # alpha
+    "": "β",  # beta
+    "": "χ",  # chi
+    "": "δ",  # delta
+    "": "ε",  # epsilon
+    "": "φ",  # phi
+    "": "γ",  # gamma
+    "": "η",  # eta
+    "": "ι",  # iota
+    "": "ϕ",  # phi variant
+    "": "κ",  # kappa
+    "": "λ",  # lambda
+    "": "μ",  # mu
+    "": "ν",  # nu
+    "": "ο",  # omicron
+    "": "π",  # pi
+    "": "θ",  # theta
+    "": "ρ",  # rho
+    "": "σ",  # sigma
+    "": "τ",  # tau
+    "": "υ",  # upsilon
+    "": "ϖ",  # pi symbol
+    "": "ω",  # omega
+    "": "ξ",  # xi
+    "": "ψ",  # psi
+    "": "ζ",  # zeta
+    "": "{",
+    "": "|",
+    "": "}",
+    "": "~",
+    "": "±",  # plus-minus
+    "": "•",  # bullet
+    "": "√",  # square root
+    "": "≤",  # less or equal
+    "": "≥",  # greater or equal
+    "": "∝",  # proportional
+    "": "×",  # multiplication
+    "": "÷",  # division
+    "": "×",  # alternate multiply
+    "": "≠",  # not equal
+    "": "≠",  # not equal alternate
+    "": "≥",  # greater or equal alternate
+    "": "′",  # prime
+    "": "*",
+    "": ",",
+    "": "≤",  # less or equal (Symbol 0xA3)
+    "": "•",  # bullet (Wingdings 0xA7)
+    "": "•",  # bullet variant
+    "": "→",  # right arrow (Symbol 0xAE)
+    "": "÷",  # division / range separator
+    "": "",        # Wingdings decorative icon (rimosso)
+    "": "→",  # right arrow variant
+    "": "",        # bracket extension piece (non ricostruibile)
+    "": "",
+    "": "",
+    "": "",
+    "": "",
+    "": "",  # TeX large paren left U+F8EB
+    "": "",  # TeX large paren extension U+F8EC
+    "": "",  # TeX large paren right U+F8ED
+    "": "",  # TeX large paren right ext U+F8EE
+    "": "",  # TeX large bracket left U+F8EF
+    "": "",  # TeX large bracket ext U+F8F0
+    "": "",  # TeX brace top-left U+F8F1
+    "": "",  # TeX brace mid U+F8F2
+    "": "",  # TeX brace mid-right U+F8F3
+    "": "",  # TeX brace extension U+F8F4
+    "": "",  # TeX brace right U+F8F5
+    "": "",  # TeX bracket right large U+F8F6
+    "": "",  # TeX bracket right ext U+F8F7
+    "": "",  # TeX bracket right close U+F8F8
+    "": "",  # TeX integral large U+F8F9
+    "": "",  # TeX integral extension U+F8FA
+    "": "",  # TeX integral top U+F8FB
+    "": "",  # TeX radical top U+F8FC
+    "": "",  # TeX radical extension U+F8FD
+    "": "",  # TeX arrowhead U+F8FE
+}
+
+_SYMBOL_PUA_RE = re.compile(
+    "[" + "".join(re.escape(k) for k in _SYMBOL_PUA_MAP) + "]"
+)
+
+_SUPERSCRIPT_RE = re.compile(r'[¹²³⁰⁴-⁹]+')
+_FOOTNOTE_BODY_RE = re.compile(
+    r'^([¹²³⁰⁴-⁹]+\s+|\[\d{1,3}\]\s+)'
+)
+_NUMBERED_HDR_RE = re.compile(
+    r"^(#{1,6})\s+(\d+(?:\.\d+)*)\.\s+(.+)$",
+    re.MULTILINE,
+)
+_BIB_MARKERS_RE = re.compile(
+    r'\b(pp?\.|vol\.|n\.\s*\d|ed\.|edn\.|ISBN|DOI|arXiv)\b'
+    r'|\b(19|20)\d{2}\b',
+    re.IGNORECASE,
+)
+_WATERMARK_RE = re.compile(
+    r"^(BOZZA|DRAFT|CONFIDENTIAL|RISERVATO|PROVVISORIO|SAMPLE|SPECIMEN"
+    r"|DO NOT DISTRIBUTE|NON DISTRIBUIRE|COPY|COPIA)\s*$",
+    re.IGNORECASE | re.MULTILINE,
+)
+
+_MATH_SYMBOLS_RE = re.compile(
+    r"[=+∈∀∃≤≥∞∑∫∂→↔⊂⊃∩∪αβγδεζηθικλμνξοπρστυφχψωΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩ]"
+)
+_EXERCISE_TRIGGER_RE = re.compile(
+    r"\b(Si dimostri|Si calcoli|Si provi|Si trovi|Trovare|Find|Prove|Show that"
+    r"|Compute|Calculate|Dimostrare|Verificare)\b",
+    re.IGNORECASE,
+)
+_MATH_HDR_RE = re.compile(r"^(#{2,3})\s+(.+)$")
+_NUMBERED_PREFIX_RE = re.compile(r"^(\d+(?:\.\d+)*[.)])\s+(.+)$", re.DOTALL)
+
+# Erano compilati dentro le funzioni a ogni chiamata — ora costanti di modulo
+_TABSEP_RE = re.compile(r"(?m)^\|\s*\|\s*$|^\|---\|?\s*$")
+_FM_RE = re.compile(
+    r"https?://|www\.|@[A-Za-z]|\bUniversit[àa]\b|\bDipartimento\b|"
+    r"\bCopyright\b|\bLicenza\b|\bEdizione\b|"
+    r"protetto da|tutti i diritti",
+    re.IGNORECASE,
+)
+_VERSE_NUM_RE = re.compile(
+    r'([.!?\xbb\'\"’]\s+)(\d+)(\s+)(?=[A-Z\xc0-\xd9a-z\xe0-\xf9\xab"“‟])'
+)
+
+
+# ─── Helper puri ─────────────────────────────────────────────────────────────
+
+def _sentence_case(s: str) -> str:
+    if not s:
+        return s
+    lower = s.lower()
+    return lower[0].upper() + lower[1:]
+
+
+def _is_allcaps_line(line: str) -> bool:
+    stripped = line.strip()
+    letters  = [c for c in stripped if c.isalpha()]
+    return (
+        len(letters) >= 3
+        and all(c.isupper() for c in letters)
+        and not stripped.startswith("#")
+        and not stripped.startswith("|")
+    )
+
+
+def _allcaps_to_header(raw_line: str) -> str:
+    text = re.sub(r"^[-*+]\s+", "", raw_line.strip())
+    text = text.rstrip(".").rstrip("?").strip()
+
+    _ORD_IT_PAT = "|".join(_ORDINALS_IT.keys())
+    m = re.match(rf"^CAPITOLO ({_ORD_IT_PAT})\. (.+)", text)
+    if m:
+        roman  = _ORDINALS_IT[m.group(1)]
+        titolo = m.group(2).rstrip(".").rstrip("?").strip()
+        return f"## Capitolo {roman} — {_sentence_case(titolo)}"
+
+    _ORD_EN_PAT = "|".join(_ORDINALS_EN.keys())
+    m = re.match(rf"^CHAPTER ({_ORD_EN_PAT}|\d+)\.? (.+)", text)
+    if m:
+        n      = _ORDINALS_EN.get(m.group(1), m.group(1))
+        titolo = m.group(2).rstrip(".").rstrip("?").strip()
+        return f"## Chapter {n} — {_sentence_case(titolo)}"
+
+    m = re.match(r"^([IVXLCDM]+|[0-9]+)\. (.+)", text)
+    if m:
+        return f"## {m.group(1)}. {_sentence_case(m.group(2).rstrip('.').strip())}"
+
+    return f"## {_sentence_case(text)}"
+
+
+def _extract_math_environments(text: str) -> tuple[str, int]:
+    _ENVS = (
+        r"Definizione|Definition|Teorema|Theorem|Lemma|"
+        r"Proposizione|Proposition|Corollario|Corollary|"
+        r"Osservazione|Remark|Nota|Note|Esempio|Example"
+    )
+    count  = 0
+    blocks = text.split("\n\n")
+    result = []
+
+    for block in blocks:
+        stripped = block.strip()
+        if not stripped or stripped.startswith("#"):
+            result.append(block)
+            continue
+
+        m = re.match(
+            rf"^({_ENVS})\s+((?:\d+\.?){{1,4}})\s*(.*)",
+            stripped,
+            re.DOTALL,
+        )
+        if not m:
+            result.append(block)
+            continue
+
+        env  = m.group(1)
+        num  = m.group(2).rstrip(".")
+        rest = m.group(3).strip()
+
+        title_m = re.match(r"^(\([^)]{2,60}\))\s+(.*)", rest, re.DOTALL)
+        if title_m:
+            header = f"### {env} {num} {title_m.group(1)}"
+            body   = title_m.group(2).strip()
+        else:
+            header = f"### {env} {num}."
+            body   = rest
+
+        result.append(f"{header}\n\n{body}" if body else header)
+        count += 1
+
+    return "\n\n".join(result), count
+
+
+def _merge_title_headers(text: str) -> tuple[str, int]:
+    count  = 0
+    blocks = re.split(r"\n{2,}", text)
+    result = []
+    i = 0
+    while i < len(blocks):
+        block    = blocks[i]
+        stripped = block.strip()
+        if (
+            re.match(r"^#{2,3} \d+\.\s*$", stripped)
+            and i + 1 < len(blocks)
+        ):
+            nxt = blocks[i + 1].strip()
+            if (
+                nxt
+                and "\n" not in nxt
+                and len(nxt) <= 80
+                and not nxt.startswith("#")
+                and not re.match(r"^\d+[\.\)]\s", nxt)
+            ):
+                result.append(stripped.rstrip() + " " + nxt)
+                count += 1
+                i += 2
+                continue
+        result.append(block)
+        i += 1
+    return re.sub(r"\n{3,}", "\n\n", "\n\n".join(result)), count
+
+
+def _extract_article_headers(text: str) -> tuple[str, int]:
+    count = 0
+
+    def _repl(m: re.Match) -> str:
+        nonlocal count
+        num  = m.group(1)
+        rest = m.group(2).strip()
+
+        title_m = re.match(
+            r"^([A-Z\xc0\xc8\xc9\xcc\xcd\xd2\xd3\xd9\xda].{1,74}?)\.\s+"
+            r"([A-Z\xc0\xc8\xc9\xcc\xcd\xd2\xd3\xd9\xda\(\d].{4,})",
+            rest,
+        )
+        if title_m:
+            count += 1
+            return (
+                f"### Art. {num}. {title_m.group(1)}.\n\n"
+                f"{title_m.group(2).strip()}"
+            )
+        if rest:
+            count += 1
+            return f"### Art. {num}.\n\n{rest}"
+        count += 1
+        return f"### Art. {num}."
+
+    text = re.sub(
+        r"^-\s+Art\.\s+([\d]+[a-z\-]*)\.\s*(.*)",
+        _repl,
+        text,
+        flags=re.MULTILINE,
+    )
+    return text, count
+
+
+# ─── Trasformazioni atomiche ──────────────────────────────────────────────────
+
+def _t_fix_symbol_font(text: str) -> tuple[str, int]:
+    count = [0]
+
+    def _repl(m: re.Match) -> str:
+        count[0] += 1
+        return _SYMBOL_PUA_MAP[m.group(0)]
+
+    result = _SYMBOL_PUA_RE.sub(_repl, text)
+    return result, count[0]
+
+
+def _t_remove_images(text: str) -> tuple[str, int]:
+    n    = len(re.findall(r"!\[[^\]]*\]\([^)]*\)", text))
+    text = re.sub(r"!\[[^\]]*\]\([^)]*\)\s*", "", text)
+    return text, n
+
+
+def _t_remove_footnotes(text: str) -> tuple[str, int]:
+    lines  = text.split("\n")
+    result, count = [], 0
+    for line in lines:
+        stripped = line.strip()
+        if stripped and _FOOTNOTE_BODY_RE.match(stripped) and len(stripped) < 300:
+            count += 1
+            continue
+        cleaned = _SUPERSCRIPT_RE.sub("", line)
+        if cleaned != line:
+            count += 1
+        result.append(cleaned)
+    return "\n".join(result), count
+
+
+def _t_fix_br(text: str) -> tuple[str, int]:
+    n    = len(re.findall(r"<br>", text, re.IGNORECASE))
+    text = re.sub(r"<br>\s*", " ", text, flags=re.IGNORECASE)
+    return text, n
+
+
+def _t_fix_tabsep(text: str) -> tuple[str, int]:
+    n    = len(_TABSEP_RE.findall(text))
+    text = _TABSEP_RE.sub("", text)
+    return text, n
+
+
+def _t_fix_accents(text: str) -> tuple[str, int]:
+    _ACCENT_MAP = {
+        "e": "\xe8", "E": "\xc8", "a": "\xe0", "A": "\xc0",
+        "u": "\xf9", "U": "\xd9", "i": "\xec", "I": "\xcc",
+        "o": "\xf2", "O": "\xd2",
+    }
+    n_bt_before = text.count("`")
+    text = re.sub(r"`([eEaAuUiIoO])", lambda m: _ACCENT_MAP[m.group(1)], text)
+    text = re.sub(r"([eEaAuUiIoO])`", lambda m: _ACCENT_MAP[m.group(1)], text)
+    n_accenti   = n_bt_before - text.count("`")
+    n_bt_orfani = text.count("`")
+    if n_bt_orfani:
+        text = re.sub(r"`", "", text)
+        n_accenti += n_bt_orfani
+    return text, n_accenti
+
+
+def _t_fix_multiplication(text: str) -> tuple[str, int]:
+    n    = len(re.findall(r'(?<=[0-9])"(?=[0-9(])', text))
+    text = re.sub(r'(?<=[0-9])"(?=[0-9(])', '×', text)
+    return text, n
+
+
+def _t_fix_micro(text: str) -> tuple[str, int]:
+    _SI_UNITS_RE = r'[mAsgVWFHTKNJClΩ]'
+    n    = len(re.findall(rf'\d\s*!(?={_SI_UNITS_RE})', text))
+    text = re.sub(rf'(\d)\s*!({_SI_UNITS_RE})', r'\1 µ\2', text)
+    return text, n
+
+
+def _t_remove_formula_labels(text: str) -> tuple[str, int]:
+    n    = len(re.findall(r"\[\d+\.\d+\]", text))
+    text = re.sub(r"\s*\[\d+\.\d+\]\s*", " ", text)
+    return text, n
+
+
+def _t_remove_dotleaders(text: str) -> tuple[str, int]:
+    _DOTLEADER_RE = r"^[^\n]*(?:(?:\. ){3,}|\.{4,})[^\n]*$"
+    n    = len(re.findall(_DOTLEADER_RE, text, re.MULTILINE))
+    text = re.sub(_DOTLEADER_RE, "", text, flags=re.MULTILINE)
+    text = re.sub(
+        r"(?m)^(i{1,3}|iv|vi{0,3}|ix|xi{0,2}|x)$",
+        "",
+        text,
+        flags=re.IGNORECASE,
+    )
+    return text, n
+
+
+def _t_fix_header_concat(text: str) -> tuple[str, int]:
+    count = 0
+
+    def _fix(m: re.Match) -> str:
+        nonlocal count
+        hashes = m.group(1)
+        full   = m.group(2).strip()
+        if len(full) < 60:
+            return m.group(0)
+        skip  = min(10, len(full) // 3)
+        split = re.search(r"(?<=[a-z\xe0\xe8\xe9\xec\xed\xf2\xf3\xf9\xfa\xe4])(?=[A-Z\xc0\xc8\xc9\xcc\xcd\xd2\xd3\xd9\xda])", full[skip:])
+        if split:
+            pos   = skip + split.start()
+            title = full[:pos].strip()
+            body  = full[pos:].strip()
+            if len(title) >= 5 and len(body) >= 15:
+                count += 1
+                return f"{hashes} {title}\n\n{body}"
+        return m.group(0)
+
+    text = re.sub(r"^(#{2,6})\s+(.{40,})$", _fix, text, flags=re.MULTILINE)
+    return text, count
+
+
+def _t_extract_capitolo(text: str) -> tuple[str, int]:
+    def _repl(m: re.Match) -> str:
+        num    = m.group(1)
+        titolo = _sentence_case(m.group(2).strip().rstrip("- ").strip())
+        return f"\n\n## Capitolo {num}: {titolo}\n\n"
+
+    text = re.sub(
+        r"\bCapitolo\s+(\d+)\s*[:\s]\s*([A-Z\xc0\xc8\xc9\xcc\xcd\xd2\xd3\xd9\xda\'L]"
+        r"[A-Z\xc0\xc8\xc9\xcc\xcd\xd2\xd3\xd9\xda\s\'\.,\(\)]{5,80}?)"
+        r"(?=\s*[-–]\s*\d|\s*\n|\s*$)",
+        _repl,
+        text,
+    )
+    return text, 0
+
+
+def _t_normalize_numbered_headings(text: str) -> tuple[str, int]:
+    all_matches = list(_NUMBERED_HDR_RE.finditer(text))
+    if not all_matches:
+        return text, 0
+
+    pairs     = [(m.group(2).count(".") + 1, len(m.group(1))) for m in all_matches]
+    depths    = [d for d, _ in pairs]
+    min_depth = min(depths)
+    max_depth = max(depths)
+    if max_depth == min_depth:
+        return text, 0
+
+    base_level = min(lv for d, lv in pairs if d == min_depth)
+    count = 0
+
+    def _repl(m: re.Match) -> str:
+        nonlocal count
+        hashes, num, title = m.group(1), m.group(2), m.group(3)
+        depth     = num.count(".") + 1
+        new_level = min(base_level + (depth - min_depth), 6)
+        if new_level == len(hashes):
+            return m.group(0)
+        count += 1
+        return f"{'#' * new_level} {num}. {title}"
+
+    return _NUMBERED_HDR_RE.sub(_repl, text), count
+
+
+def _t_normalize_header_levels(text: str) -> tuple[str, int]:
+    text = re.sub(r"^#{3,6}\s*$", "", text, flags=re.MULTILINE)
+    text = re.sub(
+        r"^(#{3,6})\s+(\d{1,3})\s+(.+)$",
+        lambda m: f"### {m.group(2)}. {m.group(3)}",
+        text,
+        flags=re.MULTILINE,
+    )
+    text = re.sub(r"^#{4,6}\s+(.+)$", r"### \1", text, flags=re.MULTILINE)
+    return text, 0
+
+
+def _t_extract_articles(text: str) -> tuple[str, int]:
+    return _extract_article_headers(text)
+
+
+def _t_remove_header_bold(text: str) -> tuple[str, int]:
+    text = re.sub(
+        r"^(#{1,6})\s+\*\*(.+?)\*\*\s*$",
+        r"\1 \2",
+        text, flags=re.MULTILINE,
+    )
+    return text, 0
+
+
+def _t_normalize_allcaps_headers(text: str) -> tuple[str, int]:
+    def _norm(m: re.Match) -> str:
+        hashes, content = m.group(1), m.group(2).strip()
+        letters = [c for c in content if c.isalpha()]
+        if letters and all(c.isupper() for c in letters):
+            return f"{hashes} {_sentence_case(content)}"
+        return m.group(0)
+
+    text = re.sub(r"^(#{1,6}) (.+)$", _norm, text, flags=re.MULTILINE)
+    return text, 0
+
+
+def _t_remove_toc(text: str) -> tuple[str, int]:
+    lines     = text.split("\n")
+    new_lines = []
+    _in_toc   = False
+    removed   = False
+    for line in lines:
+        bare       = re.sub(r"^#+\s*", "", line.strip())
+        first_word = bare.split(".")[0].strip().lower()
+        if first_word in _TOC_KEYWORDS:
+            removed = True
+            _in_toc = True
+            continue
+        if _in_toc:
+            if re.match(r"^\s*$", line) or re.match(r"^\s*[-*+]\s+\d", line):
+                continue
+            if re.match(r"^\s*[-*+]\s+.{2,70}\s+\d{1,3}\s*$", line):
+                continue
+            if len(line.strip()) > 200:
+                _in_toc = False
+                new_lines.append(line)
+                continue
+            _in_toc = False
+        new_lines.append(line)
+    return "\n".join(new_lines), 1 if removed else 0
+
+
+def _t_allcaps_to_headers(text: str) -> tuple[str, int]:
+    count     = 0
+    blocks    = text.split("\n\n")
+    new_blocks = []
+    for block in blocks:
+        stripped = block.strip()
+        if "\n" not in stripped and _is_allcaps_line(stripped):
+            new_blocks.append(_allcaps_to_header(stripped))
+            count += 1
+        else:
+            sub_lines = block.split("\n")
+            converted = []
+            for ln in sub_lines:
+                if _is_allcaps_line(ln) and len(ln.strip()) > 3:
+                    converted.append(_allcaps_to_header(ln))
+                    count += 1
+                else:
+                    converted.append(ln)
+            new_blocks.append("\n".join(converted))
+    return "\n\n".join(new_blocks), count
+
+
+def _t_numbered_sections(text: str, has_exercises: bool = False) -> tuple[str, int]:
+    count = 0
+
+    def _num_repl(m: re.Match) -> str:
+        nonlocal count
+        content = m.group(2).strip()
+        if content.endswith(".") and len(content) > 40:
+            return m.group(0)
+        if _BIB_MARKERS_RE.search(content):
+            return m.group(0)
+        count += 1
+        return f"### {m.group(1)}.\n\n{content}"
+
+    text = re.sub(r"^(\d+)\.\s+(.+)$", _num_repl, text, flags=re.MULTILINE)
+
+    def _num_letter_repl(m: re.Match) -> str:
+        nonlocal count
+        count += 1
+        return f"### {m.group(1)}{m.group(2)}.\n\n{m.group(3).strip()}"
+
+    text = re.sub(r"^(\d+)\s*([a-z])\.\s+(.+)$", _num_letter_repl, text, flags=re.MULTILINE)
+
+    if not has_exercises:
+        def _aphorism_repl(m: re.Match) -> str:
+            nonlocal count
+            content = m.group(2).strip()
+            if _BIB_MARKERS_RE.search(content):
+                return m.group(0)
+            count += 1
+            return f"\n\n### {m.group(1)}.\n\n{content}"
+
+        text = re.sub(
+            r"^-\s+(\d{1,3})\.\s+(.{10,})$",
+            _aphorism_repl,
+            text,
+            flags=re.MULTILINE,
+        )
+
+    def _list_section_repl(m: re.Match) -> str:
+        nonlocal count
+        num     = m.group(1)
+        content = m.group(2).strip()
+        if _BIB_MARKERS_RE.search(content):
+            return m.group(0)
+        count += 1
+        split = re.search(r"(?<=[a-z\xe0\xe8\xe9\xec\xed\xf2\xf3\xf9\xfa])\s+(?=[A-Z\xc0\xc8\xc9\xcc\xcd\xd2\xd3\xd9\xda])", content)
+        if split and split.start() >= 3:
+            title = content[: split.start()].strip()
+            body  = content[split.end():].strip()
+            if len(body) >= 20:
+                return f"\n\n### {num}. {title}\n\n{body}"
+        return f"\n\n### {num}. {content}"
+
+    text = re.sub(
+        r"^-\s+(\d{1,3})\s+([A-Z\xc0\xc8\xc9\xcc\xcd\xd2\xd3\xd9\xda\'L].{10,})$",
+        _list_section_repl,
+        text,
+        flags=re.MULTILINE,
+    )
+    return text, count
+
+
+def _t_extract_math(text: str) -> tuple[str, int]:
+    return _extract_math_environments(text)
+
+
+def _t_merge_paragraphs(text: str) -> tuple[str, int]:
+    _SENTENCE_END = set(".?!\xbb)\"'")
+    blocks = text.split("\n\n")
+    merged = []
+    count  = 0
+    i = 0
+    while i < len(blocks):
+        b        = blocks[i]
+        stripped = b.strip()
+        while (
+            i + 1 < len(blocks)
+            and stripped
+            and not stripped.startswith("#")
+            and not stripped.startswith("|")
+            and stripped[-1] not in _SENTENCE_END
+        ):
+            nxt = blocks[i + 1].strip()
+            if (
+                not nxt
+                or nxt.startswith("#")
+                or nxt.startswith("|")
+                or re.match(r"^\d+\.", nxt)
+                or re.match(r"^[-*+]\s", nxt)
+            ):
+                break
+            b        = stripped + " " + nxt
+            stripped = b.strip()
+            count   += 1
+            i       += 1
+        merged.append(b)
+        i += 1
+    text = "\n\n".join(merged)
+    text = re.sub(r"(?m)^\|---\|\s*", "", text)
+    return text, count
+
+
+def _t_normalize_whitespace(text: str) -> tuple[str, int]:
+    lines = text.split("\n")
+    text  = "\n".join(
+        re.sub(r"  +", " ", line) if line.strip() else line
+        for line in lines
+    )
+    return text, 0
+
+
+def _t_collapse_blank_lines(text: str) -> tuple[str, int]:
+    return re.sub(r"\n{3,}", "\n\n", text), 0
+
+
+def _t_demote_verse_headers(text: str) -> tuple[str, int]:
+    count = 0
+
+    def _demote(m: re.Match) -> str:
+        nonlocal count
+        hashes, content = m.group(1), m.group(2).strip()
+        if not re.search(r"\s\d{1,4}\s*$", content):
+            return m.group(0)
+        inner = re.sub(r"\s\d{1,4}\s*$", "", content)
+        if not re.search(r'[,;:.!?\xbb"\'][\ ]+[A-Za-z\xc0-\xff\xab"“]', inner):
+            return m.group(0)
+        count += 1
+        clean = re.sub(r"\s\d{1,4}\s*$", "", content)
+        return clean
+
+    text = re.sub(r"^(#{1,6})\s+(.{20,})$", _demote, text, flags=re.MULTILINE)
+    return text, count
+
+
+def _t_restore_poetry_lines(text: str) -> tuple[str, int]:
+    count  = 0
+    blocks = text.split("\n\n")
+    result = []
+
+    for block in blocks:
+        stripped = block.strip()
+        if not stripped or stripped.startswith("#"):
+            result.append(block)
+            continue
+
+        matches = list(_VERSE_NUM_RE.finditer(stripped))
+        if len(matches) < 2:
+            result.append(block)
+            continue
+
+        nums  = [int(m.group(2)) for m in matches]
+        diffs = [nums[i + 1] - nums[i] for i in range(len(nums) - 1)]
+        if not diffs or len(set(diffs)) > 2 or not (1 <= diffs[0] <= 5):
+            result.append(block)
+            continue
+
+        step = diffs[0]
+
+        def _replace_verse_num(m: re.Match) -> str:
+            n   = int(m.group(2))
+            sep = "\n\n" if n % (step * 3) == 0 else "\n"
+            return m.group(1).rstrip() + sep
+
+        new_block = _VERSE_NUM_RE.sub(_replace_verse_num, stripped)
+        if new_block != stripped:
+            count += len(matches)
+        result.append(new_block)
+
+    return "\n\n".join(result), count
+
+
+def _t_remove_urls(text: str) -> tuple[str, int]:
+    return re.sub(r"(?m)^(https?://|www\.)\S+\s*$", "", text), 0
+
+
+def _t_remove_empty_headers(text: str) -> tuple[str, int]:
+    blocks  = re.split(r"\n{2,}", text)
+    cleaned = []
+    for i, block in enumerate(blocks):
+        stripped = block.strip()
+        if re.match(r"^#{1,6} ", stripped) and "\n" not in stripped:
+            next_stripped    = blocks[i + 1].strip() if i + 1 < len(blocks) else ""
+            next_is_long_hdr = (
+                re.match(r"^#{1,6} ", next_stripped) and len(next_stripped) > 80
+            )
+            if not next_stripped or (
+                re.match(r"^#{1,6} ", next_stripped) and not next_is_long_hdr
+            ):
+                continue
+        cleaned.append(block)
+    return re.sub(r"\n{3,}", "\n\n", "\n\n".join(cleaned)), 0
+
+
+def _t_merge_title_headers(text: str) -> tuple[str, int]:
+    return _merge_title_headers(text)
+
+
+def _t_remove_garbage_headers(text: str) -> tuple[str, int]:
+    def _is_garbage(content: str) -> bool:
+        if content.lstrip().startswith("..."):
+            return True
+        if not re.search(r"[A-Za-z\xc0-\xffΑ-ω]{2,}", content):
+            return True
+        if re.fullmatch(r"\(?\s*[A-Za-z]{1,4}\s*\)?", content.strip()):
+            return True
+        if len(content) > 60 and re.search(r"[!%#]\w|\w[!%#]|\b\w+-\s*\w", content):
+            return True
+        first_alpha = next((c for c in content if c.isalpha()), None)
+        if first_alpha and first_alpha.islower() and len(content) > 40:
+            return True
+        if re.match(r"^[A-Za-zΑ-ω_]{1,3}\s*[=<>≤≥]", content.strip()):
+            return True
+        if re.match(r"^(Figura|Figure|Fig\.|Tabella|Table|Tab\.)\s+\d", content.strip(), re.IGNORECASE):
+            return True
+        return False
+
+    count     = 0
+    lines     = text.split("\n")
+    new_lines = []
+    for line in lines:
+        m = re.match(r"^#{1,6} (.+)$", line)
+        if m and _is_garbage(m.group(1)):
+            count += 1
+            continue
+        new_lines.append(line)
+    text = "\n".join(new_lines)
+    text = re.sub(r"\n{3,}", "\n\n", text)
+    return text, count
+
+
+def _t_remove_frontmatter(text: str) -> tuple[str, int]:
+    blocks  = re.split(r"\n{2,}", text)
+    cleaned = []
+    count   = 0
+    total   = len(blocks)
+    cutoff  = max(5, min(15, int(total * 0.20)))
+    for i, block in enumerate(blocks):
+        stripped = block.strip()
+        if i >= cutoff:
+            cleaned.append(block)
+            continue
+        if not re.match(r"^### ", stripped) or re.match(r"^### \d", stripped):
+            cleaned.append(block)
+            continue
+        body       = blocks[i + 1].strip() if i + 1 < len(blocks) else ""
+        is_fm_body = len(body) < 250 and _FM_RE.search(body)
+        is_fm_hdr  = _FM_RE.search(stripped)
+        if is_fm_body or is_fm_hdr:
+            count += 1
+            continue
+        cleaned.append(block)
+    return re.sub(r"\n{3,}", "\n\n", "\n\n".join(cleaned)), count
+
+
+def _t_remove_watermarks(text: str) -> tuple[str, int]:
+    lines  = text.split("\n")
+    result, count = [], 0
+    for line in lines:
+        if _WATERMARK_RE.match(line):
+            count += 1
+        else:
+            result.append(line)
+    return "\n".join(result), count
+
+
+def _t_fix_math_symbols(text: str) -> tuple[str, int]:
+    lines  = text.split("\n")
+    result, count = [], 0
+    for line in lines:
+        if line.strip() and re.match(r"^[\s■-◿☐-☒•▪▫◆◇●○•]+$", line):
+            count += 1
+        else:
+            result.append(line)
+    return "\n".join(result), count
+
+
+def _t_remove_recurring_lines(text: str) -> tuple[str, int]:
+    lines       = text.split("\n")
+    short_lines = [
+        ln.strip() for ln in lines
+        if 3 < len(ln.strip()) < 80
+        and not ln.strip().startswith("#")
+        and not ln.strip().startswith("|")
+    ]
+    freq      = Counter(short_lines)
+    recurring = {ln for ln, c in freq.items() if c >= 5}
+    if not recurring:
+        return text, 0
+    result, count = [], 0
+    for line in lines:
+        if line.strip() in recurring:
+            count += 1
+        else:
+            result.append(line)
+    return "\n".join(result), count
+
+
+def _t_math_header_demotion(text: str) -> tuple[str, int]:
+    lines = text.split("\n")
+    result, count = [], 0
+    for line in lines:
+        m = _MATH_HDR_RE.match(line)
+        if not m:
+            result.append(line)
+            continue
+        body = m.group(2)
+        if len(body) <= 100:
+            result.append(line)
+            continue
+        has_math = len(_MATH_SYMBOLS_RE.findall(body)) >= 3
+        has_exercise = bool(_EXERCISE_TRIGGER_RE.search(body))
+        if not (has_math or has_exercise):
+            result.append(line)
+            continue
+        nm = _NUMBERED_PREFIX_RE.match(body)
+        if nm:
+            result.append(f"**{nm.group(1)}** {nm.group(2)}")
+        else:
+            result.append(body)
+        count += 1
+    return "\n".join(result), count
+
+
+# ─── Orchestratore ───────────────────────────────────────────────────────────
+
+def apply_transforms(text: str) -> tuple[str, dict]:
+    """
+    Applica le trasformazioni strutturali al Markdown grezzo.
+    Restituisce (testo_modificato, statistiche).
+    L'ordine è semantico: encoding → struttura header → costruzione struttura → testo → rifinitura.
+    """
+    _has_ex = bool(re.search(r"\b(Esercizi|Exercises|Problems|Homework)\b", text, re.IGNORECASE))
+
+    _transforms: list[tuple[str | None, object]] = [
+        ("n_simboli_pua_corretti",         _t_fix_symbol_font),
+        ("n_immagini_rimosse",             _t_remove_images),
+        ("n_br_rimossi",                   _t_fix_br),
+        ("n_tabsep_rimossi",               _t_fix_tabsep),
+        ("n_note_rimosse",                 _t_remove_footnotes),
+        ("n_accenti_corretti",             _t_fix_accents),
+        ("n_moltiplicazioni_corrette",     _t_fix_multiplication),
+        ("n_micro_corretti",               _t_fix_micro),
+        ("n_simboli_math_rimossi",         _t_fix_math_symbols),
+        ("n_formule_rimossi",              _t_remove_formula_labels),
+        ("n_dotleader_rimossi",            _t_remove_dotleaders),
+        ("n_righe_ricorrenti_rimosse",     _t_remove_recurring_lines),
+        ("n_header_concat_fixati",         _t_fix_header_concat),
+        (None,                             _t_extract_capitolo),
+        ("n_header_numerati_normalizzati", _t_normalize_numbered_headings),
+        (None,                             _t_normalize_header_levels),
+        ("n_articoli_estratti",            _t_extract_articles),
+        (None,                             _t_remove_header_bold),
+        (None,                             _t_normalize_allcaps_headers),
+        ("toc_rimosso",                    _t_remove_toc),
+        ("n_header_allcaps",               _t_allcaps_to_headers),
+        ("n_sezioni_numerate",             partial(_t_numbered_sections, has_exercises=_has_ex)),
+        ("n_ambienti_matematici",          _t_extract_math),
+        ("n_paragrafi_uniti",              _t_merge_paragraphs),
+        (None,                             _t_normalize_whitespace),
+        (None,                             _t_collapse_blank_lines),
+        ("n_versi_ripristinati",           _t_restore_poetry_lines),
+        ("n_header_verso_demotati",        _t_demote_verse_headers),
+        (None,                             _t_remove_urls),
+        (None,                             _t_remove_empty_headers),
+        ("n_titoli_uniti",                 _t_merge_title_headers),
+        (None,                             lambda t: (re.sub(r"(?m)^(#{1,6}.+?)\s*\|\s*\d{1,3}\s*$", r"\1", t), 0)),
+        ("n_garbage_headers_rimossi",      _t_remove_garbage_headers),
+        ("n_formula_headers_demotati",     _t_math_header_demotion),
+        ("n_frontmatter_rimossi",          _t_remove_frontmatter),
+        ("n_watermark_rimossi",            _t_remove_watermarks),
+    ]
+
+    stats: dict = {}
+    for stat_key, fn in _transforms:
+        text, n = fn(text)
+        if stat_key:
+            stats[stat_key] = stats.get(stat_key, 0) + n
+
+    stats["toc_rimosso"] = bool(stats.get("toc_rimosso", 0))
+    return text, stats
diff --git a/conversione/_pipeline/validator.py b/conversione/_pipeline/validator.py
new file mode 100644
index 0000000..8e79e16
--- /dev/null
+++ b/conversione/_pipeline/validator.py
@@ -0,0 +1,152 @@
+import json
+import sys
+from pathlib import Path
+
+_GRADES = [(90, "A"), (75, "B"), (60, "C"), (40, "D"), (0, "F")]
+
+
+def _score(r: dict) -> tuple[int, list[str]]:
+    """
+    Voto 0-100 sulla qualità del clean.md per vettorizzazione.
+
+    Penalità struttura:
+      livello 0 (assente)  → −40
+      livello 1 (piatto)   → −15
+
+    Penalità residui (degradano il retrieval):
+      backtick             → −2/cad  (max −20)
+      dot-leader           → −5/cad  (max −10)
+      URL/watermark        → −5/cad  (max −15)
+      immagini             → −5/cad  (max −10)
+      <br> inline          → −2/cad  (max −15)
+      simboli encoding     → −1/cad  (max −10)
+      formule inline [N.M] → −1/cad  (max −8)
+      footnote residui     → −1/cad  (max −8)
+      caratteri PUA        → −2/cad  (max −20)
+
+    Penalità anomalie:
+      bare headers         → −3/cad  (max −15)
+    """
+    score     = 100
+    detail    = []
+    structure = r.get("structure", {})
+    anomalie  = r.get("anomalie",  {})
+    residui   = r.get("residui",   {})
+
+    livello = structure.get("livello_struttura", 0)
+    if livello == 0:
+        score -= 40
+        detail.append("struttura assente −40")
+    elif livello == 1:
+        score -= 15
+        detail.append("struttura piatta −15")
+
+    def _pen(key: str, per_item: int, cap: int, label: str) -> None:
+        n = residui.get(key, 0)
+        if n:
+            p = min(cap, n * per_item)
+            nonlocal score
+            score -= p
+            detail.append(f"{label} ×{n} −{p}")
+
+    _pen("backtick",         2, 20, "backtick")
+    _pen("dotleader",        5, 10, "dot-leader")
+    _pen("url",              5, 15, "url")
+    _pen("immagini",         5, 10, "immagini")
+    _pen("br_inline",        2, 15, "<br> inline")
+    _pen("simboli_encoding", 1, 10, "simboli encoding")
+    _pen("formule_inline",   1,  8, "formule inline")
+    _pen("footnote_markers", 1,  8, "footnote residui")
+    _pen("pua_markers",      2, 20, "caratteri PUA font Symbol")
+    _pen("formula_headers",  3, 15, "formula/esercizio come header")
+
+    n_bare = anomalie.get("bare_headers", 0)
+    if n_bare:
+        p = min(15, n_bare * 3)
+        score -= p
+        detail.append(f"bare headers ×{n_bare} −{p}")
+
+    return max(0, score), detail
+
+
+def _grade(score: int) -> str:
+    return next(g for threshold, g in _GRADES if score >= threshold)
+
+
+def validate(stems: list[str], project_root: Path, detail: bool = False) -> None:
+    conv_dir = project_root / "conversione"
+
+    paths = (
+        [conv_dir / s / "report.json" for s in stems]
+        if stems
+        else sorted(conv_dir.glob("*/report.json"))
+    )
+
+    if not paths:
+        print("Nessun report.json trovato in conversione/*/")
+        sys.exit(0)
+
+    rows = [
+        json.loads(p.read_text(encoding="utf-8")) if p.exists()
+        else {"stem": p.parent.name, "_missing": True}
+        for p in paths
+    ]
+
+    col    = max(len(r.get("stem", "stem")) for r in rows) + 2
+    header = (
+        f"{'stem':<{col}}"
+        f"{'h2':>4}{'h3':>5}  "
+        f"{'strategia':<18}"
+        f"{'bare':>5}{'corte':>6}{'lunghe':>7}"
+        f"{'btk':>5}{'br':>4}{'enc':>4}{'url':>4}{'fhdr':>5}"
+        f"{'med':>6}"
+        f"  {'voto':>4}  grade"
+    )
+    sep = "─" * len(header)
+    print(f"\n{header}\n{sep}")
+
+    scores = []
+    for r in rows:
+        if r.get("_missing"):
+            print(f"{r['stem']:<{col}}  (report.json non trovato)")
+            continue
+
+        st   = r.get("structure",    {})
+        an   = r.get("anomalie",     {})
+        res  = r.get("residui",      {})
+        dist = r.get("distribution", {})
+        s, pen = _score(r)
+        scores.append(s)
+
+        print(
+            f"{r['stem']:<{col}}"
+            f"{st.get('n_h2',              0):>4}"
+            f"{st.get('n_h3',              0):>5}  "
+            f"{st.get('strategia_chunking','?'):<18}"
+            f"{an.get('bare_headers',      0):>5}"
+            f"{an.get('short_sections',    0):>6}"
+            f"{an.get('long_sections',     0):>7}"
+            f"{res.get('backtick',         0):>5}"
+            f"{res.get('br_inline',        0):>4}"
+            f"{res.get('simboli_encoding', 0):>4}"
+            f"{res.get('url',              0):>4}"
+            f"{res.get('formula_headers',  0):>5}"
+            f"{dist.get('mediana',         0):>6}"
+            f"  {s:>4}  {_grade(s)}"
+        )
+        if detail and pen:
+            for p in pen:
+                print(f"  {'':>{col}}  ↳ {p}")
+
+    print(sep)
+    if scores:
+        media = sum(scores) / len(scores)
+        print(
+            f"Documenti: {len(scores)}   "
+            f"Media: {media:.0f}/100 {_grade(int(media))}   "
+            f"(A≥90  B≥75  C≥60  D≥40  F<40)"
+        )
+    print(
+        "\nColonne: bare=header vuoti  corte=sez<150ch  lunghe=sez>1500ch  "
+        "btk=backtick  br=<br>inline  enc=simboli encoding  fhdr=formula-header  med=mediana chars\n"
+    )