diff --git a/conversione/_pipeline/converter.py b/conversione/_pipeline/converter.py
new file mode 100644
index 0000000..38f028d
--- /dev/null
+++ b/conversione/_pipeline/converter.py
@@ -0,0 +1,62 @@
+from pathlib import Path
+
+
+def _is_tagged_pdf(pdf_path: Path) -> bool:
+ try:
+ import fitz
+ doc = fitz.open(str(pdf_path))
+ tagged = "StructTreeRoot" in doc.pdf_catalog()
+ doc.close()
+ return tagged
+ except Exception:
+ return False
+
+
+def convert_pdf(pdf_path: Path, out_dir: Path) -> Path:
+ """
+ Converte il PDF in Markdown tramite opendataloader-pdf.
+ Scrive il file nella out_dir e restituisce il percorso.
+
+ Parametri scelti per output RAG-ottimale:
+ - keep_line_breaks=False → testo fluente, no hard-wrap PDF
+ - reading_order="xycut" → corregge ordine multi-colonna (XY-Cut++)
+ - sanitize=False → preserva il testo originale
+ - image_output="off" → nessuna immagine estratta né referenziata
+ - table_method="cluster" → rileva tabelle senza bordi visibili
+ - content_safety_off → evita filtraggio di footnote (tiny) e layer OCG
+ - use_struct_tree → attivo solo se il PDF è taggato (Word/InDesign)
+ """
+ import opendataloader_pdf
+
+ out_dir.mkdir(parents=True, exist_ok=True)
+ tagged = _is_tagged_pdf(pdf_path)
+
+ opendataloader_pdf.convert(
+ input_path=str(pdf_path),
+ output_dir=str(out_dir),
+ format="markdown",
+ keep_line_breaks=False,
+ reading_order="xycut",
+ sanitize=False,
+ image_output="off",
+ table_method="cluster",
+ content_safety_off=["tiny", "hidden-ocg"],
+ use_struct_tree=tagged,
+ quiet=True,
+ )
+
+ md_file = out_dir / f"{pdf_path.stem}.md"
+ if not md_file.exists():
+ candidates = list(out_dir.glob("*.md"))
+ if not candidates:
+ raise RuntimeError(f"Nessun file .md prodotto in {out_dir}")
+ md_file = candidates[0]
+
+ content = md_file.read_text(encoding="utf-8", errors="replace").strip()
+ if len(content) < 100:
+ raise RuntimeError(
+ f"opendataloader ha prodotto un file .md quasi vuoto ({len(content)} char) "
+ f"— il PDF potrebbe essere corrotto o non supportato"
+ )
+
+ return md_file
diff --git a/conversione/_pipeline/report.py b/conversione/_pipeline/report.py
new file mode 100644
index 0000000..093bc86
--- /dev/null
+++ b/conversione/_pipeline/report.py
@@ -0,0 +1,144 @@
+import json
+import re
+from collections import Counter
+from datetime import datetime
+from pathlib import Path
+
+from .structure import _parse_sections_with_body
+
+
+def build_report(
+ stem: str,
+ out_dir: Path,
+ clean_text: str,
+ t_stats: dict,
+ profile: dict,
+ reduction: float,
+) -> Path:
+ text_lines = clean_text.split("\n")
+
+ sections = _parse_sections_with_body(clean_text, 3)
+ lengths = [len(body) for _, body in sections]
+
+ def _pct(data: list[int], p: float) -> int:
+ if not data:
+ return 0
+ s = sorted(data)
+ return s[max(0, min(len(s) - 1, int(len(s) * p)))]
+
+ distribution = {
+ "min": min(lengths) if lengths else 0,
+ "p25": _pct(lengths, 0.25),
+ "mediana": _pct(lengths, 0.50),
+ "p75": _pct(lengths, 0.75),
+ "max": max(lengths) if lengths else 0,
+ }
+
+ bare_hdrs = [
+ {"header": hdr, "corpo_inizio": body[:120].replace("\n", " ")}
+ for hdr, body in sections
+ if re.match(r"^### \d+\.\s*$", hdr) and len(body.strip()) < 30
+ ]
+ short_secs = [
+ {"header": hdr, "chars": length, "testo": body[:80].replace("\n", " ")}
+ for (hdr, body), length in zip(sections, lengths)
+ if 0 < length < 150
+ ]
+ long_secs = [
+ {"header": hdr, "chars": length}
+ for (hdr, _), length in zip(sections, lengths)
+ if length > 1500
+ ]
+
+ def _scan(pattern: str, max_n: int = 10) -> list[dict]:
+ hits = []
+ for i, line in enumerate(text_lines):
+ if re.search(pattern, line) and not re.match(r"^#+ ", line):
+ hits.append({"riga": i + 1, "testo": line.strip()[:120]})
+ if len(hits) >= max_n:
+ break
+ return hits
+
+ _math_sym_scan = re.compile(
+ r"[=+∈∀∃≤≥∞∑∫∂→↔⊂⊃∩∪αβγδεζηθικλμνξοπρστυφχψωΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩ]"
+ )
+ _ex_trigger_scan = re.compile(
+ r"\b(Si dimostri|Si calcoli|Si provi|Si trovi|Trovare|Find|Prove|Show that"
+ r"|Compute|Calculate|Dimostrare|Verificare)\b",
+ re.IGNORECASE,
+ )
+
+ def _scan_formula_headers(max_n: int = 10) -> list[dict]:
+ hits = []
+ for i, line in enumerate(text_lines):
+ m = re.match(r"^(#{2,3})\s+(.+)$", line)
+ if not m:
+ continue
+ body = m.group(2)
+ if len(body) <= 100:
+ continue
+ has_math = len(_math_sym_scan.findall(body)) >= 3
+ has_ex = bool(_ex_trigger_scan.search(body))
+ if has_math or has_ex:
+ hits.append({"riga": i + 1, "testo": line.strip()[:120]})
+ if len(hits) >= max_n:
+ break
+ return hits
+
+ residui = {
+ "backtick": _scan(r"`"),
+ "dotleader": _scan(r"(?:\. ){3,}"),
+ "url": _scan(r"^(https?://|www\.)\S+"),
+ "immagini": _scan(r"!\[[^\]]*\]\([^)]*\)"),
+ "br_inline": _scan(r"
"),
+ "simboli_encoding": _scan(r'(?<=[0-9A-Za-z])[!"](?=[0-9A-Za-z])'),
+ "formule_inline": _scan(r"\[\d+\.\d+\]"),
+ "footnote_markers": _scan(r'[¹²³⁰⁴-⁹]'),
+ "pua_markers": _scan(r'[-]'),
+ "formula_headers": _scan_formula_headers(),
+ }
+
+ report = {
+ "stem": stem,
+ "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M"),
+ "transforms": {
+ **t_stats,
+ "riduzione_pct": round(reduction),
+ },
+ "structure": profile,
+ "distribution": distribution,
+ "anomalie": {
+ "bare_headers": len(bare_hdrs),
+ "short_sections": len(short_secs),
+ "long_sections": len(long_secs),
+ "bare_headers_list": bare_hdrs,
+ "short_sections_list": short_secs,
+ "long_sections_list": long_secs,
+ },
+ "residui": {
+ "backtick": len(residui["backtick"]),
+ "dotleader": len(residui["dotleader"]),
+ "url": len(residui["url"]),
+ "immagini": len(residui["immagini"]),
+ "br_inline": len(residui["br_inline"]),
+ "simboli_encoding": len(residui["simboli_encoding"]),
+ "formule_inline": len(residui["formule_inline"]),
+ "footnote_markers": len(residui["footnote_markers"]),
+ "pua_markers": len(residui["pua_markers"]),
+ "backtick_esempi": residui["backtick"],
+ "dotleader_esempi": residui["dotleader"],
+ "url_esempi": residui["url"],
+ "immagini_esempi": residui["immagini"],
+ "br_inline_esempi": residui["br_inline"],
+ "simboli_encoding_esempi": residui["simboli_encoding"],
+ "formule_inline_esempi": residui["formule_inline"],
+ "footnote_markers_esempi": residui["footnote_markers"],
+ "pua_markers_esempi": residui["pua_markers"],
+ "formula_headers": len(residui["formula_headers"]),
+ "formula_headers_esempi": residui["formula_headers"],
+ },
+ }
+
+ report_path = out_dir / "report.json"
+ report_path.write_text(json.dumps(report, ensure_ascii=False, indent=2), encoding="utf-8")
+ return report_path
diff --git a/conversione/_pipeline/runner.py b/conversione/_pipeline/runner.py
new file mode 100644
index 0000000..7eb02dc
--- /dev/null
+++ b/conversione/_pipeline/runner.py
@@ -0,0 +1,110 @@
+import json
+import tempfile
+from pathlib import Path
+
+from .checker import check_pdf
+from .converter import convert_pdf
+from .transforms import apply_transforms
+from .structure import analyze
+from .report import build_report
+
+
+_LIVELLO_DESC = {3: "ricca (h3)", 2: "parziale (h2)", 1: "paragrafi", 0: "testo piatto"}
+
+
+def run(stem: str, project_root: Path, force: bool) -> bool:
+ pdf_path = project_root / "sources" / f"{stem}.pdf"
+ out_dir = project_root / "conversione" / stem
+ raw_out = out_dir / "raw.md"
+ clean_out = out_dir / "clean.md"
+
+ print(f"\n{'─' * 52}")
+ print(f" {stem}")
+ print(f"{'─' * 52}")
+
+ if clean_out.exists() and not force:
+ print(f" ⚠️ conversione/{stem}/clean.md già presente — skip")
+ print(f" (usa --force per rieseguire)")
+ return True
+
+ # [1] Validazione
+ print(" [1/4] Validazione PDF...")
+ ok, msg = check_pdf(pdf_path)
+ if not ok:
+ print(f" ✗ {msg}")
+ return False
+ print(f" ✅ {msg}")
+
+ # [2] Conversione
+ print(" [2/4] Conversione PDF → Markdown (opendataloader-pdf)...")
+ with tempfile.TemporaryDirectory() as tmp:
+ try:
+ md_file = convert_pdf(pdf_path, Path(tmp))
+ except MemoryError:
+ print(" ✗ Memoria esaurita durante la conversione")
+ return False
+ except Exception as e:
+ print(f" ✗ Conversione fallita: {e}")
+ return False
+ try:
+ raw_text = md_file.read_text(encoding="utf-8")
+ except UnicodeDecodeError as e:
+ print(f" ✗ Errore encoding nel file prodotto: {e}")
+ return False
+
+ size_kb = len(raw_text.encode()) // 1024
+ n_lines = raw_text.count("\n")
+ print(f" ✅ Markdown grezzo: {size_kb} KB, {n_lines} righe")
+
+ # [3] Pulizia strutturale
+ print(" [3/4] Pulizia strutturale...")
+ clean_text, t = apply_transforms(raw_text)
+ reduction = 100 * (1 - len(clean_text) / len(raw_text)) if raw_text else 0
+ print(f" ✅ Simboli PUA corretti: {t['n_simboli_pua_corretti']}")
+ print(f" Immagini rimosse: {t['n_immagini_rimosse']}")
+ print(f" Note rimosse: {t['n_note_rimosse']}")
+ print(f" Accenti corretti: {t['n_accenti_corretti']}")
+ print(f" Dot-leader rimossi: {t['n_dotleader_rimossi']}")
+ print(f" Header concat fixati: {t['n_header_concat_fixati']}")
+ print(f" Header num. normaliz.: {t['n_header_numerati_normalizzati']}")
+ print(f" Articoli → ###: {t['n_articoli_estratti']}")
+ print(f" Ambienti matematici: {t['n_ambienti_matematici']}")
+ print(f" Titoli header uniti: {t['n_titoli_uniti']}")
+ print(f" TOC rimosso: {'sì' if t['toc_rimosso'] else 'no'}")
+ print(f" Versi poesia riprist.: {t['n_versi_ripristinati']}")
+ print(f" Header verso demotati: {t['n_header_verso_demotati']}")
+ print(f" ALL-CAPS → ##: {t['n_header_allcaps']}")
+ print(f" Sezioni → ###: {t['n_sezioni_numerate']}")
+ print(f" Paragrafi uniti: {t['n_paragrafi_uniti']}")
+ print(f" Formula-hdr demotati: {t['n_formula_headers_demotati']}")
+ print(f" Riduzione testo: {reduction:.0f}%")
+
+ # [4] Profilo strutturale
+ print(" [4/4] Analisi struttura...")
+ try:
+ out_dir.mkdir(parents=True, exist_ok=True)
+ raw_out.write_text(raw_text, encoding="utf-8")
+ clean_out.write_text(clean_text, encoding="utf-8")
+ except PermissionError as e:
+ print(f" ✗ Permesso negato durante la scrittura: {e}")
+ return False
+
+ profile = analyze(clean_out)
+ (out_dir / "structure_profile.json").write_text(
+ json.dumps(profile, ensure_ascii=False, indent=2), encoding="utf-8"
+ )
+
+ print(f" ✅ Struttura: livello {profile['livello_struttura']} — "
+ f"{_LIVELLO_DESC[profile['livello_struttura']]}")
+ print(f" h1={profile['n_h1']} h2={profile['n_h2']} h3={profile['n_h3']} "
+ f"paragrafi={profile['n_paragrafi']}")
+ print(f" Strategia chunking: {profile['strategia_chunking']}")
+ print(f" Lingua rilevata: {profile['lingua_rilevata']}")
+ for w in profile["avvertenze"]:
+ print(f" ⚠️ {w}")
+
+ build_report(stem, out_dir, clean_text, t, profile, reduction)
+
+ print(f"\n Output → conversione/{stem}/")
+ print(f" raw.md (immutabile) clean.md report.json")
+ return True
diff --git a/conversione/_pipeline/transforms.py b/conversione/_pipeline/transforms.py
new file mode 100644
index 0000000..1c6a7cd
--- /dev/null
+++ b/conversione/_pipeline/transforms.py
@@ -0,0 +1,974 @@
+import re
+from collections import Counter
+from functools import partial
+
+# ─── Costanti ────────────────────────────────────────────────────────────────
+
+_TOC_KEYWORDS = frozenset([
+ "indice", "index", "contents", "table of contents",
+ "sommario", "inhaltsverzeichnis", "inhalt",
+ "indice generale", "indice analitico", "indice dei contenuti",
+ "elenco dei capitoli", "argomenti", "table des matières",
+ "tabla de contenidos", "содержание",
+])
+
+_ORDINALS_IT = {
+ "PRIMO": "I", "SECONDO": "II", "TERZO": "III", "QUARTO": "IV",
+ "QUINTO": "V", "SESTO": "VI", "SETTIMO": "VII", "OTTAVO": "VIII",
+ "NONO": "IX", "DECIMO": "X",
+}
+_ORDINALS_EN = {
+ "ONE": "1", "TWO": "2", "THREE": "3", "FOUR": "4", "FIVE": "5",
+ "SIX": "6", "SEVEN": "7", "EIGHT": "8", "NINE": "9", "TEN": "10",
+}
+
+# Mapping PUA Unicode (U+F020-U+F0FF) → simboli Unicode standard.
+# Font Symbol di Windows codifica lettere greche e operatori matematici
+# nel range Private Use Area invece dei codepoint Unicode standard.
+_SYMBOL_PUA_MAP: dict[str, str] = {
+ "": " ",
+ "": "(",
+ "": ")",
+ "": "+",
+ "": "−", # minus
+ "": ".",
+ "": "/",
+ "": "0", "": "1", "": "2", "": "3", "": "4",
+ "": "5", "": "6", "": "7", "": "8", "": "9",
+ "": ":", "": ";", "": "<", "": "=", "": ">",
+ "": "≅", # congruent
+ "": "Α", # Alpha
+ "": "Β", # Beta
+ "": "Χ", # Chi
+ "": "Δ", # Delta
+ "": "Ε", # Epsilon
+ "": "Φ", # Phi
+ "": "Γ", # Gamma
+ "": "Η", # Eta
+ "": "Ι", # Iota
+ "": "ϑ", # theta variant
+ "": "Κ", # Kappa
+ "": "Λ", # Lambda
+ "": "Μ", # Mu
+ "": "Ν", # Nu
+ "": "Ο", # Omicron
+ "": "Π", # Pi
+ "": "Θ", # Theta
+ "": "Ρ", # Rho
+ "": "Σ", # Sigma
+ "": "Τ", # Tau
+ "": "Υ", # Upsilon
+ "": "ς", # sigma final
+ "": "Ω", # Omega
+ "": "Ξ", # Xi
+ "": "Ψ", # Psi
+ "": "Ζ", # Zeta
+ "": "[",
+ "": "∴", # therefore
+ "": "]",
+ "": "⊥", # perpendicular
+ "": "α", # alpha
+ "": "β", # beta
+ "": "χ", # chi
+ "": "δ", # delta
+ "": "ε", # epsilon
+ "": "φ", # phi
+ "": "γ", # gamma
+ "": "η", # eta
+ "": "ι", # iota
+ "": "ϕ", # phi variant
+ "": "κ", # kappa
+ "": "λ", # lambda
+ "": "μ", # mu
+ "": "ν", # nu
+ "": "ο", # omicron
+ "": "π", # pi
+ "": "θ", # theta
+ "": "ρ", # rho
+ "": "σ", # sigma
+ "": "τ", # tau
+ "": "υ", # upsilon
+ "": "ϖ", # pi symbol
+ "": "ω", # omega
+ "": "ξ", # xi
+ "": "ψ", # psi
+ "": "ζ", # zeta
+ "": "{",
+ "": "|",
+ "": "}",
+ "": "~",
+ "": "±", # plus-minus
+ "": "•", # bullet
+ "": "√", # square root
+ "": "≤", # less or equal
+ "": "≥", # greater or equal
+ "": "∝", # proportional
+ "": "×", # multiplication
+ "": "÷", # division
+ "": "×", # alternate multiply
+ "": "≠", # not equal
+ "": "≠", # not equal alternate
+ "": "≥", # greater or equal alternate
+ "": "′", # prime
+ "": "*",
+ "": ",",
+ "": "≤", # less or equal (Symbol 0xA3)
+ "": "•", # bullet (Wingdings 0xA7)
+ "": "•", # bullet variant
+ "": "→", # right arrow (Symbol 0xAE)
+ "": "÷", # division / range separator
+ "": "", # Wingdings decorative icon (rimosso)
+ "": "→", # right arrow variant
+ "": "", # bracket extension piece (non ricostruibile)
+ "": "",
+ "": "",
+ "": "",
+ "": "",
+ "": "", # TeX large paren left U+F8EB
+ "": "", # TeX large paren extension U+F8EC
+ "": "", # TeX large paren right U+F8ED
+ "": "", # TeX large paren right ext U+F8EE
+ "": "", # TeX large bracket left U+F8EF
+ "": "", # TeX large bracket ext U+F8F0
+ "": "", # TeX brace top-left U+F8F1
+ "": "", # TeX brace mid U+F8F2
+ "": "", # TeX brace mid-right U+F8F3
+ "": "", # TeX brace extension U+F8F4
+ "": "", # TeX brace right U+F8F5
+ "": "", # TeX bracket right large U+F8F6
+ "": "", # TeX bracket right ext U+F8F7
+ "": "", # TeX bracket right close U+F8F8
+ "": "", # TeX integral large U+F8F9
+ "": "", # TeX integral extension U+F8FA
+ "": "", # TeX integral top U+F8FB
+ "": "", # TeX radical top U+F8FC
+ "": "", # TeX radical extension U+F8FD
+ "": "", # TeX arrowhead U+F8FE
+}
+
+_SYMBOL_PUA_RE = re.compile(
+ "[" + "".join(re.escape(k) for k in _SYMBOL_PUA_MAP) + "]"
+)
+
+_SUPERSCRIPT_RE = re.compile(r'[¹²³⁰⁴-⁹]+')
+_FOOTNOTE_BODY_RE = re.compile(
+ r'^([¹²³⁰⁴-⁹]+\s+|\[\d{1,3}\]\s+)'
+)
+_NUMBERED_HDR_RE = re.compile(
+ r"^(#{1,6})\s+(\d+(?:\.\d+)*)\.\s+(.+)$",
+ re.MULTILINE,
+)
+_BIB_MARKERS_RE = re.compile(
+ r'\b(pp?\.|vol\.|n\.\s*\d|ed\.|edn\.|ISBN|DOI|arXiv)\b'
+ r'|\b(19|20)\d{2}\b',
+ re.IGNORECASE,
+)
+_WATERMARK_RE = re.compile(
+ r"^(BOZZA|DRAFT|CONFIDENTIAL|RISERVATO|PROVVISORIO|SAMPLE|SPECIMEN"
+ r"|DO NOT DISTRIBUTE|NON DISTRIBUIRE|COPY|COPIA)\s*$",
+ re.IGNORECASE | re.MULTILINE,
+)
+
+_MATH_SYMBOLS_RE = re.compile(
+ r"[=+∈∀∃≤≥∞∑∫∂→↔⊂⊃∩∪αβγδεζηθικλμνξοπρστυφχψωΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩ]"
+)
+_EXERCISE_TRIGGER_RE = re.compile(
+ r"\b(Si dimostri|Si calcoli|Si provi|Si trovi|Trovare|Find|Prove|Show that"
+ r"|Compute|Calculate|Dimostrare|Verificare)\b",
+ re.IGNORECASE,
+)
+_MATH_HDR_RE = re.compile(r"^(#{2,3})\s+(.+)$")
+_NUMBERED_PREFIX_RE = re.compile(r"^(\d+(?:\.\d+)*[.)])\s+(.+)$", re.DOTALL)
+
+# Erano compilati dentro le funzioni a ogni chiamata — ora costanti di modulo
+_TABSEP_RE = re.compile(r"(?m)^\|\s*\|\s*$|^\|---\|?\s*$")
+_FM_RE = re.compile(
+ r"https?://|www\.|@[A-Za-z]|\bUniversit[àa]\b|\bDipartimento\b|"
+ r"\bCopyright\b|\bLicenza\b|\bEdizione\b|"
+ r"protetto da|tutti i diritti",
+ re.IGNORECASE,
+)
+_VERSE_NUM_RE = re.compile(
+ r'([.!?\xbb\'\"’]\s+)(\d+)(\s+)(?=[A-Z\xc0-\xd9a-z\xe0-\xf9\xab"“‟])'
+)
+
+
+# ─── Helper puri ─────────────────────────────────────────────────────────────
+
+def _sentence_case(s: str) -> str:
+ if not s:
+ return s
+ lower = s.lower()
+ return lower[0].upper() + lower[1:]
+
+
+def _is_allcaps_line(line: str) -> bool:
+ stripped = line.strip()
+ letters = [c for c in stripped if c.isalpha()]
+ return (
+ len(letters) >= 3
+ and all(c.isupper() for c in letters)
+ and not stripped.startswith("#")
+ and not stripped.startswith("|")
+ )
+
+
+def _allcaps_to_header(raw_line: str) -> str:
+ text = re.sub(r"^[-*+]\s+", "", raw_line.strip())
+ text = text.rstrip(".").rstrip("?").strip()
+
+ _ORD_IT_PAT = "|".join(_ORDINALS_IT.keys())
+ m = re.match(rf"^CAPITOLO ({_ORD_IT_PAT})\. (.+)", text)
+ if m:
+ roman = _ORDINALS_IT[m.group(1)]
+ titolo = m.group(2).rstrip(".").rstrip("?").strip()
+ return f"## Capitolo {roman} — {_sentence_case(titolo)}"
+
+ _ORD_EN_PAT = "|".join(_ORDINALS_EN.keys())
+ m = re.match(rf"^CHAPTER ({_ORD_EN_PAT}|\d+)\.? (.+)", text)
+ if m:
+ n = _ORDINALS_EN.get(m.group(1), m.group(1))
+ titolo = m.group(2).rstrip(".").rstrip("?").strip()
+ return f"## Chapter {n} — {_sentence_case(titolo)}"
+
+ m = re.match(r"^([IVXLCDM]+|[0-9]+)\. (.+)", text)
+ if m:
+ return f"## {m.group(1)}. {_sentence_case(m.group(2).rstrip('.').strip())}"
+
+ return f"## {_sentence_case(text)}"
+
+
+def _extract_math_environments(text: str) -> tuple[str, int]:
+ _ENVS = (
+ r"Definizione|Definition|Teorema|Theorem|Lemma|"
+ r"Proposizione|Proposition|Corollario|Corollary|"
+ r"Osservazione|Remark|Nota|Note|Esempio|Example"
+ )
+ count = 0
+ blocks = text.split("\n\n")
+ result = []
+
+ for block in blocks:
+ stripped = block.strip()
+ if not stripped or stripped.startswith("#"):
+ result.append(block)
+ continue
+
+ m = re.match(
+ rf"^({_ENVS})\s+((?:\d+\.?){{1,4}})\s*(.*)",
+ stripped,
+ re.DOTALL,
+ )
+ if not m:
+ result.append(block)
+ continue
+
+ env = m.group(1)
+ num = m.group(2).rstrip(".")
+ rest = m.group(3).strip()
+
+ title_m = re.match(r"^(\([^)]{2,60}\))\s+(.*)", rest, re.DOTALL)
+ if title_m:
+ header = f"### {env} {num} {title_m.group(1)}"
+ body = title_m.group(2).strip()
+ else:
+ header = f"### {env} {num}."
+ body = rest
+
+ result.append(f"{header}\n\n{body}" if body else header)
+ count += 1
+
+ return "\n\n".join(result), count
+
+
+def _merge_title_headers(text: str) -> tuple[str, int]:
+ count = 0
+ blocks = re.split(r"\n{2,}", text)
+ result = []
+ i = 0
+ while i < len(blocks):
+ block = blocks[i]
+ stripped = block.strip()
+ if (
+ re.match(r"^#{2,3} \d+\.\s*$", stripped)
+ and i + 1 < len(blocks)
+ ):
+ nxt = blocks[i + 1].strip()
+ if (
+ nxt
+ and "\n" not in nxt
+ and len(nxt) <= 80
+ and not nxt.startswith("#")
+ and not re.match(r"^\d+[\.\)]\s", nxt)
+ ):
+ result.append(stripped.rstrip() + " " + nxt)
+ count += 1
+ i += 2
+ continue
+ result.append(block)
+ i += 1
+ return re.sub(r"\n{3,}", "\n\n", "\n\n".join(result)), count
+
+
+def _extract_article_headers(text: str) -> tuple[str, int]:
+ count = 0
+
+ def _repl(m: re.Match) -> str:
+ nonlocal count
+ num = m.group(1)
+ rest = m.group(2).strip()
+
+ title_m = re.match(
+ r"^([A-Z\xc0\xc8\xc9\xcc\xcd\xd2\xd3\xd9\xda].{1,74}?)\.\s+"
+ r"([A-Z\xc0\xc8\xc9\xcc\xcd\xd2\xd3\xd9\xda\(\d].{4,})",
+ rest,
+ )
+ if title_m:
+ count += 1
+ return (
+ f"### Art. {num}. {title_m.group(1)}.\n\n"
+ f"{title_m.group(2).strip()}"
+ )
+ if rest:
+ count += 1
+ return f"### Art. {num}.\n\n{rest}"
+ count += 1
+ return f"### Art. {num}."
+
+ text = re.sub(
+ r"^-\s+Art\.\s+([\d]+[a-z\-]*)\.\s*(.*)",
+ _repl,
+ text,
+ flags=re.MULTILINE,
+ )
+ return text, count
+
+
+# ─── Trasformazioni atomiche ──────────────────────────────────────────────────
+
+def _t_fix_symbol_font(text: str) -> tuple[str, int]:
+ count = [0]
+
+ def _repl(m: re.Match) -> str:
+ count[0] += 1
+ return _SYMBOL_PUA_MAP[m.group(0)]
+
+ result = _SYMBOL_PUA_RE.sub(_repl, text)
+ return result, count[0]
+
+
+def _t_remove_images(text: str) -> tuple[str, int]:
+ n = len(re.findall(r"!\[[^\]]*\]\([^)]*\)", text))
+ text = re.sub(r"!\[[^\]]*\]\([^)]*\)\s*", "", text)
+ return text, n
+
+
+def _t_remove_footnotes(text: str) -> tuple[str, int]:
+ lines = text.split("\n")
+ result, count = [], 0
+ for line in lines:
+ stripped = line.strip()
+ if stripped and _FOOTNOTE_BODY_RE.match(stripped) and len(stripped) < 300:
+ count += 1
+ continue
+ cleaned = _SUPERSCRIPT_RE.sub("", line)
+ if cleaned != line:
+ count += 1
+ result.append(cleaned)
+ return "\n".join(result), count
+
+
+def _t_fix_br(text: str) -> tuple[str, int]:
+ n = len(re.findall(r"
", text, re.IGNORECASE))
+ text = re.sub(r"
\s*", " ", text, flags=re.IGNORECASE)
+ return text, n
+
+
+def _t_fix_tabsep(text: str) -> tuple[str, int]:
+ n = len(_TABSEP_RE.findall(text))
+ text = _TABSEP_RE.sub("", text)
+ return text, n
+
+
+def _t_fix_accents(text: str) -> tuple[str, int]:
+ _ACCENT_MAP = {
+ "e": "\xe8", "E": "\xc8", "a": "\xe0", "A": "\xc0",
+ "u": "\xf9", "U": "\xd9", "i": "\xec", "I": "\xcc",
+ "o": "\xf2", "O": "\xd2",
+ }
+ n_bt_before = text.count("`")
+ text = re.sub(r"`([eEaAuUiIoO])", lambda m: _ACCENT_MAP[m.group(1)], text)
+ text = re.sub(r"([eEaAuUiIoO])`", lambda m: _ACCENT_MAP[m.group(1)], text)
+ n_accenti = n_bt_before - text.count("`")
+ n_bt_orfani = text.count("`")
+ if n_bt_orfani:
+ text = re.sub(r"`", "", text)
+ n_accenti += n_bt_orfani
+ return text, n_accenti
+
+
+def _t_fix_multiplication(text: str) -> tuple[str, int]:
+ n = len(re.findall(r'(?<=[0-9])"(?=[0-9(])', text))
+ text = re.sub(r'(?<=[0-9])"(?=[0-9(])', '×', text)
+ return text, n
+
+
+def _t_fix_micro(text: str) -> tuple[str, int]:
+ _SI_UNITS_RE = r'[mAsgVWFHTKNJClΩ]'
+ n = len(re.findall(rf'\d\s*!(?={_SI_UNITS_RE})', text))
+ text = re.sub(rf'(\d)\s*!({_SI_UNITS_RE})', r'\1 µ\2', text)
+ return text, n
+
+
+def _t_remove_formula_labels(text: str) -> tuple[str, int]:
+ n = len(re.findall(r"\[\d+\.\d+\]", text))
+ text = re.sub(r"\s*\[\d+\.\d+\]\s*", " ", text)
+ return text, n
+
+
+def _t_remove_dotleaders(text: str) -> tuple[str, int]:
+ _DOTLEADER_RE = r"^[^\n]*(?:(?:\. ){3,}|\.{4,})[^\n]*$"
+ n = len(re.findall(_DOTLEADER_RE, text, re.MULTILINE))
+ text = re.sub(_DOTLEADER_RE, "", text, flags=re.MULTILINE)
+ text = re.sub(
+ r"(?m)^(i{1,3}|iv|vi{0,3}|ix|xi{0,2}|x)$",
+ "",
+ text,
+ flags=re.IGNORECASE,
+ )
+ return text, n
+
+
+def _t_fix_header_concat(text: str) -> tuple[str, int]:
+ count = 0
+
+ def _fix(m: re.Match) -> str:
+ nonlocal count
+ hashes = m.group(1)
+ full = m.group(2).strip()
+ if len(full) < 60:
+ return m.group(0)
+ skip = min(10, len(full) // 3)
+ split = re.search(r"(?<=[a-z\xe0\xe8\xe9\xec\xed\xf2\xf3\xf9\xfa\xe4])(?=[A-Z\xc0\xc8\xc9\xcc\xcd\xd2\xd3\xd9\xda])", full[skip:])
+ if split:
+ pos = skip + split.start()
+ title = full[:pos].strip()
+ body = full[pos:].strip()
+ if len(title) >= 5 and len(body) >= 15:
+ count += 1
+ return f"{hashes} {title}\n\n{body}"
+ return m.group(0)
+
+ text = re.sub(r"^(#{2,6})\s+(.{40,})$", _fix, text, flags=re.MULTILINE)
+ return text, count
+
+
+def _t_extract_capitolo(text: str) -> tuple[str, int]:
+ def _repl(m: re.Match) -> str:
+ num = m.group(1)
+ titolo = _sentence_case(m.group(2).strip().rstrip("- ").strip())
+ return f"\n\n## Capitolo {num}: {titolo}\n\n"
+
+ text = re.sub(
+ r"\bCapitolo\s+(\d+)\s*[:\s]\s*([A-Z\xc0\xc8\xc9\xcc\xcd\xd2\xd3\xd9\xda\'L]"
+ r"[A-Z\xc0\xc8\xc9\xcc\xcd\xd2\xd3\xd9\xda\s\'\.,\(\)]{5,80}?)"
+ r"(?=\s*[-–]\s*\d|\s*\n|\s*$)",
+ _repl,
+ text,
+ )
+ return text, 0
+
+
+def _t_normalize_numbered_headings(text: str) -> tuple[str, int]:
+ all_matches = list(_NUMBERED_HDR_RE.finditer(text))
+ if not all_matches:
+ return text, 0
+
+ pairs = [(m.group(2).count(".") + 1, len(m.group(1))) for m in all_matches]
+ depths = [d for d, _ in pairs]
+ min_depth = min(depths)
+ max_depth = max(depths)
+ if max_depth == min_depth:
+ return text, 0
+
+ base_level = min(lv for d, lv in pairs if d == min_depth)
+ count = 0
+
+ def _repl(m: re.Match) -> str:
+ nonlocal count
+ hashes, num, title = m.group(1), m.group(2), m.group(3)
+ depth = num.count(".") + 1
+ new_level = min(base_level + (depth - min_depth), 6)
+ if new_level == len(hashes):
+ return m.group(0)
+ count += 1
+ return f"{'#' * new_level} {num}. {title}"
+
+ return _NUMBERED_HDR_RE.sub(_repl, text), count
+
+
+def _t_normalize_header_levels(text: str) -> tuple[str, int]:
+ text = re.sub(r"^#{3,6}\s*$", "", text, flags=re.MULTILINE)
+ text = re.sub(
+ r"^(#{3,6})\s+(\d{1,3})\s+(.+)$",
+ lambda m: f"### {m.group(2)}. {m.group(3)}",
+ text,
+ flags=re.MULTILINE,
+ )
+ text = re.sub(r"^#{4,6}\s+(.+)$", r"### \1", text, flags=re.MULTILINE)
+ return text, 0
+
+
+def _t_extract_articles(text: str) -> tuple[str, int]:
+ return _extract_article_headers(text)
+
+
+def _t_remove_header_bold(text: str) -> tuple[str, int]:
+ text = re.sub(
+ r"^(#{1,6})\s+\*\*(.+?)\*\*\s*$",
+ r"\1 \2",
+ text, flags=re.MULTILINE,
+ )
+ return text, 0
+
+
+def _t_normalize_allcaps_headers(text: str) -> tuple[str, int]:
+ def _norm(m: re.Match) -> str:
+ hashes, content = m.group(1), m.group(2).strip()
+ letters = [c for c in content if c.isalpha()]
+ if letters and all(c.isupper() for c in letters):
+ return f"{hashes} {_sentence_case(content)}"
+ return m.group(0)
+
+ text = re.sub(r"^(#{1,6}) (.+)$", _norm, text, flags=re.MULTILINE)
+ return text, 0
+
+
+def _t_remove_toc(text: str) -> tuple[str, int]:
+ lines = text.split("\n")
+ new_lines = []
+ _in_toc = False
+ removed = False
+ for line in lines:
+ bare = re.sub(r"^#+\s*", "", line.strip())
+ first_word = bare.split(".")[0].strip().lower()
+ if first_word in _TOC_KEYWORDS:
+ removed = True
+ _in_toc = True
+ continue
+ if _in_toc:
+ if re.match(r"^\s*$", line) or re.match(r"^\s*[-*+]\s+\d", line):
+ continue
+ if re.match(r"^\s*[-*+]\s+.{2,70}\s+\d{1,3}\s*$", line):
+ continue
+ if len(line.strip()) > 200:
+ _in_toc = False
+ new_lines.append(line)
+ continue
+ _in_toc = False
+ new_lines.append(line)
+ return "\n".join(new_lines), 1 if removed else 0
+
+
+def _t_allcaps_to_headers(text: str) -> tuple[str, int]:
+ count = 0
+ blocks = text.split("\n\n")
+ new_blocks = []
+ for block in blocks:
+ stripped = block.strip()
+ if "\n" not in stripped and _is_allcaps_line(stripped):
+ new_blocks.append(_allcaps_to_header(stripped))
+ count += 1
+ else:
+ sub_lines = block.split("\n")
+ converted = []
+ for ln in sub_lines:
+ if _is_allcaps_line(ln) and len(ln.strip()) > 3:
+ converted.append(_allcaps_to_header(ln))
+ count += 1
+ else:
+ converted.append(ln)
+ new_blocks.append("\n".join(converted))
+ return "\n\n".join(new_blocks), count
+
+
+def _t_numbered_sections(text: str, has_exercises: bool = False) -> tuple[str, int]:
+ count = 0
+
+ def _num_repl(m: re.Match) -> str:
+ nonlocal count
+ content = m.group(2).strip()
+ if content.endswith(".") and len(content) > 40:
+ return m.group(0)
+ if _BIB_MARKERS_RE.search(content):
+ return m.group(0)
+ count += 1
+ return f"### {m.group(1)}.\n\n{content}"
+
+ text = re.sub(r"^(\d+)\.\s+(.+)$", _num_repl, text, flags=re.MULTILINE)
+
+ def _num_letter_repl(m: re.Match) -> str:
+ nonlocal count
+ count += 1
+ return f"### {m.group(1)}{m.group(2)}.\n\n{m.group(3).strip()}"
+
+ text = re.sub(r"^(\d+)\s*([a-z])\.\s+(.+)$", _num_letter_repl, text, flags=re.MULTILINE)
+
+ if not has_exercises:
+ def _aphorism_repl(m: re.Match) -> str:
+ nonlocal count
+ content = m.group(2).strip()
+ if _BIB_MARKERS_RE.search(content):
+ return m.group(0)
+ count += 1
+ return f"\n\n### {m.group(1)}.\n\n{content}"
+
+ text = re.sub(
+ r"^-\s+(\d{1,3})\.\s+(.{10,})$",
+ _aphorism_repl,
+ text,
+ flags=re.MULTILINE,
+ )
+
+ def _list_section_repl(m: re.Match) -> str:
+ nonlocal count
+ num = m.group(1)
+ content = m.group(2).strip()
+ if _BIB_MARKERS_RE.search(content):
+ return m.group(0)
+ count += 1
+ split = re.search(r"(?<=[a-z\xe0\xe8\xe9\xec\xed\xf2\xf3\xf9\xfa])\s+(?=[A-Z\xc0\xc8\xc9\xcc\xcd\xd2\xd3\xd9\xda])", content)
+ if split and split.start() >= 3:
+ title = content[: split.start()].strip()
+ body = content[split.end():].strip()
+ if len(body) >= 20:
+ return f"\n\n### {num}. {title}\n\n{body}"
+ return f"\n\n### {num}. {content}"
+
+ text = re.sub(
+ r"^-\s+(\d{1,3})\s+([A-Z\xc0\xc8\xc9\xcc\xcd\xd2\xd3\xd9\xda\'L].{10,})$",
+ _list_section_repl,
+ text,
+ flags=re.MULTILINE,
+ )
+ return text, count
+
+
+def _t_extract_math(text: str) -> tuple[str, int]:
+ return _extract_math_environments(text)
+
+
+def _t_merge_paragraphs(text: str) -> tuple[str, int]:
+ _SENTENCE_END = set(".?!\xbb)\"'")
+ blocks = text.split("\n\n")
+ merged = []
+ count = 0
+ i = 0
+ while i < len(blocks):
+ b = blocks[i]
+ stripped = b.strip()
+ while (
+ i + 1 < len(blocks)
+ and stripped
+ and not stripped.startswith("#")
+ and not stripped.startswith("|")
+ and stripped[-1] not in _SENTENCE_END
+ ):
+ nxt = blocks[i + 1].strip()
+ if (
+ not nxt
+ or nxt.startswith("#")
+ or nxt.startswith("|")
+ or re.match(r"^\d+\.", nxt)
+ or re.match(r"^[-*+]\s", nxt)
+ ):
+ break
+ b = stripped + " " + nxt
+ stripped = b.strip()
+ count += 1
+ i += 1
+ merged.append(b)
+ i += 1
+ text = "\n\n".join(merged)
+ text = re.sub(r"(?m)^\|---\|\s*", "", text)
+ return text, count
+
+
+def _t_normalize_whitespace(text: str) -> tuple[str, int]:
+ lines = text.split("\n")
+ text = "\n".join(
+ re.sub(r" +", " ", line) if line.strip() else line
+ for line in lines
+ )
+ return text, 0
+
+
+def _t_collapse_blank_lines(text: str) -> tuple[str, int]:
+ return re.sub(r"\n{3,}", "\n\n", text), 0
+
+
+def _t_demote_verse_headers(text: str) -> tuple[str, int]:
+ count = 0
+
+ def _demote(m: re.Match) -> str:
+ nonlocal count
+ hashes, content = m.group(1), m.group(2).strip()
+ if not re.search(r"\s\d{1,4}\s*$", content):
+ return m.group(0)
+ inner = re.sub(r"\s\d{1,4}\s*$", "", content)
+ if not re.search(r'[,;:.!?\xbb"\'][\ ]+[A-Za-z\xc0-\xff\xab"“]', inner):
+ return m.group(0)
+ count += 1
+ clean = re.sub(r"\s\d{1,4}\s*$", "", content)
+ return clean
+
+ text = re.sub(r"^(#{1,6})\s+(.{20,})$", _demote, text, flags=re.MULTILINE)
+ return text, count
+
+
+def _t_restore_poetry_lines(text: str) -> tuple[str, int]:
+ count = 0
+ blocks = text.split("\n\n")
+ result = []
+
+ for block in blocks:
+ stripped = block.strip()
+ if not stripped or stripped.startswith("#"):
+ result.append(block)
+ continue
+
+ matches = list(_VERSE_NUM_RE.finditer(stripped))
+ if len(matches) < 2:
+ result.append(block)
+ continue
+
+ nums = [int(m.group(2)) for m in matches]
+ diffs = [nums[i + 1] - nums[i] for i in range(len(nums) - 1)]
+ if not diffs or len(set(diffs)) > 2 or not (1 <= diffs[0] <= 5):
+ result.append(block)
+ continue
+
+ step = diffs[0]
+
+ def _replace_verse_num(m: re.Match) -> str:
+ n = int(m.group(2))
+ sep = "\n\n" if n % (step * 3) == 0 else "\n"
+ return m.group(1).rstrip() + sep
+
+ new_block = _VERSE_NUM_RE.sub(_replace_verse_num, stripped)
+ if new_block != stripped:
+ count += len(matches)
+ result.append(new_block)
+
+ return "\n\n".join(result), count
+
+
+def _t_remove_urls(text: str) -> tuple[str, int]:
+ return re.sub(r"(?m)^(https?://|www\.)\S+\s*$", "", text), 0
+
+
+def _t_remove_empty_headers(text: str) -> tuple[str, int]:
+ blocks = re.split(r"\n{2,}", text)
+ cleaned = []
+ for i, block in enumerate(blocks):
+ stripped = block.strip()
+ if re.match(r"^#{1,6} ", stripped) and "\n" not in stripped:
+ next_stripped = blocks[i + 1].strip() if i + 1 < len(blocks) else ""
+ next_is_long_hdr = (
+ re.match(r"^#{1,6} ", next_stripped) and len(next_stripped) > 80
+ )
+ if not next_stripped or (
+ re.match(r"^#{1,6} ", next_stripped) and not next_is_long_hdr
+ ):
+ continue
+ cleaned.append(block)
+ return re.sub(r"\n{3,}", "\n\n", "\n\n".join(cleaned)), 0
+
+
+def _t_merge_title_headers(text: str) -> tuple[str, int]:
+ return _merge_title_headers(text)
+
+
+def _t_remove_garbage_headers(text: str) -> tuple[str, int]:
+ def _is_garbage(content: str) -> bool:
+ if content.lstrip().startswith("..."):
+ return True
+ if not re.search(r"[A-Za-z\xc0-\xffΑ-ω]{2,}", content):
+ return True
+ if re.fullmatch(r"\(?\s*[A-Za-z]{1,4}\s*\)?", content.strip()):
+ return True
+ if len(content) > 60 and re.search(r"[!%#]\w|\w[!%#]|\b\w+-\s*\w", content):
+ return True
+ first_alpha = next((c for c in content if c.isalpha()), None)
+ if first_alpha and first_alpha.islower() and len(content) > 40:
+ return True
+ if re.match(r"^[A-Za-zΑ-ω_]{1,3}\s*[=<>≤≥]", content.strip()):
+ return True
+ if re.match(r"^(Figura|Figure|Fig\.|Tabella|Table|Tab\.)\s+\d", content.strip(), re.IGNORECASE):
+ return True
+ return False
+
+ count = 0
+ lines = text.split("\n")
+ new_lines = []
+ for line in lines:
+ m = re.match(r"^#{1,6} (.+)$", line)
+ if m and _is_garbage(m.group(1)):
+ count += 1
+ continue
+ new_lines.append(line)
+ text = "\n".join(new_lines)
+ text = re.sub(r"\n{3,}", "\n\n", text)
+ return text, count
+
+
+def _t_remove_frontmatter(text: str) -> tuple[str, int]:
+ blocks = re.split(r"\n{2,}", text)
+ cleaned = []
+ count = 0
+ total = len(blocks)
+ cutoff = max(5, min(15, int(total * 0.20)))
+ for i, block in enumerate(blocks):
+ stripped = block.strip()
+ if i >= cutoff:
+ cleaned.append(block)
+ continue
+ if not re.match(r"^### ", stripped) or re.match(r"^### \d", stripped):
+ cleaned.append(block)
+ continue
+ body = blocks[i + 1].strip() if i + 1 < len(blocks) else ""
+ is_fm_body = len(body) < 250 and _FM_RE.search(body)
+ is_fm_hdr = _FM_RE.search(stripped)
+ if is_fm_body or is_fm_hdr:
+ count += 1
+ continue
+ cleaned.append(block)
+ return re.sub(r"\n{3,}", "\n\n", "\n\n".join(cleaned)), count
+
+
+def _t_remove_watermarks(text: str) -> tuple[str, int]:
+ lines = text.split("\n")
+ result, count = [], 0
+ for line in lines:
+ if _WATERMARK_RE.match(line):
+ count += 1
+ else:
+ result.append(line)
+ return "\n".join(result), count
+
+
+def _t_fix_math_symbols(text: str) -> tuple[str, int]:
+ lines = text.split("\n")
+ result, count = [], 0
+ for line in lines:
+ if line.strip() and re.match(r"^[\s■-◿☐-☒•▪▫◆◇●○•]+$", line):
+ count += 1
+ else:
+ result.append(line)
+ return "\n".join(result), count
+
+
+def _t_remove_recurring_lines(text: str) -> tuple[str, int]:
+ lines = text.split("\n")
+ short_lines = [
+ ln.strip() for ln in lines
+ if 3 < len(ln.strip()) < 80
+ and not ln.strip().startswith("#")
+ and not ln.strip().startswith("|")
+ ]
+ freq = Counter(short_lines)
+ recurring = {ln for ln, c in freq.items() if c >= 5}
+ if not recurring:
+ return text, 0
+ result, count = [], 0
+ for line in lines:
+ if line.strip() in recurring:
+ count += 1
+ else:
+ result.append(line)
+ return "\n".join(result), count
+
+
+def _t_math_header_demotion(text: str) -> tuple[str, int]:
+ lines = text.split("\n")
+ result, count = [], 0
+ for line in lines:
+ m = _MATH_HDR_RE.match(line)
+ if not m:
+ result.append(line)
+ continue
+ body = m.group(2)
+ if len(body) <= 100:
+ result.append(line)
+ continue
+ has_math = len(_MATH_SYMBOLS_RE.findall(body)) >= 3
+ has_exercise = bool(_EXERCISE_TRIGGER_RE.search(body))
+ if not (has_math or has_exercise):
+ result.append(line)
+ continue
+ nm = _NUMBERED_PREFIX_RE.match(body)
+ if nm:
+ result.append(f"**{nm.group(1)}** {nm.group(2)}")
+ else:
+ result.append(body)
+ count += 1
+ return "\n".join(result), count
+
+
+# ─── Orchestratore ───────────────────────────────────────────────────────────
+
+def apply_transforms(text: str) -> tuple[str, dict]:
+ """
+ Applica le trasformazioni strutturali al Markdown grezzo.
+ Restituisce (testo_modificato, statistiche).
+ L'ordine è semantico: encoding → struttura header → costruzione struttura → testo → rifinitura.
+ """
+ _has_ex = bool(re.search(r"\b(Esercizi|Exercises|Problems|Homework)\b", text, re.IGNORECASE))
+
+ _transforms: list[tuple[str | None, object]] = [
+ ("n_simboli_pua_corretti", _t_fix_symbol_font),
+ ("n_immagini_rimosse", _t_remove_images),
+ ("n_br_rimossi", _t_fix_br),
+ ("n_tabsep_rimossi", _t_fix_tabsep),
+ ("n_note_rimosse", _t_remove_footnotes),
+ ("n_accenti_corretti", _t_fix_accents),
+ ("n_moltiplicazioni_corrette", _t_fix_multiplication),
+ ("n_micro_corretti", _t_fix_micro),
+ ("n_simboli_math_rimossi", _t_fix_math_symbols),
+ ("n_formule_rimossi", _t_remove_formula_labels),
+ ("n_dotleader_rimossi", _t_remove_dotleaders),
+ ("n_righe_ricorrenti_rimosse", _t_remove_recurring_lines),
+ ("n_header_concat_fixati", _t_fix_header_concat),
+ (None, _t_extract_capitolo),
+ ("n_header_numerati_normalizzati", _t_normalize_numbered_headings),
+ (None, _t_normalize_header_levels),
+ ("n_articoli_estratti", _t_extract_articles),
+ (None, _t_remove_header_bold),
+ (None, _t_normalize_allcaps_headers),
+ ("toc_rimosso", _t_remove_toc),
+ ("n_header_allcaps", _t_allcaps_to_headers),
+ ("n_sezioni_numerate", partial(_t_numbered_sections, has_exercises=_has_ex)),
+ ("n_ambienti_matematici", _t_extract_math),
+ ("n_paragrafi_uniti", _t_merge_paragraphs),
+ (None, _t_normalize_whitespace),
+ (None, _t_collapse_blank_lines),
+ ("n_versi_ripristinati", _t_restore_poetry_lines),
+ ("n_header_verso_demotati", _t_demote_verse_headers),
+ (None, _t_remove_urls),
+ (None, _t_remove_empty_headers),
+ ("n_titoli_uniti", _t_merge_title_headers),
+ (None, lambda t: (re.sub(r"(?m)^(#{1,6}.+?)\s*\|\s*\d{1,3}\s*$", r"\1", t), 0)),
+ ("n_garbage_headers_rimossi", _t_remove_garbage_headers),
+ ("n_formula_headers_demotati", _t_math_header_demotion),
+ ("n_frontmatter_rimossi", _t_remove_frontmatter),
+ ("n_watermark_rimossi", _t_remove_watermarks),
+ ]
+
+ stats: dict = {}
+ for stat_key, fn in _transforms:
+ text, n = fn(text)
+ if stat_key:
+ stats[stat_key] = stats.get(stat_key, 0) + n
+
+ stats["toc_rimosso"] = bool(stats.get("toc_rimosso", 0))
+ return text, stats
diff --git a/conversione/_pipeline/validator.py b/conversione/_pipeline/validator.py
new file mode 100644
index 0000000..8e79e16
--- /dev/null
+++ b/conversione/_pipeline/validator.py
@@ -0,0 +1,152 @@
+import json
+import sys
+from pathlib import Path
+
+_GRADES = [(90, "A"), (75, "B"), (60, "C"), (40, "D"), (0, "F")]
+
+
+def _score(r: dict) -> tuple[int, list[str]]:
+ """
+ Voto 0-100 sulla qualità del clean.md per vettorizzazione.
+
+ Penalità struttura:
+ livello 0 (assente) → −40
+ livello 1 (piatto) → −15
+
+ Penalità residui (degradano il retrieval):
+ backtick → −2/cad (max −20)
+ dot-leader → −5/cad (max −10)
+ URL/watermark → −5/cad (max −15)
+ immagini → −5/cad (max −10)
+
inline → −2/cad (max −15)
+ simboli encoding → −1/cad (max −10)
+ formule inline [N.M] → −1/cad (max −8)
+ footnote residui → −1/cad (max −8)
+ caratteri PUA → −2/cad (max −20)
+
+ Penalità anomalie:
+ bare headers → −3/cad (max −15)
+ """
+ score = 100
+ detail = []
+ structure = r.get("structure", {})
+ anomalie = r.get("anomalie", {})
+ residui = r.get("residui", {})
+
+ livello = structure.get("livello_struttura", 0)
+ if livello == 0:
+ score -= 40
+ detail.append("struttura assente −40")
+ elif livello == 1:
+ score -= 15
+ detail.append("struttura piatta −15")
+
+ def _pen(key: str, per_item: int, cap: int, label: str) -> None:
+ n = residui.get(key, 0)
+ if n:
+ p = min(cap, n * per_item)
+ nonlocal score
+ score -= p
+ detail.append(f"{label} ×{n} −{p}")
+
+ _pen("backtick", 2, 20, "backtick")
+ _pen("dotleader", 5, 10, "dot-leader")
+ _pen("url", 5, 15, "url")
+ _pen("immagini", 5, 10, "immagini")
+ _pen("br_inline", 2, 15, "
inline")
+ _pen("simboli_encoding", 1, 10, "simboli encoding")
+ _pen("formule_inline", 1, 8, "formule inline")
+ _pen("footnote_markers", 1, 8, "footnote residui")
+ _pen("pua_markers", 2, 20, "caratteri PUA font Symbol")
+ _pen("formula_headers", 3, 15, "formula/esercizio come header")
+
+ n_bare = anomalie.get("bare_headers", 0)
+ if n_bare:
+ p = min(15, n_bare * 3)
+ score -= p
+ detail.append(f"bare headers ×{n_bare} −{p}")
+
+ return max(0, score), detail
+
+
+def _grade(score: int) -> str:
+ return next(g for threshold, g in _GRADES if score >= threshold)
+
+
+def validate(stems: list[str], project_root: Path, detail: bool = False) -> None:
+ conv_dir = project_root / "conversione"
+
+ paths = (
+ [conv_dir / s / "report.json" for s in stems]
+ if stems
+ else sorted(conv_dir.glob("*/report.json"))
+ )
+
+ if not paths:
+ print("Nessun report.json trovato in conversione/*/")
+ sys.exit(0)
+
+ rows = [
+ json.loads(p.read_text(encoding="utf-8")) if p.exists()
+ else {"stem": p.parent.name, "_missing": True}
+ for p in paths
+ ]
+
+ col = max(len(r.get("stem", "stem")) for r in rows) + 2
+ header = (
+ f"{'stem':<{col}}"
+ f"{'h2':>4}{'h3':>5} "
+ f"{'strategia':<18}"
+ f"{'bare':>5}{'corte':>6}{'lunghe':>7}"
+ f"{'btk':>5}{'br':>4}{'enc':>4}{'url':>4}{'fhdr':>5}"
+ f"{'med':>6}"
+ f" {'voto':>4} grade"
+ )
+ sep = "─" * len(header)
+ print(f"\n{header}\n{sep}")
+
+ scores = []
+ for r in rows:
+ if r.get("_missing"):
+ print(f"{r['stem']:<{col}} (report.json non trovato)")
+ continue
+
+ st = r.get("structure", {})
+ an = r.get("anomalie", {})
+ res = r.get("residui", {})
+ dist = r.get("distribution", {})
+ s, pen = _score(r)
+ scores.append(s)
+
+ print(
+ f"{r['stem']:<{col}}"
+ f"{st.get('n_h2', 0):>4}"
+ f"{st.get('n_h3', 0):>5} "
+ f"{st.get('strategia_chunking','?'):<18}"
+ f"{an.get('bare_headers', 0):>5}"
+ f"{an.get('short_sections', 0):>6}"
+ f"{an.get('long_sections', 0):>7}"
+ f"{res.get('backtick', 0):>5}"
+ f"{res.get('br_inline', 0):>4}"
+ f"{res.get('simboli_encoding', 0):>4}"
+ f"{res.get('url', 0):>4}"
+ f"{res.get('formula_headers', 0):>5}"
+ f"{dist.get('mediana', 0):>6}"
+ f" {s:>4} {_grade(s)}"
+ )
+ if detail and pen:
+ for p in pen:
+ print(f" {'':>{col}} ↳ {p}")
+
+ print(sep)
+ if scores:
+ media = sum(scores) / len(scores)
+ print(
+ f"Documenti: {len(scores)} "
+ f"Media: {media:.0f}/100 {_grade(int(media))} "
+ f"(A≥90 B≥75 C≥60 D≥40 F<40)"
+ )
+ print(
+ "\nColonne: bare=header vuoti corte=sez<150ch lunghe=sez>1500ch "
+ "btk=backtick br=
inline enc=simboli encoding fhdr=formula-header med=mediana chars\n"
+ )