refactor(pipeline): modularizza apply_transforms in 26 funzioni _t_xxx

Estrae ogni trasformazione strutturale in una funzione dedicata _t_xxx(text) -> tuple[str, int], sostituendo la mega-function da 418 righe con un loop su lista di coppie (stat_key, fn). Aggiunge _parse_sections_with_body() condivisa tra analyze() e build_report(). Output identico verificato su tutti e 5 gli stem esistenti
2026-04-17 09:46:50 +02:00
parent 875a342efa
commit 757df26bc2
1 changed files with 223 additions and 201 deletions
@@ -31,6 +31,7 @@ import subprocess
 import sys
 import tempfile
 from datetime import datetime
+from functools import partial
 from pathlib import Path


@@ -340,52 +341,29 @@ def _extract_article_headers(text: str) -> tuple[str, int]:
    return text, count


-def apply_transforms(text: str) -> tuple[str, dict]:
-    """
-    Applica le trasformazioni strutturali al Markdown grezzo.
-    Restituisce (testo_modificato, statistiche).
-    """
-    stats = {
-        "toc_rimosso": False,
-        "n_immagini_rimosse": 0,
-        "n_accenti_corretti": 0,
-        "n_moltiplicazioni_corrette": 0,
-        "n_micro_corretti": 0,
-        "n_br_rimossi": 0,
-        "n_formule_rimossi": 0,
-        "n_garbage_headers_rimossi": 0,
-        "n_frontmatter_rimossi": 0,
-        "n_dotleader_rimossi": 0,
-        "n_header_concat_fixati": 0,
-        "n_articoli_estratti": 0,
-        "n_ambienti_matematici": 0,
-        "n_titoli_uniti": 0,
-        "n_header_allcaps": 0,
-        "n_sezioni_numerate": 0,
-        "n_paragrafi_uniti": 0,
-        "n_tabsep_rimossi": 0,
-    }
+# ─── [3a] Funzioni di trasformazione ─────────────────────────────────────────

-    # 0. Rimuovi riferimenti immagini (artefatti opendataloader-pdf)
-    stats["n_immagini_rimosse"] = len(re.findall(r"!\[[^\]]*\]\([^)]*\)", text))
+def _t_remove_images(text: str) -> tuple[str, int]:
+    n = len(re.findall(r"!\[[^\]]*\]\([^)]*\)", text))
    text = re.sub(r"!\[[^\]]*\]\([^)]*\)\s*", "", text)
+    return text, n

-    # 0_br. Rimuovi tag <br> residui da tabelle e blocchi formula PDF
-    #     Nelle celle di tabella produce spazio; nel testo inline elimina rumore.
-    stats["n_br_rimossi"] = len(re.findall(r"<br>", text, re.IGNORECASE))
+
+def _t_fix_br(text: str) -> tuple[str, int]:
+    n = len(re.findall(r"<br>", text, re.IGNORECASE))
    text = re.sub(r"<br>\s*", " ", text, flags=re.IGNORECASE)
+    return text, n

-    # 0_tabsep. Rimuovi separatori tabella PDF: "| |" (riga vuota) e "|---|" (separatore).
-    #     Nascono da tabelle non strutturate nel PDF. Rimossi PRIMA del merge paragrafi
-    #     (step 5) altrimenti "|---|" viene fuso con il paragrafo successivo producendo
-    #     righe tipo "|---| Una caratterizzazione analoga...".
-    _pat_tabsep = re.compile(r"(?m)^\|\s*\|\s*$|^\|---\|?\s*$")
-    stats["n_tabsep_rimossi"] = len(_pat_tabsep.findall(text))
-    text = _pat_tabsep.sub("", text)

-    # 0a. Fix artefatti backtick da PDF LaTeX: `e→è, e`→è, sar`a→sarà, ecc.
-    #     I PDF prodotti da LaTeX estraggono gli accenti gravi come backtick separati
-    #     dalla vocale accentata. Esempi: "`e" → "è", "puo`" → "può", "sar`a" → "sarà"
+def _t_fix_tabsep(text: str) -> tuple[str, int]:
+    _pat = re.compile(r"(?m)^\|\s*\|\s*$|^\|---\|?\s*$")
+    n = len(_pat.findall(text))
+    text = _pat.sub("", text)
+    return text, n
+
+
+def _t_fix_accents(text: str) -> tuple[str, int]:
+    """Fix artefatti backtick da PDF LaTeX: `e→è, e`→è, sar`a→sarà, ecc."""
    _ACCENT_MAP = {
        "e": "è", "E": "È", "a": "à", "A": "À",
        "u": "ù", "U": "Ù", "i": "ì", "I": "Ì", "o": "ò", "O": "Ò",
@@ -393,73 +371,61 @@ def apply_transforms(text: str) -> tuple[str, dict]:
    n_bt_before = text.count("`")
    text = re.sub(r"`([eEaAuUiIoO])", lambda m: _ACCENT_MAP[m.group(1)], text)
    text = re.sub(r"([eEaAuUiIoO])`", lambda m: _ACCENT_MAP[m.group(1)], text)
-    stats["n_accenti_corretti"] = n_bt_before - text.count("`")
-
+    n_accenti = n_bt_before - text.count("`")
    # Backtick orfani: artefatti LaTeX rimasti dopo la correzione vocale
-    # (es. "propriet`" da "proprietà", "continuit`" da "continuità").
-    # In testi PDF non esistono backtick legittimi → rimozione sicura.
    n_bt_orfani = text.count("`")
    if n_bt_orfani:
        text = re.sub(r"`", "", text)
-        stats["n_accenti_corretti"] += n_bt_orfani
+        n_accenti += n_bt_orfani
+    return text, n_accenti

-    # 0a2. Fix segno di moltiplicazione "→× (encoding font PDF non-standard)
-    #     Esempi: 2"107 → 2×107,  2"(10-2 m)3 → 2×(10-2 m)3
-    #     Lookbehind SOLO su cifra: evita falsi positivi tipo t1"t0 (→ limite)
-    #     o h"hf (→ differenza) dove la lettera prima della " non indica prodotto.
-    _n_cross = len(re.findall(r'(?<=[0-9])"(?=[0-9(])', text))
+
+def _t_fix_multiplication(text: str) -> tuple[str, int]:
+    """Fix segno di moltiplicazione "→× (encoding font PDF non-standard)."""
+    n = len(re.findall(r'(?<=[0-9])"(?=[0-9(])', text))
    text = re.sub(r'(?<=[0-9])"(?=[0-9(])', '×', text)
-    stats["n_moltiplicazioni_corrette"] = _n_cross
+    return text, n

-    # 0a3. Fix prefisso micro !→µ prima di unità SI note
-    #     "1 !m" → "1 µm",  "1 !A" → "1 µA",  "3 !s-1" → "3 µs-1"
-    #     Pattern stretto: cifra + spazio opzionale + ! + lettera unità SI a scelta ristretta.
-    #     Non tocca "4! steradianti" (spazio dopo !) né "mol!K" (non preceduto da cifra).
+
+def _t_fix_micro(text: str) -> tuple[str, int]:
+    """Fix prefisso micro !→µ prima di unità SI note."""
    _SI_UNITS_RE = r'[mAsgVWFHTKNJClΩ]'
-    _n_micro = len(re.findall(rf'\d\s*!(?={_SI_UNITS_RE})', text))
+    n = len(re.findall(rf'\d\s*!(?={_SI_UNITS_RE})', text))
    text = re.sub(rf'(\d)\s*!({_SI_UNITS_RE})', r'\1 µ\2', text)
-    stats["n_micro_corretti"] = _n_micro
+    return text, n

-    # 0a4. Rimuovi label formule inline [N.M] — es. [3.4], [10.7], [5.25]
-    #     Non aggiungono valore semantico per il RAG; restano come rumore numerico.
-    #     Preserva [N] senza punto (riferimenti bibliografici/note legittime).
-    n_form_before = len(re.findall(r"\[\d+\.\d+\]", text))
+
+def _t_remove_formula_labels(text: str) -> tuple[str, int]:
+    """Rimuovi label formule inline [N.M] — es. [3.4], [10.7]."""
+    n = len(re.findall(r"\[\d+\.\d+\]", text))
    text = re.sub(r"\s*\[\d+\.\d+\]\s*", " ", text)
-    stats["n_formule_rimossi"] = n_form_before
+    return text, n

-    # 0b_pre. Rimuovi righe con dot-leader (voci di indice/sommario)
-    #     Esempi: "- 1.1 Alfabeto greco . . . . . . 1", "3.4 Continuità . . . . 205"
-    #     Pattern: almeno 3 occorrenze di ". " consecutive nella riga
-    # Cattura sia ". . . ." (spazi) sia "......." (punti continui, tipici dei TOC PDF)
+
+def _t_remove_dotleaders(text: str) -> tuple[str, int]:
+    """Rimuovi righe con dot-leader e numerali romani isolati (footer TOC)."""
    _DOTLEADER_RE = r"^[^\n]*(?:(?:\. ){3,}|\.{4,})[^\n]*$"
-    stats["n_dotleader_rimossi"] = len(
-        re.findall(_DOTLEADER_RE, text, re.MULTILINE)
-    )
+    n = len(re.findall(_DOTLEADER_RE, text, re.MULTILINE))
    text = re.sub(_DOTLEADER_RE, "", text, flags=re.MULTILINE)
-
-    # 0b_pre2. Rimuovi righe che sono solo numerali romani (indicatori di pagina TOC)
-    #     Esempi: "i", "ii", "iii", "iv", "v" su riga isolata (footer pagine indice LaTeX)
-    #     Questi impedirebbero al transform 9 di rimuovere le entry TOC rimaste senza corpo.
    text = re.sub(
        r"(?m)^(i{1,3}|iv|vi{0,3}|ix|xi{0,2}|x)$",
        "",
        text,
        flags=re.IGNORECASE,
    )
+    return text, n

-    # Flag documento: rilevamento sezioni esercizi (es. libri di testo accademici)
-    # Usato per disabilitare transform 4b che convertirebbe i numeri degli esercizi in header.
-    _has_exercise_sections = bool(re.search(r"\bEsercizi\b", text, re.IGNORECASE))

-    # 0b. Fix header + body concatenati senza separatore
-    #     "##### 11 TitoloCorpodel testo..." → "##### 11 Titolo\n\nCorpo del testo..."
-    def _fix_header_concat(m: re.Match) -> str:
+def _t_fix_header_concat(text: str) -> tuple[str, int]:
+    """Fix header + body concatenati senza separatore."""
+    count = 0
+
+    def _fix(m: re.Match) -> str:
+        nonlocal count
        hashes = m.group(1)
        full = m.group(2).strip()
        if len(full) < 60:
            return m.group(0)
-        # Cerca split: lettera minuscola (incluse accentate) seguita da maiuscola
-        # Salta i primi ~10 char per non spezzare il numero della sezione
        skip = min(10, len(full) // 3)
        split = re.search(r"(?<=[a-zàèéìíòóùúä])(?=[A-ZÀÈÉÌÍÒÓÙÚ])", full[skip:])
        if split:
@@ -467,16 +433,17 @@ def apply_transforms(text: str) -> tuple[str, dict]:
            title = full[:pos].strip()
            body = full[pos:].strip()
            if len(title) >= 5 and len(body) >= 15:
-                stats["n_header_concat_fixati"] += 1
+                count += 1
                return f"{hashes} {title}\n\n{body}"
        return m.group(0)

-    text = re.sub(r"^(#{2,6})\s+(.{40,})$", _fix_header_concat, text, flags=re.MULTILINE)
+    text = re.sub(r"^(#{2,6})\s+(.{40,})$", _fix, text, flags=re.MULTILINE)
+    return text, count

-    # 0c. Estrai "Capitolo N: TITOLO" inline nel corpo del testo → ## header separato
-    #     "Capitolo 3: IL TITOLO DEL CAPITOLO - 16 Primo..."  → "## Capitolo 3: ..."
-    #     "Capitolo 1 : TITOLO CAPITOLO"                      → "## Capitolo 1: ..."
-    def _extract_capitolo(m: re.Match) -> str:
+
+def _t_extract_capitolo(text: str) -> tuple[str, int]:
+    """Estrai 'Capitolo N: TITOLO' inline nel corpo del testo → ## header."""
+    def _repl(m: re.Match) -> str:
        num = m.group(1)
        titolo = _sentence_case(m.group(2).strip().rstrip("- ").strip())
        return f"\n\n## Capitolo {num}: {titolo}\n\n"
@@ -484,126 +451,124 @@ def apply_transforms(text: str) -> tuple[str, dict]:
    text = re.sub(
        r"\bCapitolo\s+(\d+)\s*[:\s]\s*([A-ZÀÈÉÌÍÒÓÙÚ\'L][A-ZÀÈÉÌÍÒÓÙÚ\s\'\.,\(\)]{5,80}?)"
        r"(?=\s*[-–]\s*\d|\s*\n|\s*$)",
-        _extract_capitolo,
+        _repl,
        text,
    )
+    return text, 0

-    # 0d. Normalizza header di sezione a livello uniforme ###
-    #     "#### N Titolo"  → "### N. Titolo"  (numerati: aggiunge punto)
-    #     "#### B) Titolo" → "### B) Titolo"  (lettera: solo cambio livello)
-    #     "#### "          → rimosso           (vuoti)
-    text = re.sub(
-        r"^#{3,6}\s*$",
-        "",
-        text,
-        flags=re.MULTILINE,
-    )
+
+def _t_normalize_header_levels(text: str) -> tuple[str, int]:
+    """Normalizza h4+ → h3; rimuove header vuoti."""
+    text = re.sub(r"^#{3,6}\s*$", "", text, flags=re.MULTILINE)
    text = re.sub(
        r"^(#{3,6})\s+(\d{1,3})\s+(.+)$",
        lambda m: f"### {m.group(2)}. {m.group(3)}",
        text,
        flags=re.MULTILINE,
    )
-    text = re.sub(
-        r"^#{4,6}\s+(.+)$",
-        r"### \1",
-        text,
-        flags=re.MULTILINE,
-    )
+    text = re.sub(r"^#{4,6}\s+(.+)$", r"### \1", text, flags=re.MULTILINE)
+    return text, 0

-    # 0e. Converti voci articolo "- Art. N. Titolo. Corpo" → "### Art. N. Titolo.\n\nCorpo"
-    #     Eseguito dopo la promozione h4+ → h3 (0d) per non duplicare Art. già header.
-    #     Eseguito prima del merge paragrafi (5): il boundary ### previene la fusione.
-    text, n_art = _extract_article_headers(text)
-    stats["n_articoli_estratti"] = n_art

-    # 1. Rimuovi **bold** negli header esistenti: ## **Titolo** → ## Titolo
+def _t_extract_articles(text: str) -> tuple[str, int]:
+    """Converti voci articolo '- Art. N.' → '### Art. N.'"""
+    return _extract_article_headers(text)
+
+
+def _t_remove_header_bold(text: str) -> tuple[str, int]:
+    """Rimuovi **bold** negli header esistenti."""
    text = re.sub(
        r"^(#{1,6})\s+\*\*(.+?)\*\*\s*$",
        r"\1 \2",
        text, flags=re.MULTILINE,
    )
+    return text, 0

-    # 1b. Normalizza header ALL-CAPS → sentence-case
-    def _norm_allcaps_header(m: re.Match) -> str:
+
+def _t_normalize_allcaps_headers(text: str) -> tuple[str, int]:
+    """Normalizza header ALL-CAPS → sentence-case."""
+    def _norm(m: re.Match) -> str:
        hashes, content = m.group(1), m.group(2).strip()
        letters = [c for c in content if c.isalpha()]
        if letters and all(c.isupper() for c in letters):
            return f"{hashes} {_sentence_case(content)}"
        return m.group(0)

-    text = re.sub(r"^(#{1,6}) (.+)$", _norm_allcaps_header, text, flags=re.MULTILINE)
+    text = re.sub(r"^(#{1,6}) (.+)$", _norm, text, flags=re.MULTILINE)
+    return text, 0

-    # 2. Rimuovi righe TOC: header "# Indice", "# Contents", ecc.
-    #     + le voci lista numeriche che seguono (TOC senza dot-leader, es. Nietzsche):
-    #       "- 1. Dei pregiudizi dei filosofi" → rimossa se viene subito dopo un header TOC.
-    #     Le voci con dot-leader sono già rimosse da 0b_pre.
-    #     Gli header rimasti senza corpo vengono poi eliminati dal transform 9.
+
+def _t_remove_toc(text: str) -> tuple[str, int]:
+    """Rimuovi header TOC e voci lista numerate che seguono."""
    lines = text.split("\n")
    new_lines = []
    _in_toc = False
+    removed = False
    for line in lines:
-        bare       = re.sub(r"^#+\s*", "", line.strip())
+        bare = re.sub(r"^#+\s*", "", line.strip())
        first_word = bare.split(".")[0].strip().lower()
        if first_word in _TOC_KEYWORDS:
-            stats["toc_rimosso"] = True
+            removed = True
            _in_toc = True
            continue
        if _in_toc:
-            # Salta righe vuote e voci lista numeriche (- N. Titolo / - N Titolo)
            if re.match(r"^\s*$", line) or re.match(r"^\s*[-*+]\s+\d", line):
                continue
            _in_toc = False
        new_lines.append(line)
-    text = "\n".join(new_lines)
+    return "\n".join(new_lines), 1 if removed else 0

-    # 3. Converti righe ALL-CAPS standalone → ## header
+
+def _t_allcaps_to_headers(text: str) -> tuple[str, int]:
+    """Converti righe ALL-CAPS standalone → ## header."""
+    count = 0
    blocks = text.split("\n\n")
    new_blocks = []
    for block in blocks:
        stripped = block.strip()
        if "\n" not in stripped and _is_allcaps_line(stripped):
            new_blocks.append(_allcaps_to_header(stripped))
-            stats["n_header_allcaps"] += 1
+            count += 1
        else:
            sub_lines = block.split("\n")
            converted = []
            for ln in sub_lines:
                if _is_allcaps_line(ln) and len(ln.strip()) > 3:
                    converted.append(_allcaps_to_header(ln))
-                    stats["n_header_allcaps"] += 1
+                    count += 1
                else:
                    converted.append(ln)
            new_blocks.append("\n".join(converted))
-    text = "\n\n".join(new_blocks)
+    return "\n\n".join(new_blocks), count
+
+
+def _t_numbered_sections(text: str, has_exercises: bool = False) -> tuple[str, int]:
+    """Converti sezioni numerate 'N. testo' / '- N. testo' / '- N testo' → ### header."""
+    count = 0

-    # 4. Converti sezioni numerate "N. testo" → "### N.\n\ntesto"
-    #     Guarda che il testo non sia una frase completa (es. esercizi numerati):
-    #     se termina con "." ed è più lungo di 40 caratteri, è probabilmente una frase,
-    #     non un titolo di sezione → lascia invariato.
    def _num_repl(m: re.Match) -> str:
+        nonlocal count
        content = m.group(2).strip()
        if content.endswith(".") and len(content) > 40:
            return m.group(0)
-        stats["n_sezioni_numerate"] += 1
+        count += 1
        return f"### {m.group(1)}.\n\n{content}"

    text = re.sub(r"^(\d+)\.\s+(.+)$", _num_repl, text, flags=re.MULTILINE)

    def _num_letter_repl(m: re.Match) -> str:
-        stats["n_sezioni_numerate"] += 1
+        nonlocal count
+        count += 1
        return f"### {m.group(1)}{m.group(2)}.\n\n{m.group(3).strip()}"

    text = re.sub(r"^(\d+)\s*([a-z])\.\s+(.+)$", _num_letter_repl, text, flags=re.MULTILINE)

-    # 4b. Converti "- N. testo" sezioni con punto → "### N.\n\ntesto"
-    #     "- 1. Testo del primo punto..."  → "### 1.\n\nTesto del primo punto..."
-    #     Deve precedere 4c: "- N." ha il punto, "- N testo" no.
-    #     Disabilitato se il documento contiene sezioni "Esercizi": in quel caso i
-    #     "- N. testo" sono numerazioni di esercizi, non header di sezione.
-    if not _has_exercise_sections:
+    # Disabilitato se il documento contiene sezioni "Esercizi": in quel caso i
+    # "- N. testo" sono numerazioni di esercizi, non header di sezione.
+    if not has_exercises:
        def _aphorism_repl(m: re.Match) -> str:
-            stats["n_sezioni_numerate"] += 1
+            nonlocal count
+            count += 1
            return f"\n\n### {m.group(1)}.\n\n{m.group(2).strip()}"

        text = re.sub(
@@ -613,22 +578,17 @@ def apply_transforms(text: str) -> tuple[str, dict]:
            flags=re.MULTILINE,
        )

-    # 4c. Converti "- N testo" list item numerati → "### N.\n\ntesto"
-    #     "- 12 Titolo sezione Corpo della sezione..." → "### 12. Titolo sezione\n\nCorpo..."
-    #     Non tocca "- a) testo", "- 1) testo" (già gestiti come liste)
    def _list_section_repl(m: re.Match) -> str:
+        nonlocal count
        num = m.group(1)
        content = m.group(2).strip()
-        stats["n_sezioni_numerate"] += 1
-        # Separa titolo da corpo: il titolo finisce dove una lettera minuscola
-        # è seguita da spazio e maiuscola (confine fine-titolo / inizio-corpo)
+        count += 1
        split = re.search(r"(?<=[a-zàèéìíòóùú])\s+(?=[A-ZÀÈÉÌÍÒÓÙÚ])", content)
        if split and split.start() >= 3:
            title = content[: split.start()].strip()
-            body = content[split.end() :].strip()
+            body = content[split.end():].strip()
            if len(body) >= 20:
                return f"\n\n### {num}. {title}\n\n{body}"
-        # Nessun body inline: il content è solo il titolo
        return f"\n\n### {num}. {content}"

    text = re.sub(
@@ -637,16 +597,20 @@ def apply_transforms(text: str) -> tuple[str, dict]:
        text,
        flags=re.MULTILINE,
    )
+    return text, count

-    # 4d. Converti ambienti matematici (Teorema/Definizione/...) → ### header
-    #     Eseguito prima del merge paragrafi (5) per sfruttare i blocchi intatti.
-    text, n_math = _extract_math_environments(text)
-    stats["n_ambienti_matematici"] = n_math

-    # 5. Unisci paragrafi spezzati da salti pagina PDF
+def _t_extract_math(text: str) -> tuple[str, int]:
+    """Converti ambienti matematici (Teorema/Definizione/...) → ### header."""
+    return _extract_math_environments(text)
+
+
+def _t_merge_paragraphs(text: str) -> tuple[str, int]:
+    """Unisci paragrafi spezzati da salti pagina PDF."""
    _SENTENCE_END = set(".?!»)\"'")
    blocks = text.split("\n\n")
    merged = []
+    count = 0
    i = 0
    while i < len(blocks):
        b = blocks[i]
@@ -662,30 +626,38 @@ def apply_transforms(text: str) -> tuple[str, dict]:
                break
            b = stripped + " " + nxt
            stripped = b.strip()
-            stats["n_paragrafi_uniti"] += 1
+            count += 1
            i += 1
        merged.append(b)
        i += 1
    text = "\n\n".join(merged)
-
-    # Secondo pass: rimuovi prefisso |---| eventualmente rimasto dopo il merge paragrafi
+    # Secondo pass: rimuovi prefisso |---| eventualmente rimasto dopo il merge
    text = re.sub(r"(?m)^\|---\|\s*", "", text)
+    return text, count

-    # 6. Normalizza whitespace multiplo interno alle righe
+
+def _t_normalize_whitespace(text: str) -> tuple[str, int]:
+    """Normalizza whitespace multiplo interno alle righe."""
    lines = text.split("\n")
    text = "\n".join(
        re.sub(r"  +", " ", line) if line.strip() else line
        for line in lines
    )
+    return text, 0

-    # 7. Riduci righe vuote multiple a doppie
-    text = re.sub(r"\n{3,}", "\n\n", text)

-    # 8. Rimuovi righe che sono solo URL (watermark, footer di piattaforme)
-    text = re.sub(r"(?m)^(https?://|www\.)\S+\s*$", "", text)
+def _t_collapse_blank_lines(text: str) -> tuple[str, int]:
+    """Riduci righe vuote multiple a doppie."""
+    return re.sub(r"\n{3,}", "\n\n", text), 0

-    # 9. Rimuovi header senza corpo: header seguito solo da righe vuote e poi
-    #    da un altro header o dalla fine del testo (sezioni vuote / watermark)
+
+def _t_remove_urls(text: str) -> tuple[str, int]:
+    """Rimuovi righe che sono solo URL (watermark, footer di piattaforme)."""
+    return re.sub(r"(?m)^(https?://|www\.)\S+\s*$", "", text), 0
+
+
+def _t_remove_empty_headers(text: str) -> tuple[str, int]:
+    """Rimuovi header senza corpo (sezioni vuote / watermark)."""
    blocks = re.split(r"\n{2,}", text)
    cleaned = []
    for i, block in enumerate(blocks):
@@ -693,48 +665,45 @@ def apply_transforms(text: str) -> tuple[str, dict]:
        if re.match(r"^#{1,6} ", stripped) and "\n" not in stripped:
            next_stripped = blocks[i + 1].strip() if i + 1 < len(blocks) else ""
            if not next_stripped or re.match(r"^#{1,6} ", next_stripped):
-                continue  # header senza corpo → scarta
+                continue
        cleaned.append(block)
-    text = re.sub(r"\n{3,}", "\n\n", "\n\n".join(cleaned))
+    return re.sub(r"\n{3,}", "\n\n", "\n\n".join(cleaned)), 0

-    # 9b. Fondi header numerici isolati con il sottotitolo breve successivo
-    #     "### N.\n\nSottotitolo" → "### N. Sottotitolo"  (es. parti Nietzsche)
-    text, n_titoli = _merge_title_headers(text)
-    stats["n_titoli_uniti"] = n_titoli

-    # 9c. Rimuovi garbage headers: header ### senza parole reali o con solo
-    #     abbreviazioni matematiche.  Esempi: "### ( vm)", "### #", "### ! =",
-    #     "### (am)", "### 2. Il valore di hf si deter- mina risolvendo mg(h!hf)"
-    #     Questi nascono da espressioni matematiche scambiate per titoli di sezione.
-    #     Il corpo rimane nel testo e viene accorpato alla sezione precedente.
+def _t_merge_title_headers(text: str) -> tuple[str, int]:
+    """Fondi header numerici isolati con il sottotitolo breve successivo."""
+    return _merge_title_headers(text)
+
+
+def _t_remove_garbage_headers(text: str) -> tuple[str, int]:
+    """Rimuovi garbage headers: simboli, abbreviazioni matematiche, frammenti formula."""
    def _is_garbage_header(content: str) -> bool:
-        # Header con prefisso "..." — frammento di formula (es. "...Di", "...vi")
        if content.lstrip().startswith("..."):
            return True
-        # Nessuna sequenza alfabetica ≥ 2 char
        if not re.search(r"[A-Za-zÀ-ÿ]{2,}", content):
            return True
-        # Abbreviazione corta in parentesi opzionali: "(vm)", "( am)", "(am)"
        if re.fullmatch(r"\(?\s*[A-Za-z]{1,4}\s*\)?", content.strip()):
            return True
-        # Header molto lungo (>60ch) con artefatti formula inline
        if len(content) > 60 and re.search(r"[!%#]\w|\w[!%#]|\b\w+-\s*\w", content):
            return True
        return False

+    count = 0
    lines = text.split("\n")
    new_lines = []
    for line in lines:
        m = re.match(r"^#{1,6} (.+)$", line)
        if m and _is_garbage_header(m.group(1)):
-            stats["n_garbage_headers_rimossi"] += 1
+            count += 1
            continue
        new_lines.append(line)
    text = "\n".join(new_lines)
    text = re.sub(r"\n{3,}", "\n\n", text)
+    return text, count

-    # 9d. Rimuovi sezioni frontmatter: header senza numero + corpo corto con
-    #     URL, email, affiliazione, copyright, edizione — metadati non-contenuto.
+
+def _t_remove_frontmatter(text: str) -> tuple[str, int]:
+    """Rimuovi sezioni frontmatter: URL, email, affiliazione, copyright."""
    _FM_RE = re.compile(
        r"https?://|www\.|@[A-Za-z]|\bUniversit[àa]\b|\bDipartimento\b|"
        r"\bCopyright\b|\bLicenza\b|\bEdizione\b|"
@@ -743,20 +712,69 @@ def apply_transforms(text: str) -> tuple[str, dict]:
    )
    blocks = re.split(r"\n{2,}", text)
    cleaned = []
+    count = 0
    for i, block in enumerate(blocks):
        stripped = block.strip()
        if not re.match(r"^### ", stripped) or re.match(r"^### \d", stripped):
            cleaned.append(block)
            continue
        body = blocks[i + 1].strip() if i + 1 < len(blocks) else ""
-        is_fm_body  = len(body) < 250 and _FM_RE.search(body)
-        is_fm_hdr   = _FM_RE.search(stripped)
+        is_fm_body = len(body) < 250 and _FM_RE.search(body)
+        is_fm_hdr = _FM_RE.search(stripped)
        if is_fm_body or is_fm_hdr:
-            stats["n_frontmatter_rimossi"] += 1
+            count += 1
            continue
        cleaned.append(block)
-    text = re.sub(r"\n{3,}", "\n\n", "\n\n".join(cleaned))
+    return re.sub(r"\n{3,}", "\n\n", "\n\n".join(cleaned)), count

+
+# ─── [3b] Pipeline delle trasformazioni ──────────────────────────────────────
+
+def apply_transforms(text: str) -> tuple[str, dict]:
+    """
+    Applica le trasformazioni strutturali al Markdown grezzo.
+    Restituisce (testo_modificato, statistiche).
+    """
+    # Flag calcolato prima del loop: disabilita il transform 4b nei documenti
+    # con sezioni "Esercizi" (i "- N. testo" sarebbero numerazioni, non header).
+    _has_ex = bool(re.search(r"\bEsercizi\b", text, re.IGNORECASE))
+
+    _transforms: list[tuple[str | None, object]] = [
+        ("n_immagini_rimosse",          _t_remove_images),
+        ("n_br_rimossi",                _t_fix_br),
+        ("n_tabsep_rimossi",            _t_fix_tabsep),
+        ("n_accenti_corretti",          _t_fix_accents),
+        ("n_moltiplicazioni_corrette",  _t_fix_multiplication),
+        ("n_micro_corretti",            _t_fix_micro),
+        ("n_formule_rimossi",           _t_remove_formula_labels),
+        ("n_dotleader_rimossi",         _t_remove_dotleaders),
+        ("n_header_concat_fixati",      _t_fix_header_concat),
+        (None,                          _t_extract_capitolo),
+        (None,                          _t_normalize_header_levels),
+        ("n_articoli_estratti",         _t_extract_articles),
+        (None,                          _t_remove_header_bold),
+        (None,                          _t_normalize_allcaps_headers),
+        ("toc_rimosso",                 _t_remove_toc),
+        ("n_header_allcaps",            _t_allcaps_to_headers),
+        ("n_sezioni_numerate",          partial(_t_numbered_sections, has_exercises=_has_ex)),
+        ("n_ambienti_matematici",       _t_extract_math),
+        ("n_paragrafi_uniti",           _t_merge_paragraphs),
+        (None,                          _t_normalize_whitespace),
+        (None,                          _t_collapse_blank_lines),
+        (None,                          _t_remove_urls),
+        (None,                          _t_remove_empty_headers),
+        ("n_titoli_uniti",              _t_merge_title_headers),
+        ("n_garbage_headers_rimossi",   _t_remove_garbage_headers),
+        ("n_frontmatter_rimossi",       _t_remove_frontmatter),
+    ]
+
+    stats: dict = {}
+    for stat_key, fn in _transforms:
+        text, n = fn(text)
+        if stat_key:
+            stats[stat_key] = stats.get(stat_key, 0) + n
+
+    stats["toc_rimosso"] = bool(stats.get("toc_rimosso", 0))
    return text, stats


@@ -802,6 +820,26 @@ def _split_sections(text: str, level: int) -> list[str]:
    return [p for p in parts[1:] if p.strip()]


+def _parse_sections_with_body(text: str, level: int = 3) -> list[tuple[str, str]]:
+    """Restituisce lista di (header_line, body_text) per tutti gli header al livello dato."""
+    prefix = "#" * level + " "
+    lines = text.split("\n")
+    sections: list[tuple[str, str]] = []
+    cur_hdr: str | None = None
+    cur_body: list[str] = []
+    for line in lines:
+        if line.startswith(prefix):
+            if cur_hdr is not None:
+                sections.append((cur_hdr, "\n".join(cur_body).strip()))
+            cur_hdr = line
+            cur_body = []
+        elif cur_hdr is not None:
+            cur_body.append(line)
+    if cur_hdr is not None:
+        sections.append((cur_hdr, "\n".join(cur_body).strip()))
+    return sections
+
+
 def analyze(md_path: Path) -> dict:
    text = md_path.read_text(encoding="utf-8")
    n_h1 = _count_headers(text, 1)
@@ -869,20 +907,7 @@ def build_report(
    text_lines = clean_text.split("\n")

    # ── Raccolta sezioni ### con corpo ────────────────────────────────────
-    sections: list[tuple[str, str]] = []
-    cur_hdr: str | None = None
-    cur_body: list[str] = []
-    for line in text_lines:
-        if re.match(r"^### ", line):
-            if cur_hdr is not None:
-                sections.append((cur_hdr, "\n".join(cur_body).strip()))
-            cur_hdr = line
-            cur_body = []
-        elif cur_hdr is not None:
-            cur_body.append(line)
-    if cur_hdr is not None:
-        sections.append((cur_hdr, "\n".join(cur_body).strip()))
-
+    sections = _parse_sections_with_body(clean_text, 3)
    lengths = [len(body) for _, body in sections]

    # ── Distribuzione lunghezze ───────────────────────────────────────────
@@ -901,9 +926,6 @@ def build_report(
    }

    # ── Anomalie ──────────────────────────────────────────────────────────
-    # Header solo-numero senza corpo sostanziale: anomalia solo se il corpo
-    # è vuoto o < 30 chars. Un body lungo è una sezione numerata legittima
-    # (es. aforismi numerati dove il numero è l'identificatore della sezione).
    bare_hdrs = [
        {"header": hdr, "corpo_inizio": body[:120].replace("\n", " ")}
        for hdr, body in sections