fix(conversione): 5 fix robustezza e precisione transform

- _t_remove_footnotes: rimuove marcatori superscript inline e righe corpo-nota (¹ testo, [N] testo) — nuovo transform in posizione early - _t_numbered_sections: esclude voci bibliografiche (anno, pp., vol., DOI, ISBN) dalla promozione a ### header - _t_remove_toc: intercetta voci con numero pagina finale nel contesto TOC — rimosso _t_remove_toc_page_list standalone - _t_remove_frontmatter: limitata alle prime ~20% sezioni del documento - _t_remove_recurring_lines: soglia 3->5, Counter spostato a top-level
2026-04-17 12:06:19 +02:00
parent 0a8d98279c
commit ef8f56fdba
1 changed files with 48 additions and 5 deletions
@@ -30,6 +30,7 @@ import re
 import subprocess
 import sys
 import tempfile
+from collections import Counter
 from datetime import datetime
 from functools import partial
 from pathlib import Path
@@ -378,6 +379,31 @@ def _t_remove_images(text: str) -> tuple[str, int]:
    return text, n


+# Superscript Unicode: ¹²³⁴⁵⁶⁷⁸⁹⁰
+_SUPERSCRIPT_RE = re.compile(r'[\u00b9\u00b2\u00b3\u2070\u2074-\u2079]+')
+# Riga corpo-nota: inizia con superscript o [N]
+_FOOTNOTE_BODY_RE = re.compile(
+    r'^([\u00b9\u00b2\u00b3\u2070\u2074-\u2079]+\s+|\[\d{1,3}\]\s+)'
+)
+
+
+def _t_remove_footnotes(text: str) -> tuple[str, int]:
+    """Rimuovi marcatori footnote superscript inline e righe corpo-nota."""
+    lines = text.split("\n")
+    result, count = [], 0
+    for line in lines:
+        stripped = line.strip()
+        # Corpo nota: riga breve che inizia con ¹ o [N]
+        if stripped and _FOOTNOTE_BODY_RE.match(stripped) and len(stripped) < 300:
+            count += 1
+            continue
+        cleaned = _SUPERSCRIPT_RE.sub("", line)
+        if cleaned != line:
+            count += 1
+        result.append(cleaned)
+    return "\n".join(result), count
+
+
 def _t_fix_br(text: str) -> tuple[str, int]:
    n = len(re.findall(r"<br>", text, re.IGNORECASE))
    text = re.sub(r"<br>\s*", " ", text, flags=re.IGNORECASE)
@@ -585,6 +611,9 @@ def _t_remove_toc(text: str) -> tuple[str, int]:
        if _in_toc:
            if re.match(r"^\s*$", line) or re.match(r"^\s*[-*+]\s+\d", line):
                continue
+            # Voce TOC con numero pagina finale (sicuro: siamo già in contesto TOC)
+            if re.match(r"^\s*[-*+]\s+.{2,70}\s+\d{1,3}\s*$", line):
+                continue
            _in_toc = False
        new_lines.append(line)
    return "\n".join(new_lines), 1 if removed else 0
@@ -637,6 +666,13 @@ def _t_allcaps_to_headers(text: str) -> tuple[str, int]:
    return "\n\n".join(new_blocks), count


+_BIB_MARKERS_RE = re.compile(
+    r'\b(pp?\.|vol\.|n\.\s*\d|ed\.|edn\.|ISBN|DOI|arXiv)\b'
+    r'|\b(19|20)\d{2}\b',
+    re.IGNORECASE,
+)
+
+
 def _t_numbered_sections(text: str, has_exercises: bool = False) -> tuple[str, int]:
    """Converti sezioni numerate 'N. testo' / '- N. testo' / '- N testo' → ### header."""
    count = 0
@@ -646,6 +682,8 @@ def _t_numbered_sections(text: str, has_exercises: bool = False) -> tuple[str, i
        content = m.group(2).strip()
        if content.endswith(".") and len(content) > 40:
            return m.group(0)
+        if _BIB_MARKERS_RE.search(content):
+            return m.group(0)
        count += 1
        return f"### {m.group(1)}.\n\n{content}"

@@ -912,8 +950,14 @@ def _t_remove_frontmatter(text: str) -> tuple[str, int]:
    blocks = re.split(r"\n{2,}", text)
    cleaned = []
    count = 0
+    total = len(blocks)
+    cutoff = max(5, min(15, int(total * 0.20)))
    for i, block in enumerate(blocks):
        stripped = block.strip()
+        # Frontmatter compare solo nelle prime sezioni del documento
+        if i >= cutoff:
+            cleaned.append(block)
+            continue
        if not re.match(r"^### ", stripped) or re.match(r"^### \d", stripped):
            cleaned.append(block)
            continue
@@ -959,15 +1003,14 @@ def _t_fix_math_symbols(text: str) -> tuple[str, int]:


 def _t_remove_recurring_lines(text: str) -> tuple[str, int]:
-    """Rimuovi righe corte che si ripetono ≥3 volte (header/footer di pagina)."""
-    from collections import Counter
+    """Rimuovi righe corte che si ripetono ≥5 volte (header/footer di pagina)."""
    lines = text.split("\n")
    short_lines = [
        ln.strip() for ln in lines
        if 3 < len(ln.strip()) < 80 and not ln.strip().startswith("#")
    ]
    freq = Counter(short_lines)
-    recurring = {ln for ln, c in freq.items() if c >= 3}
+    recurring = {ln for ln, c in freq.items() if c >= 5}
    if not recurring:
        return text, 0
    result, count = [], 0
@@ -994,6 +1037,7 @@ def apply_transforms(text: str) -> tuple[str, dict]:
        ("n_immagini_rimosse",          _t_remove_images),
        ("n_br_rimossi",                _t_fix_br),
        ("n_tabsep_rimossi",            _t_fix_tabsep),
+        ("n_note_rimosse",              _t_remove_footnotes),
        ("n_accenti_corretti",          _t_fix_accents),
        ("n_moltiplicazioni_corrette",  _t_fix_multiplication),
        ("n_micro_corretti",            _t_fix_micro),
@@ -1009,7 +1053,6 @@ def apply_transforms(text: str) -> tuple[str, dict]:
        (None,                          _t_remove_header_bold),
        (None,                          _t_normalize_allcaps_headers),
        ("toc_rimosso",                 _t_remove_toc),
-        ("n_toc_page_list_rimossi",     _t_remove_toc_page_list),
        ("n_header_allcaps",            _t_allcaps_to_headers),
        ("n_sezioni_numerate",          partial(_t_numbered_sections, has_exercises=_has_ex)),
        ("n_ambienti_matematici",       _t_extract_math),
@@ -1344,6 +1387,7 @@ def run(stem: str, project_root: Path, force: bool) -> bool:
    clean_text, t_stats = apply_transforms(raw_text)
    reduction = 100 * (1 - len(clean_text) / len(raw_text)) if raw_text else 0
    print(f"  ✅ Immagini rimosse:      {t_stats['n_immagini_rimosse']}")
+    print(f"     Note rimossa:          {t_stats['n_note_rimosse']}")
    print(f"     Accenti corretti:      {t_stats['n_accenti_corretti']}")
    print(f"     Dot-leader rimossi:    {t_stats['n_dotleader_rimossi']}")
    print(f"     Header concat fixati:  {t_stats['n_header_concat_fixati']}")
@@ -1352,7 +1396,6 @@ def run(stem: str, project_root: Path, force: bool) -> bool:
    print(f"     Ambienti matematici:   {t_stats['n_ambienti_matematici']}")
    print(f"     Titoli header uniti:   {t_stats['n_titoli_uniti']}")
    print(f"     TOC rimosso:           {'sì' if t_stats['toc_rimosso'] else 'no'}")
-    print(f"     TOC voci pagina rim.:  {t_stats['n_toc_page_list_rimossi']}")
    print(f"     Versi poesia riprist.: {t_stats['n_versi_ripristinati']}")
    print(f"     Header verso demotati: {t_stats['n_header_verso_demotati']}")
    print(f"     ALL-CAPS → ##:         {t_stats['n_header_allcaps']}")