diff --git a/conversione/pipeline.py b/conversione/pipeline.py
index 099acec..e207b28 100644
--- a/conversione/pipeline.py
+++ b/conversione/pipeline.py
@@ -30,6 +30,7 @@ import re
import subprocess
import sys
import tempfile
+from collections import Counter
from datetime import datetime
from functools import partial
from pathlib import Path
@@ -378,6 +379,31 @@ def _t_remove_images(text: str) -> tuple[str, int]:
return text, n
+# Superscript Unicode: ¹²³⁴⁵⁶⁷⁸⁹⁰
+_SUPERSCRIPT_RE = re.compile(r'[\u00b9\u00b2\u00b3\u2070\u2074-\u2079]+')
+# Riga corpo-nota: inizia con superscript o [N]
+_FOOTNOTE_BODY_RE = re.compile(
+ r'^([\u00b9\u00b2\u00b3\u2070\u2074-\u2079]+\s+|\[\d{1,3}\]\s+)'
+)
+
+
+def _t_remove_footnotes(text: str) -> tuple[str, int]:
+ """Rimuovi marcatori footnote superscript inline e righe corpo-nota."""
+ lines = text.split("\n")
+ result, count = [], 0
+ for line in lines:
+ stripped = line.strip()
+ # Corpo nota: riga breve che inizia con ¹ o [N]
+ if stripped and _FOOTNOTE_BODY_RE.match(stripped) and len(stripped) < 300:
+ count += 1
+ continue
+ cleaned = _SUPERSCRIPT_RE.sub("", line)
+ if cleaned != line:
+ count += 1
+ result.append(cleaned)
+ return "\n".join(result), count
+
+
def _t_fix_br(text: str) -> tuple[str, int]:
n = len(re.findall(r"
", text, re.IGNORECASE))
text = re.sub(r"
\s*", " ", text, flags=re.IGNORECASE)
@@ -585,6 +611,9 @@ def _t_remove_toc(text: str) -> tuple[str, int]:
if _in_toc:
if re.match(r"^\s*$", line) or re.match(r"^\s*[-*+]\s+\d", line):
continue
+ # Voce TOC con numero pagina finale (sicuro: siamo già in contesto TOC)
+ if re.match(r"^\s*[-*+]\s+.{2,70}\s+\d{1,3}\s*$", line):
+ continue
_in_toc = False
new_lines.append(line)
return "\n".join(new_lines), 1 if removed else 0
@@ -637,6 +666,13 @@ def _t_allcaps_to_headers(text: str) -> tuple[str, int]:
return "\n\n".join(new_blocks), count
+_BIB_MARKERS_RE = re.compile(
+ r'\b(pp?\.|vol\.|n\.\s*\d|ed\.|edn\.|ISBN|DOI|arXiv)\b'
+ r'|\b(19|20)\d{2}\b',
+ re.IGNORECASE,
+)
+
+
def _t_numbered_sections(text: str, has_exercises: bool = False) -> tuple[str, int]:
"""Converti sezioni numerate 'N. testo' / '- N. testo' / '- N testo' → ### header."""
count = 0
@@ -646,6 +682,8 @@ def _t_numbered_sections(text: str, has_exercises: bool = False) -> tuple[str, i
content = m.group(2).strip()
if content.endswith(".") and len(content) > 40:
return m.group(0)
+ if _BIB_MARKERS_RE.search(content):
+ return m.group(0)
count += 1
return f"### {m.group(1)}.\n\n{content}"
@@ -912,8 +950,14 @@ def _t_remove_frontmatter(text: str) -> tuple[str, int]:
blocks = re.split(r"\n{2,}", text)
cleaned = []
count = 0
+ total = len(blocks)
+ cutoff = max(5, min(15, int(total * 0.20)))
for i, block in enumerate(blocks):
stripped = block.strip()
+ # Frontmatter compare solo nelle prime sezioni del documento
+ if i >= cutoff:
+ cleaned.append(block)
+ continue
if not re.match(r"^### ", stripped) or re.match(r"^### \d", stripped):
cleaned.append(block)
continue
@@ -959,15 +1003,14 @@ def _t_fix_math_symbols(text: str) -> tuple[str, int]:
def _t_remove_recurring_lines(text: str) -> tuple[str, int]:
- """Rimuovi righe corte che si ripetono ≥3 volte (header/footer di pagina)."""
- from collections import Counter
+ """Rimuovi righe corte che si ripetono ≥5 volte (header/footer di pagina)."""
lines = text.split("\n")
short_lines = [
ln.strip() for ln in lines
if 3 < len(ln.strip()) < 80 and not ln.strip().startswith("#")
]
freq = Counter(short_lines)
- recurring = {ln for ln, c in freq.items() if c >= 3}
+ recurring = {ln for ln, c in freq.items() if c >= 5}
if not recurring:
return text, 0
result, count = [], 0
@@ -994,6 +1037,7 @@ def apply_transforms(text: str) -> tuple[str, dict]:
("n_immagini_rimosse", _t_remove_images),
("n_br_rimossi", _t_fix_br),
("n_tabsep_rimossi", _t_fix_tabsep),
+ ("n_note_rimosse", _t_remove_footnotes),
("n_accenti_corretti", _t_fix_accents),
("n_moltiplicazioni_corrette", _t_fix_multiplication),
("n_micro_corretti", _t_fix_micro),
@@ -1009,7 +1053,6 @@ def apply_transforms(text: str) -> tuple[str, dict]:
(None, _t_remove_header_bold),
(None, _t_normalize_allcaps_headers),
("toc_rimosso", _t_remove_toc),
- ("n_toc_page_list_rimossi", _t_remove_toc_page_list),
("n_header_allcaps", _t_allcaps_to_headers),
("n_sezioni_numerate", partial(_t_numbered_sections, has_exercises=_has_ex)),
("n_ambienti_matematici", _t_extract_math),
@@ -1344,6 +1387,7 @@ def run(stem: str, project_root: Path, force: bool) -> bool:
clean_text, t_stats = apply_transforms(raw_text)
reduction = 100 * (1 - len(clean_text) / len(raw_text)) if raw_text else 0
print(f" ✅ Immagini rimosse: {t_stats['n_immagini_rimosse']}")
+ print(f" Note rimossa: {t_stats['n_note_rimosse']}")
print(f" Accenti corretti: {t_stats['n_accenti_corretti']}")
print(f" Dot-leader rimossi: {t_stats['n_dotleader_rimossi']}")
print(f" Header concat fixati: {t_stats['n_header_concat_fixati']}")
@@ -1352,7 +1396,6 @@ def run(stem: str, project_root: Path, force: bool) -> bool:
print(f" Ambienti matematici: {t_stats['n_ambienti_matematici']}")
print(f" Titoli header uniti: {t_stats['n_titoli_uniti']}")
print(f" TOC rimosso: {'sì' if t_stats['toc_rimosso'] else 'no'}")
- print(f" TOC voci pagina rim.: {t_stats['n_toc_page_list_rimossi']}")
print(f" Versi poesia riprist.: {t_stats['n_versi_ripristinati']}")
print(f" Header verso demotati: {t_stats['n_header_verso_demotati']}")
print(f" ALL-CAPS → ##: {t_stats['n_header_allcaps']}")