fix(conversione): 5 fix robustezza e precisione transform

- _t_remove_footnotes: rimuove marcatori superscript inline e righe
  corpo-nota (¹ testo, [N] testo) — nuovo transform in posizione early
- _t_numbered_sections: esclude voci bibliografiche (anno, pp., vol.,
  DOI, ISBN) dalla promozione a ### header
- _t_remove_toc: intercetta voci con numero pagina finale nel contesto
  TOC — rimosso _t_remove_toc_page_list standalone
- _t_remove_frontmatter: limitata alle prime ~20% sezioni del documento
- _t_remove_recurring_lines: soglia 3->5, Counter spostato a top-level
This commit is contained in:
2026-04-17 12:06:19 +02:00
parent 0a8d98279c
commit ef8f56fdba
+48 -5
View File
@@ -30,6 +30,7 @@ import re
import subprocess
import sys
import tempfile
from collections import Counter
from datetime import datetime
from functools import partial
from pathlib import Path
@@ -378,6 +379,31 @@ def _t_remove_images(text: str) -> tuple[str, int]:
return text, n
# Superscript Unicode: ¹²³⁴⁵⁶⁷⁸⁹⁰
_SUPERSCRIPT_RE = re.compile(r'[\u00b9\u00b2\u00b3\u2070\u2074-\u2079]+')
# Riga corpo-nota: inizia con superscript o [N]
_FOOTNOTE_BODY_RE = re.compile(
r'^([\u00b9\u00b2\u00b3\u2070\u2074-\u2079]+\s+|\[\d{1,3}\]\s+)'
)
def _t_remove_footnotes(text: str) -> tuple[str, int]:
"""Rimuovi marcatori footnote superscript inline e righe corpo-nota."""
lines = text.split("\n")
result, count = [], 0
for line in lines:
stripped = line.strip()
# Corpo nota: riga breve che inizia con ¹ o [N]
if stripped and _FOOTNOTE_BODY_RE.match(stripped) and len(stripped) < 300:
count += 1
continue
cleaned = _SUPERSCRIPT_RE.sub("", line)
if cleaned != line:
count += 1
result.append(cleaned)
return "\n".join(result), count
def _t_fix_br(text: str) -> tuple[str, int]:
n = len(re.findall(r"<br>", text, re.IGNORECASE))
text = re.sub(r"<br>\s*", " ", text, flags=re.IGNORECASE)
@@ -585,6 +611,9 @@ def _t_remove_toc(text: str) -> tuple[str, int]:
if _in_toc:
if re.match(r"^\s*$", line) or re.match(r"^\s*[-*+]\s+\d", line):
continue
# Voce TOC con numero pagina finale (sicuro: siamo già in contesto TOC)
if re.match(r"^\s*[-*+]\s+.{2,70}\s+\d{1,3}\s*$", line):
continue
_in_toc = False
new_lines.append(line)
return "\n".join(new_lines), 1 if removed else 0
@@ -637,6 +666,13 @@ def _t_allcaps_to_headers(text: str) -> tuple[str, int]:
return "\n\n".join(new_blocks), count
_BIB_MARKERS_RE = re.compile(
r'\b(pp?\.|vol\.|n\.\s*\d|ed\.|edn\.|ISBN|DOI|arXiv)\b'
r'|\b(19|20)\d{2}\b',
re.IGNORECASE,
)
def _t_numbered_sections(text: str, has_exercises: bool = False) -> tuple[str, int]:
"""Converti sezioni numerate 'N. testo' / '- N. testo' / '- N testo' → ### header."""
count = 0
@@ -646,6 +682,8 @@ def _t_numbered_sections(text: str, has_exercises: bool = False) -> tuple[str, i
content = m.group(2).strip()
if content.endswith(".") and len(content) > 40:
return m.group(0)
if _BIB_MARKERS_RE.search(content):
return m.group(0)
count += 1
return f"### {m.group(1)}.\n\n{content}"
@@ -912,8 +950,14 @@ def _t_remove_frontmatter(text: str) -> tuple[str, int]:
blocks = re.split(r"\n{2,}", text)
cleaned = []
count = 0
total = len(blocks)
cutoff = max(5, min(15, int(total * 0.20)))
for i, block in enumerate(blocks):
stripped = block.strip()
# Frontmatter compare solo nelle prime sezioni del documento
if i >= cutoff:
cleaned.append(block)
continue
if not re.match(r"^### ", stripped) or re.match(r"^### \d", stripped):
cleaned.append(block)
continue
@@ -959,15 +1003,14 @@ def _t_fix_math_symbols(text: str) -> tuple[str, int]:
def _t_remove_recurring_lines(text: str) -> tuple[str, int]:
"""Rimuovi righe corte che si ripetono ≥3 volte (header/footer di pagina)."""
from collections import Counter
"""Rimuovi righe corte che si ripetono ≥5 volte (header/footer di pagina)."""
lines = text.split("\n")
short_lines = [
ln.strip() for ln in lines
if 3 < len(ln.strip()) < 80 and not ln.strip().startswith("#")
]
freq = Counter(short_lines)
recurring = {ln for ln, c in freq.items() if c >= 3}
recurring = {ln for ln, c in freq.items() if c >= 5}
if not recurring:
return text, 0
result, count = [], 0
@@ -994,6 +1037,7 @@ def apply_transforms(text: str) -> tuple[str, dict]:
("n_immagini_rimosse", _t_remove_images),
("n_br_rimossi", _t_fix_br),
("n_tabsep_rimossi", _t_fix_tabsep),
("n_note_rimosse", _t_remove_footnotes),
("n_accenti_corretti", _t_fix_accents),
("n_moltiplicazioni_corrette", _t_fix_multiplication),
("n_micro_corretti", _t_fix_micro),
@@ -1009,7 +1053,6 @@ def apply_transforms(text: str) -> tuple[str, dict]:
(None, _t_remove_header_bold),
(None, _t_normalize_allcaps_headers),
("toc_rimosso", _t_remove_toc),
("n_toc_page_list_rimossi", _t_remove_toc_page_list),
("n_header_allcaps", _t_allcaps_to_headers),
("n_sezioni_numerate", partial(_t_numbered_sections, has_exercises=_has_ex)),
("n_ambienti_matematici", _t_extract_math),
@@ -1344,6 +1387,7 @@ def run(stem: str, project_root: Path, force: bool) -> bool:
clean_text, t_stats = apply_transforms(raw_text)
reduction = 100 * (1 - len(clean_text) / len(raw_text)) if raw_text else 0
print(f" ✅ Immagini rimosse: {t_stats['n_immagini_rimosse']}")
print(f" Note rimossa: {t_stats['n_note_rimosse']}")
print(f" Accenti corretti: {t_stats['n_accenti_corretti']}")
print(f" Dot-leader rimossi: {t_stats['n_dotleader_rimossi']}")
print(f" Header concat fixati: {t_stats['n_header_concat_fixati']}")
@@ -1352,7 +1396,6 @@ def run(stem: str, project_root: Path, force: bool) -> bool:
print(f" Ambienti matematici: {t_stats['n_ambienti_matematici']}")
print(f" Titoli header uniti: {t_stats['n_titoli_uniti']}")
print(f" TOC rimosso: {'' if t_stats['toc_rimosso'] else 'no'}")
print(f" TOC voci pagina rim.: {t_stats['n_toc_page_list_rimossi']}")
print(f" Versi poesia riprist.: {t_stats['n_versi_ripristinati']}")
print(f" Header verso demotati: {t_stats['n_header_verso_demotati']}")
print(f" ALL-CAPS → ##: {t_stats['n_header_allcaps']}")