diff --git a/conversione/pipeline.py b/conversione/pipeline.py index 099acec..e207b28 100644 --- a/conversione/pipeline.py +++ b/conversione/pipeline.py @@ -30,6 +30,7 @@ import re import subprocess import sys import tempfile +from collections import Counter from datetime import datetime from functools import partial from pathlib import Path @@ -378,6 +379,31 @@ def _t_remove_images(text: str) -> tuple[str, int]: return text, n +# Superscript Unicode: ¹²³⁴⁵⁶⁷⁸⁹⁰ +_SUPERSCRIPT_RE = re.compile(r'[\u00b9\u00b2\u00b3\u2070\u2074-\u2079]+') +# Riga corpo-nota: inizia con superscript o [N] +_FOOTNOTE_BODY_RE = re.compile( + r'^([\u00b9\u00b2\u00b3\u2070\u2074-\u2079]+\s+|\[\d{1,3}\]\s+)' +) + + +def _t_remove_footnotes(text: str) -> tuple[str, int]: + """Rimuovi marcatori footnote superscript inline e righe corpo-nota.""" + lines = text.split("\n") + result, count = [], 0 + for line in lines: + stripped = line.strip() + # Corpo nota: riga breve che inizia con ¹ o [N] + if stripped and _FOOTNOTE_BODY_RE.match(stripped) and len(stripped) < 300: + count += 1 + continue + cleaned = _SUPERSCRIPT_RE.sub("", line) + if cleaned != line: + count += 1 + result.append(cleaned) + return "\n".join(result), count + + def _t_fix_br(text: str) -> tuple[str, int]: n = len(re.findall(r"
", text, re.IGNORECASE)) text = re.sub(r"
\s*", " ", text, flags=re.IGNORECASE) @@ -585,6 +611,9 @@ def _t_remove_toc(text: str) -> tuple[str, int]: if _in_toc: if re.match(r"^\s*$", line) or re.match(r"^\s*[-*+]\s+\d", line): continue + # Voce TOC con numero pagina finale (sicuro: siamo già in contesto TOC) + if re.match(r"^\s*[-*+]\s+.{2,70}\s+\d{1,3}\s*$", line): + continue _in_toc = False new_lines.append(line) return "\n".join(new_lines), 1 if removed else 0 @@ -637,6 +666,13 @@ def _t_allcaps_to_headers(text: str) -> tuple[str, int]: return "\n\n".join(new_blocks), count +_BIB_MARKERS_RE = re.compile( + r'\b(pp?\.|vol\.|n\.\s*\d|ed\.|edn\.|ISBN|DOI|arXiv)\b' + r'|\b(19|20)\d{2}\b', + re.IGNORECASE, +) + + def _t_numbered_sections(text: str, has_exercises: bool = False) -> tuple[str, int]: """Converti sezioni numerate 'N. testo' / '- N. testo' / '- N testo' → ### header.""" count = 0 @@ -646,6 +682,8 @@ def _t_numbered_sections(text: str, has_exercises: bool = False) -> tuple[str, i content = m.group(2).strip() if content.endswith(".") and len(content) > 40: return m.group(0) + if _BIB_MARKERS_RE.search(content): + return m.group(0) count += 1 return f"### {m.group(1)}.\n\n{content}" @@ -912,8 +950,14 @@ def _t_remove_frontmatter(text: str) -> tuple[str, int]: blocks = re.split(r"\n{2,}", text) cleaned = [] count = 0 + total = len(blocks) + cutoff = max(5, min(15, int(total * 0.20))) for i, block in enumerate(blocks): stripped = block.strip() + # Frontmatter compare solo nelle prime sezioni del documento + if i >= cutoff: + cleaned.append(block) + continue if not re.match(r"^### ", stripped) or re.match(r"^### \d", stripped): cleaned.append(block) continue @@ -959,15 +1003,14 @@ def _t_fix_math_symbols(text: str) -> tuple[str, int]: def _t_remove_recurring_lines(text: str) -> tuple[str, int]: - """Rimuovi righe corte che si ripetono ≥3 volte (header/footer di pagina).""" - from collections import Counter + """Rimuovi righe corte che si ripetono ≥5 volte (header/footer di pagina).""" lines = text.split("\n") short_lines = [ ln.strip() for ln in lines if 3 < len(ln.strip()) < 80 and not ln.strip().startswith("#") ] freq = Counter(short_lines) - recurring = {ln for ln, c in freq.items() if c >= 3} + recurring = {ln for ln, c in freq.items() if c >= 5} if not recurring: return text, 0 result, count = [], 0 @@ -994,6 +1037,7 @@ def apply_transforms(text: str) -> tuple[str, dict]: ("n_immagini_rimosse", _t_remove_images), ("n_br_rimossi", _t_fix_br), ("n_tabsep_rimossi", _t_fix_tabsep), + ("n_note_rimosse", _t_remove_footnotes), ("n_accenti_corretti", _t_fix_accents), ("n_moltiplicazioni_corrette", _t_fix_multiplication), ("n_micro_corretti", _t_fix_micro), @@ -1009,7 +1053,6 @@ def apply_transforms(text: str) -> tuple[str, dict]: (None, _t_remove_header_bold), (None, _t_normalize_allcaps_headers), ("toc_rimosso", _t_remove_toc), - ("n_toc_page_list_rimossi", _t_remove_toc_page_list), ("n_header_allcaps", _t_allcaps_to_headers), ("n_sezioni_numerate", partial(_t_numbered_sections, has_exercises=_has_ex)), ("n_ambienti_matematici", _t_extract_math), @@ -1344,6 +1387,7 @@ def run(stem: str, project_root: Path, force: bool) -> bool: clean_text, t_stats = apply_transforms(raw_text) reduction = 100 * (1 - len(clean_text) / len(raw_text)) if raw_text else 0 print(f" ✅ Immagini rimosse: {t_stats['n_immagini_rimosse']}") + print(f" Note rimossa: {t_stats['n_note_rimosse']}") print(f" Accenti corretti: {t_stats['n_accenti_corretti']}") print(f" Dot-leader rimossi: {t_stats['n_dotleader_rimossi']}") print(f" Header concat fixati: {t_stats['n_header_concat_fixati']}") @@ -1352,7 +1396,6 @@ def run(stem: str, project_root: Path, force: bool) -> bool: print(f" Ambienti matematici: {t_stats['n_ambienti_matematici']}") print(f" Titoli header uniti: {t_stats['n_titoli_uniti']}") print(f" TOC rimosso: {'sì' if t_stats['toc_rimosso'] else 'no'}") - print(f" TOC voci pagina rim.: {t_stats['n_toc_page_list_rimossi']}") print(f" Versi poesia riprist.: {t_stats['n_versi_ripristinati']}") print(f" Header verso demotati: {t_stats['n_header_verso_demotati']}") print(f" ALL-CAPS → ##: {t_stats['n_header_allcaps']}")