diff --git a/conversione/pipeline.py b/conversione/pipeline.py index e207b28..eedf436 100644 --- a/conversione/pipeline.py +++ b/conversione/pipeline.py @@ -238,8 +238,9 @@ def _extract_math_environments(text: str) -> tuple[str, int]: Deve girare PRIMA del merge paragrafi (step 5) per sfruttare i blocchi intatti. """ _ENVS = ( - r"Definizione|Teorema|Lemma|Proposizione|" - r"Corollario|Osservazione|Nota|Esempio" + r"Definizione|Definition|Teorema|Theorem|Lemma|" + r"Proposizione|Proposition|Corollario|Corollary|" + r"Osservazione|Remark|Nota|Note|Esempio|Example" ) count = 0 blocks = text.split("\n\n") @@ -373,6 +374,127 @@ def _extract_article_headers(text: str) -> tuple[str, int]: # ─── [3a] Funzioni di trasformazione ───────────────────────────────────────── +# Mapping PUA Unicode (U+F020-U+F0FF) → simboli corretti per font Symbol/Wingdings. +# Il font Symbol di Windows codifica lettere greche e operatori matematici nel +# range Private Use Area invece dei codepoint Unicode standard. +_SYMBOL_PUA_MAP: dict[str, str] = { + "\uf020": " ", # space + "\uf028": "(", + "\uf029": ")", + "\uf02b": "+", + "\uf02d": "\u2212", # minus + "\uf02e": ".", + "\uf02f": "/", + "\uf030": "0", "\uf031": "1", "\uf032": "2", "\uf033": "3", "\uf034": "4", + "\uf035": "5", "\uf036": "6", "\uf037": "7", "\uf038": "8", "\uf039": "9", + "\uf03a": ":", "\uf03b": ";", "\uf03c": "<", "\uf03d": "=", "\uf03e": ">", + "\uf040": "\u2245", # congruent + "\uf041": "\u0391", # Alpha + "\uf042": "\u0392", # Beta + "\uf043": "\u03a7", # Chi + "\uf044": "\u0394", # Delta + "\uf045": "\u0395", # Epsilon + "\uf046": "\u03a6", # Phi + "\uf047": "\u0393", # Gamma + "\uf048": "\u0397", # Eta + "\uf049": "\u0399", # Iota + "\uf04a": "\u03d1", # theta variant + "\uf04b": "\u039a", # Kappa + "\uf04c": "\u039b", # Lambda + "\uf04d": "\u039c", # Mu + "\uf04e": "\u039d", # Nu + "\uf04f": "\u039f", # Omicron + "\uf050": "\u03a0", # Pi + "\uf051": "\u0398", # Theta + "\uf052": "\u03a1", # Rho + "\uf053": "\u03a3", # Sigma + "\uf054": "\u03a4", # Tau + "\uf055": "\u03a5", # Upsilon + "\uf056": "\u03c2", # sigma final + "\uf057": "\u03a9", # Omega + "\uf058": "\u039e", # Xi + "\uf059": "\u03a8", # Psi + "\uf05a": "\u0396", # Zeta + "\uf05b": "[", + "\uf05c": "\u2234", # therefore + "\uf05d": "]", + "\uf05e": "\u22a5", # perpendicular + "\uf061": "\u03b1", # alpha + "\uf062": "\u03b2", # beta + "\uf063": "\u03c7", # chi + "\uf064": "\u03b4", # delta + "\uf065": "\u03b5", # epsilon + "\uf066": "\u03c6", # phi + "\uf067": "\u03b3", # gamma + "\uf068": "\u03b7", # eta + "\uf069": "\u03b9", # iota + "\uf06a": "\u03d5", # phi variant + "\uf06b": "\u03ba", # kappa + "\uf06c": "\u03bb", # lambda + "\uf06d": "\u03bc", # mu + "\uf06e": "\u03bd", # nu + "\uf06f": "\u03bf", # omicron + "\uf070": "\u03c0", # pi + "\uf071": "\u03b8", # theta + "\uf072": "\u03c1", # rho + "\uf073": "\u03c3", # sigma + "\uf074": "\u03c4", # tau + "\uf075": "\u03c5", # upsilon + "\uf076": "\u03d6", # pi symbol + "\uf077": "\u03c9", # omega + "\uf078": "\u03be", # xi + "\uf079": "\u03c8", # psi + "\uf07a": "\u03b6", # zeta + "\uf07b": "{", + "\uf07c": "|", + "\uf07d": "}", + "\uf07e": "~", + "\uf0b1": "\u00b1", # plus-minus + "\uf0b7": "\u2022", # bullet + "\uf0ba": "\u221a", # square root + "\uf0bc": "\u2264", # less or equal + "\uf0bd": "\u2265", # greater or equal + "\uf0be": "\u221d", # proportional + "\uf0d7": "\u00d7", # multiplication + "\uf0f7": "\u00f7", # division + "\uf0b4": "\u00d7", # alternate multiply + "\uf0bb": "\u2260", # not equal + "\uf0b9": "\u2260", # not equal alternate + "\uf0b3": "\u2265", # greater or equal alternate + "\uf0b2": "\u2032", # prime + "\uf02a": "*", + "\uf02c": ",", + "\uf0a3": "\u2264", # less or equal (Symbol 0xA3) + "\uf0a7": "\u2022", # bullet (Wingdings 0xA7) + "\uf0a8": "\u2022", # bullet variant + "\uf0ae": "\u2192", # right arrow (Symbol 0xAE) + "\uf0b8": "\u00f7", # division / range separator + "\uf0eb": "", # Wingdings decorative icon (rimosso) + "\uf0f0": "\u2192", # right arrow variant + "\uf0db": "", # bracket extension piece (non ricostruibile) + "\uf0dc": "", # bracket extension piece + "\uf0dd": "", # bracket extension piece + "\uf0de": "", # brace middle piece (non ricostruibile) + "\uf0df": "", # brace extension piece +} + +_SYMBOL_PUA_RE = re.compile( + "[" + "".join(re.escape(k) for k in _SYMBOL_PUA_MAP) + "]" +) + + +def _t_fix_symbol_font(text: str) -> tuple[str, int]: + """Rimappa caratteri PUA font Symbol (U+F020-U+F0FF) in simboli Unicode corretti.""" + count = [0] + + def _repl(m: re.Match) -> str: + count[0] += 1 + return _SYMBOL_PUA_MAP[m.group(0)] + + result = _SYMBOL_PUA_RE.sub(_repl, text) + return result, count[0] + + def _t_remove_images(text: str) -> tuple[str, int]: n = len(re.findall(r"!\[[^\]]*\]\([^)]*\)", text)) text = re.sub(r"!\[[^\]]*\]\([^)]*\)\s*", "", text) @@ -555,7 +677,7 @@ def _t_normalize_numbered_headings(text: str) -> tuple[str, int]: def _t_normalize_header_levels(text: str) -> tuple[str, int]: - """Normalizza h4+ → h3; rimuove header vuoti.""" + """Normalizza h4+ → h3; rimuove header vuoti; rimuove numero pagina '| N' finale.""" text = re.sub(r"^#{3,6}\s*$", "", text, flags=re.MULTILINE) text = re.sub( r"^(#{3,6})\s+(\d{1,3})\s+(.+)$", @@ -611,37 +733,19 @@ def _t_remove_toc(text: str) -> tuple[str, int]: if _in_toc: if re.match(r"^\s*$", line) or re.match(r"^\s*[-*+]\s+\d", line): continue - # Voce TOC con numero pagina finale (sicuro: siamo già in contesto TOC) + # Voce TOC con numero pagina finale (sicuro: siamo gia in contesto TOC) if re.match(r"^\s*[-*+]\s+.{2,70}\s+\d{1,3}\s*$", line): continue + # Riga di testo lungo = probabilmente abstract o corpo, non voce di indice + if len(line.strip()) > 200: + _in_toc = False + new_lines.append(line) + continue _in_toc = False new_lines.append(line) return "\n".join(new_lines), 1 if removed else 0 -def _t_remove_toc_page_list(text: str) -> tuple[str, int]: - """Rimuovi voci lista TOC con numero di pagina finale. - - Intercetta indici come '- Canto I 1', '- Canto XXIX 119' (eventualmente - fusi su una riga: '- Canto XXIX 119 - Canto XXX 123') che opendataloader - non separa dall'indice del PDF. - """ - count = 0 - lines = text.split("\n") - new_lines = [] - for line in lines: - stripped = line.strip() - # Voce TOC fusa: "- X N - Y M" — le separiamo e le scartiamo entrambe - if re.match(r"^\s*-\s+.{2,50}\s+\d{1,4}\s+-\s+.{2,50}\s+\d{1,4}\s*$", stripped): - count += 2 - continue - # Voce TOC semplice: "- Testo ... NN" dove NN è un numero pagina isolato - if re.match(r"^\s*-\s+\S.{1,60}\s+\d{1,4}\s*$", stripped): - count += 1 - continue - new_lines.append(line) - return "\n".join(new_lines), count - def _t_allcaps_to_headers(text: str) -> tuple[str, int]: """Converti righe ALL-CAPS standalone → ## header.""" @@ -701,8 +805,11 @@ def _t_numbered_sections(text: str, has_exercises: bool = False) -> tuple[str, i if not has_exercises: def _aphorism_repl(m: re.Match) -> str: nonlocal count + content = m.group(2).strip() + if _BIB_MARKERS_RE.search(content): + return m.group(0) count += 1 - return f"\n\n### {m.group(1)}.\n\n{m.group(2).strip()}" + return f"\n\n### {m.group(1)}.\n\n{content}" text = re.sub( r"^-\s+(\d{1,3})\.\s+(.{10,})$", @@ -715,6 +822,8 @@ def _t_numbered_sections(text: str, has_exercises: bool = False) -> tuple[str, i nonlocal count num = m.group(1) content = m.group(2).strip() + if _BIB_MARKERS_RE.search(content): + return m.group(0) count += 1 split = re.search(r"(?<=[a-zàèéìíòóùú])\s+(?=[A-ZÀÈÉÌÍÒÓÙÚ])", content) if split and split.start() >= 3: @@ -756,7 +865,7 @@ def _t_merge_paragraphs(text: str) -> tuple[str, int]: and stripped[-1] not in _SENTENCE_END ): nxt = blocks[i + 1].strip() - if not nxt or nxt.startswith("#") or nxt.startswith("|") or re.match(r"^\d+\.", nxt): + if not nxt or nxt.startswith("#") or nxt.startswith("|") or re.match(r"^\d+\.", nxt) or re.match(r"^[-*+]\s", nxt): break b = stripped + " " + nxt stripped = b.strip() @@ -912,17 +1021,22 @@ def _t_remove_garbage_headers(text: str) -> tuple[str, int]: def _is_garbage_header(content: str) -> bool: if content.lstrip().startswith("..."): return True - if not re.search(r"[A-Za-zÀ-ÿ]{2,}", content): + if not re.search(r"[A-Za-zÀ-ÿ\u0391-\u03c9]{2,}", content): return True if re.fullmatch(r"\(?\s*[A-Za-z]{1,4}\s*\)?", content.strip()): return True if len(content) > 60 and re.search(r"[!%#]\w|\w[!%#]|\b\w+-\s*\w", content): return True - # Frammento di frase: inizia con minuscola ed è abbastanza lungo - # (testo spezzato dalla tabella che opendataloader ha promosso a heading) + # Frammento di frase: inizia con minuscola ed e abbastanza lungo first_alpha = next((c for c in content if c.isalpha()), None) if first_alpha and first_alpha.islower() and len(content) > 40: return True + # Formula matematica: variabile singola (o breve) seguita da = o operatore + if re.match(r"^[A-Za-z\u0391-\u03c9_]{1,3}\s*[=<>≤≥]", content.strip()): + return True + # Didascalia figura/tabella: "Figura N..." o "Figure N..." o "Tabella N..." + if re.match(r"^(Figura|Figure|Fig\.|Tabella|Table|Tab\.)\s+\d", content.strip(), re.IGNORECASE): + return True return False count = 0 @@ -1007,7 +1121,9 @@ def _t_remove_recurring_lines(text: str) -> tuple[str, int]: lines = text.split("\n") short_lines = [ ln.strip() for ln in lines - if 3 < len(ln.strip()) < 80 and not ln.strip().startswith("#") + if 3 < len(ln.strip()) < 80 + and not ln.strip().startswith("#") + and not ln.strip().startswith("|") ] freq = Counter(short_lines) recurring = {ln for ln, c in freq.items() if c >= 5} @@ -1031,9 +1147,10 @@ def apply_transforms(text: str) -> tuple[str, dict]: """ # Flag calcolato prima del loop: disabilita il transform 4b nei documenti # con sezioni "Esercizi" (i "- N. testo" sarebbero numerazioni, non header). - _has_ex = bool(re.search(r"\bEsercizi\b", text, re.IGNORECASE)) + _has_ex = bool(re.search(r"\b(Esercizi|Exercises|Problems|Homework)\b", text, re.IGNORECASE)) _transforms: list[tuple[str | None, object]] = [ + ("n_simboli_pua_corretti", _t_fix_symbol_font), ("n_immagini_rimosse", _t_remove_images), ("n_br_rimossi", _t_fix_br), ("n_tabsep_rimossi", _t_fix_tabsep), @@ -1064,6 +1181,7 @@ def apply_transforms(text: str) -> tuple[str, dict]: (None, _t_remove_urls), (None, _t_remove_empty_headers), ("n_titoli_uniti", _t_merge_title_headers), + (None, lambda t: (re.sub(r"(?m)^(#{1,6}.+?)\s*\|\s*\d{1,3}\s*$", r"\1", t), 0)), ("n_garbage_headers_rimossi", _t_remove_garbage_headers), ("n_frontmatter_rimossi", _t_remove_frontmatter), ("n_watermark_rimossi", _t_remove_watermarks), @@ -1286,13 +1404,15 @@ def build_report( return hits residui = { - "backtick": _scan(r"`"), - "dotleader": _scan(r"(?:\. ){3,}"), - "url": _scan(r"^(https?://|www\.)\S+"), - "immagini": _scan(r"!\[[^\]]*\]\([^)]*\)"), - "br_inline": _scan(r"
"), - "simboli_encoding":_scan(r'(?<=[0-9A-Za-z])[!"](?=[0-9A-Za-z])'), - "formule_inline": _scan(r"\[\d+\.\d+\]"), + "backtick": _scan(r"`"), + "dotleader": _scan(r"(?:\. ){3,}"), + "url": _scan(r"^(https?://|www\.)\S+"), + "immagini": _scan(r"!\[[^\]]*\]\([^)]*\)"), + "br_inline": _scan(r"
"), + "simboli_encoding": _scan(r'(?<=[0-9A-Za-z])[!"](?=[0-9A-Za-z])'), + "formule_inline": _scan(r"\[\d+\.\d+\]"), + "footnote_markers": _scan(r'[\u00b9\u00b2\u00b3\u2070\u2074-\u2079]'), + "pua_markers": _scan(r'[\ue000-\uf8ff]'), } # ── Composizione report ─────────────────────────────────────────────── @@ -1321,13 +1441,17 @@ def build_report( "br_inline": len(residui["br_inline"]), "simboli_encoding": len(residui["simboli_encoding"]), "formule_inline": len(residui["formule_inline"]), - "backtick_esempi": residui["backtick"], - "dotleader_esempi": residui["dotleader"], - "url_esempi": residui["url"], - "immagini_esempi": residui["immagini"], - "br_inline_esempi": residui["br_inline"], - "simboli_encoding_esempi": residui["simboli_encoding"], - "formule_inline_esempi": residui["formule_inline"], + "footnote_markers": len(residui["footnote_markers"]), + "pua_markers": len(residui["pua_markers"]), + "backtick_esempi": residui["backtick"], + "dotleader_esempi": residui["dotleader"], + "url_esempi": residui["url"], + "immagini_esempi": residui["immagini"], + "br_inline_esempi": residui["br_inline"], + "simboli_encoding_esempi": residui["simboli_encoding"], + "formule_inline_esempi": residui["formule_inline"], + "footnote_markers_esempi": residui["footnote_markers"], + "pua_markers_esempi": residui["pua_markers"], }, } @@ -1386,7 +1510,8 @@ def run(stem: str, project_root: Path, force: bool) -> bool: print(" [3/4] Pulizia strutturale...") clean_text, t_stats = apply_transforms(raw_text) reduction = 100 * (1 - len(clean_text) / len(raw_text)) if raw_text else 0 - print(f" ✅ Immagini rimosse: {t_stats['n_immagini_rimosse']}") + print(f" ✅ Simboli PUA corretti: {t_stats['n_simboli_pua_corretti']}") + print(f" Immagini rimosse: {t_stats['n_immagini_rimosse']}") print(f" Note rimossa: {t_stats['n_note_rimosse']}") print(f" Accenti corretti: {t_stats['n_accenti_corretti']}") print(f" Dot-leader rimossi: {t_stats['n_dotleader_rimossi']}") diff --git a/conversione/validate.py b/conversione/validate.py index b9d71be..f2c1ead 100644 --- a/conversione/validate.py +++ b/conversione/validate.py @@ -86,6 +86,8 @@ def _score(r: dict) -> tuple[int, list[str]]: _pen("br_inline", 2, 15, "
inline") _pen("simboli_encoding", 1, 10, "simboli encoding") _pen("formule_inline", 1, 8, "formule inline") + _pen("footnote_markers", 1, 8, "footnote residui") + _pen("pua_markers", 2, 20, "caratteri PUA font Symbol") # ── Anomalie ────────────────────────────────────────────────────────── n_bare = anomalie.get("bare_headers", 0)