diff --git a/conversione/pipeline.py b/conversione/pipeline.py
index e207b28..eedf436 100644
--- a/conversione/pipeline.py
+++ b/conversione/pipeline.py
@@ -238,8 +238,9 @@ def _extract_math_environments(text: str) -> tuple[str, int]:
Deve girare PRIMA del merge paragrafi (step 5) per sfruttare i blocchi intatti.
"""
_ENVS = (
- r"Definizione|Teorema|Lemma|Proposizione|"
- r"Corollario|Osservazione|Nota|Esempio"
+ r"Definizione|Definition|Teorema|Theorem|Lemma|"
+ r"Proposizione|Proposition|Corollario|Corollary|"
+ r"Osservazione|Remark|Nota|Note|Esempio|Example"
)
count = 0
blocks = text.split("\n\n")
@@ -373,6 +374,127 @@ def _extract_article_headers(text: str) -> tuple[str, int]:
# ─── [3a] Funzioni di trasformazione ─────────────────────────────────────────
+# Mapping PUA Unicode (U+F020-U+F0FF) → simboli corretti per font Symbol/Wingdings.
+# Il font Symbol di Windows codifica lettere greche e operatori matematici nel
+# range Private Use Area invece dei codepoint Unicode standard.
+_SYMBOL_PUA_MAP: dict[str, str] = {
+ "\uf020": " ", # space
+ "\uf028": "(",
+ "\uf029": ")",
+ "\uf02b": "+",
+ "\uf02d": "\u2212", # minus
+ "\uf02e": ".",
+ "\uf02f": "/",
+ "\uf030": "0", "\uf031": "1", "\uf032": "2", "\uf033": "3", "\uf034": "4",
+ "\uf035": "5", "\uf036": "6", "\uf037": "7", "\uf038": "8", "\uf039": "9",
+ "\uf03a": ":", "\uf03b": ";", "\uf03c": "<", "\uf03d": "=", "\uf03e": ">",
+ "\uf040": "\u2245", # congruent
+ "\uf041": "\u0391", # Alpha
+ "\uf042": "\u0392", # Beta
+ "\uf043": "\u03a7", # Chi
+ "\uf044": "\u0394", # Delta
+ "\uf045": "\u0395", # Epsilon
+ "\uf046": "\u03a6", # Phi
+ "\uf047": "\u0393", # Gamma
+ "\uf048": "\u0397", # Eta
+ "\uf049": "\u0399", # Iota
+ "\uf04a": "\u03d1", # theta variant
+ "\uf04b": "\u039a", # Kappa
+ "\uf04c": "\u039b", # Lambda
+ "\uf04d": "\u039c", # Mu
+ "\uf04e": "\u039d", # Nu
+ "\uf04f": "\u039f", # Omicron
+ "\uf050": "\u03a0", # Pi
+ "\uf051": "\u0398", # Theta
+ "\uf052": "\u03a1", # Rho
+ "\uf053": "\u03a3", # Sigma
+ "\uf054": "\u03a4", # Tau
+ "\uf055": "\u03a5", # Upsilon
+ "\uf056": "\u03c2", # sigma final
+ "\uf057": "\u03a9", # Omega
+ "\uf058": "\u039e", # Xi
+ "\uf059": "\u03a8", # Psi
+ "\uf05a": "\u0396", # Zeta
+ "\uf05b": "[",
+ "\uf05c": "\u2234", # therefore
+ "\uf05d": "]",
+ "\uf05e": "\u22a5", # perpendicular
+ "\uf061": "\u03b1", # alpha
+ "\uf062": "\u03b2", # beta
+ "\uf063": "\u03c7", # chi
+ "\uf064": "\u03b4", # delta
+ "\uf065": "\u03b5", # epsilon
+ "\uf066": "\u03c6", # phi
+ "\uf067": "\u03b3", # gamma
+ "\uf068": "\u03b7", # eta
+ "\uf069": "\u03b9", # iota
+ "\uf06a": "\u03d5", # phi variant
+ "\uf06b": "\u03ba", # kappa
+ "\uf06c": "\u03bb", # lambda
+ "\uf06d": "\u03bc", # mu
+ "\uf06e": "\u03bd", # nu
+ "\uf06f": "\u03bf", # omicron
+ "\uf070": "\u03c0", # pi
+ "\uf071": "\u03b8", # theta
+ "\uf072": "\u03c1", # rho
+ "\uf073": "\u03c3", # sigma
+ "\uf074": "\u03c4", # tau
+ "\uf075": "\u03c5", # upsilon
+ "\uf076": "\u03d6", # pi symbol
+ "\uf077": "\u03c9", # omega
+ "\uf078": "\u03be", # xi
+ "\uf079": "\u03c8", # psi
+ "\uf07a": "\u03b6", # zeta
+ "\uf07b": "{",
+ "\uf07c": "|",
+ "\uf07d": "}",
+ "\uf07e": "~",
+ "\uf0b1": "\u00b1", # plus-minus
+ "\uf0b7": "\u2022", # bullet
+ "\uf0ba": "\u221a", # square root
+ "\uf0bc": "\u2264", # less or equal
+ "\uf0bd": "\u2265", # greater or equal
+ "\uf0be": "\u221d", # proportional
+ "\uf0d7": "\u00d7", # multiplication
+ "\uf0f7": "\u00f7", # division
+ "\uf0b4": "\u00d7", # alternate multiply
+ "\uf0bb": "\u2260", # not equal
+ "\uf0b9": "\u2260", # not equal alternate
+ "\uf0b3": "\u2265", # greater or equal alternate
+ "\uf0b2": "\u2032", # prime
+ "\uf02a": "*",
+ "\uf02c": ",",
+ "\uf0a3": "\u2264", # less or equal (Symbol 0xA3)
+ "\uf0a7": "\u2022", # bullet (Wingdings 0xA7)
+ "\uf0a8": "\u2022", # bullet variant
+ "\uf0ae": "\u2192", # right arrow (Symbol 0xAE)
+ "\uf0b8": "\u00f7", # division / range separator
+ "\uf0eb": "", # Wingdings decorative icon (rimosso)
+ "\uf0f0": "\u2192", # right arrow variant
+ "\uf0db": "", # bracket extension piece (non ricostruibile)
+ "\uf0dc": "", # bracket extension piece
+ "\uf0dd": "", # bracket extension piece
+ "\uf0de": "", # brace middle piece (non ricostruibile)
+ "\uf0df": "", # brace extension piece
+}
+
+_SYMBOL_PUA_RE = re.compile(
+ "[" + "".join(re.escape(k) for k in _SYMBOL_PUA_MAP) + "]"
+)
+
+
+def _t_fix_symbol_font(text: str) -> tuple[str, int]:
+ """Rimappa caratteri PUA font Symbol (U+F020-U+F0FF) in simboli Unicode corretti."""
+ count = [0]
+
+ def _repl(m: re.Match) -> str:
+ count[0] += 1
+ return _SYMBOL_PUA_MAP[m.group(0)]
+
+ result = _SYMBOL_PUA_RE.sub(_repl, text)
+ return result, count[0]
+
+
def _t_remove_images(text: str) -> tuple[str, int]:
n = len(re.findall(r"!\[[^\]]*\]\([^)]*\)", text))
text = re.sub(r"!\[[^\]]*\]\([^)]*\)\s*", "", text)
@@ -555,7 +677,7 @@ def _t_normalize_numbered_headings(text: str) -> tuple[str, int]:
def _t_normalize_header_levels(text: str) -> tuple[str, int]:
- """Normalizza h4+ → h3; rimuove header vuoti."""
+ """Normalizza h4+ → h3; rimuove header vuoti; rimuove numero pagina '| N' finale."""
text = re.sub(r"^#{3,6}\s*$", "", text, flags=re.MULTILINE)
text = re.sub(
r"^(#{3,6})\s+(\d{1,3})\s+(.+)$",
@@ -611,37 +733,19 @@ def _t_remove_toc(text: str) -> tuple[str, int]:
if _in_toc:
if re.match(r"^\s*$", line) or re.match(r"^\s*[-*+]\s+\d", line):
continue
- # Voce TOC con numero pagina finale (sicuro: siamo già in contesto TOC)
+ # Voce TOC con numero pagina finale (sicuro: siamo gia in contesto TOC)
if re.match(r"^\s*[-*+]\s+.{2,70}\s+\d{1,3}\s*$", line):
continue
+ # Riga di testo lungo = probabilmente abstract o corpo, non voce di indice
+ if len(line.strip()) > 200:
+ _in_toc = False
+ new_lines.append(line)
+ continue
_in_toc = False
new_lines.append(line)
return "\n".join(new_lines), 1 if removed else 0
-def _t_remove_toc_page_list(text: str) -> tuple[str, int]:
- """Rimuovi voci lista TOC con numero di pagina finale.
-
- Intercetta indici come '- Canto I 1', '- Canto XXIX 119' (eventualmente
- fusi su una riga: '- Canto XXIX 119 - Canto XXX 123') che opendataloader
- non separa dall'indice del PDF.
- """
- count = 0
- lines = text.split("\n")
- new_lines = []
- for line in lines:
- stripped = line.strip()
- # Voce TOC fusa: "- X N - Y M" — le separiamo e le scartiamo entrambe
- if re.match(r"^\s*-\s+.{2,50}\s+\d{1,4}\s+-\s+.{2,50}\s+\d{1,4}\s*$", stripped):
- count += 2
- continue
- # Voce TOC semplice: "- Testo ... NN" dove NN è un numero pagina isolato
- if re.match(r"^\s*-\s+\S.{1,60}\s+\d{1,4}\s*$", stripped):
- count += 1
- continue
- new_lines.append(line)
- return "\n".join(new_lines), count
-
def _t_allcaps_to_headers(text: str) -> tuple[str, int]:
"""Converti righe ALL-CAPS standalone → ## header."""
@@ -701,8 +805,11 @@ def _t_numbered_sections(text: str, has_exercises: bool = False) -> tuple[str, i
if not has_exercises:
def _aphorism_repl(m: re.Match) -> str:
nonlocal count
+ content = m.group(2).strip()
+ if _BIB_MARKERS_RE.search(content):
+ return m.group(0)
count += 1
- return f"\n\n### {m.group(1)}.\n\n{m.group(2).strip()}"
+ return f"\n\n### {m.group(1)}.\n\n{content}"
text = re.sub(
r"^-\s+(\d{1,3})\.\s+(.{10,})$",
@@ -715,6 +822,8 @@ def _t_numbered_sections(text: str, has_exercises: bool = False) -> tuple[str, i
nonlocal count
num = m.group(1)
content = m.group(2).strip()
+ if _BIB_MARKERS_RE.search(content):
+ return m.group(0)
count += 1
split = re.search(r"(?<=[a-zàèéìíòóùú])\s+(?=[A-ZÀÈÉÌÍÒÓÙÚ])", content)
if split and split.start() >= 3:
@@ -756,7 +865,7 @@ def _t_merge_paragraphs(text: str) -> tuple[str, int]:
and stripped[-1] not in _SENTENCE_END
):
nxt = blocks[i + 1].strip()
- if not nxt or nxt.startswith("#") or nxt.startswith("|") or re.match(r"^\d+\.", nxt):
+ if not nxt or nxt.startswith("#") or nxt.startswith("|") or re.match(r"^\d+\.", nxt) or re.match(r"^[-*+]\s", nxt):
break
b = stripped + " " + nxt
stripped = b.strip()
@@ -912,17 +1021,22 @@ def _t_remove_garbage_headers(text: str) -> tuple[str, int]:
def _is_garbage_header(content: str) -> bool:
if content.lstrip().startswith("..."):
return True
- if not re.search(r"[A-Za-zÀ-ÿ]{2,}", content):
+ if not re.search(r"[A-Za-zÀ-ÿ\u0391-\u03c9]{2,}", content):
return True
if re.fullmatch(r"\(?\s*[A-Za-z]{1,4}\s*\)?", content.strip()):
return True
if len(content) > 60 and re.search(r"[!%#]\w|\w[!%#]|\b\w+-\s*\w", content):
return True
- # Frammento di frase: inizia con minuscola ed è abbastanza lungo
- # (testo spezzato dalla tabella che opendataloader ha promosso a heading)
+ # Frammento di frase: inizia con minuscola ed e abbastanza lungo
first_alpha = next((c for c in content if c.isalpha()), None)
if first_alpha and first_alpha.islower() and len(content) > 40:
return True
+ # Formula matematica: variabile singola (o breve) seguita da = o operatore
+ if re.match(r"^[A-Za-z\u0391-\u03c9_]{1,3}\s*[=<>≤≥]", content.strip()):
+ return True
+ # Didascalia figura/tabella: "Figura N..." o "Figure N..." o "Tabella N..."
+ if re.match(r"^(Figura|Figure|Fig\.|Tabella|Table|Tab\.)\s+\d", content.strip(), re.IGNORECASE):
+ return True
return False
count = 0
@@ -1007,7 +1121,9 @@ def _t_remove_recurring_lines(text: str) -> tuple[str, int]:
lines = text.split("\n")
short_lines = [
ln.strip() for ln in lines
- if 3 < len(ln.strip()) < 80 and not ln.strip().startswith("#")
+ if 3 < len(ln.strip()) < 80
+ and not ln.strip().startswith("#")
+ and not ln.strip().startswith("|")
]
freq = Counter(short_lines)
recurring = {ln for ln, c in freq.items() if c >= 5}
@@ -1031,9 +1147,10 @@ def apply_transforms(text: str) -> tuple[str, dict]:
"""
# Flag calcolato prima del loop: disabilita il transform 4b nei documenti
# con sezioni "Esercizi" (i "- N. testo" sarebbero numerazioni, non header).
- _has_ex = bool(re.search(r"\bEsercizi\b", text, re.IGNORECASE))
+ _has_ex = bool(re.search(r"\b(Esercizi|Exercises|Problems|Homework)\b", text, re.IGNORECASE))
_transforms: list[tuple[str | None, object]] = [
+ ("n_simboli_pua_corretti", _t_fix_symbol_font),
("n_immagini_rimosse", _t_remove_images),
("n_br_rimossi", _t_fix_br),
("n_tabsep_rimossi", _t_fix_tabsep),
@@ -1064,6 +1181,7 @@ def apply_transforms(text: str) -> tuple[str, dict]:
(None, _t_remove_urls),
(None, _t_remove_empty_headers),
("n_titoli_uniti", _t_merge_title_headers),
+ (None, lambda t: (re.sub(r"(?m)^(#{1,6}.+?)\s*\|\s*\d{1,3}\s*$", r"\1", t), 0)),
("n_garbage_headers_rimossi", _t_remove_garbage_headers),
("n_frontmatter_rimossi", _t_remove_frontmatter),
("n_watermark_rimossi", _t_remove_watermarks),
@@ -1286,13 +1404,15 @@ def build_report(
return hits
residui = {
- "backtick": _scan(r"`"),
- "dotleader": _scan(r"(?:\. ){3,}"),
- "url": _scan(r"^(https?://|www\.)\S+"),
- "immagini": _scan(r"!\[[^\]]*\]\([^)]*\)"),
- "br_inline": _scan(r"
"),
- "simboli_encoding":_scan(r'(?<=[0-9A-Za-z])[!"](?=[0-9A-Za-z])'),
- "formule_inline": _scan(r"\[\d+\.\d+\]"),
+ "backtick": _scan(r"`"),
+ "dotleader": _scan(r"(?:\. ){3,}"),
+ "url": _scan(r"^(https?://|www\.)\S+"),
+ "immagini": _scan(r"!\[[^\]]*\]\([^)]*\)"),
+ "br_inline": _scan(r"
"),
+ "simboli_encoding": _scan(r'(?<=[0-9A-Za-z])[!"](?=[0-9A-Za-z])'),
+ "formule_inline": _scan(r"\[\d+\.\d+\]"),
+ "footnote_markers": _scan(r'[\u00b9\u00b2\u00b3\u2070\u2074-\u2079]'),
+ "pua_markers": _scan(r'[\ue000-\uf8ff]'),
}
# ── Composizione report ───────────────────────────────────────────────
@@ -1321,13 +1441,17 @@ def build_report(
"br_inline": len(residui["br_inline"]),
"simboli_encoding": len(residui["simboli_encoding"]),
"formule_inline": len(residui["formule_inline"]),
- "backtick_esempi": residui["backtick"],
- "dotleader_esempi": residui["dotleader"],
- "url_esempi": residui["url"],
- "immagini_esempi": residui["immagini"],
- "br_inline_esempi": residui["br_inline"],
- "simboli_encoding_esempi": residui["simboli_encoding"],
- "formule_inline_esempi": residui["formule_inline"],
+ "footnote_markers": len(residui["footnote_markers"]),
+ "pua_markers": len(residui["pua_markers"]),
+ "backtick_esempi": residui["backtick"],
+ "dotleader_esempi": residui["dotleader"],
+ "url_esempi": residui["url"],
+ "immagini_esempi": residui["immagini"],
+ "br_inline_esempi": residui["br_inline"],
+ "simboli_encoding_esempi": residui["simboli_encoding"],
+ "formule_inline_esempi": residui["formule_inline"],
+ "footnote_markers_esempi": residui["footnote_markers"],
+ "pua_markers_esempi": residui["pua_markers"],
},
}
@@ -1386,7 +1510,8 @@ def run(stem: str, project_root: Path, force: bool) -> bool:
print(" [3/4] Pulizia strutturale...")
clean_text, t_stats = apply_transforms(raw_text)
reduction = 100 * (1 - len(clean_text) / len(raw_text)) if raw_text else 0
- print(f" ✅ Immagini rimosse: {t_stats['n_immagini_rimosse']}")
+ print(f" ✅ Simboli PUA corretti: {t_stats['n_simboli_pua_corretti']}")
+ print(f" Immagini rimosse: {t_stats['n_immagini_rimosse']}")
print(f" Note rimossa: {t_stats['n_note_rimosse']}")
print(f" Accenti corretti: {t_stats['n_accenti_corretti']}")
print(f" Dot-leader rimossi: {t_stats['n_dotleader_rimossi']}")
diff --git a/conversione/validate.py b/conversione/validate.py
index b9d71be..f2c1ead 100644
--- a/conversione/validate.py
+++ b/conversione/validate.py
@@ -86,6 +86,8 @@ def _score(r: dict) -> tuple[int, list[str]]:
_pen("br_inline", 2, 15, "
inline")
_pen("simboli_encoding", 1, 10, "simboli encoding")
_pen("formule_inline", 1, 8, "formule inline")
+ _pen("footnote_markers", 1, 8, "footnote residui")
+ _pen("pua_markers", 2, 20, "caratteri PUA font Symbol")
# ── Anomalie ──────────────────────────────────────────────────────────
n_bare = anomalie.get("bare_headers", 0)