diff --git a/.gitignore b/.gitignore index 4ff0772..0334ca9 100644 --- a/.gitignore +++ b/.gitignore @@ -30,6 +30,7 @@ Thumbs.db # Output conversione/ — generati da conversione/pipeline.py conversione/*/ !conversione/_pipeline/ +!conversione/_pipeline/transforms !conversione/_pipeline/** # Output chunks/ — generati da chunks/chunker.py e chunks/verify_chunks.py diff --git a/conversione/_pipeline/__pycache__/__init__.cpython-312.pyc b/conversione/_pipeline/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000..add25a1 Binary files /dev/null and b/conversione/_pipeline/__pycache__/__init__.cpython-312.pyc differ diff --git a/conversione/_pipeline/__pycache__/checker.cpython-312.pyc b/conversione/_pipeline/__pycache__/checker.cpython-312.pyc new file mode 100644 index 0000000..11cf3f0 Binary files /dev/null and b/conversione/_pipeline/__pycache__/checker.cpython-312.pyc differ diff --git a/conversione/_pipeline/__pycache__/converter.cpython-312.pyc b/conversione/_pipeline/__pycache__/converter.cpython-312.pyc new file mode 100644 index 0000000..249b953 Binary files /dev/null and b/conversione/_pipeline/__pycache__/converter.cpython-312.pyc differ diff --git a/conversione/_pipeline/__pycache__/deps.cpython-312.pyc b/conversione/_pipeline/__pycache__/deps.cpython-312.pyc new file mode 100644 index 0000000..9b1e055 Binary files /dev/null and b/conversione/_pipeline/__pycache__/deps.cpython-312.pyc differ diff --git a/conversione/_pipeline/__pycache__/report.cpython-312.pyc b/conversione/_pipeline/__pycache__/report.cpython-312.pyc new file mode 100644 index 0000000..1eabfcc Binary files /dev/null and b/conversione/_pipeline/__pycache__/report.cpython-312.pyc differ diff --git a/conversione/_pipeline/__pycache__/runner.cpython-312.pyc b/conversione/_pipeline/__pycache__/runner.cpython-312.pyc new file mode 100644 index 0000000..b91242f Binary files /dev/null and b/conversione/_pipeline/__pycache__/runner.cpython-312.pyc differ diff --git a/conversione/_pipeline/__pycache__/structure.cpython-312.pyc b/conversione/_pipeline/__pycache__/structure.cpython-312.pyc new file mode 100644 index 0000000..569c0d0 Binary files /dev/null and b/conversione/_pipeline/__pycache__/structure.cpython-312.pyc differ diff --git a/conversione/_pipeline/__pycache__/transforms.cpython-312.pyc b/conversione/_pipeline/__pycache__/transforms.cpython-312.pyc new file mode 100644 index 0000000..2839153 Binary files /dev/null and b/conversione/_pipeline/__pycache__/transforms.cpython-312.pyc differ diff --git a/conversione/_pipeline/__pycache__/validator.cpython-312.pyc b/conversione/_pipeline/__pycache__/validator.cpython-312.pyc new file mode 100644 index 0000000..e38ffd4 Binary files /dev/null and b/conversione/_pipeline/__pycache__/validator.cpython-312.pyc differ diff --git a/conversione/_pipeline/runner.py b/conversione/_pipeline/runner.py index 7eb02dc..125aeb9 100644 --- a/conversione/_pipeline/runner.py +++ b/conversione/_pipeline/runner.py @@ -71,6 +71,7 @@ def run(stem: str, project_root: Path, force: bool) -> bool: print(f" Ambienti matematici: {t['n_ambienti_matematici']}") print(f" Titoli header uniti: {t['n_titoli_uniti']}") print(f" TOC rimosso: {'sì' if t['toc_rimosso'] else 'no'}") + print(f" TOC orfani rimossi: {t['n_toc_orfani_rimossi']}") print(f" Versi poesia riprist.: {t['n_versi_ripristinati']}") print(f" Header verso demotati: {t['n_header_verso_demotati']}") print(f" ALL-CAPS → ##: {t['n_header_allcaps']}") diff --git a/conversione/_pipeline/transforms.py b/conversione/_pipeline/transforms.py deleted file mode 100644 index 1c6a7cd..0000000 --- a/conversione/_pipeline/transforms.py +++ /dev/null @@ -1,974 +0,0 @@ -import re -from collections import Counter -from functools import partial - -# ─── Costanti ──────────────────────────────────────────────────────────────── - -_TOC_KEYWORDS = frozenset([ - "indice", "index", "contents", "table of contents", - "sommario", "inhaltsverzeichnis", "inhalt", - "indice generale", "indice analitico", "indice dei contenuti", - "elenco dei capitoli", "argomenti", "table des matières", - "tabla de contenidos", "содержание", -]) - -_ORDINALS_IT = { - "PRIMO": "I", "SECONDO": "II", "TERZO": "III", "QUARTO": "IV", - "QUINTO": "V", "SESTO": "VI", "SETTIMO": "VII", "OTTAVO": "VIII", - "NONO": "IX", "DECIMO": "X", -} -_ORDINALS_EN = { - "ONE": "1", "TWO": "2", "THREE": "3", "FOUR": "4", "FIVE": "5", - "SIX": "6", "SEVEN": "7", "EIGHT": "8", "NINE": "9", "TEN": "10", -} - -# Mapping PUA Unicode (U+F020-U+F0FF) → simboli Unicode standard. -# Font Symbol di Windows codifica lettere greche e operatori matematici -# nel range Private Use Area invece dei codepoint Unicode standard. -_SYMBOL_PUA_MAP: dict[str, str] = { - "": " ", - "": "(", - "": ")", - "": "+", - "": "−", # minus - "": ".", - "": "/", - "": "0", "": "1", "": "2", "": "3", "": "4", - "": "5", "": "6", "": "7", "": "8", "": "9", - "": ":", "": ";", "": "<", "": "=", "": ">", - "": "≅", # congruent - "": "Α", # Alpha - "": "Β", # Beta - "": "Χ", # Chi - "": "Δ", # Delta - "": "Ε", # Epsilon - "": "Φ", # Phi - "": "Γ", # Gamma - "": "Η", # Eta - "": "Ι", # Iota - "": "ϑ", # theta variant - "": "Κ", # Kappa - "": "Λ", # Lambda - "": "Μ", # Mu - "": "Ν", # Nu - "": "Ο", # Omicron - "": "Π", # Pi - "": "Θ", # Theta - "": "Ρ", # Rho - "": "Σ", # Sigma - "": "Τ", # Tau - "": "Υ", # Upsilon - "": "ς", # sigma final - "": "Ω", # Omega - "": "Ξ", # Xi - "": "Ψ", # Psi - "": "Ζ", # Zeta - "": "[", - "": "∴", # therefore - "": "]", - "": "⊥", # perpendicular - "": "α", # alpha - "": "β", # beta - "": "χ", # chi - "": "δ", # delta - "": "ε", # epsilon - "": "φ", # phi - "": "γ", # gamma - "": "η", # eta - "": "ι", # iota - "": "ϕ", # phi variant - "": "κ", # kappa - "": "λ", # lambda - "": "μ", # mu - "": "ν", # nu - "": "ο", # omicron - "": "π", # pi - "": "θ", # theta - "": "ρ", # rho - "": "σ", # sigma - "": "τ", # tau - "": "υ", # upsilon - "": "ϖ", # pi symbol - "": "ω", # omega - "": "ξ", # xi - "": "ψ", # psi - "": "ζ", # zeta - "": "{", - "": "|", - "": "}", - "": "~", - "": "±", # plus-minus - "": "•", # bullet - "": "√", # square root - "": "≤", # less or equal - "": "≥", # greater or equal - "": "∝", # proportional - "": "×", # multiplication - "": "÷", # division - "": "×", # alternate multiply - "": "≠", # not equal - "": "≠", # not equal alternate - "": "≥", # greater or equal alternate - "": "′", # prime - "": "*", - "": ",", - "": "≤", # less or equal (Symbol 0xA3) - "": "•", # bullet (Wingdings 0xA7) - "": "•", # bullet variant - "": "→", # right arrow (Symbol 0xAE) - "": "÷", # division / range separator - "": "", # Wingdings decorative icon (rimosso) - "": "→", # right arrow variant - "": "", # bracket extension piece (non ricostruibile) - "": "", - "": "", - "": "", - "": "", - "": "", # TeX large paren left U+F8EB - "": "", # TeX large paren extension U+F8EC - "": "", # TeX large paren right U+F8ED - "": "", # TeX large paren right ext U+F8EE - "": "", # TeX large bracket left U+F8EF - "": "", # TeX large bracket ext U+F8F0 - "": "", # TeX brace top-left U+F8F1 - "": "", # TeX brace mid U+F8F2 - "": "", # TeX brace mid-right U+F8F3 - "": "", # TeX brace extension U+F8F4 - "": "", # TeX brace right U+F8F5 - "": "", # TeX bracket right large U+F8F6 - "": "", # TeX bracket right ext U+F8F7 - "": "", # TeX bracket right close U+F8F8 - "": "", # TeX integral large U+F8F9 - "": "", # TeX integral extension U+F8FA - "": "", # TeX integral top U+F8FB - "": "", # TeX radical top U+F8FC - "": "", # TeX radical extension U+F8FD - "": "", # TeX arrowhead U+F8FE -} - -_SYMBOL_PUA_RE = re.compile( - "[" + "".join(re.escape(k) for k in _SYMBOL_PUA_MAP) + "]" -) - -_SUPERSCRIPT_RE = re.compile(r'[¹²³⁰⁴-⁹]+') -_FOOTNOTE_BODY_RE = re.compile( - r'^([¹²³⁰⁴-⁹]+\s+|\[\d{1,3}\]\s+)' -) -_NUMBERED_HDR_RE = re.compile( - r"^(#{1,6})\s+(\d+(?:\.\d+)*)\.\s+(.+)$", - re.MULTILINE, -) -_BIB_MARKERS_RE = re.compile( - r'\b(pp?\.|vol\.|n\.\s*\d|ed\.|edn\.|ISBN|DOI|arXiv)\b' - r'|\b(19|20)\d{2}\b', - re.IGNORECASE, -) -_WATERMARK_RE = re.compile( - r"^(BOZZA|DRAFT|CONFIDENTIAL|RISERVATO|PROVVISORIO|SAMPLE|SPECIMEN" - r"|DO NOT DISTRIBUTE|NON DISTRIBUIRE|COPY|COPIA)\s*$", - re.IGNORECASE | re.MULTILINE, -) - -_MATH_SYMBOLS_RE = re.compile( - r"[=+∈∀∃≤≥∞∑∫∂→↔⊂⊃∩∪αβγδεζηθικλμνξοπρστυφχψωΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩ]" -) -_EXERCISE_TRIGGER_RE = re.compile( - r"\b(Si dimostri|Si calcoli|Si provi|Si trovi|Trovare|Find|Prove|Show that" - r"|Compute|Calculate|Dimostrare|Verificare)\b", - re.IGNORECASE, -) -_MATH_HDR_RE = re.compile(r"^(#{2,3})\s+(.+)$") -_NUMBERED_PREFIX_RE = re.compile(r"^(\d+(?:\.\d+)*[.)])\s+(.+)$", re.DOTALL) - -# Erano compilati dentro le funzioni a ogni chiamata — ora costanti di modulo -_TABSEP_RE = re.compile(r"(?m)^\|\s*\|\s*$|^\|---\|?\s*$") -_FM_RE = re.compile( - r"https?://|www\.|@[A-Za-z]|\bUniversit[àa]\b|\bDipartimento\b|" - r"\bCopyright\b|\bLicenza\b|\bEdizione\b|" - r"protetto da|tutti i diritti", - re.IGNORECASE, -) -_VERSE_NUM_RE = re.compile( - r'([.!?\xbb\'\"’]\s+)(\d+)(\s+)(?=[A-Z\xc0-\xd9a-z\xe0-\xf9\xab"“‟])' -) - - -# ─── Helper puri ───────────────────────────────────────────────────────────── - -def _sentence_case(s: str) -> str: - if not s: - return s - lower = s.lower() - return lower[0].upper() + lower[1:] - - -def _is_allcaps_line(line: str) -> bool: - stripped = line.strip() - letters = [c for c in stripped if c.isalpha()] - return ( - len(letters) >= 3 - and all(c.isupper() for c in letters) - and not stripped.startswith("#") - and not stripped.startswith("|") - ) - - -def _allcaps_to_header(raw_line: str) -> str: - text = re.sub(r"^[-*+]\s+", "", raw_line.strip()) - text = text.rstrip(".").rstrip("?").strip() - - _ORD_IT_PAT = "|".join(_ORDINALS_IT.keys()) - m = re.match(rf"^CAPITOLO ({_ORD_IT_PAT})\. (.+)", text) - if m: - roman = _ORDINALS_IT[m.group(1)] - titolo = m.group(2).rstrip(".").rstrip("?").strip() - return f"## Capitolo {roman} — {_sentence_case(titolo)}" - - _ORD_EN_PAT = "|".join(_ORDINALS_EN.keys()) - m = re.match(rf"^CHAPTER ({_ORD_EN_PAT}|\d+)\.? (.+)", text) - if m: - n = _ORDINALS_EN.get(m.group(1), m.group(1)) - titolo = m.group(2).rstrip(".").rstrip("?").strip() - return f"## Chapter {n} — {_sentence_case(titolo)}" - - m = re.match(r"^([IVXLCDM]+|[0-9]+)\. (.+)", text) - if m: - return f"## {m.group(1)}. {_sentence_case(m.group(2).rstrip('.').strip())}" - - return f"## {_sentence_case(text)}" - - -def _extract_math_environments(text: str) -> tuple[str, int]: - _ENVS = ( - r"Definizione|Definition|Teorema|Theorem|Lemma|" - r"Proposizione|Proposition|Corollario|Corollary|" - r"Osservazione|Remark|Nota|Note|Esempio|Example" - ) - count = 0 - blocks = text.split("\n\n") - result = [] - - for block in blocks: - stripped = block.strip() - if not stripped or stripped.startswith("#"): - result.append(block) - continue - - m = re.match( - rf"^({_ENVS})\s+((?:\d+\.?){{1,4}})\s*(.*)", - stripped, - re.DOTALL, - ) - if not m: - result.append(block) - continue - - env = m.group(1) - num = m.group(2).rstrip(".") - rest = m.group(3).strip() - - title_m = re.match(r"^(\([^)]{2,60}\))\s+(.*)", rest, re.DOTALL) - if title_m: - header = f"### {env} {num} {title_m.group(1)}" - body = title_m.group(2).strip() - else: - header = f"### {env} {num}." - body = rest - - result.append(f"{header}\n\n{body}" if body else header) - count += 1 - - return "\n\n".join(result), count - - -def _merge_title_headers(text: str) -> tuple[str, int]: - count = 0 - blocks = re.split(r"\n{2,}", text) - result = [] - i = 0 - while i < len(blocks): - block = blocks[i] - stripped = block.strip() - if ( - re.match(r"^#{2,3} \d+\.\s*$", stripped) - and i + 1 < len(blocks) - ): - nxt = blocks[i + 1].strip() - if ( - nxt - and "\n" not in nxt - and len(nxt) <= 80 - and not nxt.startswith("#") - and not re.match(r"^\d+[\.\)]\s", nxt) - ): - result.append(stripped.rstrip() + " " + nxt) - count += 1 - i += 2 - continue - result.append(block) - i += 1 - return re.sub(r"\n{3,}", "\n\n", "\n\n".join(result)), count - - -def _extract_article_headers(text: str) -> tuple[str, int]: - count = 0 - - def _repl(m: re.Match) -> str: - nonlocal count - num = m.group(1) - rest = m.group(2).strip() - - title_m = re.match( - r"^([A-Z\xc0\xc8\xc9\xcc\xcd\xd2\xd3\xd9\xda].{1,74}?)\.\s+" - r"([A-Z\xc0\xc8\xc9\xcc\xcd\xd2\xd3\xd9\xda\(\d].{4,})", - rest, - ) - if title_m: - count += 1 - return ( - f"### Art. {num}. {title_m.group(1)}.\n\n" - f"{title_m.group(2).strip()}" - ) - if rest: - count += 1 - return f"### Art. {num}.\n\n{rest}" - count += 1 - return f"### Art. {num}." - - text = re.sub( - r"^-\s+Art\.\s+([\d]+[a-z\-]*)\.\s*(.*)", - _repl, - text, - flags=re.MULTILINE, - ) - return text, count - - -# ─── Trasformazioni atomiche ────────────────────────────────────────────────── - -def _t_fix_symbol_font(text: str) -> tuple[str, int]: - count = [0] - - def _repl(m: re.Match) -> str: - count[0] += 1 - return _SYMBOL_PUA_MAP[m.group(0)] - - result = _SYMBOL_PUA_RE.sub(_repl, text) - return result, count[0] - - -def _t_remove_images(text: str) -> tuple[str, int]: - n = len(re.findall(r"!\[[^\]]*\]\([^)]*\)", text)) - text = re.sub(r"!\[[^\]]*\]\([^)]*\)\s*", "", text) - return text, n - - -def _t_remove_footnotes(text: str) -> tuple[str, int]: - lines = text.split("\n") - result, count = [], 0 - for line in lines: - stripped = line.strip() - if stripped and _FOOTNOTE_BODY_RE.match(stripped) and len(stripped) < 300: - count += 1 - continue - cleaned = _SUPERSCRIPT_RE.sub("", line) - if cleaned != line: - count += 1 - result.append(cleaned) - return "\n".join(result), count - - -def _t_fix_br(text: str) -> tuple[str, int]: - n = len(re.findall(r"
", text, re.IGNORECASE)) - text = re.sub(r"
\s*", " ", text, flags=re.IGNORECASE) - return text, n - - -def _t_fix_tabsep(text: str) -> tuple[str, int]: - n = len(_TABSEP_RE.findall(text)) - text = _TABSEP_RE.sub("", text) - return text, n - - -def _t_fix_accents(text: str) -> tuple[str, int]: - _ACCENT_MAP = { - "e": "\xe8", "E": "\xc8", "a": "\xe0", "A": "\xc0", - "u": "\xf9", "U": "\xd9", "i": "\xec", "I": "\xcc", - "o": "\xf2", "O": "\xd2", - } - n_bt_before = text.count("`") - text = re.sub(r"`([eEaAuUiIoO])", lambda m: _ACCENT_MAP[m.group(1)], text) - text = re.sub(r"([eEaAuUiIoO])`", lambda m: _ACCENT_MAP[m.group(1)], text) - n_accenti = n_bt_before - text.count("`") - n_bt_orfani = text.count("`") - if n_bt_orfani: - text = re.sub(r"`", "", text) - n_accenti += n_bt_orfani - return text, n_accenti - - -def _t_fix_multiplication(text: str) -> tuple[str, int]: - n = len(re.findall(r'(?<=[0-9])"(?=[0-9(])', text)) - text = re.sub(r'(?<=[0-9])"(?=[0-9(])', '×', text) - return text, n - - -def _t_fix_micro(text: str) -> tuple[str, int]: - _SI_UNITS_RE = r'[mAsgVWFHTKNJClΩ]' - n = len(re.findall(rf'\d\s*!(?={_SI_UNITS_RE})', text)) - text = re.sub(rf'(\d)\s*!({_SI_UNITS_RE})', r'\1 µ\2', text) - return text, n - - -def _t_remove_formula_labels(text: str) -> tuple[str, int]: - n = len(re.findall(r"\[\d+\.\d+\]", text)) - text = re.sub(r"\s*\[\d+\.\d+\]\s*", " ", text) - return text, n - - -def _t_remove_dotleaders(text: str) -> tuple[str, int]: - _DOTLEADER_RE = r"^[^\n]*(?:(?:\. ){3,}|\.{4,})[^\n]*$" - n = len(re.findall(_DOTLEADER_RE, text, re.MULTILINE)) - text = re.sub(_DOTLEADER_RE, "", text, flags=re.MULTILINE) - text = re.sub( - r"(?m)^(i{1,3}|iv|vi{0,3}|ix|xi{0,2}|x)$", - "", - text, - flags=re.IGNORECASE, - ) - return text, n - - -def _t_fix_header_concat(text: str) -> tuple[str, int]: - count = 0 - - def _fix(m: re.Match) -> str: - nonlocal count - hashes = m.group(1) - full = m.group(2).strip() - if len(full) < 60: - return m.group(0) - skip = min(10, len(full) // 3) - split = re.search(r"(?<=[a-z\xe0\xe8\xe9\xec\xed\xf2\xf3\xf9\xfa\xe4])(?=[A-Z\xc0\xc8\xc9\xcc\xcd\xd2\xd3\xd9\xda])", full[skip:]) - if split: - pos = skip + split.start() - title = full[:pos].strip() - body = full[pos:].strip() - if len(title) >= 5 and len(body) >= 15: - count += 1 - return f"{hashes} {title}\n\n{body}" - return m.group(0) - - text = re.sub(r"^(#{2,6})\s+(.{40,})$", _fix, text, flags=re.MULTILINE) - return text, count - - -def _t_extract_capitolo(text: str) -> tuple[str, int]: - def _repl(m: re.Match) -> str: - num = m.group(1) - titolo = _sentence_case(m.group(2).strip().rstrip("- ").strip()) - return f"\n\n## Capitolo {num}: {titolo}\n\n" - - text = re.sub( - r"\bCapitolo\s+(\d+)\s*[:\s]\s*([A-Z\xc0\xc8\xc9\xcc\xcd\xd2\xd3\xd9\xda\'L]" - r"[A-Z\xc0\xc8\xc9\xcc\xcd\xd2\xd3\xd9\xda\s\'\.,\(\)]{5,80}?)" - r"(?=\s*[-–]\s*\d|\s*\n|\s*$)", - _repl, - text, - ) - return text, 0 - - -def _t_normalize_numbered_headings(text: str) -> tuple[str, int]: - all_matches = list(_NUMBERED_HDR_RE.finditer(text)) - if not all_matches: - return text, 0 - - pairs = [(m.group(2).count(".") + 1, len(m.group(1))) for m in all_matches] - depths = [d for d, _ in pairs] - min_depth = min(depths) - max_depth = max(depths) - if max_depth == min_depth: - return text, 0 - - base_level = min(lv for d, lv in pairs if d == min_depth) - count = 0 - - def _repl(m: re.Match) -> str: - nonlocal count - hashes, num, title = m.group(1), m.group(2), m.group(3) - depth = num.count(".") + 1 - new_level = min(base_level + (depth - min_depth), 6) - if new_level == len(hashes): - return m.group(0) - count += 1 - return f"{'#' * new_level} {num}. {title}" - - return _NUMBERED_HDR_RE.sub(_repl, text), count - - -def _t_normalize_header_levels(text: str) -> tuple[str, int]: - text = re.sub(r"^#{3,6}\s*$", "", text, flags=re.MULTILINE) - text = re.sub( - r"^(#{3,6})\s+(\d{1,3})\s+(.+)$", - lambda m: f"### {m.group(2)}. {m.group(3)}", - text, - flags=re.MULTILINE, - ) - text = re.sub(r"^#{4,6}\s+(.+)$", r"### \1", text, flags=re.MULTILINE) - return text, 0 - - -def _t_extract_articles(text: str) -> tuple[str, int]: - return _extract_article_headers(text) - - -def _t_remove_header_bold(text: str) -> tuple[str, int]: - text = re.sub( - r"^(#{1,6})\s+\*\*(.+?)\*\*\s*$", - r"\1 \2", - text, flags=re.MULTILINE, - ) - return text, 0 - - -def _t_normalize_allcaps_headers(text: str) -> tuple[str, int]: - def _norm(m: re.Match) -> str: - hashes, content = m.group(1), m.group(2).strip() - letters = [c for c in content if c.isalpha()] - if letters and all(c.isupper() for c in letters): - return f"{hashes} {_sentence_case(content)}" - return m.group(0) - - text = re.sub(r"^(#{1,6}) (.+)$", _norm, text, flags=re.MULTILINE) - return text, 0 - - -def _t_remove_toc(text: str) -> tuple[str, int]: - lines = text.split("\n") - new_lines = [] - _in_toc = False - removed = False - for line in lines: - bare = re.sub(r"^#+\s*", "", line.strip()) - first_word = bare.split(".")[0].strip().lower() - if first_word in _TOC_KEYWORDS: - removed = True - _in_toc = True - continue - if _in_toc: - if re.match(r"^\s*$", line) or re.match(r"^\s*[-*+]\s+\d", line): - continue - if re.match(r"^\s*[-*+]\s+.{2,70}\s+\d{1,3}\s*$", line): - continue - if len(line.strip()) > 200: - _in_toc = False - new_lines.append(line) - continue - _in_toc = False - new_lines.append(line) - return "\n".join(new_lines), 1 if removed else 0 - - -def _t_allcaps_to_headers(text: str) -> tuple[str, int]: - count = 0 - blocks = text.split("\n\n") - new_blocks = [] - for block in blocks: - stripped = block.strip() - if "\n" not in stripped and _is_allcaps_line(stripped): - new_blocks.append(_allcaps_to_header(stripped)) - count += 1 - else: - sub_lines = block.split("\n") - converted = [] - for ln in sub_lines: - if _is_allcaps_line(ln) and len(ln.strip()) > 3: - converted.append(_allcaps_to_header(ln)) - count += 1 - else: - converted.append(ln) - new_blocks.append("\n".join(converted)) - return "\n\n".join(new_blocks), count - - -def _t_numbered_sections(text: str, has_exercises: bool = False) -> tuple[str, int]: - count = 0 - - def _num_repl(m: re.Match) -> str: - nonlocal count - content = m.group(2).strip() - if content.endswith(".") and len(content) > 40: - return m.group(0) - if _BIB_MARKERS_RE.search(content): - return m.group(0) - count += 1 - return f"### {m.group(1)}.\n\n{content}" - - text = re.sub(r"^(\d+)\.\s+(.+)$", _num_repl, text, flags=re.MULTILINE) - - def _num_letter_repl(m: re.Match) -> str: - nonlocal count - count += 1 - return f"### {m.group(1)}{m.group(2)}.\n\n{m.group(3).strip()}" - - text = re.sub(r"^(\d+)\s*([a-z])\.\s+(.+)$", _num_letter_repl, text, flags=re.MULTILINE) - - if not has_exercises: - def _aphorism_repl(m: re.Match) -> str: - nonlocal count - content = m.group(2).strip() - if _BIB_MARKERS_RE.search(content): - return m.group(0) - count += 1 - return f"\n\n### {m.group(1)}.\n\n{content}" - - text = re.sub( - r"^-\s+(\d{1,3})\.\s+(.{10,})$", - _aphorism_repl, - text, - flags=re.MULTILINE, - ) - - def _list_section_repl(m: re.Match) -> str: - nonlocal count - num = m.group(1) - content = m.group(2).strip() - if _BIB_MARKERS_RE.search(content): - return m.group(0) - count += 1 - split = re.search(r"(?<=[a-z\xe0\xe8\xe9\xec\xed\xf2\xf3\xf9\xfa])\s+(?=[A-Z\xc0\xc8\xc9\xcc\xcd\xd2\xd3\xd9\xda])", content) - if split and split.start() >= 3: - title = content[: split.start()].strip() - body = content[split.end():].strip() - if len(body) >= 20: - return f"\n\n### {num}. {title}\n\n{body}" - return f"\n\n### {num}. {content}" - - text = re.sub( - r"^-\s+(\d{1,3})\s+([A-Z\xc0\xc8\xc9\xcc\xcd\xd2\xd3\xd9\xda\'L].{10,})$", - _list_section_repl, - text, - flags=re.MULTILINE, - ) - return text, count - - -def _t_extract_math(text: str) -> tuple[str, int]: - return _extract_math_environments(text) - - -def _t_merge_paragraphs(text: str) -> tuple[str, int]: - _SENTENCE_END = set(".?!\xbb)\"'") - blocks = text.split("\n\n") - merged = [] - count = 0 - i = 0 - while i < len(blocks): - b = blocks[i] - stripped = b.strip() - while ( - i + 1 < len(blocks) - and stripped - and not stripped.startswith("#") - and not stripped.startswith("|") - and stripped[-1] not in _SENTENCE_END - ): - nxt = blocks[i + 1].strip() - if ( - not nxt - or nxt.startswith("#") - or nxt.startswith("|") - or re.match(r"^\d+\.", nxt) - or re.match(r"^[-*+]\s", nxt) - ): - break - b = stripped + " " + nxt - stripped = b.strip() - count += 1 - i += 1 - merged.append(b) - i += 1 - text = "\n\n".join(merged) - text = re.sub(r"(?m)^\|---\|\s*", "", text) - return text, count - - -def _t_normalize_whitespace(text: str) -> tuple[str, int]: - lines = text.split("\n") - text = "\n".join( - re.sub(r" +", " ", line) if line.strip() else line - for line in lines - ) - return text, 0 - - -def _t_collapse_blank_lines(text: str) -> tuple[str, int]: - return re.sub(r"\n{3,}", "\n\n", text), 0 - - -def _t_demote_verse_headers(text: str) -> tuple[str, int]: - count = 0 - - def _demote(m: re.Match) -> str: - nonlocal count - hashes, content = m.group(1), m.group(2).strip() - if not re.search(r"\s\d{1,4}\s*$", content): - return m.group(0) - inner = re.sub(r"\s\d{1,4}\s*$", "", content) - if not re.search(r'[,;:.!?\xbb"\'][\ ]+[A-Za-z\xc0-\xff\xab"“]', inner): - return m.group(0) - count += 1 - clean = re.sub(r"\s\d{1,4}\s*$", "", content) - return clean - - text = re.sub(r"^(#{1,6})\s+(.{20,})$", _demote, text, flags=re.MULTILINE) - return text, count - - -def _t_restore_poetry_lines(text: str) -> tuple[str, int]: - count = 0 - blocks = text.split("\n\n") - result = [] - - for block in blocks: - stripped = block.strip() - if not stripped or stripped.startswith("#"): - result.append(block) - continue - - matches = list(_VERSE_NUM_RE.finditer(stripped)) - if len(matches) < 2: - result.append(block) - continue - - nums = [int(m.group(2)) for m in matches] - diffs = [nums[i + 1] - nums[i] for i in range(len(nums) - 1)] - if not diffs or len(set(diffs)) > 2 or not (1 <= diffs[0] <= 5): - result.append(block) - continue - - step = diffs[0] - - def _replace_verse_num(m: re.Match) -> str: - n = int(m.group(2)) - sep = "\n\n" if n % (step * 3) == 0 else "\n" - return m.group(1).rstrip() + sep - - new_block = _VERSE_NUM_RE.sub(_replace_verse_num, stripped) - if new_block != stripped: - count += len(matches) - result.append(new_block) - - return "\n\n".join(result), count - - -def _t_remove_urls(text: str) -> tuple[str, int]: - return re.sub(r"(?m)^(https?://|www\.)\S+\s*$", "", text), 0 - - -def _t_remove_empty_headers(text: str) -> tuple[str, int]: - blocks = re.split(r"\n{2,}", text) - cleaned = [] - for i, block in enumerate(blocks): - stripped = block.strip() - if re.match(r"^#{1,6} ", stripped) and "\n" not in stripped: - next_stripped = blocks[i + 1].strip() if i + 1 < len(blocks) else "" - next_is_long_hdr = ( - re.match(r"^#{1,6} ", next_stripped) and len(next_stripped) > 80 - ) - if not next_stripped or ( - re.match(r"^#{1,6} ", next_stripped) and not next_is_long_hdr - ): - continue - cleaned.append(block) - return re.sub(r"\n{3,}", "\n\n", "\n\n".join(cleaned)), 0 - - -def _t_merge_title_headers(text: str) -> tuple[str, int]: - return _merge_title_headers(text) - - -def _t_remove_garbage_headers(text: str) -> tuple[str, int]: - def _is_garbage(content: str) -> bool: - if content.lstrip().startswith("..."): - return True - if not re.search(r"[A-Za-z\xc0-\xffΑ-ω]{2,}", content): - return True - if re.fullmatch(r"\(?\s*[A-Za-z]{1,4}\s*\)?", content.strip()): - return True - if len(content) > 60 and re.search(r"[!%#]\w|\w[!%#]|\b\w+-\s*\w", content): - return True - first_alpha = next((c for c in content if c.isalpha()), None) - if first_alpha and first_alpha.islower() and len(content) > 40: - return True - if re.match(r"^[A-Za-zΑ-ω_]{1,3}\s*[=<>≤≥]", content.strip()): - return True - if re.match(r"^(Figura|Figure|Fig\.|Tabella|Table|Tab\.)\s+\d", content.strip(), re.IGNORECASE): - return True - return False - - count = 0 - lines = text.split("\n") - new_lines = [] - for line in lines: - m = re.match(r"^#{1,6} (.+)$", line) - if m and _is_garbage(m.group(1)): - count += 1 - continue - new_lines.append(line) - text = "\n".join(new_lines) - text = re.sub(r"\n{3,}", "\n\n", text) - return text, count - - -def _t_remove_frontmatter(text: str) -> tuple[str, int]: - blocks = re.split(r"\n{2,}", text) - cleaned = [] - count = 0 - total = len(blocks) - cutoff = max(5, min(15, int(total * 0.20))) - for i, block in enumerate(blocks): - stripped = block.strip() - if i >= cutoff: - cleaned.append(block) - continue - if not re.match(r"^### ", stripped) or re.match(r"^### \d", stripped): - cleaned.append(block) - continue - body = blocks[i + 1].strip() if i + 1 < len(blocks) else "" - is_fm_body = len(body) < 250 and _FM_RE.search(body) - is_fm_hdr = _FM_RE.search(stripped) - if is_fm_body or is_fm_hdr: - count += 1 - continue - cleaned.append(block) - return re.sub(r"\n{3,}", "\n\n", "\n\n".join(cleaned)), count - - -def _t_remove_watermarks(text: str) -> tuple[str, int]: - lines = text.split("\n") - result, count = [], 0 - for line in lines: - if _WATERMARK_RE.match(line): - count += 1 - else: - result.append(line) - return "\n".join(result), count - - -def _t_fix_math_symbols(text: str) -> tuple[str, int]: - lines = text.split("\n") - result, count = [], 0 - for line in lines: - if line.strip() and re.match(r"^[\s■-◿☐-☒•▪▫◆◇●○•]+$", line): - count += 1 - else: - result.append(line) - return "\n".join(result), count - - -def _t_remove_recurring_lines(text: str) -> tuple[str, int]: - lines = text.split("\n") - short_lines = [ - ln.strip() for ln in lines - if 3 < len(ln.strip()) < 80 - and not ln.strip().startswith("#") - and not ln.strip().startswith("|") - ] - freq = Counter(short_lines) - recurring = {ln for ln, c in freq.items() if c >= 5} - if not recurring: - return text, 0 - result, count = [], 0 - for line in lines: - if line.strip() in recurring: - count += 1 - else: - result.append(line) - return "\n".join(result), count - - -def _t_math_header_demotion(text: str) -> tuple[str, int]: - lines = text.split("\n") - result, count = [], 0 - for line in lines: - m = _MATH_HDR_RE.match(line) - if not m: - result.append(line) - continue - body = m.group(2) - if len(body) <= 100: - result.append(line) - continue - has_math = len(_MATH_SYMBOLS_RE.findall(body)) >= 3 - has_exercise = bool(_EXERCISE_TRIGGER_RE.search(body)) - if not (has_math or has_exercise): - result.append(line) - continue - nm = _NUMBERED_PREFIX_RE.match(body) - if nm: - result.append(f"**{nm.group(1)}** {nm.group(2)}") - else: - result.append(body) - count += 1 - return "\n".join(result), count - - -# ─── Orchestratore ─────────────────────────────────────────────────────────── - -def apply_transforms(text: str) -> tuple[str, dict]: - """ - Applica le trasformazioni strutturali al Markdown grezzo. - Restituisce (testo_modificato, statistiche). - L'ordine è semantico: encoding → struttura header → costruzione struttura → testo → rifinitura. - """ - _has_ex = bool(re.search(r"\b(Esercizi|Exercises|Problems|Homework)\b", text, re.IGNORECASE)) - - _transforms: list[tuple[str | None, object]] = [ - ("n_simboli_pua_corretti", _t_fix_symbol_font), - ("n_immagini_rimosse", _t_remove_images), - ("n_br_rimossi", _t_fix_br), - ("n_tabsep_rimossi", _t_fix_tabsep), - ("n_note_rimosse", _t_remove_footnotes), - ("n_accenti_corretti", _t_fix_accents), - ("n_moltiplicazioni_corrette", _t_fix_multiplication), - ("n_micro_corretti", _t_fix_micro), - ("n_simboli_math_rimossi", _t_fix_math_symbols), - ("n_formule_rimossi", _t_remove_formula_labels), - ("n_dotleader_rimossi", _t_remove_dotleaders), - ("n_righe_ricorrenti_rimosse", _t_remove_recurring_lines), - ("n_header_concat_fixati", _t_fix_header_concat), - (None, _t_extract_capitolo), - ("n_header_numerati_normalizzati", _t_normalize_numbered_headings), - (None, _t_normalize_header_levels), - ("n_articoli_estratti", _t_extract_articles), - (None, _t_remove_header_bold), - (None, _t_normalize_allcaps_headers), - ("toc_rimosso", _t_remove_toc), - ("n_header_allcaps", _t_allcaps_to_headers), - ("n_sezioni_numerate", partial(_t_numbered_sections, has_exercises=_has_ex)), - ("n_ambienti_matematici", _t_extract_math), - ("n_paragrafi_uniti", _t_merge_paragraphs), - (None, _t_normalize_whitespace), - (None, _t_collapse_blank_lines), - ("n_versi_ripristinati", _t_restore_poetry_lines), - ("n_header_verso_demotati", _t_demote_verse_headers), - (None, _t_remove_urls), - (None, _t_remove_empty_headers), - ("n_titoli_uniti", _t_merge_title_headers), - (None, lambda t: (re.sub(r"(?m)^(#{1,6}.+?)\s*\|\s*\d{1,3}\s*$", r"\1", t), 0)), - ("n_garbage_headers_rimossi", _t_remove_garbage_headers), - ("n_formula_headers_demotati", _t_math_header_demotion), - ("n_frontmatter_rimossi", _t_remove_frontmatter), - ("n_watermark_rimossi", _t_remove_watermarks), - ] - - stats: dict = {} - for stat_key, fn in _transforms: - text, n = fn(text) - if stat_key: - stats[stat_key] = stats.get(stat_key, 0) + n - - stats["toc_rimosso"] = bool(stats.get("toc_rimosso", 0)) - return text, stats diff --git a/conversione/_pipeline/transforms/__init__.py b/conversione/_pipeline/transforms/__init__.py new file mode 100644 index 0000000..9b02e60 --- /dev/null +++ b/conversione/_pipeline/transforms/__init__.py @@ -0,0 +1,4 @@ +"""Package transforms: pipeline di pulizia strutturale per Markdown RAG.""" +from ._apply import apply_transforms + +__all__ = ["apply_transforms"] diff --git a/conversione/_pipeline/transforms/__pycache__/__init__.cpython-312.pyc b/conversione/_pipeline/transforms/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000..bd0f2d7 Binary files /dev/null and b/conversione/_pipeline/transforms/__pycache__/__init__.cpython-312.pyc differ diff --git a/conversione/_pipeline/transforms/__pycache__/_apply.cpython-312.pyc b/conversione/_pipeline/transforms/__pycache__/_apply.cpython-312.pyc new file mode 100644 index 0000000..c98ec7f Binary files /dev/null and b/conversione/_pipeline/transforms/__pycache__/_apply.cpython-312.pyc differ diff --git a/conversione/_pipeline/transforms/__pycache__/_artifacts.cpython-312.pyc b/conversione/_pipeline/transforms/__pycache__/_artifacts.cpython-312.pyc new file mode 100644 index 0000000..ab50816 Binary files /dev/null and b/conversione/_pipeline/transforms/__pycache__/_artifacts.cpython-312.pyc differ diff --git a/conversione/_pipeline/transforms/__pycache__/_constants.cpython-312.pyc b/conversione/_pipeline/transforms/__pycache__/_constants.cpython-312.pyc new file mode 100644 index 0000000..68f5d79 Binary files /dev/null and b/conversione/_pipeline/transforms/__pycache__/_constants.cpython-312.pyc differ diff --git a/conversione/_pipeline/transforms/__pycache__/_encoding.cpython-312.pyc b/conversione/_pipeline/transforms/__pycache__/_encoding.cpython-312.pyc new file mode 100644 index 0000000..c8df4b6 Binary files /dev/null and b/conversione/_pipeline/transforms/__pycache__/_encoding.cpython-312.pyc differ diff --git a/conversione/_pipeline/transforms/__pycache__/_finish.cpython-312.pyc b/conversione/_pipeline/transforms/__pycache__/_finish.cpython-312.pyc new file mode 100644 index 0000000..854b0dd Binary files /dev/null and b/conversione/_pipeline/transforms/__pycache__/_finish.cpython-312.pyc differ diff --git a/conversione/_pipeline/transforms/__pycache__/_headers.cpython-312.pyc b/conversione/_pipeline/transforms/__pycache__/_headers.cpython-312.pyc new file mode 100644 index 0000000..55f3d9c Binary files /dev/null and b/conversione/_pipeline/transforms/__pycache__/_headers.cpython-312.pyc differ diff --git a/conversione/_pipeline/transforms/__pycache__/_helpers.cpython-312.pyc b/conversione/_pipeline/transforms/__pycache__/_helpers.cpython-312.pyc new file mode 100644 index 0000000..5ac5beb Binary files /dev/null and b/conversione/_pipeline/transforms/__pycache__/_helpers.cpython-312.pyc differ diff --git a/conversione/_pipeline/transforms/__pycache__/_structure.cpython-312.pyc b/conversione/_pipeline/transforms/__pycache__/_structure.cpython-312.pyc new file mode 100644 index 0000000..8007942 Binary files /dev/null and b/conversione/_pipeline/transforms/__pycache__/_structure.cpython-312.pyc differ diff --git a/conversione/_pipeline/transforms/__pycache__/_text.cpython-312.pyc b/conversione/_pipeline/transforms/__pycache__/_text.cpython-312.pyc new file mode 100644 index 0000000..f6f3e6f Binary files /dev/null and b/conversione/_pipeline/transforms/__pycache__/_text.cpython-312.pyc differ diff --git a/conversione/_pipeline/transforms/_apply.py b/conversione/_pipeline/transforms/_apply.py new file mode 100644 index 0000000..efa1565 --- /dev/null +++ b/conversione/_pipeline/transforms/_apply.py @@ -0,0 +1,96 @@ +"""Orchestratore: applica le trasformazioni in ordine semantico.""" +import re +from functools import partial + +from ._encoding import ( + _t_fix_symbol_font, _t_fix_accents, + _t_fix_multiplication, _t_fix_micro, +) +from ._artifacts import ( + _t_remove_images, _t_fix_br, _t_fix_tabsep, _t_remove_footnotes, + _t_remove_formula_labels, _t_remove_dotleaders, _t_remove_recurring_lines, + _t_fix_math_symbols, _t_remove_watermarks, _t_remove_urls, +) +from ._headers import ( + _t_fix_header_concat, _t_extract_capitolo, + _t_normalize_numbered_headings, _t_normalize_header_levels, + _t_remove_header_bold, _t_normalize_allcaps_headers, +) +from ._structure import ( + _t_remove_toc, _t_remove_orphan_toc, _t_allcaps_to_headers, + _t_numbered_sections, _t_extract_math, _t_extract_articles, +) +from ._text import ( + _t_merge_paragraphs, _t_normalize_whitespace, _t_collapse_blank_lines, + _t_restore_poetry_lines, _t_demote_verse_headers, +) +from ._finish import ( + _t_remove_empty_headers, _t_merge_title_headers, + _t_remove_garbage_headers, _t_math_header_demotion, + _t_remove_frontmatter, +) + + +def apply_transforms(text: str) -> tuple[str, dict]: + """ + Applica le trasformazioni strutturali al Markdown grezzo. + Restituisce (testo_modificato, statistiche). + L'ordine è semantico: encoding → artefatti → struttura header → + costruzione struttura → testo → rifinitura. + """ + _has_ex = bool(re.search(r"\b(Esercizi|Exercises|Problems|Homework)\b", text, re.IGNORECASE)) + + _transforms: list[tuple[str | None, object]] = [ + # 1. Encoding + ("n_simboli_pua_corretti", _t_fix_symbol_font), + ("n_accenti_corretti", _t_fix_accents), + ("n_moltiplicazioni_corrette", _t_fix_multiplication), + ("n_micro_corretti", _t_fix_micro), + # 2. Pulizia artefatti + ("n_immagini_rimosse", _t_remove_images), + ("n_br_rimossi", _t_fix_br), + ("n_tabsep_rimossi", _t_fix_tabsep), + ("n_note_rimosse", _t_remove_footnotes), + ("n_simboli_math_rimossi", _t_fix_math_symbols), + ("n_formule_rimossi", _t_remove_formula_labels), + ("n_dotleader_rimossi", _t_remove_dotleaders), + ("n_righe_ricorrenti_rimosse", _t_remove_recurring_lines), + # 3. Struttura header + ("n_header_concat_fixati", _t_fix_header_concat), + (None, _t_extract_capitolo), + ("n_header_numerati_normalizzati", _t_normalize_numbered_headings), + (None, _t_normalize_header_levels), + (None, _t_remove_header_bold), + (None, _t_normalize_allcaps_headers), + # 4. Costruzione struttura + ("toc_rimosso", _t_remove_toc), + ("n_toc_orfani_rimossi", _t_remove_orphan_toc), + ("n_header_allcaps", _t_allcaps_to_headers), + ("n_sezioni_numerate", partial(_t_numbered_sections, has_exercises=_has_ex)), + ("n_ambienti_matematici", _t_extract_math), + ("n_articoli_estratti", _t_extract_articles), + # 5. Testo + ("n_paragrafi_uniti", _t_merge_paragraphs), + (None, _t_normalize_whitespace), + (None, _t_collapse_blank_lines), + ("n_versi_ripristinati", _t_restore_poetry_lines), + ("n_header_verso_demotati", _t_demote_verse_headers), + (None, _t_remove_urls), + # 6. Rifinitura + (None, _t_remove_empty_headers), + ("n_titoli_uniti", _t_merge_title_headers), + (None, lambda t: (re.sub(r"(?m)^(#{1,6}.+?)\s*\|\s*\d{1,3}\s*$", r"\1", t), 0)), + ("n_garbage_headers_rimossi", _t_remove_garbage_headers), + ("n_formula_headers_demotati", _t_math_header_demotion), + ("n_frontmatter_rimossi", _t_remove_frontmatter), + ("n_watermark_rimossi", _t_remove_watermarks), + ] + + stats: dict = {} + for stat_key, fn in _transforms: + text, n = fn(text) + if stat_key: + stats[stat_key] = stats.get(stat_key, 0) + n + + stats["toc_rimosso"] = bool(stats.get("toc_rimosso", 0)) + return text, stats diff --git a/conversione/_pipeline/transforms/_artifacts.py b/conversione/_pipeline/transforms/_artifacts.py new file mode 100644 index 0000000..a3e2f67 --- /dev/null +++ b/conversione/_pipeline/transforms/_artifacts.py @@ -0,0 +1,106 @@ +"""Rimozione artefatti: immagini, BR, footnote, URL, righe ricorrenti, watermark.""" +import re +from collections import Counter + +from ._constants import ( + _WATERMARK_RE, _TABSEP_RE, _SUPERSCRIPT_RE, _FOOTNOTE_BODY_RE, +) + + +def _t_remove_images(text: str) -> tuple[str, int]: + n = len(re.findall(r"!\[[^\]]*\]\([^)]*\)", text)) + text = re.sub(r"!\[[^\]]*\]\([^)]*\)\s*", "", text) + return text, n + + +def _t_fix_br(text: str) -> tuple[str, int]: + n = len(re.findall(r"
", text, re.IGNORECASE)) + text = re.sub(r"
\s*", " ", text, flags=re.IGNORECASE) + return text, n + + +def _t_fix_tabsep(text: str) -> tuple[str, int]: + n = len(_TABSEP_RE.findall(text)) + text = _TABSEP_RE.sub("", text) + return text, n + + +def _t_remove_footnotes(text: str) -> tuple[str, int]: + lines = text.split("\n") + result, count = [], 0 + for line in lines: + stripped = line.strip() + if stripped and _FOOTNOTE_BODY_RE.match(stripped) and len(stripped) < 300: + count += 1 + continue + cleaned = _SUPERSCRIPT_RE.sub("", line) + if cleaned != line: + count += 1 + result.append(cleaned) + return "\n".join(result), count + + +def _t_remove_formula_labels(text: str) -> tuple[str, int]: + n = len(re.findall(r"\[\d+\.\d+\]", text)) + text = re.sub(r"\s*\[\d+\.\d+\]\s*", " ", text) + return text, n + + +def _t_remove_dotleaders(text: str) -> tuple[str, int]: + _DOTLEADER_RE = r"^[^\n]*(?:(?:\. ){3,}|\.{4,})[^\n]*$" + n = len(re.findall(_DOTLEADER_RE, text, re.MULTILINE)) + text = re.sub(_DOTLEADER_RE, "", text, flags=re.MULTILINE) + text = re.sub( + r"(?m)^(i{1,3}|iv|vi{0,3}|ix|xi{0,2}|x)$", + "", + text, + flags=re.IGNORECASE, + ) + return text, n + + +def _t_remove_recurring_lines(text: str) -> tuple[str, int]: + lines = text.split("\n") + short_lines = [ + ln.strip() for ln in lines + if 3 < len(ln.strip()) < 80 + and not ln.strip().startswith("#") + and not ln.strip().startswith("|") + ] + freq = Counter(short_lines) + recurring = {ln for ln, c in freq.items() if c >= 5} + if not recurring: + return text, 0 + result, count = [], 0 + for line in lines: + if line.strip() in recurring: + count += 1 + else: + result.append(line) + return "\n".join(result), count + + +def _t_fix_math_symbols(text: str) -> tuple[str, int]: + lines = text.split("\n") + result, count = [], 0 + for line in lines: + if line.strip() and re.match(r"^[\s■-◿☐-☒•▪▫◆◇●○•]+$", line): + count += 1 + else: + result.append(line) + return "\n".join(result), count + + +def _t_remove_watermarks(text: str) -> tuple[str, int]: + lines = text.split("\n") + result, count = [], 0 + for line in lines: + if _WATERMARK_RE.match(line): + count += 1 + else: + result.append(line) + return "\n".join(result), count + + +def _t_remove_urls(text: str) -> tuple[str, int]: + return re.sub(r"(?m)^(https?://|www\.)\S+\s*$", "", text), 0 diff --git a/conversione/_pipeline/transforms/_constants.py b/conversione/_pipeline/transforms/_constants.py new file mode 100644 index 0000000..18760e0 --- /dev/null +++ b/conversione/_pipeline/transforms/_constants.py @@ -0,0 +1,161 @@ +""" +Costanti di modulo condivise tra i moduli di trasformazione. +Tutte le regex compilate e le mappe statiche vivono qui. +""" +import re + +# ─── Keyword sets ───────────────────────────────────────────────────────────── + +_TOC_KEYWORDS = frozenset([ + "indice", "index", "contents", "table of contents", + "sommario", "inhaltsverzeichnis", "inhalt", + "indice generale", "indice analitico", "indice dei contenuti", + "elenco dei capitoli", "argomenti", "table des matières", + "tabla de contenidos", "содержание", +]) + +_ORDINALS_IT = { + "PRIMO": "I", "SECONDO": "II", "TERZO": "III", "QUARTO": "IV", + "QUINTO": "V", "SESTO": "VI", "SETTIMO": "VII", "OTTAVO": "VIII", + "NONO": "IX", "DECIMO": "X", +} +_ORDINALS_EN = { + "ONE": "1", "TWO": "2", "THREE": "3", "FOUR": "4", "FIVE": "5", + "SIX": "6", "SEVEN": "7", "EIGHT": "8", "NINE": "9", "TEN": "10", +} + +# ─── PUA Symbol font map ────────────────────────────────────────────────────── + +_SYMBOL_PUA_MAP: dict[str, str] = { + "": " ", + "": "(", + "": ")", + "": "+", + "": "−", + "": ".", + "": "/", + "": "0", "": "1", "": "2", "": "3", "": "4", + "": "5", "": "6", "": "7", "": "8", "": "9", + "": ":", "": ";", "": "<", "": "=", "": ">", + "": "≅", + "": "Α", "": "Β", "": "Χ", "": "Δ", "": "Ε", + "": "Φ", "": "Γ", "": "Η", "": "Ι", "": "ϑ", + "": "Κ", "": "Λ", "": "Μ", "": "Ν", "": "Ο", + "": "Π", "": "Θ", "": "Ρ", "": "Σ", "": "Τ", + "": "Υ", "": "ς", "": "Ω", "": "Ξ", "": "Ψ", + "": "Ζ", + "": "[", + "": "∴", + "": "]", + "": "⊥", + "": "α", "": "β", "": "χ", "": "δ", "": "ε", + "": "φ", "": "γ", "": "η", "": "ι", "": "ϕ", + "": "κ", "": "λ", "": "μ", "": "ν", "": "ο", + "": "π", "": "θ", "": "ρ", "": "σ", "": "τ", + "": "υ", "": "ϖ", "": "ω", "": "ξ", "": "ψ", + "": "ζ", + "": "{", + "": "|", + "": "}", + "": "~", + "": "±", + "": "•", + "": "√", + "": "≤", + "": "≥", + "": "∝", + "": "×", + "": "÷", + "": "×", + "": "≠", + "": "≠", + "": "≥", + "": "′", + "": "*", + "": ",", + "": "≤", + "": "•", + "": "•", + "": "→", + "": "÷", + "": "", + "": "→", + "": "", + "": "", + "": "", + "": "", + # TeX Computer Modern bracket/delimiter pieces (U+F8EB–F8FE) → stringa vuota + "": "", # TeX large paren left + "": "", # TeX large paren extension + "": "", # TeX large paren right + "": "", # TeX large paren right ext + "": "", # TeX large bracket left + "": "", # TeX large bracket ext + "": "", # TeX brace top-left + "": "", # TeX brace mid + "": "", # TeX brace mid-right + "": "", # TeX brace extension + "": "", # TeX brace right + "": "", # TeX bracket right large + "": "", # TeX bracket right ext + "": "", # TeX bracket right close + "": "", # TeX integral large + "": "", # TeX integral extension + "": "", # TeX integral top + "": "", # TeX radical top + "": "", # TeX radical extension + "": "", # TeX arrowhead +} + +_SYMBOL_PUA_RE = re.compile( + "[" + "".join(re.escape(k) for k in _SYMBOL_PUA_MAP) + "]" +) + +# ─── Regex compilate condivise ──────────────────────────────────────────────── + +_SUPERSCRIPT_RE = re.compile(r'[¹²³⁰⁴-⁹]+') +_FOOTNOTE_BODY_RE = re.compile( + r'^([¹²³⁰⁴-⁹]+\s+|\[\d{1,3}\]\s+)' +) +_NUMBERED_HDR_RE = re.compile( + r"^(#{1,6})\s+(\d+(?:\.\d+)*)\.\s+(.+)$", + re.MULTILINE, +) +_BIB_MARKERS_RE = re.compile( + r'\b(pp?\.|vol\.|n\.\s*\d|ed\.|edn\.|ISBN|DOI|arXiv)\b' + r'|\b(19|20)\d{2}\b', + re.IGNORECASE, +) +_WATERMARK_RE = re.compile( + r"^(BOZZA|DRAFT|CONFIDENTIAL|RISERVATO|PROVVISORIO|SAMPLE|SPECIMEN" + r"|DO NOT DISTRIBUTE|NON DISTRIBUIRE|COPY|COPIA)\s*$", + re.IGNORECASE | re.MULTILINE, +) +_TABSEP_RE = re.compile(r"(?m)^\|\s*\|\s*$|^\|---\|?\s*$") +_FM_RE = re.compile( + r"https?://|www\.|@[A-Za-z]|\bUniversit[àa]\b|\bDipartimento\b|" + r"\bCopyright\b|\bLicenza\b|\bEdizione\b|" + r"protetto da|tutti i diritti", + re.IGNORECASE, +) +_VERSE_NUM_RE = re.compile( + r"([.!?\xbb'\"" + "’" + r"]\s+)(\d+)(\s+)(?=[A-Z\xc0-\xd9a-z\xe0-\xf9\xab“”‟])" +) +# Math header demotion +_MATH_SYMBOLS_RE = re.compile( + r"[=+∈∀∃≤≥∞∑∫∂→↔⊂⊃∩∪αβγδεζηθικλμνξοπρστυφχψωΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩ]" +) +_EXERCISE_TRIGGER_RE = re.compile( + r"\b(Si dimostri|Si calcoli|Si provi|Si trovi|Trovare|Find|Prove|Show that" + r"|Compute|Calculate|Dimostrare|Verificare)\b", + re.IGNORECASE, +) +_MATH_HDR_RE = re.compile(r"^(#{2,3})\s+(.+)$") +_NUMBERED_PREFIX_RE = re.compile(r"^(\d+(?:\.\d+)*[.)])\s+(.+)$", re.DOTALL) +# Orphan TOC: voce di indice senza dot-leader (es. "3. Funzioni 174") +_TOC_ITEM_RE = re.compile( + r"^\d+(\.\d+)*\.?\s+[A-Za-zÀ-ú\'\(][^\n]{2,70}$" +) +_TOC_HDR_WITH_PAGE_RE = re.compile( + r"^#{1,3}\s+\d+\.?\s+.{3,60}\s+\d{1,4}$" +) diff --git a/conversione/_pipeline/transforms/_encoding.py b/conversione/_pipeline/transforms/_encoding.py new file mode 100644 index 0000000..2ce4ec3 --- /dev/null +++ b/conversione/_pipeline/transforms/_encoding.py @@ -0,0 +1,45 @@ +"""Trasformazioni di encoding: PUA font Symbol, accenti LaTeX, simboli SI.""" +import re + +from ._constants import _SYMBOL_PUA_MAP, _SYMBOL_PUA_RE + + +def _t_fix_symbol_font(text: str) -> tuple[str, int]: + count = [0] + + def _repl(m: re.Match) -> str: + count[0] += 1 + return _SYMBOL_PUA_MAP[m.group(0)] + + result = _SYMBOL_PUA_RE.sub(_repl, text) + return result, count[0] + + +def _t_fix_accents(text: str) -> tuple[str, int]: + _ACCENT_MAP = { + "e": "\xe8", "E": "\xc8", "a": "\xe0", "A": "\xc0", + "u": "\xf9", "U": "\xd9", "i": "\xec", "I": "\xcc", + "o": "\xf2", "O": "\xd2", + } + n_bt_before = text.count("`") + text = re.sub(r"`([eEaAuUiIoO])", lambda m: _ACCENT_MAP[m.group(1)], text) + text = re.sub(r"([eEaAuUiIoO])`", lambda m: _ACCENT_MAP[m.group(1)], text) + n_accenti = n_bt_before - text.count("`") + n_bt_orfani = text.count("`") + if n_bt_orfani: + text = re.sub(r"`", "", text) + n_accenti += n_bt_orfani + return text, n_accenti + + +def _t_fix_multiplication(text: str) -> tuple[str, int]: + n = len(re.findall(r'(?<=[0-9])"(?=[0-9(])', text)) + text = re.sub(r'(?<=[0-9])"(?=[0-9(])', '×', text) + return text, n + + +def _t_fix_micro(text: str) -> tuple[str, int]: + _SI_UNITS_RE = r'[mAsgVWFHTKNJClΩ]' + n = len(re.findall(rf'\d\s*!(?={_SI_UNITS_RE})', text)) + text = re.sub(rf'(\d)\s*!({_SI_UNITS_RE})', r'\1 µ\2', text) + return text, n diff --git a/conversione/_pipeline/transforms/_finish.py b/conversione/_pipeline/transforms/_finish.py new file mode 100644 index 0000000..a5f8a8e --- /dev/null +++ b/conversione/_pipeline/transforms/_finish.py @@ -0,0 +1,116 @@ +"""Trasformazioni di rifinitura: header vuoti, garbage, demozione formula-header, frontmatter.""" +import re + +from ._constants import ( + _FM_RE, _MATH_HDR_RE, _MATH_SYMBOLS_RE, + _EXERCISE_TRIGGER_RE, _NUMBERED_PREFIX_RE, +) +from ._helpers import _merge_title_headers + + +def _t_remove_empty_headers(text: str) -> tuple[str, int]: + blocks = re.split(r"\n{2,}", text) + cleaned = [] + for i, block in enumerate(blocks): + stripped = block.strip() + if re.match(r"^#{1,6} ", stripped) and "\n" not in stripped: + next_stripped = blocks[i + 1].strip() if i + 1 < len(blocks) else "" + next_is_long_hdr = ( + re.match(r"^#{1,6} ", next_stripped) and len(next_stripped) > 80 + ) + if not next_stripped or ( + re.match(r"^#{1,6} ", next_stripped) and not next_is_long_hdr + ): + continue + cleaned.append(block) + return re.sub(r"\n{3,}", "\n\n", "\n\n".join(cleaned)), 0 + + +def _t_merge_title_headers(text: str) -> tuple[str, int]: + return _merge_title_headers(text) + + +def _t_remove_garbage_headers(text: str) -> tuple[str, int]: + def _is_garbage(content: str) -> bool: + if content.lstrip().startswith("..."): + return True + if not re.search(r"[A-Za-z\xc0-\xffΑ-ω]{2,}", content): + return True + if re.fullmatch(r"\(?\s*[A-Za-z]{1,4}\s*\)?", content.strip()): + return True + if len(content) > 60 and re.search(r"[!%#]\w|\w[!%#]|\b\w+-\s*\w", content): + return True + first_alpha = next((c for c in content if c.isalpha()), None) + if first_alpha and first_alpha.islower() and len(content) > 40: + return True + if re.match(r"^[A-Za-zΑ-ω_]{1,3}\s*[=<>≤≥]", content.strip()): + return True + if re.match( + r"^(Figura|Figure|Fig\.|Tabella|Table|Tab\.)\s+\d", + content.strip(), re.IGNORECASE, + ): + return True + return False + + count = 0 + lines = text.split("\n") + new_lines = [] + for line in lines: + m = re.match(r"^#{1,6} (.+)$", line) + if m and _is_garbage(m.group(1)): + count += 1 + continue + new_lines.append(line) + text = "\n".join(new_lines) + text = re.sub(r"\n{3,}", "\n\n", text) + return text, count + + +def _t_math_header_demotion(text: str) -> tuple[str, int]: + lines = text.split("\n") + result, count = [], 0 + for line in lines: + m = _MATH_HDR_RE.match(line) + if not m: + result.append(line) + continue + body = m.group(2) + if len(body) <= 100: + result.append(line) + continue + has_math = len(_MATH_SYMBOLS_RE.findall(body)) >= 3 + has_exercise = bool(_EXERCISE_TRIGGER_RE.search(body)) + if not (has_math or has_exercise): + result.append(line) + continue + nm = _NUMBERED_PREFIX_RE.match(body) + if nm: + result.append(f"**{nm.group(1)}** {nm.group(2)}") + else: + result.append(body) + count += 1 + return "\n".join(result), count + + +def _t_remove_frontmatter(text: str) -> tuple[str, int]: + blocks = re.split(r"\n{2,}", text) + cleaned = [] + count = 0 + total = len(blocks) + cutoff = max(5, min(15, int(total * 0.20))) + for i, block in enumerate(blocks): + stripped = block.strip() + if i >= cutoff: + cleaned.append(block) + continue + if not re.match(r"^### ", stripped) or re.match(r"^### \d", stripped): + cleaned.append(block) + continue + body = blocks[i + 1].strip() if i + 1 < len(blocks) else "" + is_fm_body = len(body) < 250 and _FM_RE.search(body) + is_fm_hdr = _FM_RE.search(stripped) + if is_fm_body or is_fm_hdr: + count += 1 + continue + cleaned.append(block) + return re.sub(r"\n{3,}", "\n\n", "\n\n".join(cleaned)), count diff --git a/conversione/_pipeline/transforms/_headers.py b/conversione/_pipeline/transforms/_headers.py new file mode 100644 index 0000000..5e34247 --- /dev/null +++ b/conversione/_pipeline/transforms/_headers.py @@ -0,0 +1,110 @@ +"""Trasformazioni sulla struttura degli header: normalizzazione livelli, concat, bold.""" +import re + +from ._constants import _NUMBERED_HDR_RE +from ._helpers import _sentence_case + + +def _t_fix_header_concat(text: str) -> tuple[str, int]: + count = 0 + + def _fix(m: re.Match) -> str: + nonlocal count + hashes = m.group(1) + full = m.group(2).strip() + if len(full) < 60: + return m.group(0) + skip = min(10, len(full) // 3) + split = re.search( + r"(?<=[a-z\xe0\xe8\xe9\xec\xed\xf2\xf3\xf9\xfa\xe4])" + r"(?=[A-Z\xc0\xc8\xc9\xcc\xcd\xd2\xd3\xd9\xda])", + full[skip:], + ) + if split: + pos = skip + split.start() + title = full[:pos].strip() + body = full[pos:].strip() + if len(title) >= 5 and len(body) >= 15: + count += 1 + return f"{hashes} {title}\n\n{body}" + return m.group(0) + + text = re.sub(r"^(#{2,6})\s+(.{40,})$", _fix, text, flags=re.MULTILINE) + return text, count + + +def _t_extract_capitolo(text: str) -> tuple[str, int]: + def _repl(m: re.Match) -> str: + num = m.group(1) + titolo = _sentence_case(m.group(2).strip().rstrip("- ").strip()) + return f"\n\n## Capitolo {num}: {titolo}\n\n" + + text = re.sub( + r"\bCapitolo\s+(\d+)\s*[:\s]\s*([A-Z\xc0\xc8\xc9\xcc\xcd\xd2\xd3\xd9\xda\'L]" + r"[A-Z\xc0\xc8\xc9\xcc\xcd\xd2\xd3\xd9\xda\s\'\.,\(\)]{5,80}?)" + r"(?=\s*[-–]\s*\d|\s*\n|\s*$)", + _repl, + text, + ) + return text, 0 + + +def _t_normalize_numbered_headings(text: str) -> tuple[str, int]: + all_matches = list(_NUMBERED_HDR_RE.finditer(text)) + if not all_matches: + return text, 0 + + pairs = [(m.group(2).count(".") + 1, len(m.group(1))) for m in all_matches] + depths = [d for d, _ in pairs] + min_depth = min(depths) + max_depth = max(depths) + if max_depth == min_depth: + return text, 0 + + base_level = min(lv for d, lv in pairs if d == min_depth) + count = 0 + + def _repl(m: re.Match) -> str: + nonlocal count + hashes, num, title = m.group(1), m.group(2), m.group(3) + depth = num.count(".") + 1 + new_level = min(base_level + (depth - min_depth), 6) + if new_level == len(hashes): + return m.group(0) + count += 1 + return f"{'#' * new_level} {num}. {title}" + + return _NUMBERED_HDR_RE.sub(_repl, text), count + + +def _t_normalize_header_levels(text: str) -> tuple[str, int]: + text = re.sub(r"^#{3,6}\s*$", "", text, flags=re.MULTILINE) + text = re.sub( + r"^(#{3,6})\s+(\d{1,3})\s+(.+)$", + lambda m: f"### {m.group(2)}. {m.group(3)}", + text, + flags=re.MULTILINE, + ) + text = re.sub(r"^#{4,6}\s+(.+)$", r"### \1", text, flags=re.MULTILINE) + return text, 0 + + +def _t_remove_header_bold(text: str) -> tuple[str, int]: + text = re.sub( + r"^(#{1,6})\s+\*\*(.+?)\*\*\s*$", + r"\1 \2", + text, flags=re.MULTILINE, + ) + return text, 0 + + +def _t_normalize_allcaps_headers(text: str) -> tuple[str, int]: + def _norm(m: re.Match) -> str: + hashes, content = m.group(1), m.group(2).strip() + letters = [c for c in content if c.isalpha()] + if letters and all(c.isupper() for c in letters): + return f"{hashes} {_sentence_case(content)}" + return m.group(0) + + text = re.sub(r"^(#{1,6}) (.+)$", _norm, text, flags=re.MULTILINE) + return text, 0 diff --git a/conversione/_pipeline/transforms/_helpers.py b/conversione/_pipeline/transforms/_helpers.py new file mode 100644 index 0000000..e91ad1b --- /dev/null +++ b/conversione/_pipeline/transforms/_helpers.py @@ -0,0 +1,153 @@ +"""Funzioni helper pure condivise tra i moduli di trasformazione.""" +import re + +from ._constants import _ORDINALS_IT, _ORDINALS_EN + + +def _sentence_case(s: str) -> str: + if not s: + return s + lower = s.lower() + return lower[0].upper() + lower[1:] + + +def _is_allcaps_line(line: str) -> bool: + stripped = line.strip() + letters = [c for c in stripped if c.isalpha()] + return ( + len(letters) >= 3 + and all(c.isupper() for c in letters) + and not stripped.startswith("#") + and not stripped.startswith("|") + ) + + +def _allcaps_to_header(raw_line: str) -> str: + text = re.sub(r"^[-*+]\s+", "", raw_line.strip()) + text = text.rstrip(".").rstrip("?").strip() + + _ORD_IT_PAT = "|".join(_ORDINALS_IT.keys()) + m = re.match(rf"^CAPITOLO ({_ORD_IT_PAT})\. (.+)", text) + if m: + roman = _ORDINALS_IT[m.group(1)] + titolo = m.group(2).rstrip(".").rstrip("?").strip() + return f"## Capitolo {roman} — {_sentence_case(titolo)}" + + _ORD_EN_PAT = "|".join(_ORDINALS_EN.keys()) + m = re.match(rf"^CHAPTER ({_ORD_EN_PAT}|\d+)\.? (.+)", text) + if m: + n = _ORDINALS_EN.get(m.group(1), m.group(1)) + titolo = m.group(2).rstrip(".").rstrip("?").strip() + return f"## Chapter {n} — {_sentence_case(titolo)}" + + m = re.match(r"^([IVXLCDM]+|[0-9]+)\. (.+)", text) + if m: + return f"## {m.group(1)}. {_sentence_case(m.group(2).rstrip('.').strip())}" + + return f"## {_sentence_case(text)}" + + +def _extract_math_environments(text: str) -> tuple[str, int]: + _ENVS = ( + r"Definizione|Definition|Teorema|Theorem|Lemma|" + r"Proposizione|Proposition|Corollario|Corollary|" + r"Osservazione|Remark|Nota|Note|Esempio|Example" + ) + count = 0 + blocks = text.split("\n\n") + result = [] + + for block in blocks: + stripped = block.strip() + if not stripped or stripped.startswith("#"): + result.append(block) + continue + + m = re.match( + rf"^({_ENVS})\s+((?:\d+\.?){{1,4}})\s*(.*)", + stripped, + re.DOTALL, + ) + if not m: + result.append(block) + continue + + env = m.group(1) + num = m.group(2).rstrip(".") + rest = m.group(3).strip() + + title_m = re.match(r"^(\([^)]{2,60}\))\s+(.*)", rest, re.DOTALL) + if title_m: + header = f"### {env} {num} {title_m.group(1)}" + body = title_m.group(2).strip() + else: + header = f"### {env} {num}." + body = rest + + result.append(f"{header}\n\n{body}" if body else header) + count += 1 + + return "\n\n".join(result), count + + +def _merge_title_headers(text: str) -> tuple[str, int]: + count = 0 + blocks = re.split(r"\n{2,}", text) + result = [] + i = 0 + while i < len(blocks): + block = blocks[i] + stripped = block.strip() + if ( + re.match(r"^#{2,3} \d+\.\s*$", stripped) + and i + 1 < len(blocks) + ): + nxt = blocks[i + 1].strip() + if ( + nxt + and "\n" not in nxt + and len(nxt) <= 80 + and not nxt.startswith("#") + and not re.match(r"^\d+[\.\)]\s", nxt) + ): + result.append(stripped.rstrip() + " " + nxt) + count += 1 + i += 2 + continue + result.append(block) + i += 1 + return re.sub(r"\n{3,}", "\n\n", "\n\n".join(result)), count + + +def _extract_article_headers(text: str) -> tuple[str, int]: + count = 0 + + def _repl(m: re.Match) -> str: + nonlocal count + num = m.group(1) + rest = m.group(2).strip() + + title_m = re.match( + r"^([A-Z\xc0\xc8\xc9\xcc\xcd\xd2\xd3\xd9\xda].{1,74}?)\.\s+" + r"([A-Z\xc0\xc8\xc9\xcc\xcd\xd2\xd3\xd9\xda\(\d].{4,})", + rest, + ) + if title_m: + count += 1 + return ( + f"### Art. {num}. {title_m.group(1)}.\n\n" + f"{title_m.group(2).strip()}" + ) + if rest: + count += 1 + return f"### Art. {num}.\n\n{rest}" + count += 1 + return f"### Art. {num}." + + text = re.sub( + r"^-\s+Art\.\s+([\d]+[a-z\-]*)\.\s*(.*)", + _repl, + text, + flags=re.MULTILINE, + ) + return text, count diff --git a/conversione/_pipeline/transforms/_structure.py b/conversione/_pipeline/transforms/_structure.py new file mode 100644 index 0000000..853c8bb --- /dev/null +++ b/conversione/_pipeline/transforms/_structure.py @@ -0,0 +1,184 @@ +"""Costruzione struttura: TOC, ALLCAPS→##, sezioni numerate, ambienti matematici, articoli.""" +import re + +from ._constants import ( + _TOC_KEYWORDS, _BIB_MARKERS_RE, + _TOC_ITEM_RE, _TOC_HDR_WITH_PAGE_RE, +) +from ._helpers import ( + _is_allcaps_line, _allcaps_to_header, + _extract_math_environments, _extract_article_headers, +) + + +def _t_remove_toc(text: str) -> tuple[str, int]: + lines = text.split("\n") + new_lines = [] + _in_toc = False + removed = False + for line in lines: + bare = re.sub(r"^#+\s*", "", line.strip()) + first_word = bare.split(".")[0].strip().lower() + if first_word in _TOC_KEYWORDS: + removed = True + _in_toc = True + continue + if _in_toc: + if re.match(r"^\s*$", line) or re.match(r"^\s*[-*+]\s+\d", line): + continue + if re.match(r"^\s*[-*+]\s+.{2,70}\s+\d{1,3}\s*$", line): + continue + if len(line.strip()) > 200: + _in_toc = False + new_lines.append(line) + continue + _in_toc = False + new_lines.append(line) + return "\n".join(new_lines), 1 if removed else 0 + + +def _t_remove_orphan_toc(text: str) -> tuple[str, int]: + """ + Rimuove voci di sommario senza dot-leader che sfuggono a _t_remove_toc. + Rileva: (a) blocchi di 3+ righe consecutive che matchano il pattern TOC + nei primi 25% del documento; (b) header ### N. Titolo PAGINA il cui corpo + è una lista di voci numerate. + """ + blocks = re.split(r"\n{2,}", text) + total = len(blocks) + cutoff = max(10, min(40, int(total * 0.25))) + to_drop = set() + + i = 0 + while i < cutoff and i < total: + b = blocks[i].strip() + + # (a) Sequenza di 3+ blocchi TOC consecutivi + if _TOC_ITEM_RE.match(b): + j = i + while j < min(cutoff, i + 60) and j < len(blocks) and _TOC_ITEM_RE.match(blocks[j].strip()): + j += 1 + if j - i >= 3: + for k in range(i, j): + to_drop.add(k) + # Rimuovi anche l'header ### precedente se ha numero di pagina + if i > 0 and _TOC_HDR_WITH_PAGE_RE.match(blocks[i - 1].strip()): + to_drop.add(i - 1) + i = j + continue + + # (b) Header ### N. Titolo PAGINA con corpo che è lista di voci numerate + if _TOC_HDR_WITH_PAGE_RE.match(b): + body = blocks[i + 1].strip() if i + 1 < len(blocks) else "" + # Il corpo contiene 2+ occorrenze di "N. Titolo" + toc_hits = re.findall(r"\d+\.?\s+[A-Za-zÀ-ú]", body) + if len(toc_hits) >= 2 and len(body) < 300: + to_drop.add(i) + if i + 1 < total: + to_drop.add(i + 1) + i += 2 + continue + + i += 1 + + if not to_drop: + return text, 0 + + kept = [b for idx, b in enumerate(blocks) if idx not in to_drop] + return re.sub(r"\n{3,}", "\n\n", "\n\n".join(kept)), len(to_drop) + + +def _t_allcaps_to_headers(text: str) -> tuple[str, int]: + count = 0 + blocks = text.split("\n\n") + new_blocks = [] + for block in blocks: + stripped = block.strip() + if "\n" not in stripped and _is_allcaps_line(stripped): + new_blocks.append(_allcaps_to_header(stripped)) + count += 1 + else: + sub_lines = block.split("\n") + converted = [] + for ln in sub_lines: + if _is_allcaps_line(ln) and len(ln.strip()) > 3: + converted.append(_allcaps_to_header(ln)) + count += 1 + else: + converted.append(ln) + new_blocks.append("\n".join(converted)) + return "\n\n".join(new_blocks), count + + +def _t_numbered_sections(text: str, has_exercises: bool = False) -> tuple[str, int]: + count = 0 + + def _num_repl(m: re.Match) -> str: + nonlocal count + content = m.group(2).strip() + if content.endswith(".") and len(content) > 40: + return m.group(0) + if _BIB_MARKERS_RE.search(content): + return m.group(0) + count += 1 + return f"### {m.group(1)}.\n\n{content}" + + text = re.sub(r"^(\d+)\.\s+(.+)$", _num_repl, text, flags=re.MULTILINE) + + def _num_letter_repl(m: re.Match) -> str: + nonlocal count + count += 1 + return f"### {m.group(1)}{m.group(2)}.\n\n{m.group(3).strip()}" + + text = re.sub(r"^(\d+)\s*([a-z])\.\s+(.+)$", _num_letter_repl, text, flags=re.MULTILINE) + + if not has_exercises: + def _aphorism_repl(m: re.Match) -> str: + nonlocal count + content = m.group(2).strip() + if _BIB_MARKERS_RE.search(content): + return m.group(0) + count += 1 + return f"\n\n### {m.group(1)}.\n\n{content}" + + text = re.sub( + r"^-\s+(\d{1,3})\.\s+(.{10,})$", + _aphorism_repl, + text, + flags=re.MULTILINE, + ) + + def _list_section_repl(m: re.Match) -> str: + nonlocal count + num = m.group(1) + content = m.group(2).strip() + if _BIB_MARKERS_RE.search(content): + return m.group(0) + count += 1 + split = re.search( + r"(?<=[a-z\xe0\xe8\xe9\xec\xed\xf2\xf3\xf9\xfa])\s+" + r"(?=[A-Z\xc0\xc8\xc9\xcc\xcd\xd2\xd3\xd9\xda])", + content, + ) + if split and split.start() >= 3: + title = content[: split.start()].strip() + body = content[split.end():].strip() + if len(body) >= 20: + return f"\n\n### {num}. {title}\n\n{body}" + return f"\n\n### {num}. {content}" + + text = re.sub( + r"^-\s+(\d{1,3})\s+([A-Z\xc0\xc8\xc9\xcc\xcd\xd2\xd3\xd9\xda\'L].{10,})$", + _list_section_repl, + text, + flags=re.MULTILINE, + ) + return text, count + + +def _t_extract_math(text: str) -> tuple[str, int]: + return _extract_math_environments(text) + + +def _t_extract_articles(text: str) -> tuple[str, int]: + return _extract_article_headers(text) diff --git a/conversione/_pipeline/transforms/_text.py b/conversione/_pipeline/transforms/_text.py new file mode 100644 index 0000000..dfa6105 --- /dev/null +++ b/conversione/_pipeline/transforms/_text.py @@ -0,0 +1,109 @@ +"""Trasformazioni sul testo: merge paragrafi, whitespace, poesia, versi.""" +import re + +from ._constants import _VERSE_NUM_RE + + +def _t_merge_paragraphs(text: str) -> tuple[str, int]: + _SENTENCE_END = set(".?!\xbb)\"'") + blocks = text.split("\n\n") + merged = [] + count = 0 + i = 0 + while i < len(blocks): + b = blocks[i] + stripped = b.strip() + while ( + i + 1 < len(blocks) + and stripped + and not stripped.startswith("#") + and not stripped.startswith("|") + and stripped[-1] not in _SENTENCE_END + ): + nxt = blocks[i + 1].strip() + if ( + not nxt + or nxt.startswith("#") + or nxt.startswith("|") + or re.match(r"^\d+\.", nxt) + or re.match(r"^[-*+]\s", nxt) + ): + break + b = stripped + " " + nxt + stripped = b.strip() + count += 1 + i += 1 + merged.append(b) + i += 1 + text = "\n\n".join(merged) + text = re.sub(r"(?m)^\|---\|\s*", "", text) + return text, count + + +def _t_normalize_whitespace(text: str) -> tuple[str, int]: + lines = text.split("\n") + text = "\n".join( + re.sub(r" +", " ", line) if line.strip() else line + for line in lines + ) + return text, 0 + + +def _t_collapse_blank_lines(text: str) -> tuple[str, int]: + return re.sub(r"\n{3,}", "\n\n", text), 0 + + +def _t_restore_poetry_lines(text: str) -> tuple[str, int]: + count = 0 + blocks = text.split("\n\n") + result = [] + + for block in blocks: + stripped = block.strip() + if not stripped or stripped.startswith("#"): + result.append(block) + continue + + matches = list(_VERSE_NUM_RE.finditer(stripped)) + if len(matches) < 2: + result.append(block) + continue + + nums = [int(m.group(2)) for m in matches] + diffs = [nums[i + 1] - nums[i] for i in range(len(nums) - 1)] + if not diffs or len(set(diffs)) > 2 or not (1 <= diffs[0] <= 5): + result.append(block) + continue + + step = diffs[0] + + def _replace_verse_num(m: re.Match) -> str: + n = int(m.group(2)) + sep = "\n\n" if n % (step * 3) == 0 else "\n" + return m.group(1).rstrip() + sep + + new_block = _VERSE_NUM_RE.sub(_replace_verse_num, stripped) + if new_block != stripped: + count += len(matches) + result.append(new_block) + + return "\n\n".join(result), count + + +def _t_demote_verse_headers(text: str) -> tuple[str, int]: + count = 0 + + def _demote(m: re.Match) -> str: + nonlocal count + hashes, content = m.group(1), m.group(2).strip() + if not re.search(r"\s\d{1,4}\s*$", content): + return m.group(0) + inner = re.sub(r"\s\d{1,4}\s*$", "", content) + if not re.search(r'[,;:.!?\xbb"\'][\ ]+[A-Za-z\xc0-\xff\xab""]', inner): + return m.group(0) + count += 1 + clean = re.sub(r"\s\d{1,4}\s*$", "", content) + return clean + + text = re.sub(r"^(#{1,6})\s+(.{20,})$", _demote, text, flags=re.MULTILINE) + return text, count diff --git a/docs/superpowers/plans/2026-04-30-pipeline-ottimizzazione.md b/docs/superpowers/plans/2026-04-30-pipeline-ottimizzazione.md new file mode 100644 index 0000000..91694f9 --- /dev/null +++ b/docs/superpowers/plans/2026-04-30-pipeline-ottimizzazione.md @@ -0,0 +1,560 @@ +# Pipeline ottimizzazione PDF→Markdown — Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Eliminare la necessità di revisione manuale del `clean.md` ottimizzando i parametri di opendataloader-pdf e aggiungendo trasformazioni mirate per tutti i tipi di PDF. + +**Architecture:** Quattro file modificati: `converter.py` (parametri adattivi + rilevamento PDF taggato), `transforms.py` (PUA bracket TeX + demozione header-formula), `report.py` (nuova metrica residua), `validator.py` (nuova penalità). Nessun cambio all'API pubblica di `_pipeline`. + +**Tech Stack:** Python 3.12, opendataloader-pdf (Java), PyMuPDF (fitz), regex + +--- + +## File modificati + +| File | Tipo | Responsabilità | +|------|------|----------------| +| `conversione/_pipeline/converter.py` | Modify | `_is_tagged_pdf()` + nuovi parametri convert | +| `conversione/_pipeline/transforms.py` | Modify | PUA bracket TeX + `_t_math_header_demotion` | +| `conversione/_pipeline/report.py` | Modify | `formula_headers_residui` nella sezione residui | +| `conversione/_pipeline/validator.py` | Modify | Penalità formula headers | + +--- + +## Task 1: Converter adattivo — `_is_tagged_pdf()` + nuovi parametri + +**Files:** +- Modify: `conversione/_pipeline/converter.py` + +- [ ] **Step 1: Leggi il file attuale** + +```bash +cat conversione/_pipeline/converter.py +``` + +- [ ] **Step 2: Sostituisci interamente il contenuto** + +Il nuovo `converter.py` aggiunge `_is_tagged_pdf()` (usa fitz per controllare `StructTreeRoot` nel catalog del PDF) e passa i nuovi parametri a `opendataloader_pdf.convert()`: +- `table_method="cluster"` — sempre attivo, migliora tabelle senza bordi +- `content_safety_off=["tiny", "hidden-ocg"]` — evita filtraggio di footnote e layer OCG +- `use_struct_tree=tagged` — attivo solo se PDF è taggato + +```python +from pathlib import Path + + +def _is_tagged_pdf(pdf_path: Path) -> bool: + try: + import fitz + doc = fitz.open(str(pdf_path)) + tagged = "StructTreeRoot" in doc.pdf_catalog() + doc.close() + return tagged + except Exception: + return False + + +def convert_pdf(pdf_path: Path, out_dir: Path) -> Path: + """ + Converte il PDF in Markdown tramite opendataloader-pdf. + Scrive il file nella out_dir e restituisce il percorso. + + Parametri scelti per output RAG-ottimale: + - keep_line_breaks=False → testo fluente, no hard-wrap PDF + - reading_order="xycut" → corregge ordine multi-colonna (XY-Cut++) + - sanitize=False → preserva il testo originale + - image_output="off" → nessuna immagine estratta né referenziata + - table_method="cluster" → rileva tabelle senza bordi visibili + - content_safety_off → evita filtraggio di footnote e layer OCG + - use_struct_tree → attivo se PDF è taggato (Word/InDesign) + """ + import opendataloader_pdf + + out_dir.mkdir(parents=True, exist_ok=True) + tagged = _is_tagged_pdf(pdf_path) + + opendataloader_pdf.convert( + input_path=str(pdf_path), + output_dir=str(out_dir), + format="markdown", + keep_line_breaks=False, + reading_order="xycut", + sanitize=False, + image_output="off", + table_method="cluster", + content_safety_off=["tiny", "hidden-ocg"], + use_struct_tree=tagged, + quiet=True, + ) + + md_file = out_dir / f"{pdf_path.stem}.md" + if not md_file.exists(): + candidates = list(out_dir.glob("*.md")) + if not candidates: + raise RuntimeError(f"Nessun file .md prodotto in {out_dir}") + md_file = candidates[0] + + content = md_file.read_text(encoding="utf-8", errors="replace").strip() + if len(content) < 100: + raise RuntimeError( + f"opendataloader ha prodotto un file .md quasi vuoto ({len(content)} char) " + f"— il PDF potrebbe essere corrotto o non supportato" + ) + + return md_file +``` + +- [ ] **Step 3: Verifica sintattica** + +```bash +.venv/bin/python -c "from conversione._pipeline.converter import convert_pdf, _is_tagged_pdf; print('OK')" +``` + +Atteso: `OK` + +- [ ] **Step 4: Commit** + +```bash +git add conversione/_pipeline/converter.py +git commit -m "feat(converter): parametri adattivi — use_struct_tree, cluster tables, content-safety" +``` + +--- + +## Task 2: Aggiunta PUA bracket TeX (U+F8EB–U+F8FE) + +**Files:** +- Modify: `conversione/_pipeline/transforms.py` (sezione `_SYMBOL_PUA_MAP`, righe ~28–127) + +Questi codepoint sono pezzi di parentesi/bracket grandi del font Computer Modern (TeX), non ricostruibili come singolo simbolo → mappati a `""`. + +- [ ] **Step 1: Aggiungi le entries mancanti alla fine di `_SYMBOL_PUA_MAP`** + +Individua la riga `"": "", # bracket extension piece (non ricostruibile)` (circa riga 122) e aggiungi **dopo** l'ultima entry esistente della mappa (prima della `}`): + +```python + "": "", # TeX large paren left + "": "", # TeX large paren extension + "": "", # TeX large paren right + "": "", # TeX large paren right extension + "": "", # TeX large bracket left + "": "", # TeX large bracket extension + "": "", # TeX brace top-left + "": "", # TeX brace mid + "": "", # TeX brace mid-right + "": "", # TeX brace extension + "": "", # TeX brace right + "": "", # TeX bracket right large + "": "", # TeX bracket right extension + "": "", # TeX bracket right close + "": "", # TeX integral large + "": "", # TeX integral extension + "": "", # TeX integral top + "": "", # TeX radical top + "": "", # TeX radical extension + "": "", # TeX arrowhead +``` + +- [ ] **Step 2: Verifica che _SYMBOL_PUA_RE si aggiorni automaticamente** + +```bash +.venv/bin/python -c " +from conversione._pipeline.transforms import _SYMBOL_PUA_MAP, _SYMBOL_PUA_RE +pua_chars = ['', '', '', ''] +for c in pua_chars: + assert c in _SYMBOL_PUA_MAP, f'Manca {repr(c)}' + assert _SYMBOL_PUA_RE.search(c), f'Regex non cattura {repr(c)}' +print(f'OK — {len(_SYMBOL_PUA_MAP)} PUA chars mappati') +" +``` + +Atteso: `OK — N PUA chars mappati` (N > 90) + +- [ ] **Step 3: Verifica sostituzione su testo di esempio** + +```bash +.venv/bin/python -c " +from conversione._pipeline.transforms import apply_transforms +testo = 'Sia x = f(n) e n la parentesi grande.' +pulito, stats = apply_transforms(testo) +assert '' not in pulito +assert '' not in pulito +print('Testo pulito:', repr(pulito)) +print('PUA corretti:', stats['n_simboli_pua_corretti']) +" +``` + +Atteso: nessun PUA nel testo pulito, `n_simboli_pua_corretti` > 0. + +- [ ] **Step 4: Commit** + +```bash +git add conversione/_pipeline/transforms.py +git commit -m "feat(transforms): aggiungi PUA bracket TeX U+F8EB-F8FE alla mappa simboli" +``` + +--- + +## Task 3: Nuova trasformazione `_t_math_header_demotion` + +**Files:** +- Modify: `conversione/_pipeline/transforms.py` + +Demota a testo semplice gli header `##`/`###` che sono enunciati di esercizi o formule lunghe (non titoli di sezione reali). + +**Criteri di demozione** (almeno uno tra math e exercise deve valere): +- Livello `##` o `###` +- Lunghezza testo (senza `#`) > 100 caratteri +- `math`: ≥ 3 simboli matematici nell'header (da set: `=`, `+`, `∈`, `∀`, `∃`, `≤`, `≥`, `∞`, `∑`, `∫`, `∂`, `→`, `↔`, `⊂`, `⊃`, `∩`, `∪`, lettere greche Unicode U+03B1–U+03C9 e U+0391–U+03A9) +- `exercise`: matcha pattern traccia (`\b(Si dimostri|Si calcoli|Si provi|Si trovi|Trovare|Find|Prove|Show that|Compute|Calculate|Dimostrare|Verificare)\b`) + +**Output**: rimuove `#+ `. Se la riga inizia con `N. ` (numero + punto), converte in `**N.** resto`. Altrimenti testo plain. + +- [ ] **Step 1: Aggiungi costante regex a livello di modulo** (dopo le costanti esistenti, prima di `_SYMBOL_PUA_MAP`) + +Trova la riga `_VERSE_NUM_RE = re.compile(` (circa riga 160) e aggiungi **dopo**: + +```python +_MATH_SYMBOLS_RE = re.compile( + r"[=+∈∀∃≤≥∞∑∫∂→↔⊂⊃∩∪αβγδεζηθικλμνξοπρστυφχψωΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩ]" +) +_EXERCISE_TRIGGER_RE = re.compile( + r"\b(Si dimostri|Si calcoli|Si provi|Si trovi|Trovare|Find|Prove|Show that" + r"|Compute|Calculate|Dimostrare|Verificare)\b", + re.IGNORECASE, +) +_MATH_HDR_RE = re.compile(r"^(#{2,3})\s+(.+)$") +_NUMBERED_PREFIX_RE = re.compile(r"^(\d+(?:\.\d+)*[.)])\s+(.+)$", re.DOTALL) +``` + +- [ ] **Step 2: Aggiungi la funzione `_t_math_header_demotion`** (prima dell'orchestratore `apply_transforms`) + +Trova la riga `# ─── Orchestratore` e aggiungi **prima**: + +```python +def _t_math_header_demotion(text: str) -> tuple[str, int]: + lines = text.split("\n") + result, count = [], 0 + for line in lines: + m = _MATH_HDR_RE.match(line) + if not m: + result.append(line) + continue + body = m.group(2) + if len(body) <= 100: + result.append(line) + continue + has_math = len(_MATH_SYMBOLS_RE.findall(body)) >= 3 + has_exercise = bool(_EXERCISE_TRIGGER_RE.search(body)) + if not (has_math or has_exercise): + result.append(line) + continue + nm = _NUMBERED_PREFIX_RE.match(body) + if nm: + result.append(f"**{nm.group(1)}** {nm.group(2)}") + else: + result.append(body) + count += 1 + return "\n".join(result), count +``` + +- [ ] **Step 3: Registra la trasformazione in `_transforms`** + +Nell'orchestratore `apply_transforms`, trova la riga: + +```python + ("n_garbage_headers_rimossi", _t_remove_garbage_headers), +``` + +e aggiungi **dopo**: + +```python + ("n_formula_headers_demotati", _t_math_header_demotion), +``` + +- [ ] **Step 4: Aggiungi la stat key al print in `runner.py`** + +Trova in `conversione/_pipeline/runner.py` il blocco di print delle statistiche (dopo `apply_transforms`) e aggiungi: + +```python + print(f" Formula-hdr demotati: {t['n_formula_headers_demotati']}") +``` + +- [ ] **Step 5: Verifica su caso sintetico** + +```bash +.venv/bin/python -c " +from conversione._pipeline.transforms import apply_transforms + +# Caso 1: header esercizio lungo → deve essere demotato +testo = '### 3. Si dimostri la formula per le equazioni di secondo grado ax^2 + bx + c = 0 e si analizzi il segno del discriminante b^2 - 4ac per tutti i valori reali.' +pulito, stats = apply_transforms(testo) +assert '###' not in pulito, f'Header non demotato: {pulito!r}' +print('Caso 1 OK:', pulito[:80]) + +# Caso 2: header titolo corto → NON deve essere demotato +testo2 = '### Teorema di Cauchy' +pulito2, _ = apply_transforms(testo2) +assert '###' in pulito2, f'Header legittimo demotato: {pulito2!r}' +print('Caso 2 OK:', pulito2) + +# Caso 3: header con molti simboli math + lungo → demotato +testo3 = '### Sia f: R→R tale che ∀x∈R si abbia f(x) = ∑_{n=0}^{∞} aₙxⁿ con ∫f dx = g(x) + C per ogni x∈[a,b].' +pulito3, stats3 = apply_transforms(testo3) +print('Caso 3:', '###' not in pulito3, stats3.get('n_formula_headers_demotati')) + +print('Stats:', stats.get('n_formula_headers_demotati')) +" +``` + +Atteso: Caso 1 e 3 demotati, Caso 2 intatto. + +- [ ] **Step 6: Commit** + +```bash +git add conversione/_pipeline/transforms.py conversione/_pipeline/runner.py +git commit -m "feat(transforms): aggiungi _t_math_header_demotion per header esercizi e formule" +``` + +--- + +## Task 4: `report.py` — metrica `formula_headers_residui` + +**Files:** +- Modify: `conversione/_pipeline/report.py` + +- [ ] **Step 1: Aggiungi funzione di scan formula-header e integrala nel report** + +Nella funzione `build_report()`, dopo la definizione di `_scan()` (circa riga 53), aggiungi: + +```python + def _scan_formula_headers(max_n: int = 10) -> list[dict]: + _math_sym = re.compile( + r"[=+∈∀∃≤≥∞∑∫∂→↔⊂⊃∩∪αβγδεζηθικλμνξοπρστυφχψωΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩ]" + ) + _ex_trigger = re.compile( + r"\b(Si dimostri|Si calcoli|Si provi|Si trovi|Trovare|Find|Prove|Show that" + r"|Compute|Calculate|Dimostrare|Verificare)\b", + re.IGNORECASE, + ) + hits = [] + for i, line in enumerate(text_lines): + m = re.match(r"^(#{2,3})\s+(.+)$", line) + if not m: + continue + body = m.group(2) + if len(body) <= 100: + continue + has_math = len(_math_sym.findall(body)) >= 3 + has_ex = bool(_ex_trigger.search(body)) + if has_math or has_ex: + hits.append({"riga": i + 1, "testo": line.strip()[:120]}) + if len(hits) >= max_n: + break + return hits +``` + +- [ ] **Step 2: Aggiungi la metrica ai `residui`** + +Trova nel dict `residui` la riga: + +```python + "pua_markers": _scan(r'[-]'), +``` + +e aggiungi **dopo**: + +```python + "formula_headers": _scan_formula_headers(), +``` + +Poi nel dict principale `report["residui"]`, trova la riga: + +```python + "pua_markers_esempi": residui["pua_markers"], +``` + +e aggiungi **dopo**: + +```python + "formula_headers": len(residui["formula_headers"]), + "formula_headers_esempi": residui["formula_headers"], +``` + +- [ ] **Step 3: Verifica** + +```bash +.venv/bin/python -c " +import json +from pathlib import Path +from conversione._pipeline.report import build_report +from conversione._pipeline.transforms import apply_transforms + +testo = open('conversione/analisi1/raw.md').read() +clean, t = apply_transforms(testo) +from conversione._pipeline.structure import analyze + +tmp = Path('/tmp/test_report') +tmp.mkdir(exist_ok=True) +(tmp / 'clean.md').write_text(clean) +profile = analyze(tmp / 'clean.md') +rp = build_report('test', tmp, clean, t, profile, 5.0) +r = json.loads(rp.read_text()) +print('formula_headers residui:', r['residui']['formula_headers']) +print('formula_headers esempi:', len(r['residui']['formula_headers_esempi'])) +" +``` + +Atteso: count numerico (può essere 0 se la demozione ha funzionato bene), nessun errore. + +- [ ] **Step 4: Commit** + +```bash +git add conversione/_pipeline/report.py +git commit -m "feat(report): aggiungi metrica formula_headers_residui" +``` + +--- + +## Task 5: `validator.py` — penalità formula headers + +**Files:** +- Modify: `conversione/_pipeline/validator.py` + +- [ ] **Step 1: Aggiungi la penalità in `_score()`** + +Trova in `_score()` la riga: + +```python + _pen("pua_markers", 2, 20, "caratteri PUA font Symbol") +``` + +e aggiungi **dopo**: + +```python + _pen("formula_headers", 3, 15, "formula/esercizio come header") +``` + +- [ ] **Step 2: Aggiungi colonna `fhdr` nell'output tabellare di `validate()`** + +Trova in `validate()` la riga che costruisce `header`: + +```python + header = ( + f"{'stem':<{col}}" + f"{'h2':>4}{'h3':>5} " + f"{'strategia':<18}" + f"{'bare':>5}{'corte':>6}{'lunghe':>7}" + f"{'btk':>5}{'br':>4}{'enc':>4}{'url':>4}" + f"{'med':>6}" + f" {'voto':>4} grade" + ) +``` + +Sostituiscila con: + +```python + header = ( + f"{'stem':<{col}}" + f"{'h2':>4}{'h3':>5} " + f"{'strategia':<18}" + f"{'bare':>5}{'corte':>6}{'lunghe':>7}" + f"{'btk':>5}{'br':>4}{'enc':>4}{'url':>4}{'fhdr':>5}" + f"{'med':>6}" + f" {'voto':>4} grade" + ) +``` + +Trova il `print(...)` dentro il loop `for r in rows:` e aggiungi `fhdr`: + +```python + print( + f"{r['stem']:<{col}}" + f"{st.get('n_h2', 0):>4}" + f"{st.get('n_h3', 0):>5} " + f"{st.get('strategia_chunking','?'):<18}" + f"{an.get('bare_headers', 0):>5}" + f"{an.get('short_sections', 0):>6}" + f"{an.get('long_sections', 0):>7}" + f"{res.get('backtick', 0):>5}" + f"{res.get('br_inline', 0):>4}" + f"{res.get('simboli_encoding', 0):>4}" + f"{res.get('url', 0):>4}" + f"{res.get('formula_headers', 0):>5}" + f"{dist.get('mediana', 0):>6}" + f" {s:>4} {_grade(s)}" + ) +``` + +Aggiorna anche la riga finale `print("\nColonne: ...")`: + +```python + print( + "\nColonne: bare=header vuoti corte=sez<150ch lunghe=sez>1500ch " + "btk=backtick br=
inline enc=simboli encoding fhdr=formula-header med=mediana chars\n" + ) +``` + +- [ ] **Step 3: Verifica** + +```bash +.venv/bin/python -c " +from conversione._pipeline.validator import _score +r = {'structure': {'livello_struttura': 3}, 'anomalie': {}, 'residui': {'formula_headers': 5}} +score, detail = _score(r) +print(score, detail) +assert any('formula' in d for d in detail), 'Penalità formula non applicata' +print('OK') +" +``` + +Atteso: penalità `formula/esercizio come header ×5 −15` nel detail. + +- [ ] **Step 4: Commit** + +```bash +git add conversione/_pipeline/validator.py +git commit -m "feat(validator): aggiungi penalità formula_headers, colonna fhdr nel report" +``` + +--- + +## Task 6: Test di integrazione su analisi1 + +- [ ] **Step 1: Riesegui la pipeline su analisi1** + +```bash +.venv/bin/python conversione/ --stem analisi1 --force 2>&1 +``` + +Atteso: completamento senza errori, print `Formula-hdr demotati: N` visibile. + +- [ ] **Step 2: Valida e confronta con il report precedente** + +```bash +.venv/bin/python conversione/ validate analisi1 --detail +``` + +Confronta con il vecchio voto del `report.json` originale. Il voto deve essere ≥ al precedente. + +- [ ] **Step 3: Verifica riduzione PUA bracket** + +```bash +python3 -c " +import json +r = json.load(open('conversione/analisi1/report.json')) +pua = r['residui']['pua_markers'] +fhdr = r['residui'].get('formula_headers', 'N/A') +print(f'PUA residui: {pua} (era 10+ prima)') +print(f'Formula headers residui: {fhdr}') +" +``` + +Atteso: `pua_markers` ridotto rispetto al run precedente (era 10 nel report originale). + +- [ ] **Step 4: Commit finale se tutto OK** + +```bash +git add conversione/analisi1/ +git commit -m "chore: rigenera output analisi1 con pipeline ottimizzata" +``` diff --git a/docs/superpowers/specs/2026-04-30-pipeline-ottimizzazione-design.md b/docs/superpowers/specs/2026-04-30-pipeline-ottimizzazione-design.md new file mode 100644 index 0000000..698a7cb --- /dev/null +++ b/docs/superpowers/specs/2026-04-30-pipeline-ottimizzazione-design.md @@ -0,0 +1,80 @@ +# Pipeline ottimizzazione — Design Spec +*2026-04-30* + +## Obiettivo +Eliminare la necessità di revisione manuale del `clean.md` per tutti i tipi di PDF (accademici/matematici, giuridici, tecnici) ottimizzando i parametri di opendataloader-pdf e aggiungendo trasformazioni mirate. + +## Scope +Nessun hybrid backend. Solo Java + trasformazioni Python. + +--- + +## 1. `converter.py` — Parametri adattivi + +### 1.1 Rilevamento PDF taggato +Funzione `_is_tagged_pdf(pdf_path) -> bool` usando PyMuPDF (`fitz`): +```python +doc = fitz.open(str(pdf_path)) +tagged = "StructTreeRoot" in doc.pdf_catalog() +doc.close() +``` + +### 1.2 Nuovi parametri fissi (tutti i PDF) +- `table_method="cluster"` — tabelle senza bordi visibili +- `content_safety_off=["tiny", "hidden-ocg"]` — evita filtraggio di footnote e layer OCG + +### 1.3 Parametro condizionale +- `use_struct_tree=tagged` — attivo solo se il PDF è taggato + +Una sola conversione Java, zero overhead per PDF non taggati. + +--- + +## 2. `transforms.py` — Due aggiunte + +### 2.1 PUA bracket TeX (U+F8EB–F8F8) +Aggiunge al `_SYMBOL_PUA_MAP` i glifoni bracket di Computer Modern font che appaiono come PUA: +`U+F8EB, U+F8EC, U+F8ED, U+F8EE, U+F8EF, U+F8F0, U+F8F1, U+F8F2, U+F8F3, U+F8F4, U+F8F5, U+F8F6, U+F8F7, U+F8F8, U+F8F9, U+F8FA, U+F8FB, U+F8FC, U+F8FD, U+F8FE` +→ tutti mappati a `""` (pezzi di parentesi non ricostruibili come singolo glifo) + +Il `_SYMBOL_PUA_RE` si aggiorna automaticamente essendo costruito dalla mappa. + +### 2.2 Nuova trasformazione `_t_math_header_demotion` +Demota a testo semplice gli header `##`/`###` che sono in realtà enunciati di esercizi o formule lunghe. + +**Criteri di demozione** (tutti devono valere): +- Livello `##` o `###` +- Lunghezza testo > 100 caratteri +- Almeno uno tra: + - ≥ 3 simboli matematici (`=`, `+`, `∈`, `∀`, `∃`, `≤`, `≥`, `∞`, lettere greche Unicode, `lim`, `sup`, `inf`, `∑`, `∫`) + - Matcha pattern traccia esercizio: `(Si dimostri|Si calcoli|Si provi|Si trovi|Trovare|Find|Prove|Show|Compute|Calculate)\b` + +**Output**: rimuove `#+ ` iniziale. Se numerata (`N. testo`), converte in `**N.** testo`. Altrimenti testo plain. + +**Posizione in `_transforms`**: gruppo "Rifinitura", dopo `_t_garbage_headers`. + +**Stat key**: `n_formula_headers_demotati` + +--- + +## 3. `report.py` — Nuova metrica residua + +`build_report()` aggiunge contatore `formula_headers_residui`: +- Conta header `##`/`###` nel `clean.md` finale che superano ancora i criteri math (sopra) +- Mostra fino a 3 esempi in `formula_headers_esempi` + +--- + +## 4. `validator.py` — Nuova penalità + +| Problema | Penalità | Cap | +|----------|----------|-----| +| Formula/esercizio come header residuo | −3/cad | −15 | + +--- + +## File modificati +1. `conversione/_pipeline/converter.py` — `_is_tagged_pdf()` + nuovi parametri +2. `conversione/_pipeline/transforms.py` — PUA map + `_t_math_header_demotion` +3. `conversione/_pipeline/report.py` — `formula_headers_residui` +4. `conversione/_pipeline/validator.py` — nuova penalità