Files
rag-from-scratch/conversione/_pipeline/_constants.py
T
davide ebd2a43f84 feat: integra pipeline PDF→Markdown a 9 stadi e test suite
Porta da main la riscrittura completa di conversione/_pipeline/ (9 stadi
PyMuPDF) e la suite tests/ senza modificare chunks/, step-8/, rag.py,
ollama/, retrieve.py, config.py.

requirements.txt: aggiunge PyMuPDF>=1.24.0 e pytest>=8.0, mantiene chromadb,
rimuove opendataloader-pdf e pymupdf4llm.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-11 14:46:16 +02:00

170 lines
6.1 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
Costanti di modulo condivise tra i moduli di trasformazione.
Tutte le regex compilate e le mappe statiche vivono qui.
"""
import re
# ─── Keyword sets ─────────────────────────────────────────────────────────────
_TOC_KEYWORDS = frozenset([
"indice", "index", "contents", "table of contents",
"sommario", "inhaltsverzeichnis", "inhalt",
"indice generale", "indice analitico", "indice dei contenuti",
"elenco dei capitoli", "argomenti", "table des matières",
"tabla de contenidos", "содержание",
])
_ORDINALS_IT = {
"PRIMO": "I", "SECONDO": "II", "TERZO": "III", "QUARTO": "IV",
"QUINTO": "V", "SESTO": "VI", "SETTIMO": "VII", "OTTAVO": "VIII",
"NONO": "IX", "DECIMO": "X",
}
_ORDINALS_EN = {
"ONE": "1", "TWO": "2", "THREE": "3", "FOUR": "4", "FIVE": "5",
"SIX": "6", "SEVEN": "7", "EIGHT": "8", "NINE": "9", "TEN": "10",
}
# ─── PUA Symbol font map ──────────────────────────────────────────────────────
_SYMBOL_PUA_MAP: dict[str, str] = {
"": " ",
"": "(",
"": ")",
"": "+",
"": "",
"": ".",
"": "/",
"": "0", "": "1", "": "2", "": "3", "": "4",
"": "5", "": "6", "": "7", "": "8", "": "9",
"": ":", "": ";", "": "<", "": "=", "": ">",
"": "",
"": "Α", "": "Β", "": "Χ", "": "Δ", "": "Ε",
"": "Φ", "": "Γ", "": "Η", "": "Ι", "": "ϑ",
"": "Κ", "": "Λ", "": "Μ", "": "Ν", "": "Ο",
"": "Π", "": "Θ", "": "Ρ", "": "Σ", "": "Τ",
"": "Υ", "": "ς", "": "Ω", "": "Ξ", "": "Ψ",
"": "Ζ",
"": "[",
"": "",
"": "]",
"": "",
"": "α", "": "β", "": "χ", "": "δ", "": "ε",
"": "φ", "": "γ", "": "η", "": "ι", "": "ϕ",
"": "κ", "": "λ", "": "μ", "": "ν", "": "ο",
"": "π", "": "θ", "": "ρ", "": "σ", "": "τ",
"": "υ", "": "ϖ", "": "ω", "": "ξ", "": "ψ",
"": "ζ",
"": "{",
"": "|",
"": "}",
"": "~",
"": "±",
"": "",
"": "",
"": "",
"": "",
"": "",
"": "×",
"": "÷",
"": "×",
"": "",
"": "",
"": "",
"": "",
"": "*",
"": ",",
"": "",
"": "",
"": "",
"": "",
"": "÷",
"": "",
"": "",
"": "",
"": "",
"": "",
"": "",
# TeX Computer Modern bracket/delimiter pieces (U+F8EBF8FE) → stringa vuota
"": "", # TeX large paren left
"": "", # TeX large paren extension
"": "", # TeX large paren right
"": "", # TeX large paren right ext
"": "", # TeX large bracket left
"": "", # TeX large bracket ext
"": "", # TeX brace top-left
"": "", # TeX brace mid
"": "", # TeX brace mid-right
"": "", # TeX brace extension
"": "", # TeX brace right
"": "", # TeX bracket right large
"": "", # TeX bracket right ext
"": "", # TeX bracket right close
"": "", # TeX integral large
"": "", # TeX integral extension
"": "", # TeX integral top
"": "", # TeX radical top
"": "", # TeX radical extension
"": "", # TeX arrowhead
}
_SYMBOL_PUA_RE = re.compile(
"[" + "".join(re.escape(k) for k in _SYMBOL_PUA_MAP) + "]"
)
# ─── Regex compilate condivise ────────────────────────────────────────────────
_SUPERSCRIPT_RE = re.compile(r'[¹²³⁰⁴-⁹]+')
_FOOTNOTE_BODY_RE = re.compile(
r'^([¹²³⁰⁴-⁹]+\s+|\[\d{1,3}\]\s+)'
)
_NUMBERED_HDR_RE = re.compile(
r"^(#{1,6})\s+(\d+(?:\.\d+)*)\.\s+(.+)$",
re.MULTILINE,
)
_BIB_MARKERS_RE = re.compile(
r'\b(pp?\.|vol\.|n\.\s*\d|ed\.|edn\.|ISBN|DOI|arXiv)\b'
r'|\b(19|20)\d{2}\b'
r'|\b(ibid\.?|ibidem|op\.\s*cit\.?|cit\.|cfr\.|ivi[,;\s])\b',
re.IGNORECASE,
)
# Pattern autore accademico: iniziale maiuscola + cognome TUTTO-MAIUSCOLO (es. "A. SMITH")
_FOOTNOTE_AUTHOR_RE = re.compile(r'(?<![A-Z])[A-Z]\.\s+[A-Z]{3,}')
_WATERMARK_RE = re.compile(
r"^(BOZZA|DRAFT|CONFIDENTIAL|RISERVATO|PROVVISORIO|SAMPLE|SPECIMEN"
r"|DO NOT DISTRIBUTE|NON DISTRIBUIRE|COPY|COPIA)\s*$",
re.IGNORECASE | re.MULTILINE,
)
_TABSEP_RE = re.compile(r"(?m)^\|\s*\|\s*$|^\|---\|?\s*$")
_DOTLEADER_RE = re.compile(r"^[^\n]*(?:(?:\. ){3,}|\.{4,})[^\n]*$", re.MULTILINE)
_FM_RE = re.compile(
r"https?://|www\.|@[A-Za-z]|\bUniversit[àa]\b|\bDipartimento\b|"
r"\bCopyright\b|\bLicenza\b|\bEdizione\b|"
r"protetto da|tutti i diritti",
re.IGNORECASE,
)
_VERSE_NUM_RE = re.compile(
r"([.!?\xbb'\"" + "" + r"]\s+)(\d+)(\s+)(?=[A-Z\xc0-\xd9a-z\xe0-\xf9\xab“”‟])"
)
# Math header demotion
_MATH_SYMBOLS_RE = re.compile(
r"[=+∈∀∃≤≥∞∑∫∂→↔⊂⊃∩∪αβγδεζηθικλμνξοπρστυφχψωΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩ]"
)
_EXERCISE_TRIGGER_RE = re.compile(
r"\b(Si dimostri|Si calcoli|Si provi|Si trovi|Trovare|Find|Prove|Show that"
r"|Compute|Calculate|Dimostrare|Verificare)\b",
re.IGNORECASE,
)
_MATH_HDR_RE = re.compile(r"^(#{2,3})\s+(.+)$")
_NUMBERED_PREFIX_RE = re.compile(r"^(\d+(?:\.\d+)*[.)])\s+(.+)$", re.DOTALL)
# Orphan TOC: voce di indice senza dot-leader (es. "3. Funzioni 174")
_TOC_ITEM_RE = re.compile(
r"^\d+(\.\d+)*\.?\s+[A-Za-zÀ-ú\'\(][^\n]{2,70}$"
)
_TOC_HDR_WITH_PAGE_RE = re.compile(
r"^#{1,3}\s+\d+\.?\s+.{3,60}\s+\d{1,4}$"
)
# Artefatti PDF: page markers e separatori
_PAGE_MARKER_RE = re.compile(r"(?m)^<!-- page: \d+ -->\s*$")
_STANDALONE_NUM_RE = re.compile(r"(?m)^(?:- )?\d{1,3}$")
_UNDERSCORE_SEP_RE = re.compile(r"(?m)^_{4,}\s*$")