From 3f4689e8fd407dc70bdb658f29b2ff4db40a4bda Mon Sep 17 00:00:00 2001 From: Davide Grilli Date: Thu, 7 May 2026 16:12:50 +0200 Subject: [PATCH] feat: rileva note bibliografiche e raccolte multi-articolo in pipeline MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Risolve la conversione errata di note a piè di pagina accademiche in header Markdown nei testi giuridici (es. dirittopubblico: da 424 h2 errati → 27 h2 semanticamente corretti). - _BIB_MARKERS_RE: aggiunge ibid., cfr., op. cit., cit., ivi - _FOOTNOTE_AUTHOR_RE: nuovo pattern per "A. COGNOME" (es. G. GUZZETTA) - _num_repl / _aphorism_repl / _list_section_repl: usano entrambi i guard per non convertire note bibliografiche in sezioni - _t_promote_chapter_headers: usa max-count ≥ 3 per distinguere raccolte multi-articolo (non promuovere) da libri con capitoli sequenziali (promuovere); preserva il comportamento corretto su anatomia - _t_remove_page_markers / _t_remove_page_numbers / _t_remove_separators: nuove transform per page marker PDF, numeri isolati, separatori underscore Co-Authored-By: Claude Sonnet 4.6 --- conversione/_pipeline/_apply.py | 9 +++- conversione/_pipeline/_artifacts.py | 25 +++++++++ conversione/_pipeline/_constants.py | 9 +++- conversione/_pipeline/_structure.py | 78 ++++++++++++++++++++++++++--- 4 files changed, 112 insertions(+), 9 deletions(-) diff --git a/conversione/_pipeline/_apply.py b/conversione/_pipeline/_apply.py index 9bf8f21..79d75e3 100644 --- a/conversione/_pipeline/_apply.py +++ b/conversione/_pipeline/_apply.py @@ -10,6 +10,7 @@ from ._artifacts import ( _t_remove_images, _t_fix_br, _t_fix_tabsep, _t_remove_footnotes, _t_remove_formula_labels, _t_remove_dotleaders, _t_remove_recurring_lines, _t_fix_math_symbols, _t_remove_watermarks, _t_remove_urls, + _t_remove_page_markers, _t_remove_page_numbers, _t_remove_separators, ) from ._headers import ( _t_fix_header_concat, _t_extract_capitolo, @@ -18,7 +19,8 @@ from ._headers import ( ) from ._structure import ( _t_remove_toc, _t_remove_orphan_toc, _t_allcaps_to_headers, - _t_numbered_sections, _t_extract_math, _t_extract_articles, + _t_numbered_sections, _t_promote_chapter_headers, + _t_extract_math, _t_extract_articles, ) from ._text import ( _t_merge_paragraphs, _t_normalize_whitespace, _t_collapse_blank_lines, @@ -50,6 +52,8 @@ def apply_transforms(text: str, on_step=None) -> tuple[str, dict]: ("n_moltiplicazioni_corrette", _t_fix_multiplication, "simbolo moltiplicazione"), ("n_micro_corretti", _t_fix_micro, "simbolo micro SI"), # 2. Pulizia artefatti + ("n_page_markers_rimossi", _t_remove_page_markers, "rimozione page markers PDF"), + ("n_separatori_rimossi", _t_remove_separators, "rimozione separatori underscore"), ("n_immagini_rimosse", _t_remove_images, "rimozione immagini"), ("n_br_rimossi", _t_fix_br, "fix
inline"), ("n_tabsep_rimossi", _t_fix_tabsep, "fix separatori tabella"), @@ -58,6 +62,7 @@ def apply_transforms(text: str, on_step=None) -> tuple[str, dict]: ("n_formule_rimossi", _t_remove_formula_labels, "rimozione label formula"), ("n_dotleader_rimossi", _t_remove_dotleaders, "rimozione dot-leader TOC"), ("n_righe_ricorrenti_rimosse", _t_remove_recurring_lines, "rimozione righe ricorrenti"), + ("n_numeri_pagina_rimossi", _t_remove_page_numbers, "rimozione numeri pagina isolati"), # 3. Struttura header ("n_header_concat_fixati", _t_fix_header_concat, "fix header+corpo concatenati"), (None, _t_extract_capitolo, "estrazione Capitolo inline"), @@ -70,6 +75,7 @@ def apply_transforms(text: str, on_step=None) -> tuple[str, dict]: ("n_toc_orfani_rimossi", _t_remove_orphan_toc, "rimozione voci TOC orfane"), ("n_header_allcaps", _t_allcaps_to_headers, "ALL-CAPS → ##"), ("n_sezioni_numerate", partial(_t_numbered_sections, has_exercises=_has_ex), "sezioni numerate → ###"), + ("n_capitoli_promossi", _t_promote_chapter_headers, "promozione capitoli ### → ##"), ("n_ambienti_matematici", _t_extract_math, "estrazione ambienti matematici"), ("n_articoli_estratti", _t_extract_articles, "estrazione articoli → ###"), # 5. Testo @@ -80,6 +86,7 @@ def apply_transforms(text: str, on_step=None) -> tuple[str, dict]: ("n_header_verso_demotati", _t_demote_verse_headers, "demozione header-verso"), ("n_url_rimossi", _t_remove_urls, "rimozione URL"), # 6. Rifinitura + (None, lambda t: (re.sub(r"(?m)^(#{1,6}.+?)\s+pag\.\s*\d{1,4}\s*$", r"\1", t), 0), "strip pag.N dagli header"), (None, _t_remove_empty_headers, "rimozione header vuoti"), ("n_titoli_uniti", _t_merge_title_headers, "merge titoli isolati"), (None, lambda t: (re.sub(r"(?m)^(#{1,6}.+?)\s*\|\s*\d{1,3}\s*$", r"\1", t), 0), "fix header|pagina"), diff --git a/conversione/_pipeline/_artifacts.py b/conversione/_pipeline/_artifacts.py index a5333b1..275081b 100644 --- a/conversione/_pipeline/_artifacts.py +++ b/conversione/_pipeline/_artifacts.py @@ -4,6 +4,7 @@ from collections import Counter from ._constants import ( _WATERMARK_RE, _TABSEP_RE, _SUPERSCRIPT_RE, _FOOTNOTE_BODY_RE, _DOTLEADER_RE, + _PAGE_MARKER_RE, _STANDALONE_NUM_RE, _UNDERSCORE_SEP_RE, ) @@ -100,3 +101,27 @@ def _t_remove_urls(text: str) -> tuple[str, int]: n = len(re.findall(r"(?m)^(https?://|www\.)\S+\s*$", text)) text = re.sub(r"(?m)^(https?://|www\.)\S+\s*$", "", text) return text, n + + +def _t_remove_page_markers(text: str) -> tuple[str, int]: + """Rimuove i marcatori e i separatori --- adiacenti.""" + n = len(_PAGE_MARKER_RE.findall(text)) + # Rimuovi ---\n come blocco unico (separatori di pagina PDF) + text = re.sub(r"(?m)^---\s*\n\s*\n?", "", text) + # Rimuovi eventuali rimasti senza --- + text = _PAGE_MARKER_RE.sub("", text) + return text, n + + +def _t_remove_page_numbers(text: str) -> tuple[str, int]: + """Rimuove numeri di pagina isolati (1-3 cifre su una riga solitaria).""" + n = len(_STANDALONE_NUM_RE.findall(text)) + text = _STANDALONE_NUM_RE.sub("", text) + return text, n + + +def _t_remove_separators(text: str) -> tuple[str, int]: + """Rimuove linee di separazione formate solo da underscore (___...).""" + n = len(_UNDERSCORE_SEP_RE.findall(text)) + text = _UNDERSCORE_SEP_RE.sub("", text) + return text, n diff --git a/conversione/_pipeline/_constants.py b/conversione/_pipeline/_constants.py index baa25a0..b9847ba 100644 --- a/conversione/_pipeline/_constants.py +++ b/conversione/_pipeline/_constants.py @@ -123,9 +123,12 @@ _NUMBERED_HDR_RE = re.compile( ) _BIB_MARKERS_RE = re.compile( r'\b(pp?\.|vol\.|n\.\s*\d|ed\.|edn\.|ISBN|DOI|arXiv)\b' - r'|\b(19|20)\d{2}\b', + r'|\b(19|20)\d{2}\b' + r'|\b(ibid\.?|ibidem|op\.\s*cit\.?|cit\.|cfr\.|ivi[,;\s])\b', re.IGNORECASE, ) +# Pattern autore accademico: iniziale maiuscola + cognome TUTTO-MAIUSCOLO (es. A. PAJNO, G. GUZZETTA) +_FOOTNOTE_AUTHOR_RE = re.compile(r'(?\s*$") +_STANDALONE_NUM_RE = re.compile(r"(?m)^(?:- )?\d{1,3}$") +_UNDERSCORE_SEP_RE = re.compile(r"(?m)^_{4,}\s*$") diff --git a/conversione/_pipeline/_structure.py b/conversione/_pipeline/_structure.py index 853c8bb..2e0d3b0 100644 --- a/conversione/_pipeline/_structure.py +++ b/conversione/_pipeline/_structure.py @@ -2,7 +2,7 @@ import re from ._constants import ( - _TOC_KEYWORDS, _BIB_MARKERS_RE, + _TOC_KEYWORDS, _BIB_MARKERS_RE, _FOOTNOTE_AUTHOR_RE, _TOC_ITEM_RE, _TOC_HDR_WITH_PAGE_RE, ) from ._helpers import ( @@ -28,6 +28,9 @@ def _t_remove_toc(text: str) -> tuple[str, int]: continue if re.match(r"^\s*[-*+]\s+.{2,70}\s+\d{1,3}\s*$", line): continue + # Righe brevi con riferimento pagina (es. "Prefazione pag. 4") + if re.match(r"^.{3,80}\s+pag\.\s*\d{1,4}\s*$", line.strip()): + continue if len(line.strip()) > 200: _in_toc = False new_lines.append(line) @@ -118,10 +121,23 @@ def _t_numbered_sections(text: str, has_exercises: bool = False) -> tuple[str, i content = m.group(2).strip() if content.endswith(".") and len(content) > 40: return m.group(0) - if _BIB_MARKERS_RE.search(content): + # Paragrafo lungo: non è un titolo di sezione + if len(content) > 130: + return m.group(0) + if _BIB_MARKERS_RE.search(content) or _FOOTNOTE_AUTHOR_RE.search(content): return m.group(0) count += 1 - return f"### {m.group(1)}.\n\n{content}" + # Prova a separare titolo dal corpo alla prima transizione minusc→Maiusc + split = re.search( + r"(?<=[a-z\xe0\xe8\xe9\xec\xed\xf2\xf3\xf9\xfa])\s+" + r"(?=[A-Z\xc0\xc8\xc9\xcc\xcd\xd2\xd3\xd9\xda])", + content, + ) + if split and 3 <= split.start() and len(content) - split.end() >= 40: + title = content[: split.start()].strip() + body = content[split.end():].strip() + return f"### {m.group(1)}. {title}\n\n{body}" + return f"### {m.group(1)}. {content}" text = re.sub(r"^(\d+)\.\s+(.+)$", _num_repl, text, flags=re.MULTILINE) @@ -136,13 +152,22 @@ def _t_numbered_sections(text: str, has_exercises: bool = False) -> tuple[str, i def _aphorism_repl(m: re.Match) -> str: nonlocal count content = m.group(2).strip() - if _BIB_MARKERS_RE.search(content): + if _BIB_MARKERS_RE.search(content) or _FOOTNOTE_AUTHOR_RE.search(content): return m.group(0) count += 1 - return f"\n\n### {m.group(1)}.\n\n{content}" + split = re.search( + r"(?<=[a-z\xe0\xe8\xe9\xec\xed\xf2\xf3\xf9\xfa])\s+" + r"(?=[A-Z\xc0\xc8\xc9\xcc\xcd\xd2\xd3\xd9\xda])", + content, + ) + if split and 3 <= split.start() and len(content) - split.end() >= 40: + title = content[: split.start()].strip() + body = content[split.end():].strip() + return f"\n\n### {m.group(1)}. {title}\n\n{body}" + return f"\n\n### {m.group(1)}. {content}" text = re.sub( - r"^-\s+(\d{1,3})\.\s+(.{10,})$", + r"^-[ \t]+(\d{1,3})\.[ \t]+(.{10,})$", _aphorism_repl, text, flags=re.MULTILINE, @@ -152,7 +177,7 @@ def _t_numbered_sections(text: str, has_exercises: bool = False) -> tuple[str, i nonlocal count num = m.group(1) content = m.group(2).strip() - if _BIB_MARKERS_RE.search(content): + if _BIB_MARKERS_RE.search(content) or _FOOTNOTE_AUTHOR_RE.search(content): return m.group(0) count += 1 split = re.search( @@ -176,6 +201,45 @@ def _t_numbered_sections(text: str, has_exercises: bool = False) -> tuple[str, i return text, count +def _t_promote_chapter_headers(text: str) -> tuple[str, int]: + """ + Promuove ### N. Titolo → ## N. Titolo quando sembrano capitoli principali. + Condizioni: ≥3 headers ### con numero 1–50, nessun ## già presente, + numeri di capitolo sequenziali e NON duplicati. + Numeri duplicati indicano una raccolta multi-articolo: non promuovere. + """ + if re.search(r"^## \d", text, re.MULTILINE): + return text, 0 + + pattern = re.compile(r"^### (\d+)\. (.+)$", re.MULTILINE) + matches = list(pattern.finditer(text)) + chapter_matches = [m for m in matches if int(m.group(1)) <= 50] + + if len(chapter_matches) < 3: + return text, 0 + + chapter_nums_list = [int(m.group(1)) for m in chapter_matches] + + # Se qualche numero appare ≥3 volte è una raccolta multi-articolo: non promuovere + num_counter: dict[int, int] = {} + for n in chapter_nums_list: + num_counter[n] = num_counter.get(n, 0) + 1 + if max(num_counter.values()) >= 3: + return text, 0 + + chapter_nums = set(chapter_nums_list) + count = 0 + + def _repl(m: re.Match) -> str: + nonlocal count + if int(m.group(1)) in chapter_nums: + count += 1 + return f"## {m.group(1)}. {m.group(2)}" + return m.group(0) + + return pattern.sub(_repl, text), count + + def _t_extract_math(text: str) -> tuple[str, int]: return _extract_math_environments(text)