feat: rileva note bibliografiche e raccolte multi-articolo in pipeline

Risolve la conversione errata di note a piè di pagina accademiche in header Markdown nei testi giuridici (es. dirittopubblico: da 424 h2 errati → 27 h2 semanticamente corretti). - _BIB_MARKERS_RE: aggiunge ibid., cfr., op. cit., cit., ivi - _FOOTNOTE_AUTHOR_RE: nuovo pattern per "A. COGNOME" (es. G. GUZZETTA) - _num_repl / _aphorism_repl / _list_section_repl: usano entrambi i guard per non convertire note bibliografiche in sezioni - _t_promote_chapter_headers: usa max-count ≥ 3 per distinguere raccolte multi-articolo (non promuovere) da libri con capitoli sequenziali (promuovere); preserva il comportamento corretto su anatomia - _t_remove_page_markers / _t_remove_page_numbers / _t_remove_separators: nuove transform per page marker PDF, numeri isolati, separatori underscore Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-07 16:12:50 +02:00
parent 2c0b7a462e
commit 3f4689e8fd
4 changed files with 112 additions and 9 deletions
@@ -10,6 +10,7 @@ from ._artifacts import (
    _t_remove_images, _t_fix_br, _t_fix_tabsep, _t_remove_footnotes,
    _t_remove_formula_labels, _t_remove_dotleaders, _t_remove_recurring_lines,
    _t_fix_math_symbols, _t_remove_watermarks, _t_remove_urls,
+    _t_remove_page_markers, _t_remove_page_numbers, _t_remove_separators,
 )
 from ._headers   import (
    _t_fix_header_concat, _t_extract_capitolo,
@@ -18,7 +19,8 @@ from ._headers   import (
 )
 from ._structure import (
    _t_remove_toc, _t_remove_orphan_toc, _t_allcaps_to_headers,
-    _t_numbered_sections, _t_extract_math, _t_extract_articles,
+    _t_numbered_sections, _t_promote_chapter_headers,
+    _t_extract_math, _t_extract_articles,
 )
 from ._text      import (
    _t_merge_paragraphs, _t_normalize_whitespace, _t_collapse_blank_lines,
@@ -50,6 +52,8 @@ def apply_transforms(text: str, on_step=None) -> tuple[str, dict]:
        ("n_moltiplicazioni_corrette",     _t_fix_multiplication,           "simbolo moltiplicazione"),
        ("n_micro_corretti",               _t_fix_micro,                    "simbolo micro SI"),
        # 2. Pulizia artefatti
+        ("n_page_markers_rimossi",         _t_remove_page_markers,          "rimozione page markers PDF"),
+        ("n_separatori_rimossi",           _t_remove_separators,            "rimozione separatori underscore"),
        ("n_immagini_rimosse",             _t_remove_images,                "rimozione immagini"),
        ("n_br_rimossi",                   _t_fix_br,                       "fix <br> inline"),
        ("n_tabsep_rimossi",               _t_fix_tabsep,                   "fix separatori tabella"),
@@ -58,6 +62,7 @@ def apply_transforms(text: str, on_step=None) -> tuple[str, dict]:
        ("n_formule_rimossi",              _t_remove_formula_labels,        "rimozione label formula"),
        ("n_dotleader_rimossi",            _t_remove_dotleaders,            "rimozione dot-leader TOC"),
        ("n_righe_ricorrenti_rimosse",     _t_remove_recurring_lines,       "rimozione righe ricorrenti"),
+        ("n_numeri_pagina_rimossi",        _t_remove_page_numbers,          "rimozione numeri pagina isolati"),
        # 3. Struttura header
        ("n_header_concat_fixati",         _t_fix_header_concat,            "fix header+corpo concatenati"),
        (None,                             _t_extract_capitolo,             "estrazione Capitolo inline"),
@@ -70,6 +75,7 @@ def apply_transforms(text: str, on_step=None) -> tuple[str, dict]:
        ("n_toc_orfani_rimossi",           _t_remove_orphan_toc,            "rimozione voci TOC orfane"),
        ("n_header_allcaps",               _t_allcaps_to_headers,           "ALL-CAPS → ##"),
        ("n_sezioni_numerate",             partial(_t_numbered_sections, has_exercises=_has_ex), "sezioni numerate → ###"),
+        ("n_capitoli_promossi",            _t_promote_chapter_headers,      "promozione capitoli ### → ##"),
        ("n_ambienti_matematici",          _t_extract_math,                 "estrazione ambienti matematici"),
        ("n_articoli_estratti",            _t_extract_articles,             "estrazione articoli → ###"),
        # 5. Testo
@@ -80,6 +86,7 @@ def apply_transforms(text: str, on_step=None) -> tuple[str, dict]:
        ("n_header_verso_demotati",        _t_demote_verse_headers,         "demozione header-verso"),
        ("n_url_rimossi",                  _t_remove_urls,                  "rimozione URL"),
        # 6. Rifinitura
+        (None,                             lambda t: (re.sub(r"(?m)^(#{1,6}.+?)\s+pag\.\s*\d{1,4}\s*$", r"\1", t), 0), "strip pag.N dagli header"),
        (None,                             _t_remove_empty_headers,         "rimozione header vuoti"),
        ("n_titoli_uniti",                 _t_merge_title_headers,          "merge titoli isolati"),
        (None,                             lambda t: (re.sub(r"(?m)^(#{1,6}.+?)\s*\|\s*\d{1,3}\s*$", r"\1", t), 0), "fix header|pagina"),
@@ -4,6 +4,7 @@ from collections import Counter

 from ._constants import (
    _WATERMARK_RE, _TABSEP_RE, _SUPERSCRIPT_RE, _FOOTNOTE_BODY_RE, _DOTLEADER_RE,
+    _PAGE_MARKER_RE, _STANDALONE_NUM_RE, _UNDERSCORE_SEP_RE,
 )


@@ -100,3 +101,27 @@ def _t_remove_urls(text: str) -> tuple[str, int]:
    n    = len(re.findall(r"(?m)^(https?://|www\.)\S+\s*$", text))
    text = re.sub(r"(?m)^(https?://|www\.)\S+\s*$", "", text)
    return text, n
+
+
+def _t_remove_page_markers(text: str) -> tuple[str, int]:
+    """Rimuove i marcatori <!-- page: N --> e i separatori --- adiacenti."""
+    n = len(_PAGE_MARKER_RE.findall(text))
+    # Rimuovi ---\n<!-- page: N --> come blocco unico (separatori di pagina PDF)
+    text = re.sub(r"(?m)^---\s*\n<!-- page: \d+ -->\s*\n?", "", text)
+    # Rimuovi eventuali <!-- page: N --> rimasti senza ---
+    text = _PAGE_MARKER_RE.sub("", text)
+    return text, n
+
+
+def _t_remove_page_numbers(text: str) -> tuple[str, int]:
+    """Rimuove numeri di pagina isolati (1-3 cifre su una riga solitaria)."""
+    n    = len(_STANDALONE_NUM_RE.findall(text))
+    text = _STANDALONE_NUM_RE.sub("", text)
+    return text, n
+
+
+def _t_remove_separators(text: str) -> tuple[str, int]:
+    """Rimuove linee di separazione formate solo da underscore (___...)."""
+    n    = len(_UNDERSCORE_SEP_RE.findall(text))
+    text = _UNDERSCORE_SEP_RE.sub("", text)
+    return text, n
@@ -123,9 +123,12 @@ _NUMBERED_HDR_RE = re.compile(
 )
 _BIB_MARKERS_RE = re.compile(
    r'\b(pp?\.|vol\.|n\.\s*\d|ed\.|edn\.|ISBN|DOI|arXiv)\b'
-    r'|\b(19|20)\d{2}\b',
+    r'|\b(19|20)\d{2}\b'
+    r'|\b(ibid\.?|ibidem|op\.\s*cit\.?|cit\.|cfr\.|ivi[,;\s])\b',
    re.IGNORECASE,
 )
+# Pattern autore accademico: iniziale maiuscola + cognome TUTTO-MAIUSCOLO (es. A. PAJNO, G. GUZZETTA)
+_FOOTNOTE_AUTHOR_RE = re.compile(r'(?<![A-Z])[A-Z]\.\s+[A-Z]{3,}')
 _WATERMARK_RE = re.compile(
    r"^(BOZZA|DRAFT|CONFIDENTIAL|RISERVATO|PROVVISORIO|SAMPLE|SPECIMEN"
    r"|DO NOT DISTRIBUTE|NON DISTRIBUIRE|COPY|COPIA)\s*$",
@@ -160,3 +163,7 @@ _TOC_ITEM_RE = re.compile(
 _TOC_HDR_WITH_PAGE_RE = re.compile(
    r"^#{1,3}\s+\d+\.?\s+.{3,60}\s+\d{1,4}$"
 )
+# Artefatti PDF: page markers e separatori
+_PAGE_MARKER_RE = re.compile(r"(?m)^<!-- page: \d+ -->\s*$")
+_STANDALONE_NUM_RE = re.compile(r"(?m)^(?:- )?\d{1,3}$")
+_UNDERSCORE_SEP_RE = re.compile(r"(?m)^_{4,}\s*$")
@@ -2,7 +2,7 @@
 import re

 from ._constants import (
-    _TOC_KEYWORDS, _BIB_MARKERS_RE,
+    _TOC_KEYWORDS, _BIB_MARKERS_RE, _FOOTNOTE_AUTHOR_RE,
    _TOC_ITEM_RE, _TOC_HDR_WITH_PAGE_RE,
 )
 from ._helpers import (
@@ -28,6 +28,9 @@ def _t_remove_toc(text: str) -> tuple[str, int]:
                continue
            if re.match(r"^\s*[-*+]\s+.{2,70}\s+\d{1,3}\s*$", line):
                continue
+            # Righe brevi con riferimento pagina (es. "Prefazione pag. 4")
+            if re.match(r"^.{3,80}\s+pag\.\s*\d{1,4}\s*$", line.strip()):
+                continue
            if len(line.strip()) > 200:
                _in_toc = False
                new_lines.append(line)
@@ -118,10 +121,23 @@ def _t_numbered_sections(text: str, has_exercises: bool = False) -> tuple[str, i
        content = m.group(2).strip()
        if content.endswith(".") and len(content) > 40:
            return m.group(0)
-        if _BIB_MARKERS_RE.search(content):
+        # Paragrafo lungo: non è un titolo di sezione
+        if len(content) > 130:
+            return m.group(0)
+        if _BIB_MARKERS_RE.search(content) or _FOOTNOTE_AUTHOR_RE.search(content):
            return m.group(0)
        count += 1
-        return f"### {m.group(1)}.\n\n{content}"
+        # Prova a separare titolo dal corpo alla prima transizione minusc→Maiusc
+        split = re.search(
+            r"(?<=[a-z\xe0\xe8\xe9\xec\xed\xf2\xf3\xf9\xfa])\s+"
+            r"(?=[A-Z\xc0\xc8\xc9\xcc\xcd\xd2\xd3\xd9\xda])",
+            content,
+        )
+        if split and 3 <= split.start() and len(content) - split.end() >= 40:
+            title = content[: split.start()].strip()
+            body  = content[split.end():].strip()
+            return f"### {m.group(1)}. {title}\n\n{body}"
+        return f"### {m.group(1)}. {content}"

    text = re.sub(r"^(\d+)\.\s+(.+)$", _num_repl, text, flags=re.MULTILINE)

@@ -136,13 +152,22 @@ def _t_numbered_sections(text: str, has_exercises: bool = False) -> tuple[str, i
        def _aphorism_repl(m: re.Match) -> str:
            nonlocal count
            content = m.group(2).strip()
-            if _BIB_MARKERS_RE.search(content):
+            if _BIB_MARKERS_RE.search(content) or _FOOTNOTE_AUTHOR_RE.search(content):
                return m.group(0)
            count += 1
-            return f"\n\n### {m.group(1)}.\n\n{content}"
+            split = re.search(
+                r"(?<=[a-z\xe0\xe8\xe9\xec\xed\xf2\xf3\xf9\xfa])\s+"
+                r"(?=[A-Z\xc0\xc8\xc9\xcc\xcd\xd2\xd3\xd9\xda])",
+                content,
+            )
+            if split and 3 <= split.start() and len(content) - split.end() >= 40:
+                title = content[: split.start()].strip()
+                body  = content[split.end():].strip()
+                return f"\n\n### {m.group(1)}. {title}\n\n{body}"
+            return f"\n\n### {m.group(1)}. {content}"

        text = re.sub(
-            r"^-\s+(\d{1,3})\.\s+(.{10,})$",
+            r"^-[ \t]+(\d{1,3})\.[ \t]+(.{10,})$",
            _aphorism_repl,
            text,
            flags=re.MULTILINE,
@@ -152,7 +177,7 @@ def _t_numbered_sections(text: str, has_exercises: bool = False) -> tuple[str, i
        nonlocal count
        num     = m.group(1)
        content = m.group(2).strip()
-        if _BIB_MARKERS_RE.search(content):
+        if _BIB_MARKERS_RE.search(content) or _FOOTNOTE_AUTHOR_RE.search(content):
            return m.group(0)
        count += 1
        split = re.search(
@@ -176,6 +201,45 @@ def _t_numbered_sections(text: str, has_exercises: bool = False) -> tuple[str, i
    return text, count


+def _t_promote_chapter_headers(text: str) -> tuple[str, int]:
+    """
+    Promuove ### N. Titolo → ## N. Titolo quando sembrano capitoli principali.
+    Condizioni: ≥3 headers ### con numero 1–50, nessun ## già presente,
+    numeri di capitolo sequenziali e NON duplicati.
+    Numeri duplicati indicano una raccolta multi-articolo: non promuovere.
+    """
+    if re.search(r"^## \d", text, re.MULTILINE):
+        return text, 0
+
+    pattern = re.compile(r"^### (\d+)\. (.+)$", re.MULTILINE)
+    matches = list(pattern.finditer(text))
+    chapter_matches = [m for m in matches if int(m.group(1)) <= 50]
+
+    if len(chapter_matches) < 3:
+        return text, 0
+
+    chapter_nums_list = [int(m.group(1)) for m in chapter_matches]
+
+    # Se qualche numero appare ≥3 volte è una raccolta multi-articolo: non promuovere
+    num_counter: dict[int, int] = {}
+    for n in chapter_nums_list:
+        num_counter[n] = num_counter.get(n, 0) + 1
+    if max(num_counter.values()) >= 3:
+        return text, 0
+
+    chapter_nums = set(chapter_nums_list)
+    count = 0
+
+    def _repl(m: re.Match) -> str:
+        nonlocal count
+        if int(m.group(1)) in chapter_nums:
+            count += 1
+            return f"## {m.group(1)}. {m.group(2)}"
+        return m.group(0)
+
+    return pattern.sub(_repl, text), count
+
+
 def _t_extract_math(text: str) -> tuple[str, int]:
    return _extract_math_environments(text)