From 3f4689e8fd407dc70bdb658f29b2ff4db40a4bda Mon Sep 17 00:00:00 2001
From: Davide Grilli <davide.grilli@outlook.com>
Date: Thu, 7 May 2026 16:12:50 +0200
Subject: [PATCH] feat: rileva note bibliografiche e raccolte multi-articolo in
 pipeline
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Risolve la conversione errata di note a piè di pagina accademiche in
header Markdown nei testi giuridici (es. dirittopubblico: da 424 h2
errati → 27 h2 semanticamente corretti).

- _BIB_MARKERS_RE: aggiunge ibid., cfr., op. cit., cit., ivi
- _FOOTNOTE_AUTHOR_RE: nuovo pattern per "A. COGNOME" (es. G. GUZZETTA)
- _num_repl / _aphorism_repl / _list_section_repl: usano entrambi i
  guard per non convertire note bibliografiche in sezioni
- _t_promote_chapter_headers: usa max-count ≥ 3 per distinguere
  raccolte multi-articolo (non promuovere) da libri con capitoli
  sequenziali (promuovere); preserva il comportamento corretto su anatomia
- _t_remove_page_markers / _t_remove_page_numbers / _t_remove_separators:
  nuove transform per page marker PDF, numeri isolati, separatori underscore

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 conversione/_pipeline/_apply.py     |  9 +++-
 conversione/_pipeline/_artifacts.py | 25 +++++++++
 conversione/_pipeline/_constants.py |  9 +++-
 conversione/_pipeline/_structure.py | 78 ++++++++++++++++++++++++++---
 4 files changed, 112 insertions(+), 9 deletions(-)
diff --git a/conversione/_pipeline/_apply.py b/conversione/_pipeline/_apply.py
index 9bf8f21..79d75e3 100644
--- a/conversione/_pipeline/_apply.py
+++ b/conversione/_pipeline/_apply.py
@@ -10,6 +10,7 @@ from ._artifacts import (
     _t_remove_images, _t_fix_br, _t_fix_tabsep, _t_remove_footnotes,
     _t_remove_formula_labels, _t_remove_dotleaders, _t_remove_recurring_lines,
     _t_fix_math_symbols, _t_remove_watermarks, _t_remove_urls,
+    _t_remove_page_markers, _t_remove_page_numbers, _t_remove_separators,
 )
 from ._headers   import (
     _t_fix_header_concat, _t_extract_capitolo,
@@ -18,7 +19,8 @@ from ._headers   import (
 )
 from ._structure import (
     _t_remove_toc, _t_remove_orphan_toc, _t_allcaps_to_headers,
-    _t_numbered_sections, _t_extract_math, _t_extract_articles,
+    _t_numbered_sections, _t_promote_chapter_headers,
+    _t_extract_math, _t_extract_articles,
 )
 from ._text      import (
     _t_merge_paragraphs, _t_normalize_whitespace, _t_collapse_blank_lines,
@@ -50,6 +52,8 @@ def apply_transforms(text: str, on_step=None) -> tuple[str, dict]:
         ("n_moltiplicazioni_corrette",     _t_fix_multiplication,           "simbolo moltiplicazione"),
         ("n_micro_corretti",               _t_fix_micro,                    "simbolo micro SI"),
         # 2. Pulizia artefatti
+        ("n_page_markers_rimossi",         _t_remove_page_markers,          "rimozione page markers PDF"),
+        ("n_separatori_rimossi",           _t_remove_separators,            "rimozione separatori underscore"),
         ("n_immagini_rimosse",             _t_remove_images,                "rimozione immagini"),
         ("n_br_rimossi",                   _t_fix_br,                       "fix <br> inline"),
         ("n_tabsep_rimossi",               _t_fix_tabsep,                   "fix separatori tabella"),
@@ -58,6 +62,7 @@ def apply_transforms(text: str, on_step=None) -> tuple[str, dict]:
         ("n_formule_rimossi",              _t_remove_formula_labels,        "rimozione label formula"),
         ("n_dotleader_rimossi",            _t_remove_dotleaders,            "rimozione dot-leader TOC"),
         ("n_righe_ricorrenti_rimosse",     _t_remove_recurring_lines,       "rimozione righe ricorrenti"),
+        ("n_numeri_pagina_rimossi",        _t_remove_page_numbers,          "rimozione numeri pagina isolati"),
         # 3. Struttura header
         ("n_header_concat_fixati",         _t_fix_header_concat,            "fix header+corpo concatenati"),
         (None,                             _t_extract_capitolo,             "estrazione Capitolo inline"),
@@ -70,6 +75,7 @@ def apply_transforms(text: str, on_step=None) -> tuple[str, dict]:
         ("n_toc_orfani_rimossi",           _t_remove_orphan_toc,            "rimozione voci TOC orfane"),
         ("n_header_allcaps",               _t_allcaps_to_headers,           "ALL-CAPS → ##"),
         ("n_sezioni_numerate",             partial(_t_numbered_sections, has_exercises=_has_ex), "sezioni numerate → ###"),
+        ("n_capitoli_promossi",            _t_promote_chapter_headers,      "promozione capitoli ### → ##"),
         ("n_ambienti_matematici",          _t_extract_math,                 "estrazione ambienti matematici"),
         ("n_articoli_estratti",            _t_extract_articles,             "estrazione articoli → ###"),
         # 5. Testo
@@ -80,6 +86,7 @@ def apply_transforms(text: str, on_step=None) -> tuple[str, dict]:
         ("n_header_verso_demotati",        _t_demote_verse_headers,         "demozione header-verso"),
         ("n_url_rimossi",                  _t_remove_urls,                  "rimozione URL"),
         # 6. Rifinitura
+        (None,                             lambda t: (re.sub(r"(?m)^(#{1,6}.+?)\s+pag\.\s*\d{1,4}\s*$", r"\1", t), 0), "strip pag.N dagli header"),
         (None,                             _t_remove_empty_headers,         "rimozione header vuoti"),
         ("n_titoli_uniti",                 _t_merge_title_headers,          "merge titoli isolati"),
         (None,                             lambda t: (re.sub(r"(?m)^(#{1,6}.+?)\s*\|\s*\d{1,3}\s*$", r"\1", t), 0), "fix header|pagina"),
diff --git a/conversione/_pipeline/_artifacts.py b/conversione/_pipeline/_artifacts.py
index a5333b1..275081b 100644
--- a/conversione/_pipeline/_artifacts.py
+++ b/conversione/_pipeline/_artifacts.py
@@ -4,6 +4,7 @@ from collections import Counter
 
 from ._constants import (
     _WATERMARK_RE, _TABSEP_RE, _SUPERSCRIPT_RE, _FOOTNOTE_BODY_RE, _DOTLEADER_RE,
+    _PAGE_MARKER_RE, _STANDALONE_NUM_RE, _UNDERSCORE_SEP_RE,
 )
 
 
@@ -100,3 +101,27 @@ def _t_remove_urls(text: str) -> tuple[str, int]:
     n    = len(re.findall(r"(?m)^(https?://|www\.)\S+\s*$", text))
     text = re.sub(r"(?m)^(https?://|www\.)\S+\s*$", "", text)
     return text, n
+
+
+def _t_remove_page_markers(text: str) -> tuple[str, int]:
+    """Rimuove i marcatori <!-- page: N --> e i separatori --- adiacenti."""
+    n = len(_PAGE_MARKER_RE.findall(text))
+    # Rimuovi ---\n<!-- page: N --> come blocco unico (separatori di pagina PDF)
+    text = re.sub(r"(?m)^---\s*\n<!-- page: \d+ -->\s*\n?", "", text)
+    # Rimuovi eventuali <!-- page: N --> rimasti senza ---
+    text = _PAGE_MARKER_RE.sub("", text)
+    return text, n
+
+
+def _t_remove_page_numbers(text: str) -> tuple[str, int]:
+    """Rimuove numeri di pagina isolati (1-3 cifre su una riga solitaria)."""
+    n    = len(_STANDALONE_NUM_RE.findall(text))
+    text = _STANDALONE_NUM_RE.sub("", text)
+    return text, n
+
+
+def _t_remove_separators(text: str) -> tuple[str, int]:
+    """Rimuove linee di separazione formate solo da underscore (___...)."""
+    n    = len(_UNDERSCORE_SEP_RE.findall(text))
+    text = _UNDERSCORE_SEP_RE.sub("", text)
+    return text, n
diff --git a/conversione/_pipeline/_constants.py b/conversione/_pipeline/_constants.py
index baa25a0..b9847ba 100644
--- a/conversione/_pipeline/_constants.py
+++ b/conversione/_pipeline/_constants.py
@@ -123,9 +123,12 @@ _NUMBERED_HDR_RE = re.compile(
 )
 _BIB_MARKERS_RE = re.compile(
     r'\b(pp?\.|vol\.|n\.\s*\d|ed\.|edn\.|ISBN|DOI|arXiv)\b'
-    r'|\b(19|20)\d{2}\b',
+    r'|\b(19|20)\d{2}\b'
+    r'|\b(ibid\.?|ibidem|op\.\s*cit\.?|cit\.|cfr\.|ivi[,;\s])\b',
     re.IGNORECASE,
 )
+# Pattern autore accademico: iniziale maiuscola + cognome TUTTO-MAIUSCOLO (es. A. PAJNO, G. GUZZETTA)
+_FOOTNOTE_AUTHOR_RE = re.compile(r'(?<![A-Z])[A-Z]\.\s+[A-Z]{3,}')
 _WATERMARK_RE = re.compile(
     r"^(BOZZA|DRAFT|CONFIDENTIAL|RISERVATO|PROVVISORIO|SAMPLE|SPECIMEN"
     r"|DO NOT DISTRIBUTE|NON DISTRIBUIRE|COPY|COPIA)\s*$",
@@ -160,3 +163,7 @@ _TOC_ITEM_RE = re.compile(
 _TOC_HDR_WITH_PAGE_RE = re.compile(
     r"^#{1,3}\s+\d+\.?\s+.{3,60}\s+\d{1,4}$"
 )
+# Artefatti PDF: page markers e separatori
+_PAGE_MARKER_RE = re.compile(r"(?m)^<!-- page: \d+ -->\s*$")
+_STANDALONE_NUM_RE = re.compile(r"(?m)^(?:- )?\d{1,3}$")
+_UNDERSCORE_SEP_RE = re.compile(r"(?m)^_{4,}\s*$")
diff --git a/conversione/_pipeline/_structure.py b/conversione/_pipeline/_structure.py
index 853c8bb..2e0d3b0 100644
--- a/conversione/_pipeline/_structure.py
+++ b/conversione/_pipeline/_structure.py
@@ -2,7 +2,7 @@
 import re
 
 from ._constants import (
-    _TOC_KEYWORDS, _BIB_MARKERS_RE,
+    _TOC_KEYWORDS, _BIB_MARKERS_RE, _FOOTNOTE_AUTHOR_RE,
     _TOC_ITEM_RE, _TOC_HDR_WITH_PAGE_RE,
 )
 from ._helpers import (
@@ -28,6 +28,9 @@ def _t_remove_toc(text: str) -> tuple[str, int]:
                 continue
             if re.match(r"^\s*[-*+]\s+.{2,70}\s+\d{1,3}\s*$", line):
                 continue
+            # Righe brevi con riferimento pagina (es. "Prefazione pag. 4")
+            if re.match(r"^.{3,80}\s+pag\.\s*\d{1,4}\s*$", line.strip()):
+                continue
             if len(line.strip()) > 200:
                 _in_toc = False
                 new_lines.append(line)
@@ -118,10 +121,23 @@ def _t_numbered_sections(text: str, has_exercises: bool = False) -> tuple[str, i
         content = m.group(2).strip()
         if content.endswith(".") and len(content) > 40:
             return m.group(0)
-        if _BIB_MARKERS_RE.search(content):
+        # Paragrafo lungo: non è un titolo di sezione
+        if len(content) > 130:
+            return m.group(0)
+        if _BIB_MARKERS_RE.search(content) or _FOOTNOTE_AUTHOR_RE.search(content):
             return m.group(0)
         count += 1
-        return f"### {m.group(1)}.\n\n{content}"
+        # Prova a separare titolo dal corpo alla prima transizione minusc→Maiusc
+        split = re.search(
+            r"(?<=[a-z\xe0\xe8\xe9\xec\xed\xf2\xf3\xf9\xfa])\s+"
+            r"(?=[A-Z\xc0\xc8\xc9\xcc\xcd\xd2\xd3\xd9\xda])",
+            content,
+        )
+        if split and 3 <= split.start() and len(content) - split.end() >= 40:
+            title = content[: split.start()].strip()
+            body  = content[split.end():].strip()
+            return f"### {m.group(1)}. {title}\n\n{body}"
+        return f"### {m.group(1)}. {content}"
 
     text = re.sub(r"^(\d+)\.\s+(.+)$", _num_repl, text, flags=re.MULTILINE)
 
@@ -136,13 +152,22 @@ def _t_numbered_sections(text: str, has_exercises: bool = False) -> tuple[str, i
         def _aphorism_repl(m: re.Match) -> str:
             nonlocal count
             content = m.group(2).strip()
-            if _BIB_MARKERS_RE.search(content):
+            if _BIB_MARKERS_RE.search(content) or _FOOTNOTE_AUTHOR_RE.search(content):
                 return m.group(0)
             count += 1
-            return f"\n\n### {m.group(1)}.\n\n{content}"
+            split = re.search(
+                r"(?<=[a-z\xe0\xe8\xe9\xec\xed\xf2\xf3\xf9\xfa])\s+"
+                r"(?=[A-Z\xc0\xc8\xc9\xcc\xcd\xd2\xd3\xd9\xda])",
+                content,
+            )
+            if split and 3 <= split.start() and len(content) - split.end() >= 40:
+                title = content[: split.start()].strip()
+                body  = content[split.end():].strip()
+                return f"\n\n### {m.group(1)}. {title}\n\n{body}"
+            return f"\n\n### {m.group(1)}. {content}"
 
         text = re.sub(
-            r"^-\s+(\d{1,3})\.\s+(.{10,})$",
+            r"^-[ \t]+(\d{1,3})\.[ \t]+(.{10,})$",
             _aphorism_repl,
             text,
             flags=re.MULTILINE,
@@ -152,7 +177,7 @@ def _t_numbered_sections(text: str, has_exercises: bool = False) -> tuple[str, i
         nonlocal count
         num     = m.group(1)
         content = m.group(2).strip()
-        if _BIB_MARKERS_RE.search(content):
+        if _BIB_MARKERS_RE.search(content) or _FOOTNOTE_AUTHOR_RE.search(content):
             return m.group(0)
         count += 1
         split = re.search(
@@ -176,6 +201,45 @@ def _t_numbered_sections(text: str, has_exercises: bool = False) -> tuple[str, i
     return text, count
 
 
+def _t_promote_chapter_headers(text: str) -> tuple[str, int]:
+    """
+    Promuove ### N. Titolo → ## N. Titolo quando sembrano capitoli principali.
+    Condizioni: ≥3 headers ### con numero 1–50, nessun ## già presente,
+    numeri di capitolo sequenziali e NON duplicati.
+    Numeri duplicati indicano una raccolta multi-articolo: non promuovere.
+    """
+    if re.search(r"^## \d", text, re.MULTILINE):
+        return text, 0
+
+    pattern = re.compile(r"^### (\d+)\. (.+)$", re.MULTILINE)
+    matches = list(pattern.finditer(text))
+    chapter_matches = [m for m in matches if int(m.group(1)) <= 50]
+
+    if len(chapter_matches) < 3:
+        return text, 0
+
+    chapter_nums_list = [int(m.group(1)) for m in chapter_matches]
+
+    # Se qualche numero appare ≥3 volte è una raccolta multi-articolo: non promuovere
+    num_counter: dict[int, int] = {}
+    for n in chapter_nums_list:
+        num_counter[n] = num_counter.get(n, 0) + 1
+    if max(num_counter.values()) >= 3:
+        return text, 0
+
+    chapter_nums = set(chapter_nums_list)
+    count = 0
+
+    def _repl(m: re.Match) -> str:
+        nonlocal count
+        if int(m.group(1)) in chapter_nums:
+            count += 1
+            return f"## {m.group(1)}. {m.group(2)}"
+        return m.group(0)
+
+    return pattern.sub(_repl, text), count
+
+
 def _t_extract_math(text: str) -> tuple[str, int]:
     return _extract_math_environments(text)