feat: demota #→## quando il documento usa h1 per sezioni principali

Aggiunge _t_demote_h1 in _headers.py: se il documento contiene ≥5 header # con contenuto testuale (lettere iniziali), i # vengono demotati a ## creando la gerarchia ## (parti) → ### (sezioni) invece di # → ###. Utile per manuali strutturati in parti principali (h1) con sezioni (h3) senza livello intermedio h2. La soglia di 5 evita falsi positivi su documenti con un solo titolo h1 o h1 da artefatti di encoding. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-07 16:21:02 +02:00
parent 3f4689e8fd
commit 444942dc8f
2 changed files with 19 additions and 1 deletions
@@ -15,7 +15,7 @@ from ._artifacts import (
 from ._headers   import (
    _t_fix_header_concat, _t_extract_capitolo,
    _t_normalize_numbered_headings, _t_normalize_header_levels,
-    _t_remove_header_bold, _t_normalize_allcaps_headers,
+    _t_remove_header_bold, _t_normalize_allcaps_headers, _t_demote_h1,
 )
 from ._structure import (
    _t_remove_toc, _t_remove_orphan_toc, _t_allcaps_to_headers,
@@ -68,6 +68,7 @@ def apply_transforms(text: str, on_step=None) -> tuple[str, dict]:
        (None,                             _t_extract_capitolo,             "estrazione Capitolo inline"),
        ("n_header_numerati_normalizzati", _t_normalize_numbered_headings,  "normalizzazione livelli numerati"),
        (None,                             _t_normalize_header_levels,      "normalizzazione livelli ####→###"),
+        (None,                             _t_demote_h1,                    "demozione #→## (sezioni principali)"),
        (None,                             _t_remove_header_bold,           "rimozione bold negli header"),
        (None,                             _t_normalize_allcaps_headers,    "normalizzazione ALL-CAPS header"),
        # 4. Costruzione struttura
@@ -98,6 +98,23 @@ def _t_remove_header_bold(text: str) -> tuple[str, int]:
    return text, 0


+def _t_demote_h1(text: str) -> tuple[str, int]:
+    """
+    Demota # → ## quando il documento usa # per sezioni principali (≥5 h1
+    con contenuto testuale). Crea gerarchia ## → ### invece di # → ###.
+    """
+    h1_count = len(re.findall(r"^# [A-Za-z\xc0-\xff]", text, re.MULTILINE))
+    if h1_count < 5:
+        return text, 0
+    count = 0
+    def _repl(m: re.Match) -> str:
+        nonlocal count
+        count += 1
+        return f"## {m.group(1)}"
+    text = re.sub(r"^# (.+)$", _repl, text, flags=re.MULTILINE)
+    return text, count
+
+
 def _t_normalize_allcaps_headers(text: str) -> tuple[str, int]:
    def _norm(m: re.Match) -> str:
        hashes, content = m.group(1), m.group(2).strip()