Files
rag-from-scratch/conversione/_pipeline/_headers.py
T
davide 444942dc8f feat: demota #→## quando il documento usa h1 per sezioni principali
Aggiunge _t_demote_h1 in _headers.py: se il documento contiene ≥5
header # con contenuto testuale (lettere iniziali), i # vengono
demotati a ## creando la gerarchia ## (parti) → ### (sezioni)
invece di # → ###.

Utile per manuali strutturati in parti principali (h1) con sezioni
(h3) senza livello intermedio h2. La soglia di 5 evita falsi positivi
su documenti con un solo titolo h1 o h1 da artefatti di encoding.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-07 16:21:02 +02:00

128 lines
4.0 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""Trasformazioni sulla struttura degli header: normalizzazione livelli, concat, bold."""
import re
from ._constants import _NUMBERED_HDR_RE
from ._helpers import _sentence_case
def _t_fix_header_concat(text: str) -> tuple[str, int]:
count = 0
def _fix(m: re.Match) -> str:
nonlocal count
hashes = m.group(1)
full = m.group(2).strip()
if len(full) < 60:
return m.group(0)
skip = min(10, len(full) // 3)
split = re.search(
r"(?<=[a-z\xe0\xe8\xe9\xec\xed\xf2\xf3\xf9\xfa\xe4])"
r"(?=[A-Z\xc0\xc8\xc9\xcc\xcd\xd2\xd3\xd9\xda])",
full[skip:],
)
if split:
pos = skip + split.start()
title = full[:pos].strip()
body = full[pos:].strip()
if len(title) >= 5 and len(body) >= 15:
count += 1
return f"{hashes} {title}\n\n{body}"
return m.group(0)
text = re.sub(r"^(#{2,6})\s+(.{40,})$", _fix, text, flags=re.MULTILINE)
return text, count
def _t_extract_capitolo(text: str) -> tuple[str, int]:
def _repl(m: re.Match) -> str:
num = m.group(1)
titolo = _sentence_case(m.group(2).strip().rstrip("- ").strip())
return f"\n\n## Capitolo {num}: {titolo}\n\n"
text = re.sub(
r"\bCapitolo\s+(\d+)\s*[:\s]\s*([A-Z\xc0\xc8\xc9\xcc\xcd\xd2\xd3\xd9\xda\'L]"
r"[A-Z\xc0\xc8\xc9\xcc\xcd\xd2\xd3\xd9\xda\s\'\.,\(\)]{5,80}?)"
r"(?=\s*[-]\s*\d|\s*\n|\s*$)",
_repl,
text,
)
return text, 0
def _t_normalize_numbered_headings(text: str) -> tuple[str, int]:
all_matches = list(_NUMBERED_HDR_RE.finditer(text))
if not all_matches:
return text, 0
pairs = [(m.group(2).count(".") + 1, len(m.group(1))) for m in all_matches]
depths = [d for d, _ in pairs]
min_depth = min(depths)
max_depth = max(depths)
if max_depth == min_depth:
return text, 0
base_level = min(lv for d, lv in pairs if d == min_depth)
count = 0
def _repl(m: re.Match) -> str:
nonlocal count
hashes, num, title = m.group(1), m.group(2), m.group(3)
depth = num.count(".") + 1
new_level = min(base_level + (depth - min_depth), 6)
if new_level == len(hashes):
return m.group(0)
count += 1
return f"{'#' * new_level} {num}. {title}"
return _NUMBERED_HDR_RE.sub(_repl, text), count
def _t_normalize_header_levels(text: str) -> tuple[str, int]:
text = re.sub(r"^#{3,6}\s*$", "", text, flags=re.MULTILINE)
text = re.sub(
r"^(#{3,6})\s+(\d{1,3})\s+(.+)$",
lambda m: f"### {m.group(2)}. {m.group(3)}",
text,
flags=re.MULTILINE,
)
text = re.sub(r"^#{4,6}\s+(.+)$", r"### \1", text, flags=re.MULTILINE)
return text, 0
def _t_remove_header_bold(text: str) -> tuple[str, int]:
text = re.sub(
r"^(#{1,6})\s+\*\*(.+?)\*\*\s*$",
r"\1 \2",
text, flags=re.MULTILINE,
)
return text, 0
def _t_demote_h1(text: str) -> tuple[str, int]:
"""
Demota # → ## quando il documento usa # per sezioni principali (≥5 h1
con contenuto testuale). Crea gerarchia ## → ### invece di # → ###.
"""
h1_count = len(re.findall(r"^# [A-Za-z\xc0-\xff]", text, re.MULTILINE))
if h1_count < 5:
return text, 0
count = 0
def _repl(m: re.Match) -> str:
nonlocal count
count += 1
return f"## {m.group(1)}"
text = re.sub(r"^# (.+)$", _repl, text, flags=re.MULTILINE)
return text, count
def _t_normalize_allcaps_headers(text: str) -> tuple[str, int]:
def _norm(m: re.Match) -> str:
hashes, content = m.group(1), m.group(2).strip()
letters = [c for c in content if c.isalpha()]
if letters and all(c.isupper() for c in letters):
return f"{hashes} {_sentence_case(content)}"
return m.group(0)
text = re.sub(r"^(#{1,6}) (.+)$", _norm, text, flags=re.MULTILINE)
return text, 0