444942dc8f
Aggiunge _t_demote_h1 in _headers.py: se il documento contiene ≥5 header # con contenuto testuale (lettere iniziali), i # vengono demotati a ## creando la gerarchia ## (parti) → ### (sezioni) invece di # → ###. Utile per manuali strutturati in parti principali (h1) con sezioni (h3) senza livello intermedio h2. La soglia di 5 evita falsi positivi su documenti con un solo titolo h1 o h1 da artefatti di encoding. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
128 lines
4.0 KiB
Python
128 lines
4.0 KiB
Python
"""Trasformazioni sulla struttura degli header: normalizzazione livelli, concat, bold."""
|
||
import re
|
||
|
||
from ._constants import _NUMBERED_HDR_RE
|
||
from ._helpers import _sentence_case
|
||
|
||
|
||
def _t_fix_header_concat(text: str) -> tuple[str, int]:
|
||
count = 0
|
||
|
||
def _fix(m: re.Match) -> str:
|
||
nonlocal count
|
||
hashes = m.group(1)
|
||
full = m.group(2).strip()
|
||
if len(full) < 60:
|
||
return m.group(0)
|
||
skip = min(10, len(full) // 3)
|
||
split = re.search(
|
||
r"(?<=[a-z\xe0\xe8\xe9\xec\xed\xf2\xf3\xf9\xfa\xe4])"
|
||
r"(?=[A-Z\xc0\xc8\xc9\xcc\xcd\xd2\xd3\xd9\xda])",
|
||
full[skip:],
|
||
)
|
||
if split:
|
||
pos = skip + split.start()
|
||
title = full[:pos].strip()
|
||
body = full[pos:].strip()
|
||
if len(title) >= 5 and len(body) >= 15:
|
||
count += 1
|
||
return f"{hashes} {title}\n\n{body}"
|
||
return m.group(0)
|
||
|
||
text = re.sub(r"^(#{2,6})\s+(.{40,})$", _fix, text, flags=re.MULTILINE)
|
||
return text, count
|
||
|
||
|
||
def _t_extract_capitolo(text: str) -> tuple[str, int]:
|
||
def _repl(m: re.Match) -> str:
|
||
num = m.group(1)
|
||
titolo = _sentence_case(m.group(2).strip().rstrip("- ").strip())
|
||
return f"\n\n## Capitolo {num}: {titolo}\n\n"
|
||
|
||
text = re.sub(
|
||
r"\bCapitolo\s+(\d+)\s*[:\s]\s*([A-Z\xc0\xc8\xc9\xcc\xcd\xd2\xd3\xd9\xda\'L]"
|
||
r"[A-Z\xc0\xc8\xc9\xcc\xcd\xd2\xd3\xd9\xda\s\'\.,\(\)]{5,80}?)"
|
||
r"(?=\s*[-–]\s*\d|\s*\n|\s*$)",
|
||
_repl,
|
||
text,
|
||
)
|
||
return text, 0
|
||
|
||
|
||
def _t_normalize_numbered_headings(text: str) -> tuple[str, int]:
|
||
all_matches = list(_NUMBERED_HDR_RE.finditer(text))
|
||
if not all_matches:
|
||
return text, 0
|
||
|
||
pairs = [(m.group(2).count(".") + 1, len(m.group(1))) for m in all_matches]
|
||
depths = [d for d, _ in pairs]
|
||
min_depth = min(depths)
|
||
max_depth = max(depths)
|
||
if max_depth == min_depth:
|
||
return text, 0
|
||
|
||
base_level = min(lv for d, lv in pairs if d == min_depth)
|
||
count = 0
|
||
|
||
def _repl(m: re.Match) -> str:
|
||
nonlocal count
|
||
hashes, num, title = m.group(1), m.group(2), m.group(3)
|
||
depth = num.count(".") + 1
|
||
new_level = min(base_level + (depth - min_depth), 6)
|
||
if new_level == len(hashes):
|
||
return m.group(0)
|
||
count += 1
|
||
return f"{'#' * new_level} {num}. {title}"
|
||
|
||
return _NUMBERED_HDR_RE.sub(_repl, text), count
|
||
|
||
|
||
def _t_normalize_header_levels(text: str) -> tuple[str, int]:
|
||
text = re.sub(r"^#{3,6}\s*$", "", text, flags=re.MULTILINE)
|
||
text = re.sub(
|
||
r"^(#{3,6})\s+(\d{1,3})\s+(.+)$",
|
||
lambda m: f"### {m.group(2)}. {m.group(3)}",
|
||
text,
|
||
flags=re.MULTILINE,
|
||
)
|
||
text = re.sub(r"^#{4,6}\s+(.+)$", r"### \1", text, flags=re.MULTILINE)
|
||
return text, 0
|
||
|
||
|
||
def _t_remove_header_bold(text: str) -> tuple[str, int]:
|
||
text = re.sub(
|
||
r"^(#{1,6})\s+\*\*(.+?)\*\*\s*$",
|
||
r"\1 \2",
|
||
text, flags=re.MULTILINE,
|
||
)
|
||
return text, 0
|
||
|
||
|
||
def _t_demote_h1(text: str) -> tuple[str, int]:
|
||
"""
|
||
Demota # → ## quando il documento usa # per sezioni principali (≥5 h1
|
||
con contenuto testuale). Crea gerarchia ## → ### invece di # → ###.
|
||
"""
|
||
h1_count = len(re.findall(r"^# [A-Za-z\xc0-\xff]", text, re.MULTILINE))
|
||
if h1_count < 5:
|
||
return text, 0
|
||
count = 0
|
||
def _repl(m: re.Match) -> str:
|
||
nonlocal count
|
||
count += 1
|
||
return f"## {m.group(1)}"
|
||
text = re.sub(r"^# (.+)$", _repl, text, flags=re.MULTILINE)
|
||
return text, count
|
||
|
||
|
||
def _t_normalize_allcaps_headers(text: str) -> tuple[str, int]:
|
||
def _norm(m: re.Match) -> str:
|
||
hashes, content = m.group(1), m.group(2).strip()
|
||
letters = [c for c in content if c.isalpha()]
|
||
if letters and all(c.isupper() for c in letters):
|
||
return f"{hashes} {_sentence_case(content)}"
|
||
return m.group(0)
|
||
|
||
text = re.sub(r"^(#{1,6}) (.+)$", _norm, text, flags=re.MULTILINE)
|
||
return text, 0
|