"""Trasformazioni sulla struttura degli header: normalizzazione livelli, concat, bold.""" import re from ._constants import _NUMBERED_HDR_RE from ._helpers import _sentence_case def _t_fix_header_concat(text: str) -> tuple[str, int]: count = 0 def _fix(m: re.Match) -> str: nonlocal count hashes = m.group(1) full = m.group(2).strip() if len(full) < 60: return m.group(0) skip = min(10, len(full) // 3) split = re.search( r"(?<=[a-z\xe0\xe8\xe9\xec\xed\xf2\xf3\xf9\xfa\xe4])" r"(?=[A-Z\xc0\xc8\xc9\xcc\xcd\xd2\xd3\xd9\xda])", full[skip:], ) if split: pos = skip + split.start() title = full[:pos].strip() body = full[pos:].strip() if len(title) >= 5 and len(body) >= 15: count += 1 return f"{hashes} {title}\n\n{body}" return m.group(0) text = re.sub(r"^(#{2,6})\s+(.{40,})$", _fix, text, flags=re.MULTILINE) return text, count def _t_extract_capitolo(text: str) -> tuple[str, int]: def _repl(m: re.Match) -> str: num = m.group(1) titolo = _sentence_case(m.group(2).strip().rstrip("- ").strip()) return f"\n\n## Capitolo {num}: {titolo}\n\n" text = re.sub( r"\bCapitolo\s+(\d+)\s*[:\s]\s*([A-Z\xc0\xc8\xc9\xcc\xcd\xd2\xd3\xd9\xda\'L]" r"[A-Z\xc0\xc8\xc9\xcc\xcd\xd2\xd3\xd9\xda\s\'\.,\(\)]{5,80}?)" r"(?=\s*[-–]\s*\d|\s*\n|\s*$)", _repl, text, ) return text, 0 def _t_normalize_numbered_headings(text: str) -> tuple[str, int]: all_matches = list(_NUMBERED_HDR_RE.finditer(text)) if not all_matches: return text, 0 pairs = [(m.group(2).count(".") + 1, len(m.group(1))) for m in all_matches] depths = [d for d, _ in pairs] min_depth = min(depths) max_depth = max(depths) if max_depth == min_depth: return text, 0 base_level = min(lv for d, lv in pairs if d == min_depth) count = 0 def _repl(m: re.Match) -> str: nonlocal count hashes, num, title = m.group(1), m.group(2), m.group(3) depth = num.count(".") + 1 new_level = min(base_level + (depth - min_depth), 6) if new_level == len(hashes): return m.group(0) count += 1 return f"{'#' * new_level} {num}. {title}" return _NUMBERED_HDR_RE.sub(_repl, text), count def _t_normalize_header_levels(text: str) -> tuple[str, int]: text = re.sub(r"^#{3,6}\s*$", "", text, flags=re.MULTILINE) text = re.sub( r"^(#{3,6})\s+(\d{1,3})\s+(.+)$", lambda m: f"### {m.group(2)}. {m.group(3)}", text, flags=re.MULTILINE, ) text = re.sub(r"^#{4,6}\s+(.+)$", r"### \1", text, flags=re.MULTILINE) return text, 0 def _t_remove_header_bold(text: str) -> tuple[str, int]: text = re.sub( r"^(#{1,6})\s+\*\*(.+?)\*\*\s*$", r"\1 \2", text, flags=re.MULTILINE, ) return text, 0 def _t_normalize_allcaps_headers(text: str) -> tuple[str, int]: def _norm(m: re.Match) -> str: hashes, content = m.group(1), m.group(2).strip() letters = [c for c in content if c.isalpha()] if letters and all(c.isupper() for c in letters): return f"{hashes} {_sentence_case(content)}" return m.group(0) text = re.sub(r"^(#{1,6}) (.+)$", _norm, text, flags=re.MULTILINE) return text, 0