rag-from-scratch/conversione/_pipeline/_headers.py

"""Trasformazioni sulla struttura degli header: normalizzazione livelli, concat, bold."""
import re

from ._constants import _NUMBERED_HDR_RE
from ._helpers import _sentence_case


def _t_fix_header_concat(text: str) -> tuple[str, int]:
    count = 0

    def _fix(m: re.Match) -> str:
        nonlocal count
        hashes = m.group(1)
        full   = m.group(2).strip()
        if len(full) < 60:
            return m.group(0)
        skip  = min(10, len(full) // 3)
        split = re.search(
            r"(?<=[a-z\xe0\xe8\xe9\xec\xed\xf2\xf3\xf9\xfa\xe4])"
            r"(?=[A-Z\xc0\xc8\xc9\xcc\xcd\xd2\xd3\xd9\xda])",
            full[skip:],
        )
        if split:
            pos   = skip + split.start()
            title = full[:pos].strip()
            body  = full[pos:].strip()
            if len(title) >= 5 and len(body) >= 15:
                count += 1
                return f"{hashes} {title}\n\n{body}"
        return m.group(0)

    text = re.sub(r"^(#{2,6})\s+(.{40,})$", _fix, text, flags=re.MULTILINE)
    return text, count


def _t_extract_capitolo(text: str) -> tuple[str, int]:
    def _repl(m: re.Match) -> str:
        num    = m.group(1)
        titolo = _sentence_case(m.group(2).strip().rstrip("- ").strip())
        return f"\n\n## Capitolo {num}: {titolo}\n\n"

    text = re.sub(
        r"\bCapitolo\s+(\d+)\s*[:\s]\s*([A-Z\xc0\xc8\xc9\xcc\xcd\xd2\xd3\xd9\xda\'L]"
        r"[A-Z\xc0\xc8\xc9\xcc\xcd\xd2\xd3\xd9\xda\s\'\.,\(\)]{5,80}?)"
        r"(?=\s*[-–]\s*\d|\s*\n|\s*$)",
        _repl,
        text,
    )
    return text, 0


def _t_normalize_numbered_headings(text: str) -> tuple[str, int]:
    all_matches = list(_NUMBERED_HDR_RE.finditer(text))
    if not all_matches:
        return text, 0

    pairs     = [(m.group(2).count(".") + 1, len(m.group(1))) for m in all_matches]
    depths    = [d for d, _ in pairs]
    min_depth = min(depths)
    max_depth = max(depths)
    if max_depth == min_depth:
        return text, 0

    base_level = min(lv for d, lv in pairs if d == min_depth)
    count = 0

    def _repl(m: re.Match) -> str:
        nonlocal count
        hashes, num, title = m.group(1), m.group(2), m.group(3)
        depth     = num.count(".") + 1
        new_level = min(base_level + (depth - min_depth), 6)
        if new_level == len(hashes):
            return m.group(0)
        count += 1
        return f"{'#' * new_level} {num}. {title}"

    return _NUMBERED_HDR_RE.sub(_repl, text), count


def _t_normalize_header_levels(text: str) -> tuple[str, int]:
    text = re.sub(r"^#{3,6}\s*$", "", text, flags=re.MULTILINE)
    text = re.sub(
        r"^(#{3,6})\s+(\d{1,3})\s+(.+)$",
        lambda m: f"### {m.group(2)}. {m.group(3)}",
        text,
        flags=re.MULTILINE,
    )
    text = re.sub(r"^#{4,6}\s+(.+)$", r"### \1", text, flags=re.MULTILINE)
    return text, 0


def _t_remove_header_bold(text: str) -> tuple[str, int]:
    text = re.sub(
        r"^(#{1,6})\s+\*\*(.+?)\*\*\s*$",
        r"\1 \2",
        text, flags=re.MULTILINE,
    )
    return text, 0


def _t_demote_h1(text: str) -> tuple[str, int]:
    """
    Demota # → ## quando il documento usa # per sezioni principali (≥5 h1
    con contenuto testuale). Crea gerarchia ## → ### invece di # → ###.
    """
    h1_count = len(re.findall(r"^# [A-Za-z\xc0-\xff]", text, re.MULTILINE))
    if h1_count < 5:
        return text, 0
    count = 0
    def _repl(m: re.Match) -> str:
        nonlocal count
        count += 1
        return f"## {m.group(1)}"
    text = re.sub(r"^# (.+)$", _repl, text, flags=re.MULTILINE)
    return text, count


def _t_normalize_allcaps_headers(text: str) -> tuple[str, int]:
    def _norm(m: re.Match) -> str:
        hashes, content = m.group(1), m.group(2).strip()
        letters = [c for c in content if c.isalpha()]
        if letters and all(c.isupper() for c in letters):
            return f"{hashes} {_sentence_case(content)}"
        return m.group(0)

    text = re.sub(r"^(#{1,6}) (.+)$", _norm, text, flags=re.MULTILINE)
    return text, 0