rag-from-scratch/conversione/_pipeline/_finish.py

"""Trasformazioni di rifinitura: header vuoti, garbage, demozione formula-header, frontmatter."""
import re

from ._constants import (
    _FM_RE, _MATH_HDR_RE, _MATH_SYMBOLS_RE,
    _EXERCISE_TRIGGER_RE, _NUMBERED_PREFIX_RE,
)
from ._helpers import _merge_title_headers


def _t_remove_empty_headers(text: str) -> tuple[str, int]:
    blocks  = re.split(r"\n{2,}", text)
    cleaned = []
    for i, block in enumerate(blocks):
        stripped = block.strip()
        if re.match(r"^#{1,6} ", stripped) and "\n" not in stripped:
            next_stripped    = blocks[i + 1].strip() if i + 1 < len(blocks) else ""
            next_is_long_hdr = (
                re.match(r"^#{1,6} ", next_stripped) and len(next_stripped) > 80
            )
            if not next_stripped or (
                re.match(r"^#{1,6} ", next_stripped) and not next_is_long_hdr
            ):
                continue
        cleaned.append(block)
    return re.sub(r"\n{3,}", "\n\n", "\n\n".join(cleaned)), 0


def _t_merge_title_headers(text: str) -> tuple[str, int]:
    return _merge_title_headers(text)


def _t_remove_garbage_headers(text: str) -> tuple[str, int]:
    def _is_garbage(content: str) -> bool:
        if content.lstrip().startswith("..."):
            return True
        if not re.search(r"[A-Za-z\xc0-\xffΑ-ω]{2,}", content):
            return True
        if re.fullmatch(r"\(?\s*[A-Za-z]{1,4}\s*\)?", content.strip()):
            return True
        if len(content) > 60 and re.search(r"[!%#]\w|\w[!%#]|\b\w+-\s*\w", content):
            return True
        first_alpha = next((c for c in content if c.isalpha()), None)
        if first_alpha and first_alpha.islower() and len(content) > 40:
            return True
        if re.match(r"^[A-Za-zΑ-ω_]{1,3}\s*[=<>≤≥]", content.strip()):
            return True
        if re.match(
            r"^(Figura|Figure|Fig\.|Tabella|Table|Tab\.)\s+\d",
            content.strip(), re.IGNORECASE,
        ):
            return True
        return False

    count     = 0
    lines     = text.split("\n")
    new_lines = []
    for line in lines:
        m = re.match(r"^#{1,6} (.+)$", line)
        if m and _is_garbage(m.group(1)):
            count += 1
            continue
        new_lines.append(line)
    text = "\n".join(new_lines)
    text = re.sub(r"\n{3,}", "\n\n", text)
    return text, count


def _t_math_header_demotion(text: str) -> tuple[str, int]:
    lines = text.split("\n")
    result, count = [], 0
    for line in lines:
        m = _MATH_HDR_RE.match(line)
        if not m:
            result.append(line)
            continue
        body = m.group(2)
        if len(body) <= 100:
            result.append(line)
            continue
        has_math     = len(_MATH_SYMBOLS_RE.findall(body)) >= 3
        has_exercise = bool(_EXERCISE_TRIGGER_RE.search(body))
        if not (has_math or has_exercise):
            result.append(line)
            continue
        nm = _NUMBERED_PREFIX_RE.match(body)
        if nm:
            result.append(f"**{nm.group(1)}** {nm.group(2)}")
        else:
            result.append(body)
        count += 1
    return "\n".join(result), count


def _t_remove_frontmatter(text: str) -> tuple[str, int]:
    blocks  = re.split(r"\n{2,}", text)
    cleaned = []
    count   = 0
    total   = len(blocks)
    cutoff  = max(5, min(15, int(total * 0.20)))
    for i, block in enumerate(blocks):
        stripped = block.strip()
        if i >= cutoff:
            cleaned.append(block)
            continue
        if not re.match(r"^### ", stripped) or re.match(r"^### \d", stripped):
            cleaned.append(block)
            continue
        body       = blocks[i + 1].strip() if i + 1 < len(blocks) else ""
        is_fm_body = len(body) < 250 and _FM_RE.search(body)
        is_fm_hdr  = _FM_RE.search(stripped)
        if is_fm_body or is_fm_hdr:
            count += 1
            continue
        cleaned.append(block)
    return re.sub(r"\n{3,}", "\n\n", "\n\n".join(cleaned)), count