rag-from-scratch/conversione/_pipeline/_structure.py

"""Costruzione struttura: TOC, ALLCAPS→##, sezioni numerate, ambienti matematici, articoli."""
import re

from ._constants import (
    _TOC_KEYWORDS, _BIB_MARKERS_RE,
    _TOC_ITEM_RE, _TOC_HDR_WITH_PAGE_RE,
)
from ._helpers import (
    _is_allcaps_line, _allcaps_to_header,
    _extract_math_environments, _extract_article_headers,
)


def _t_remove_toc(text: str) -> tuple[str, int]:
    lines     = text.split("\n")
    new_lines = []
    _in_toc   = False
    removed   = False
    for line in lines:
        bare       = re.sub(r"^#+\s*", "", line.strip())
        first_word = bare.split(".")[0].strip().lower()
        if first_word in _TOC_KEYWORDS:
            removed = True
            _in_toc = True
            continue
        if _in_toc:
            if re.match(r"^\s*$", line) or re.match(r"^\s*[-*+]\s+\d", line):
                continue
            if re.match(r"^\s*[-*+]\s+.{2,70}\s+\d{1,3}\s*$", line):
                continue
            if len(line.strip()) > 200:
                _in_toc = False
                new_lines.append(line)
                continue
            _in_toc = False
        new_lines.append(line)
    return "\n".join(new_lines), 1 if removed else 0


def _t_remove_orphan_toc(text: str) -> tuple[str, int]:
    """
    Rimuove voci di sommario senza dot-leader che sfuggono a _t_remove_toc.
    Rileva: (a) blocchi di 3+ righe consecutive che matchano il pattern TOC
    nei primi 25% del documento; (b) header ### N. Titolo PAGINA il cui corpo
    è una lista di voci numerate.
    """
    blocks  = re.split(r"\n{2,}", text)
    total   = len(blocks)
    cutoff  = max(10, min(40, int(total * 0.25)))
    to_drop = set()

    i = 0
    while i < cutoff and i < total:
        b = blocks[i].strip()

        # (a) Sequenza di 3+ blocchi TOC consecutivi
        if _TOC_ITEM_RE.match(b):
            j = i
            while j < min(cutoff, i + 60) and j < len(blocks) and _TOC_ITEM_RE.match(blocks[j].strip()):
                j += 1
            if j - i >= 3:
                for k in range(i, j):
                    to_drop.add(k)
                # Rimuovi anche l'header ### precedente se ha numero di pagina
                if i > 0 and _TOC_HDR_WITH_PAGE_RE.match(blocks[i - 1].strip()):
                    to_drop.add(i - 1)
                i = j
                continue

        # (b) Header ### N. Titolo PAGINA con corpo che è lista di voci numerate
        if _TOC_HDR_WITH_PAGE_RE.match(b):
            body = blocks[i + 1].strip() if i + 1 < len(blocks) else ""
            # Il corpo contiene 2+ occorrenze di "N. Titolo"
            toc_hits = re.findall(r"\d+\.?\s+[A-Za-zÀ-ú]", body)
            if len(toc_hits) >= 2 and len(body) < 300:
                to_drop.add(i)
                if i + 1 < total:
                    to_drop.add(i + 1)
                i += 2
                continue

        i += 1

    if not to_drop:
        return text, 0

    kept = [b for idx, b in enumerate(blocks) if idx not in to_drop]
    return re.sub(r"\n{3,}", "\n\n", "\n\n".join(kept)), len(to_drop)


def _t_allcaps_to_headers(text: str) -> tuple[str, int]:
    count      = 0
    blocks     = text.split("\n\n")
    new_blocks = []
    for block in blocks:
        stripped = block.strip()
        if "\n" not in stripped and _is_allcaps_line(stripped):
            new_blocks.append(_allcaps_to_header(stripped))
            count += 1
        else:
            sub_lines = block.split("\n")
            converted = []
            for ln in sub_lines:
                if _is_allcaps_line(ln) and len(ln.strip()) > 3:
                    converted.append(_allcaps_to_header(ln))
                    count += 1
                else:
                    converted.append(ln)
            new_blocks.append("\n".join(converted))
    return "\n\n".join(new_blocks), count


def _t_numbered_sections(text: str, has_exercises: bool = False) -> tuple[str, int]:
    count = 0

    def _num_repl(m: re.Match) -> str:
        nonlocal count
        content = m.group(2).strip()
        if content.endswith(".") and len(content) > 40:
            return m.group(0)
        if _BIB_MARKERS_RE.search(content):
            return m.group(0)
        count += 1
        return f"### {m.group(1)}.\n\n{content}"

    text = re.sub(r"^(\d+)\.\s+(.+)$", _num_repl, text, flags=re.MULTILINE)

    def _num_letter_repl(m: re.Match) -> str:
        nonlocal count
        count += 1
        return f"### {m.group(1)}{m.group(2)}.\n\n{m.group(3).strip()}"

    text = re.sub(r"^(\d+)\s*([a-z])\.\s+(.+)$", _num_letter_repl, text, flags=re.MULTILINE)

    if not has_exercises:
        def _aphorism_repl(m: re.Match) -> str:
            nonlocal count
            content = m.group(2).strip()
            if _BIB_MARKERS_RE.search(content):
                return m.group(0)
            count += 1
            return f"\n\n### {m.group(1)}.\n\n{content}"

        text = re.sub(
            r"^-\s+(\d{1,3})\.\s+(.{10,})$",
            _aphorism_repl,
            text,
            flags=re.MULTILINE,
        )

    def _list_section_repl(m: re.Match) -> str:
        nonlocal count
        num     = m.group(1)
        content = m.group(2).strip()
        if _BIB_MARKERS_RE.search(content):
            return m.group(0)
        count += 1
        split = re.search(
            r"(?<=[a-z\xe0\xe8\xe9\xec\xed\xf2\xf3\xf9\xfa])\s+"
            r"(?=[A-Z\xc0\xc8\xc9\xcc\xcd\xd2\xd3\xd9\xda])",
            content,
        )
        if split and split.start() >= 3:
            title = content[: split.start()].strip()
            body  = content[split.end():].strip()
            if len(body) >= 20:
                return f"\n\n### {num}. {title}\n\n{body}"
        return f"\n\n### {num}. {content}"

    text = re.sub(
        r"^-\s+(\d{1,3})\s+([A-Z\xc0\xc8\xc9\xcc\xcd\xd2\xd3\xd9\xda\'L].{10,})$",
        _list_section_repl,
        text,
        flags=re.MULTILINE,
    )
    return text, count


def _t_extract_math(text: str) -> tuple[str, int]:
    return _extract_math_environments(text)


def _t_extract_articles(text: str) -> tuple[str, int]:
    return _extract_article_headers(text)