rag-from-scratch/conversione/_pipeline/_text.py

"""Trasformazioni sul testo: merge paragrafi, whitespace, poesia, versi."""
import re

from ._constants import _VERSE_NUM_RE


def _t_merge_paragraphs(text: str) -> tuple[str, int]:
    _SENTENCE_END = set(".?!\xbb)\"'")
    blocks = text.split("\n\n")
    merged = []
    count  = 0
    i = 0
    while i < len(blocks):
        b        = blocks[i]
        stripped = b.strip()
        while (
            i + 1 < len(blocks)
            and stripped
            and not stripped.startswith("#")
            and not stripped.startswith("|")
            and stripped[-1] not in _SENTENCE_END
        ):
            nxt = blocks[i + 1].strip()
            if (
                not nxt
                or nxt.startswith("#")
                or nxt.startswith("|")
                or re.match(r"^\d+\.", nxt)
                or re.match(r"^[-*+]\s", nxt)
            ):
                break
            b        = stripped + " " + nxt
            stripped = b.strip()
            count   += 1
            i       += 1
        merged.append(b)
        i += 1
    text = "\n\n".join(merged)
    text = re.sub(r"(?m)^\|---\|\s*", "", text)
    return text, count


def _t_normalize_whitespace(text: str) -> tuple[str, int]:
    lines = text.split("\n")
    text  = "\n".join(
        re.sub(r"  +", " ", line) if line.strip() else line
        for line in lines
    )
    return text, 0


def _t_collapse_blank_lines(text: str) -> tuple[str, int]:
    return re.sub(r"\n{3,}", "\n\n", text), 0


def _t_restore_poetry_lines(text: str) -> tuple[str, int]:
    count  = 0
    blocks = text.split("\n\n")
    result = []

    for block in blocks:
        stripped = block.strip()
        if not stripped or stripped.startswith("#"):
            result.append(block)
            continue

        matches = list(_VERSE_NUM_RE.finditer(stripped))
        if len(matches) < 2:
            result.append(block)
            continue

        nums  = [int(m.group(2)) for m in matches]
        diffs = [nums[i + 1] - nums[i] for i in range(len(nums) - 1)]
        if not diffs or len(set(diffs)) > 2 or not (1 <= diffs[0] <= 5):
            result.append(block)
            continue

        step = diffs[0]

        def _replace_verse_num(m: re.Match) -> str:
            n   = int(m.group(2))
            sep = "\n\n" if n % (step * 3) == 0 else "\n"
            return m.group(1).rstrip() + sep

        new_block = _VERSE_NUM_RE.sub(_replace_verse_num, stripped)
        if new_block != stripped:
            count += len(matches)
        result.append(new_block)

    return "\n\n".join(result), count


def _t_demote_verse_headers(text: str) -> tuple[str, int]:
    count = 0

    def _demote(m: re.Match) -> str:
        nonlocal count
        hashes, content = m.group(1), m.group(2).strip()
        if not re.search(r"\s\d{1,4}\s*$", content):
            return m.group(0)
        inner = re.sub(r"\s\d{1,4}\s*$", "", content)
        if not re.search(r'[,;:.!?\xbb"\'][\ ]+[A-Za-z\xc0-\xff\xab""]', inner):
            return m.group(0)
        count += 1
        clean = re.sub(r"\s\d{1,4}\s*$", "", content)
        return clean

    text = re.sub(r"^(#{1,6})\s+(.{20,})$", _demote, text, flags=re.MULTILINE)
    return text, count