rag-from-scratch/conversione/_pipeline/structure.py

import re
from pathlib import Path

# ─── Rilevamento lingua ───────────────────────────────────────────────────────

_IT_WORDS = frozenset([
    "il", "la", "di", "e", "che", "non", "per", "un", "una", "si",
    "con", "da", "del", "della", "dei", "in", "ma", "se", "lo", "le",
    "gli", "al", "alla", "ai", "alle", "sono", "ha", "hanno", "era",
    "erano", "nel", "nella", "nei", "nelle", "questo", "questa", "così",
])
_EN_WORDS = frozenset([
    "the", "of", "and", "to", "in", "is", "that", "it", "was", "for",
    "on", "are", "as", "with", "his", "they", "at", "be", "this", "have",
    "from", "or", "an", "but", "not", "by", "he", "she", "we", "you",
    "which", "their", "been", "has", "would", "there", "when", "will",
])
_FR_WORDS = frozenset([
    "le", "les", "de", "du", "des", "et", "un", "une", "est", "que",
    "pour", "dans", "sur", "avec", "qui", "par", "pas", "plus", "au",
    "ce", "se", "ou", "mais", "comme", "aussi",
])
_DE_WORDS = frozenset([
    "der", "die", "das", "und", "in", "von", "zu", "den", "mit", "ist",
    "auf", "eine", "als", "dem", "des", "sich", "nicht", "auch", "werden",
    "bei", "nach", "oder", "wenn", "wird", "war",
])
_ES_WORDS = frozenset([
    "el", "los", "las", "de", "en", "un", "una", "es", "que", "por",
    "con", "del", "para", "como", "pero", "sus", "son", "los", "hay",
    "todo", "esta", "este", "ser", "más", "ya",
])


def _detect_language(text: str) -> str:
    words  = re.findall(r"\b[a-zA-Z]{2,}\b", text.lower())
    sample = words[:2000]
    scores = {
        "it": sum(1 for w in sample if w in _IT_WORDS),
        "en": sum(1 for w in sample if w in _EN_WORDS),
        "fr": sum(1 for w in sample if w in _FR_WORDS),
        "de": sum(1 for w in sample if w in _DE_WORDS),
        "es": sum(1 for w in sample if w in _ES_WORDS),
    }
    best = max(scores, key=scores.get)
    return best if scores[best] > 0 else "unknown"


# ─── Analisi struttura ────────────────────────────────────────────────────────

def _count_headers(text: str, level: int) -> int:
    prefix = "#" * level + " "
    return len(re.findall(rf"(?m)^{re.escape(prefix)}", text))


def _count_paragraphs(text: str) -> int:
    blocks = re.split(r"\n{2,}", text)
    return sum(1 for b in blocks if b.strip() and not re.match(r"^#+\s", b.strip()))


def _split_sections(text: str, level: int) -> list[str]:
    prefix = "#" * level + " "
    parts  = re.split(rf"(?m)^{re.escape(prefix)}.+", text)
    return [p for p in parts[1:] if p.strip()]


def _parse_sections_with_body(text: str, level: int = 3) -> list[tuple[str, str]]:
    """Restituisce lista di (header_line, body_text) per tutti gli header al livello dato."""
    prefix   = "#" * level + " "
    lines    = text.split("\n")
    sections: list[tuple[str, str]] = []
    cur_hdr:  str | None = None
    cur_body: list[str]  = []
    for line in lines:
        if line.startswith(prefix):
            if cur_hdr is not None:
                sections.append((cur_hdr, "\n".join(cur_body).strip()))
            cur_hdr  = line
            cur_body = []
        elif cur_hdr is not None:
            cur_body.append(line)
    if cur_hdr is not None:
        sections.append((cur_hdr, "\n".join(cur_body).strip()))
    return sections


def analyze(md_path: Path) -> dict:
    text        = md_path.read_text(encoding="utf-8")
    n_h1        = _count_headers(text, 1)
    n_h2        = _count_headers(text, 2)
    n_h3        = _count_headers(text, 3)
    n_paragrafi = _count_paragraphs(text)

    if n_h3 >= 5:
        livello, boundary, strategia = 3, "h3", "h3_aware"
        section_bodies = _split_sections(text, 3)
        # Se h3 sono enormi e h2 più brevi, h2 è il boundary corretto
        if n_h2 >= 3:
            h2_bodies = _split_sections(text, 2)
            avg_h3 = sum(len(b) for b in section_bodies) / len(section_bodies) if section_bodies else 0
            avg_h2 = sum(len(b) for b in h2_bodies) / len(h2_bodies) if h2_bodies else 0
            if avg_h3 > 5000 and avg_h2 < avg_h3 * 0.7:
                livello, boundary, strategia = 2, "h2", "h2_paragraph_split"
                section_bodies = h2_bodies
    elif n_h2 >= 3:
        livello, boundary, strategia = 2, "h2", "h2_paragraph_split"
        section_bodies = _split_sections(text, 2)
    elif n_h1 + n_h2 + n_h3 >= 1:
        livello, boundary, strategia = 1, "paragrafo", "paragraph"
        section_bodies = [b for b in re.split(r"\n{2,}", text) if b.strip()]
    elif n_paragrafi >= 3:
        livello, boundary, strategia = 1, "paragrafo", "paragraph"
        section_bodies = [b for b in re.split(r"\n{2,}", text) if b.strip()]
    else:
        livello, boundary, strategia = 0, "nessuno", "sliding_window"
        section_bodies = [text] if text.strip() else []

    lengths          = [len(b) for b in section_bodies if b.strip()]
    lunghezza_media  = int(sum(lengths) / len(lengths)) if lengths else 0
    lingua           = _detect_language(text)

    avvertenze = []
    short = sum(1 for l in lengths if l < 200)
    long_ = sum(1 for l in lengths if l > 800)
    if short:
        avvertenze.append(f"{short} sezioni sotto i 200 caratteri — verranno accorpate")
    if long_:
        avvertenze.append(f"{long_} sezioni sopra i 800 caratteri — verranno divise")

    return {
        "livello_struttura":     livello,
        "n_h1":                  n_h1,
        "n_h2":                  n_h2,
        "n_h3":                  n_h3,
        "n_paragrafi":           n_paragrafi,
        "boundary_primario":     boundary,
        "lingua_rilevata":       lingua,
        "lunghezza_media_sezione": lunghezza_media,
        "strategia_chunking":    strategia,
        "avvertenze":            avvertenze,
    }