import re from pathlib import Path # ─── Rilevamento lingua ─────────────────────────────────────────────────────── _IT_WORDS = frozenset([ "il", "la", "di", "e", "che", "non", "per", "un", "una", "si", "con", "da", "del", "della", "dei", "in", "ma", "se", "lo", "le", "gli", "al", "alla", "ai", "alle", "sono", "ha", "hanno", "era", "erano", "nel", "nella", "nei", "nelle", "questo", "questa", "così", ]) _EN_WORDS = frozenset([ "the", "of", "and", "to", "in", "is", "that", "it", "was", "for", "on", "are", "as", "with", "his", "they", "at", "be", "this", "have", "from", "or", "an", "but", "not", "by", "he", "she", "we", "you", "which", "their", "been", "has", "would", "there", "when", "will", ]) _FR_WORDS = frozenset([ "le", "les", "de", "du", "des", "et", "un", "une", "est", "que", "pour", "dans", "sur", "avec", "qui", "par", "pas", "plus", "au", "ce", "se", "ou", "mais", "comme", "aussi", ]) _DE_WORDS = frozenset([ "der", "die", "das", "und", "in", "von", "zu", "den", "mit", "ist", "auf", "eine", "als", "dem", "des", "sich", "nicht", "auch", "werden", "bei", "nach", "oder", "wenn", "wird", "war", ]) _ES_WORDS = frozenset([ "el", "los", "las", "de", "en", "un", "una", "es", "que", "por", "con", "del", "para", "como", "pero", "sus", "son", "los", "hay", "todo", "esta", "este", "ser", "más", "ya", ]) def _detect_language(text: str) -> str: words = re.findall(r"\b[a-zA-Z]{2,}\b", text.lower()) sample = words[:2000] scores = { "it": sum(1 for w in sample if w in _IT_WORDS), "en": sum(1 for w in sample if w in _EN_WORDS), "fr": sum(1 for w in sample if w in _FR_WORDS), "de": sum(1 for w in sample if w in _DE_WORDS), "es": sum(1 for w in sample if w in _ES_WORDS), } best = max(scores, key=scores.get) return best if scores[best] > 0 else "unknown" # ─── Analisi struttura ──────────────────────────────────────────────────────── def _count_headers(text: str, level: int) -> int: prefix = "#" * level + " " return len(re.findall(rf"(?m)^{re.escape(prefix)}", text)) def _count_paragraphs(text: str) -> int: blocks = re.split(r"\n{2,}", text) return sum(1 for b in blocks if b.strip() and not re.match(r"^#+\s", b.strip())) def _split_sections(text: str, level: int) -> list[str]: prefix = "#" * level + " " parts = re.split(rf"(?m)^{re.escape(prefix)}.+", text) return [p for p in parts[1:] if p.strip()] def _parse_sections_with_body(text: str, level: int = 3) -> list[tuple[str, str]]: """Restituisce lista di (header_line, body_text) per tutti gli header al livello dato.""" prefix = "#" * level + " " lines = text.split("\n") sections: list[tuple[str, str]] = [] cur_hdr: str | None = None cur_body: list[str] = [] for line in lines: if line.startswith(prefix): if cur_hdr is not None: sections.append((cur_hdr, "\n".join(cur_body).strip())) cur_hdr = line cur_body = [] elif cur_hdr is not None: cur_body.append(line) if cur_hdr is not None: sections.append((cur_hdr, "\n".join(cur_body).strip())) return sections def analyze(md_path: Path) -> dict: text = md_path.read_text(encoding="utf-8") n_h1 = _count_headers(text, 1) n_h2 = _count_headers(text, 2) n_h3 = _count_headers(text, 3) n_paragrafi = _count_paragraphs(text) if n_h3 >= 5: livello, boundary, strategia = 3, "h3", "h3_aware" section_bodies = _split_sections(text, 3) # Se h3 sono enormi e h2 più brevi, h2 è il boundary corretto if n_h2 >= 3: h2_bodies = _split_sections(text, 2) avg_h3 = sum(len(b) for b in section_bodies) / len(section_bodies) if section_bodies else 0 avg_h2 = sum(len(b) for b in h2_bodies) / len(h2_bodies) if h2_bodies else 0 if avg_h3 > 5000 and avg_h2 < avg_h3 * 0.7: livello, boundary, strategia = 2, "h2", "h2_paragraph_split" section_bodies = h2_bodies elif n_h2 >= 3: livello, boundary, strategia = 2, "h2", "h2_paragraph_split" section_bodies = _split_sections(text, 2) elif n_h1 + n_h2 + n_h3 >= 1: livello, boundary, strategia = 1, "paragrafo", "paragraph" section_bodies = [b for b in re.split(r"\n{2,}", text) if b.strip()] elif n_paragrafi >= 3: livello, boundary, strategia = 1, "paragrafo", "paragraph" section_bodies = [b for b in re.split(r"\n{2,}", text) if b.strip()] else: livello, boundary, strategia = 0, "nessuno", "sliding_window" section_bodies = [text] if text.strip() else [] lengths = [len(b) for b in section_bodies if b.strip()] lunghezza_media = int(sum(lengths) / len(lengths)) if lengths else 0 lingua = _detect_language(text) avvertenze = [] short = sum(1 for l in lengths if l < 200) long_ = sum(1 for l in lengths if l > 800) if short: avvertenze.append(f"{short} sezioni sotto i 200 caratteri — verranno accorpate") if long_: avvertenze.append(f"{long_} sezioni sopra i 800 caratteri — verranno divise") return { "livello_struttura": livello, "n_h1": n_h1, "n_h2": n_h2, "n_h3": n_h3, "n_paragrafi": n_paragrafi, "boundary_primario": boundary, "lingua_rilevata": lingua, "lunghezza_media_sezione": lunghezza_media, "strategia_chunking": strategia, "avvertenze": avvertenze, }