ebd2a43f84
Porta da main la riscrittura completa di conversione/_pipeline/ (9 stadi PyMuPDF) e la suite tests/ senza modificare chunks/, step-8/, rag.py, ollama/, retrieve.py, config.py. requirements.txt: aggiunge PyMuPDF>=1.24.0 e pytest>=8.0, mantiene chromadb, rimuove opendataloader-pdf e pymupdf4llm. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
142 lines
5.8 KiB
Python
142 lines
5.8 KiB
Python
import re
|
|
from pathlib import Path
|
|
|
|
# ─── Rilevamento lingua ───────────────────────────────────────────────────────
|
|
|
|
_IT_WORDS = frozenset([
|
|
"il", "la", "di", "e", "che", "non", "per", "un", "una", "si",
|
|
"con", "da", "del", "della", "dei", "in", "ma", "se", "lo", "le",
|
|
"gli", "al", "alla", "ai", "alle", "sono", "ha", "hanno", "era",
|
|
"erano", "nel", "nella", "nei", "nelle", "questo", "questa", "così",
|
|
])
|
|
_EN_WORDS = frozenset([
|
|
"the", "of", "and", "to", "in", "is", "that", "it", "was", "for",
|
|
"on", "are", "as", "with", "his", "they", "at", "be", "this", "have",
|
|
"from", "or", "an", "but", "not", "by", "he", "she", "we", "you",
|
|
"which", "their", "been", "has", "would", "there", "when", "will",
|
|
])
|
|
_FR_WORDS = frozenset([
|
|
"le", "les", "de", "du", "des", "et", "un", "une", "est", "que",
|
|
"pour", "dans", "sur", "avec", "qui", "par", "pas", "plus", "au",
|
|
"ce", "se", "ou", "mais", "comme", "aussi",
|
|
])
|
|
_DE_WORDS = frozenset([
|
|
"der", "die", "das", "und", "in", "von", "zu", "den", "mit", "ist",
|
|
"auf", "eine", "als", "dem", "des", "sich", "nicht", "auch", "werden",
|
|
"bei", "nach", "oder", "wenn", "wird", "war",
|
|
])
|
|
_ES_WORDS = frozenset([
|
|
"el", "los", "las", "de", "en", "un", "una", "es", "que", "por",
|
|
"con", "del", "para", "como", "pero", "sus", "son", "los", "hay",
|
|
"todo", "esta", "este", "ser", "más", "ya",
|
|
])
|
|
|
|
|
|
def _detect_language(text: str) -> str:
|
|
words = re.findall(r"\b[a-zA-Z]{2,}\b", text.lower())
|
|
sample = words[:2000]
|
|
scores = {
|
|
"it": sum(1 for w in sample if w in _IT_WORDS),
|
|
"en": sum(1 for w in sample if w in _EN_WORDS),
|
|
"fr": sum(1 for w in sample if w in _FR_WORDS),
|
|
"de": sum(1 for w in sample if w in _DE_WORDS),
|
|
"es": sum(1 for w in sample if w in _ES_WORDS),
|
|
}
|
|
best = max(scores, key=scores.get)
|
|
return best if scores[best] > 0 else "unknown"
|
|
|
|
|
|
# ─── Analisi struttura ────────────────────────────────────────────────────────
|
|
|
|
def _count_headers(text: str, level: int) -> int:
|
|
prefix = "#" * level + " "
|
|
return len(re.findall(rf"(?m)^{re.escape(prefix)}", text))
|
|
|
|
|
|
def _count_paragraphs(text: str) -> int:
|
|
blocks = re.split(r"\n{2,}", text)
|
|
return sum(1 for b in blocks if b.strip() and not re.match(r"^#+\s", b.strip()))
|
|
|
|
|
|
def _split_sections(text: str, level: int) -> list[str]:
|
|
prefix = "#" * level + " "
|
|
parts = re.split(rf"(?m)^{re.escape(prefix)}.+", text)
|
|
return [p for p in parts[1:] if p.strip()]
|
|
|
|
|
|
def _parse_sections_with_body(text: str, level: int = 3) -> list[tuple[str, str]]:
|
|
"""Restituisce lista di (header_line, body_text) per tutti gli header al livello dato."""
|
|
prefix = "#" * level + " "
|
|
lines = text.split("\n")
|
|
sections: list[tuple[str, str]] = []
|
|
cur_hdr: str | None = None
|
|
cur_body: list[str] = []
|
|
for line in lines:
|
|
if line.startswith(prefix):
|
|
if cur_hdr is not None:
|
|
sections.append((cur_hdr, "\n".join(cur_body).strip()))
|
|
cur_hdr = line
|
|
cur_body = []
|
|
elif cur_hdr is not None:
|
|
cur_body.append(line)
|
|
if cur_hdr is not None:
|
|
sections.append((cur_hdr, "\n".join(cur_body).strip()))
|
|
return sections
|
|
|
|
|
|
def analyze(md_path: Path) -> dict:
|
|
text = md_path.read_text(encoding="utf-8")
|
|
n_h1 = _count_headers(text, 1)
|
|
n_h2 = _count_headers(text, 2)
|
|
n_h3 = _count_headers(text, 3)
|
|
n_paragrafi = _count_paragraphs(text)
|
|
|
|
if n_h3 >= 5:
|
|
livello, boundary, strategia = 3, "h3", "h3_aware"
|
|
section_bodies = _split_sections(text, 3)
|
|
# Se h3 sono enormi e h2 più brevi, h2 è il boundary corretto
|
|
if n_h2 >= 3:
|
|
h2_bodies = _split_sections(text, 2)
|
|
avg_h3 = sum(len(b) for b in section_bodies) / len(section_bodies) if section_bodies else 0
|
|
avg_h2 = sum(len(b) for b in h2_bodies) / len(h2_bodies) if h2_bodies else 0
|
|
if avg_h3 > 5000 and avg_h2 < avg_h3 * 0.7:
|
|
livello, boundary, strategia = 2, "h2", "h2_paragraph_split"
|
|
section_bodies = h2_bodies
|
|
elif n_h2 >= 3:
|
|
livello, boundary, strategia = 2, "h2", "h2_paragraph_split"
|
|
section_bodies = _split_sections(text, 2)
|
|
elif n_h1 + n_h2 + n_h3 >= 1:
|
|
livello, boundary, strategia = 1, "paragrafo", "paragraph"
|
|
section_bodies = [b for b in re.split(r"\n{2,}", text) if b.strip()]
|
|
elif n_paragrafi >= 3:
|
|
livello, boundary, strategia = 1, "paragrafo", "paragraph"
|
|
section_bodies = [b for b in re.split(r"\n{2,}", text) if b.strip()]
|
|
else:
|
|
livello, boundary, strategia = 0, "nessuno", "sliding_window"
|
|
section_bodies = [text] if text.strip() else []
|
|
|
|
lengths = [len(b) for b in section_bodies if b.strip()]
|
|
lunghezza_media = int(sum(lengths) / len(lengths)) if lengths else 0
|
|
lingua = _detect_language(text)
|
|
|
|
avvertenze = []
|
|
short = sum(1 for l in lengths if l < 200)
|
|
long_ = sum(1 for l in lengths if l > 800)
|
|
if short:
|
|
avvertenze.append(f"{short} sezioni sotto i 200 caratteri — verranno accorpate")
|
|
if long_:
|
|
avvertenze.append(f"{long_} sezioni sopra i 800 caratteri — verranno divise")
|
|
|
|
return {
|
|
"livello_struttura": livello,
|
|
"n_h1": n_h1,
|
|
"n_h2": n_h2,
|
|
"n_h3": n_h3,
|
|
"n_paragrafi": n_paragrafi,
|
|
"boundary_primario": boundary,
|
|
"lingua_rilevata": lingua,
|
|
"lunghezza_media_sezione": lunghezza_media,
|
|
"strategia_chunking": strategia,
|
|
"avvertenze": avvertenze,
|
|
}
|