Files
rag-from-scratch/chunks/chunker.py
T
davide e78c404211 feat(chunks): pipeline unificata Stage 1+2 con md_optimizer
chunker.py ora esegue in sequenza:
  - Stage 1 (md_optimizer.py): _content_list_v2.json + _model.json → _clean.md
    con pulizia TOC, frontespizio, sommari interni, merge titoli capitolo
  - Stage 2: _clean.md → chunks.json (paragraph-overlap, atomici tabelle/liste)

config.py esteso con CHAPTER_PREFIX_PATTERNS, SOMMARIO_PATTERNS,
MODEL_SKIP_LABELS, MODEL_ABSTRACT_LABELS, MIN_CONTENT_CHARS.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-20 16:07:40 +02:00

308 lines
11 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
"""
Pipeline di chunking unificata (Stage 1 + Stage 2)
Stage 1 — Ottimizzazione Markdown (md_optimizer):
Legge _content_list_v2.json + _model.json di MinerU e produce _clean.md
con gerarchia H1/H2/H3 pulita (TOC, frontespizi e sommari rimossi).
Stage 2 — Chunking semantico:
Divide il _clean.md in chunk semantici:
- un chunk per paragrafo (mai due paragrafi nello stesso chunk)
- split a confine di frase se il paragrafo supera MAX_CHARS
- overlap di OVERLAP_SENTENCES frasi tra chunk consecutivi
- tabelle e liste sono blocchi atomici (non si spezzano)
Input: sources/<stem>/auto/<stem>_content_list_v2.json
sources/<stem>/auto/<stem>_model.json (opzionale)
Output: sources/<stem>/auto/<stem>_clean.md
chunks/<stem>/chunks.json
chunks/<stem>/meta.json
Uso:
python chunks/chunker.py --stem <stem>
python chunks/chunker.py # tutti gli stem in sources/
python chunks/chunker.py --stem <stem> --force
python chunks/chunker.py --stem <stem> --skip-optimize # salta Stage 1
"""
import argparse
import json
import re
import sys
from pathlib import Path
_HERE = Path(__file__).resolve().parent
if str(_HERE) not in sys.path:
sys.path.insert(0, str(_HERE))
import config as cfg
from md_optimizer import optimize as _optimize_md
# ─── Utilità ──────────────────────────────────────────────────────────────────
def split_sentences(text: str) -> list[str]:
parts = re.split(cfg.SENTENCE_SPLIT_RE, text.strip())
return [p.strip() for p in parts if p.strip()]
def context_to_meta(context: str) -> tuple[str, str]:
"""Divide 'H1 > H2 > H3' in (sezione, titolo) per ingest/verify."""
parts = [p.strip() for p in context.split(" > ") if p.strip()]
if len(parts) >= 2:
return " > ".join(parts[:-1]), parts[-1]
return (parts[0] if parts else ""), ""
# ─── Parser Markdown ──────────────────────────────────────────────────────────
def parse_paragraphs(text: str) -> list[dict]:
"""Estrae blocchi dal _clean.md con il loro contesto heading.
Restituisce: [{"context": "H1 > H2 > H3", "text": "...", "kind": "text|table|list"}]
Ogni riga vuota chiude il paragrafo corrente. Tabelle (righe con |) e
liste (righe con -) vengono accumulate come blocchi atomici.
"""
h1 = h2 = h3 = ""
result: list[dict] = []
buf: list[str] = []
cur_kind = "text"
def flush() -> None:
body = "\n".join(buf).strip()
if body:
parts = [p for p in [h1, h2, h3] if p]
context = " > ".join(parts) if parts else "documento"
result.append({"context": context, "text": body, "kind": cur_kind})
buf.clear()
for line in text.splitlines():
if re.match(r"^# ", line):
flush()
h1, h2, h3 = line[2:].strip(), "", ""
cur_kind = "text"
elif re.match(r"^## ", line):
flush()
h2, h3 = line[3:].strip(), ""
cur_kind = "text"
elif re.match(r"^### ", line):
flush()
h3 = line[4:].strip()
cur_kind = "text"
elif line.strip().startswith("|"):
if cur_kind != "table":
flush()
cur_kind = "table"
buf.append(line)
elif line.strip().startswith("- "):
if cur_kind != "list":
flush()
cur_kind = "list"
buf.append(line)
elif line.strip() == "":
flush()
cur_kind = "text"
else:
if cur_kind in ("table", "list"):
flush()
cur_kind = "text"
buf.append(line)
flush()
return result
# ─── Chunking ─────────────────────────────────────────────────────────────────
def make_chunks(paragraphs: list[dict]) -> list[dict]:
"""Genera chunk dal risultato di parse_paragraphs.
Regole:
- un chunk = un paragrafo (o sotto-parte se > MAX_CHARS)
- split solo a confine di frase; una frase che supera MAX_CHARS è emessa intera
- l'ultima frase del chunk N viene preposta al chunk N+1 (overlap)
- tabelle e liste: blocco atomico (mai spezzato)
"""
chunks: list[dict] = []
overlap_tail: list[str] = []
idx = 0
for para in paragraphs:
text = para["text"]
context = para["context"]
kind = para["kind"]
sezione, titolo = context_to_meta(context)
# ── Blocchi atomici (tabelle, liste) ──────────────────────────────────
if kind in ("table", "list"):
prefix = " ".join(overlap_tail) + " " if overlap_tail else ""
body = (prefix + text).strip()
chunk_text = f"[{context}]\n{body}"
chunks.append({
"chunk_id": f"c{idx}",
"text": chunk_text,
"sezione": sezione,
"titolo": titolo,
"context": context,
"n_chars": len(chunk_text),
})
idx += 1
sents = split_sentences(text)
overlap_tail = sents[-cfg.OVERLAP_SENTENCES:] if cfg.OVERLAP_SENTENCES else []
continue
# ── Paragrafo testo: split a confine di frase ─────────────────────────
sents = split_sentences(text)
if not sents:
continue
current: list[str] = list(overlap_tail)
has_primary: bool = False
for sent in sents:
candidate_len = len(" ".join(current + [sent]))
if candidate_len <= cfg.MAX_CHARS or not has_primary:
current.append(sent)
has_primary = True
else:
body = " ".join(current)
chunk_text = f"[{context}]\n{body}"
chunks.append({
"chunk_id": f"c{idx}",
"text": chunk_text,
"sezione": sezione,
"titolo": titolo,
"context": context,
"n_chars": len(chunk_text),
})
idx += 1
overlap_tail = current[-cfg.OVERLAP_SENTENCES:] if cfg.OVERLAP_SENTENCES else []
current = list(overlap_tail) + [sent]
has_primary = True
if has_primary:
body = " ".join(current)
chunk_text = f"[{context}]\n{body}"
chunks.append({
"chunk_id": f"c{idx}",
"text": chunk_text,
"sezione": sezione,
"titolo": titolo,
"context": context,
"n_chars": len(chunk_text),
})
idx += 1
overlap_tail = current[-cfg.OVERLAP_SENTENCES:] if cfg.OVERLAP_SENTENCES else []
return chunks
# ─── Pipeline per documento ───────────────────────────────────────────────────
def process_stem(stem: str, project_root: Path,
force: bool, skip_optimize: bool) -> bool:
"""Esegue Stage 1 (ottimizzazione MD) + Stage 2 (chunking) per un documento."""
# ── Stage 1: ottimizzazione Markdown ──────────────────────────────────────
if not skip_optimize:
ok = _optimize_md(stem, project_root, force=force)
if not ok:
return False
else:
print(f"\n[Stage 1] skip (--skip-optimize)")
# ── Stage 2: chunking ─────────────────────────────────────────────────────
clean_md = project_root / "sources" / stem / "auto" / f"{stem}_clean.md"
out_dir = project_root / "chunks" / stem
out_file = out_dir / "chunks.json"
print(f"[Stage 2] Chunking: {stem}")
if not clean_md.exists():
print(f"{stem}_clean.md non trovato")
return False
if out_file.exists() and not force:
print(f" ↩ chunks.json già presente — skip chunking")
return True
text = clean_md.read_text(encoding="utf-8")
paragraphs = parse_paragraphs(text)
if not paragraphs:
print(f" ✗ Nessun paragrafo estratto da {clean_md.name}")
return False
chunks = make_chunks(paragraphs)
if not chunks:
print(f" ✗ Nessun chunk generato")
return False
out_dir.mkdir(parents=True, exist_ok=True)
out_file.write_text(
json.dumps(chunks, ensure_ascii=False, indent=2), encoding="utf-8"
)
(out_dir / "meta.json").write_text(
json.dumps({
"min_chars": cfg.MIN_CHARS,
"max_chars": cfg.MAX_CHARS,
"target_chars": cfg.MAX_CHARS,
"overlap": cfg.OVERLAP_SENTENCES,
"strategy": "paragraph_overlap",
}, ensure_ascii=False),
encoding="utf-8",
)
lengths = [c["n_chars"] for c in chunks]
over_max = sum(1 for l in lengths if l > cfg.MAX_CHARS)
under_min = sum(1 for l in lengths if l < cfg.MIN_CHARS)
avg = int(sum(lengths) / len(lengths))
print(f"{len(chunks)} chunk | media {avg} char | max {max(lengths)} char")
if over_max:
print(f" ⚠️ {over_max} chunk superano MAX_CHARS={cfg.MAX_CHARS}")
if under_min:
print(f" {under_min} chunk sotto MIN_CHARS={cfg.MIN_CHARS}")
print(f" → chunks/{stem}/chunks.json")
return True
# ─── Entry point ──────────────────────────────────────────────────────────────
if __name__ == "__main__":
project_root = Path(__file__).parent.parent
parser = argparse.ArgumentParser(
description="Pipeline unificata MinerU → _clean.md → chunks.json"
)
parser.add_argument("--stem", help="Nome documento (sottocartella di sources/)")
parser.add_argument("--force", action="store_true",
help="Rigenera _clean.md e chunks.json anche se esistono")
parser.add_argument("--skip-optimize", action="store_true",
help="Salta Stage 1 (usa _clean.md già presente)")
args = parser.parse_args()
if args.stem:
stems = [args.stem]
else:
sources_dir = project_root / "sources"
stems = sorted(
p.name for p in sources_dir.iterdir()
if p.is_dir()
and (p / "auto" / f"{p.name}_content_list_v2.json").exists()
)
if not stems:
print("Errore: nessun documento MinerU trovato in sources/")
sys.exit(1)
results = [
process_stem(s, project_root, args.force, args.skip_optimize)
for s in stems
]
ok = sum(results)
print(f"\n{'' if all(results) else '⚠️ '} {ok}/{len(results)} documenti processati")
sys.exit(0 if all(results) else 1)