e78c404211
chunker.py ora esegue in sequenza:
- Stage 1 (md_optimizer.py): _content_list_v2.json + _model.json → _clean.md
con pulizia TOC, frontespizio, sommari interni, merge titoli capitolo
- Stage 2: _clean.md → chunks.json (paragraph-overlap, atomici tabelle/liste)
config.py esteso con CHAPTER_PREFIX_PATTERNS, SOMMARIO_PATTERNS,
MODEL_SKIP_LABELS, MODEL_ABSTRACT_LABELS, MIN_CONTENT_CHARS.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
308 lines
11 KiB
Python
308 lines
11 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
Pipeline di chunking unificata (Stage 1 + Stage 2)
|
||
|
||
Stage 1 — Ottimizzazione Markdown (md_optimizer):
|
||
Legge _content_list_v2.json + _model.json di MinerU e produce _clean.md
|
||
con gerarchia H1/H2/H3 pulita (TOC, frontespizi e sommari rimossi).
|
||
|
||
Stage 2 — Chunking semantico:
|
||
Divide il _clean.md in chunk semantici:
|
||
- un chunk per paragrafo (mai due paragrafi nello stesso chunk)
|
||
- split a confine di frase se il paragrafo supera MAX_CHARS
|
||
- overlap di OVERLAP_SENTENCES frasi tra chunk consecutivi
|
||
- tabelle e liste sono blocchi atomici (non si spezzano)
|
||
|
||
Input: sources/<stem>/auto/<stem>_content_list_v2.json
|
||
sources/<stem>/auto/<stem>_model.json (opzionale)
|
||
Output: sources/<stem>/auto/<stem>_clean.md
|
||
chunks/<stem>/chunks.json
|
||
chunks/<stem>/meta.json
|
||
|
||
Uso:
|
||
python chunks/chunker.py --stem <stem>
|
||
python chunks/chunker.py # tutti gli stem in sources/
|
||
python chunks/chunker.py --stem <stem> --force
|
||
python chunks/chunker.py --stem <stem> --skip-optimize # salta Stage 1
|
||
"""
|
||
|
||
import argparse
|
||
import json
|
||
import re
|
||
import sys
|
||
from pathlib import Path
|
||
|
||
_HERE = Path(__file__).resolve().parent
|
||
if str(_HERE) not in sys.path:
|
||
sys.path.insert(0, str(_HERE))
|
||
import config as cfg
|
||
from md_optimizer import optimize as _optimize_md
|
||
|
||
|
||
# ─── Utilità ──────────────────────────────────────────────────────────────────
|
||
|
||
def split_sentences(text: str) -> list[str]:
|
||
parts = re.split(cfg.SENTENCE_SPLIT_RE, text.strip())
|
||
return [p.strip() for p in parts if p.strip()]
|
||
|
||
|
||
def context_to_meta(context: str) -> tuple[str, str]:
|
||
"""Divide 'H1 > H2 > H3' in (sezione, titolo) per ingest/verify."""
|
||
parts = [p.strip() for p in context.split(" > ") if p.strip()]
|
||
if len(parts) >= 2:
|
||
return " > ".join(parts[:-1]), parts[-1]
|
||
return (parts[0] if parts else ""), ""
|
||
|
||
|
||
# ─── Parser Markdown ──────────────────────────────────────────────────────────
|
||
|
||
def parse_paragraphs(text: str) -> list[dict]:
|
||
"""Estrae blocchi dal _clean.md con il loro contesto heading.
|
||
|
||
Restituisce: [{"context": "H1 > H2 > H3", "text": "...", "kind": "text|table|list"}]
|
||
|
||
Ogni riga vuota chiude il paragrafo corrente. Tabelle (righe con |) e
|
||
liste (righe con -) vengono accumulate come blocchi atomici.
|
||
"""
|
||
h1 = h2 = h3 = ""
|
||
result: list[dict] = []
|
||
buf: list[str] = []
|
||
cur_kind = "text"
|
||
|
||
def flush() -> None:
|
||
body = "\n".join(buf).strip()
|
||
if body:
|
||
parts = [p for p in [h1, h2, h3] if p]
|
||
context = " > ".join(parts) if parts else "documento"
|
||
result.append({"context": context, "text": body, "kind": cur_kind})
|
||
buf.clear()
|
||
|
||
for line in text.splitlines():
|
||
if re.match(r"^# ", line):
|
||
flush()
|
||
h1, h2, h3 = line[2:].strip(), "", ""
|
||
cur_kind = "text"
|
||
elif re.match(r"^## ", line):
|
||
flush()
|
||
h2, h3 = line[3:].strip(), ""
|
||
cur_kind = "text"
|
||
elif re.match(r"^### ", line):
|
||
flush()
|
||
h3 = line[4:].strip()
|
||
cur_kind = "text"
|
||
elif line.strip().startswith("|"):
|
||
if cur_kind != "table":
|
||
flush()
|
||
cur_kind = "table"
|
||
buf.append(line)
|
||
elif line.strip().startswith("- "):
|
||
if cur_kind != "list":
|
||
flush()
|
||
cur_kind = "list"
|
||
buf.append(line)
|
||
elif line.strip() == "":
|
||
flush()
|
||
cur_kind = "text"
|
||
else:
|
||
if cur_kind in ("table", "list"):
|
||
flush()
|
||
cur_kind = "text"
|
||
buf.append(line)
|
||
|
||
flush()
|
||
return result
|
||
|
||
|
||
# ─── Chunking ─────────────────────────────────────────────────────────────────
|
||
|
||
def make_chunks(paragraphs: list[dict]) -> list[dict]:
|
||
"""Genera chunk dal risultato di parse_paragraphs.
|
||
|
||
Regole:
|
||
- un chunk = un paragrafo (o sotto-parte se > MAX_CHARS)
|
||
- split solo a confine di frase; una frase che supera MAX_CHARS è emessa intera
|
||
- l'ultima frase del chunk N viene preposta al chunk N+1 (overlap)
|
||
- tabelle e liste: blocco atomico (mai spezzato)
|
||
"""
|
||
chunks: list[dict] = []
|
||
overlap_tail: list[str] = []
|
||
idx = 0
|
||
|
||
for para in paragraphs:
|
||
text = para["text"]
|
||
context = para["context"]
|
||
kind = para["kind"]
|
||
sezione, titolo = context_to_meta(context)
|
||
|
||
# ── Blocchi atomici (tabelle, liste) ──────────────────────────────────
|
||
if kind in ("table", "list"):
|
||
prefix = " ".join(overlap_tail) + " " if overlap_tail else ""
|
||
body = (prefix + text).strip()
|
||
chunk_text = f"[{context}]\n{body}"
|
||
chunks.append({
|
||
"chunk_id": f"c{idx}",
|
||
"text": chunk_text,
|
||
"sezione": sezione,
|
||
"titolo": titolo,
|
||
"context": context,
|
||
"n_chars": len(chunk_text),
|
||
})
|
||
idx += 1
|
||
sents = split_sentences(text)
|
||
overlap_tail = sents[-cfg.OVERLAP_SENTENCES:] if cfg.OVERLAP_SENTENCES else []
|
||
continue
|
||
|
||
# ── Paragrafo testo: split a confine di frase ─────────────────────────
|
||
sents = split_sentences(text)
|
||
if not sents:
|
||
continue
|
||
|
||
current: list[str] = list(overlap_tail)
|
||
has_primary: bool = False
|
||
|
||
for sent in sents:
|
||
candidate_len = len(" ".join(current + [sent]))
|
||
|
||
if candidate_len <= cfg.MAX_CHARS or not has_primary:
|
||
current.append(sent)
|
||
has_primary = True
|
||
else:
|
||
body = " ".join(current)
|
||
chunk_text = f"[{context}]\n{body}"
|
||
chunks.append({
|
||
"chunk_id": f"c{idx}",
|
||
"text": chunk_text,
|
||
"sezione": sezione,
|
||
"titolo": titolo,
|
||
"context": context,
|
||
"n_chars": len(chunk_text),
|
||
})
|
||
idx += 1
|
||
overlap_tail = current[-cfg.OVERLAP_SENTENCES:] if cfg.OVERLAP_SENTENCES else []
|
||
current = list(overlap_tail) + [sent]
|
||
has_primary = True
|
||
|
||
if has_primary:
|
||
body = " ".join(current)
|
||
chunk_text = f"[{context}]\n{body}"
|
||
chunks.append({
|
||
"chunk_id": f"c{idx}",
|
||
"text": chunk_text,
|
||
"sezione": sezione,
|
||
"titolo": titolo,
|
||
"context": context,
|
||
"n_chars": len(chunk_text),
|
||
})
|
||
idx += 1
|
||
overlap_tail = current[-cfg.OVERLAP_SENTENCES:] if cfg.OVERLAP_SENTENCES else []
|
||
|
||
return chunks
|
||
|
||
|
||
# ─── Pipeline per documento ───────────────────────────────────────────────────
|
||
|
||
def process_stem(stem: str, project_root: Path,
|
||
force: bool, skip_optimize: bool) -> bool:
|
||
"""Esegue Stage 1 (ottimizzazione MD) + Stage 2 (chunking) per un documento."""
|
||
|
||
# ── Stage 1: ottimizzazione Markdown ──────────────────────────────────────
|
||
if not skip_optimize:
|
||
ok = _optimize_md(stem, project_root, force=force)
|
||
if not ok:
|
||
return False
|
||
else:
|
||
print(f"\n[Stage 1] skip (--skip-optimize)")
|
||
|
||
# ── Stage 2: chunking ─────────────────────────────────────────────────────
|
||
clean_md = project_root / "sources" / stem / "auto" / f"{stem}_clean.md"
|
||
out_dir = project_root / "chunks" / stem
|
||
out_file = out_dir / "chunks.json"
|
||
|
||
print(f"[Stage 2] Chunking: {stem}")
|
||
|
||
if not clean_md.exists():
|
||
print(f" ✗ {stem}_clean.md non trovato")
|
||
return False
|
||
|
||
if out_file.exists() and not force:
|
||
print(f" ↩ chunks.json già presente — skip chunking")
|
||
return True
|
||
|
||
text = clean_md.read_text(encoding="utf-8")
|
||
paragraphs = parse_paragraphs(text)
|
||
|
||
if not paragraphs:
|
||
print(f" ✗ Nessun paragrafo estratto da {clean_md.name}")
|
||
return False
|
||
|
||
chunks = make_chunks(paragraphs)
|
||
|
||
if not chunks:
|
||
print(f" ✗ Nessun chunk generato")
|
||
return False
|
||
|
||
out_dir.mkdir(parents=True, exist_ok=True)
|
||
out_file.write_text(
|
||
json.dumps(chunks, ensure_ascii=False, indent=2), encoding="utf-8"
|
||
)
|
||
(out_dir / "meta.json").write_text(
|
||
json.dumps({
|
||
"min_chars": cfg.MIN_CHARS,
|
||
"max_chars": cfg.MAX_CHARS,
|
||
"target_chars": cfg.MAX_CHARS,
|
||
"overlap": cfg.OVERLAP_SENTENCES,
|
||
"strategy": "paragraph_overlap",
|
||
}, ensure_ascii=False),
|
||
encoding="utf-8",
|
||
)
|
||
|
||
lengths = [c["n_chars"] for c in chunks]
|
||
over_max = sum(1 for l in lengths if l > cfg.MAX_CHARS)
|
||
under_min = sum(1 for l in lengths if l < cfg.MIN_CHARS)
|
||
avg = int(sum(lengths) / len(lengths))
|
||
|
||
print(f" ✅ {len(chunks)} chunk | media {avg} char | max {max(lengths)} char")
|
||
if over_max:
|
||
print(f" ⚠️ {over_max} chunk superano MAX_CHARS={cfg.MAX_CHARS}")
|
||
if under_min:
|
||
print(f" ℹ️ {under_min} chunk sotto MIN_CHARS={cfg.MIN_CHARS}")
|
||
print(f" → chunks/{stem}/chunks.json")
|
||
return True
|
||
|
||
|
||
# ─── Entry point ──────────────────────────────────────────────────────────────
|
||
|
||
if __name__ == "__main__":
|
||
project_root = Path(__file__).parent.parent
|
||
|
||
parser = argparse.ArgumentParser(
|
||
description="Pipeline unificata MinerU → _clean.md → chunks.json"
|
||
)
|
||
parser.add_argument("--stem", help="Nome documento (sottocartella di sources/)")
|
||
parser.add_argument("--force", action="store_true",
|
||
help="Rigenera _clean.md e chunks.json anche se esistono")
|
||
parser.add_argument("--skip-optimize", action="store_true",
|
||
help="Salta Stage 1 (usa _clean.md già presente)")
|
||
args = parser.parse_args()
|
||
|
||
if args.stem:
|
||
stems = [args.stem]
|
||
else:
|
||
sources_dir = project_root / "sources"
|
||
stems = sorted(
|
||
p.name for p in sources_dir.iterdir()
|
||
if p.is_dir()
|
||
and (p / "auto" / f"{p.name}_content_list_v2.json").exists()
|
||
)
|
||
if not stems:
|
||
print("Errore: nessun documento MinerU trovato in sources/")
|
||
sys.exit(1)
|
||
|
||
results = [
|
||
process_stem(s, project_root, args.force, args.skip_optimize)
|
||
for s in stems
|
||
]
|
||
ok = sum(results)
|
||
print(f"\n{'✅' if all(results) else '⚠️ '} {ok}/{len(results)} documenti processati")
|
||
sys.exit(0 if all(results) else 1)
|