#!/usr/bin/env python3 """ Pipeline di chunking unificata (Stage 1 + Stage 2) Stage 1 — Ottimizzazione Markdown (md_optimizer): Legge _content_list_v2.json + _model.json di MinerU e produce _clean.md con gerarchia H1/H2/H3 pulita (TOC, frontespizi e sommari rimossi). Stage 2 — Chunking semantico: Divide il _clean.md in chunk semantici: - un chunk per paragrafo (mai due paragrafi nello stesso chunk) - split a confine di frase se il paragrafo supera MAX_CHARS - overlap di OVERLAP_SENTENCES frasi tra chunk consecutivi - tabelle e liste sono blocchi atomici (non si spezzano) Input: sources//auto/_content_list_v2.json sources//auto/_model.json (opzionale) Output: sources//auto/_clean.md chunks//chunks.json chunks//meta.json Uso: python chunks/chunker.py --stem python chunks/chunker.py # tutti gli stem in sources/ python chunks/chunker.py --stem --force python chunks/chunker.py --stem --skip-optimize # salta Stage 1 """ import argparse import json import re import sys from pathlib import Path _HERE = Path(__file__).resolve().parent if str(_HERE) not in sys.path: sys.path.insert(0, str(_HERE)) import config as cfg from md_optimizer import optimize as _optimize_md # ─── Utilità ────────────────────────────────────────────────────────────────── def split_sentences(text: str) -> list[str]: parts = re.split(cfg.SENTENCE_SPLIT_RE, text.strip()) return [p.strip() for p in parts if p.strip()] def context_to_meta(context: str) -> tuple[str, str]: """Divide 'H1 > H2 > H3' in (sezione, titolo) per ingest/verify.""" parts = [p.strip() for p in context.split(" > ") if p.strip()] if len(parts) >= 2: return " > ".join(parts[:-1]), parts[-1] return (parts[0] if parts else ""), "" # ─── Parser Markdown ────────────────────────────────────────────────────────── def parse_paragraphs(text: str) -> list[dict]: """Estrae blocchi dal _clean.md con il loro contesto heading. Restituisce: [{"context": "H1 > H2 > H3", "text": "...", "kind": "text|table|list"}] Ogni riga vuota chiude il paragrafo corrente. Tabelle (righe con |) e liste (righe con -) vengono accumulate come blocchi atomici. """ h1 = h2 = h3 = "" result: list[dict] = [] buf: list[str] = [] cur_kind = "text" def flush() -> None: body = "\n".join(buf).strip() if body: parts = [p for p in [h1, h2, h3] if p] context = " > ".join(parts) if parts else "documento" result.append({"context": context, "text": body, "kind": cur_kind}) buf.clear() for line in text.splitlines(): if re.match(r"^# ", line): flush() h1, h2, h3 = line[2:].strip(), "", "" cur_kind = "text" elif re.match(r"^## ", line): flush() h2, h3 = line[3:].strip(), "" cur_kind = "text" elif re.match(r"^### ", line): flush() h3 = line[4:].strip() cur_kind = "text" elif line.strip().startswith("|"): if cur_kind != "table": flush() cur_kind = "table" buf.append(line) elif line.strip().startswith("- "): if cur_kind != "list": flush() cur_kind = "list" buf.append(line) elif line.strip() == "": flush() cur_kind = "text" else: if cur_kind in ("table", "list"): flush() cur_kind = "text" buf.append(line) flush() return result # ─── Chunking ───────────────────────────────────────────────────────────────── def make_chunks(paragraphs: list[dict]) -> list[dict]: """Genera chunk dal risultato di parse_paragraphs. Regole: - un chunk = un paragrafo (o sotto-parte se > MAX_CHARS) - split solo a confine di frase; una frase che supera MAX_CHARS è emessa intera - l'ultima frase del chunk N viene preposta al chunk N+1 (overlap) - tabelle e liste: blocco atomico (mai spezzato) """ chunks: list[dict] = [] overlap_tail: list[str] = [] idx = 0 for para in paragraphs: text = para["text"] context = para["context"] kind = para["kind"] sezione, titolo = context_to_meta(context) # ── Blocchi atomici (tabelle, liste) ────────────────────────────────── if kind in ("table", "list"): prefix = " ".join(overlap_tail) + " " if overlap_tail else "" body = (prefix + text).strip() chunk_text = f"[{context}]\n{body}" chunks.append({ "chunk_id": f"c{idx}", "text": chunk_text, "sezione": sezione, "titolo": titolo, "context": context, "n_chars": len(chunk_text), }) idx += 1 sents = split_sentences(text) overlap_tail = sents[-cfg.OVERLAP_SENTENCES:] if cfg.OVERLAP_SENTENCES else [] continue # ── Paragrafo testo: split a confine di frase ───────────────────────── sents = split_sentences(text) if not sents: continue current: list[str] = list(overlap_tail) has_primary: bool = False for sent in sents: candidate_len = len(" ".join(current + [sent])) if candidate_len <= cfg.MAX_CHARS or not has_primary: current.append(sent) has_primary = True else: body = " ".join(current) chunk_text = f"[{context}]\n{body}" chunks.append({ "chunk_id": f"c{idx}", "text": chunk_text, "sezione": sezione, "titolo": titolo, "context": context, "n_chars": len(chunk_text), }) idx += 1 overlap_tail = current[-cfg.OVERLAP_SENTENCES:] if cfg.OVERLAP_SENTENCES else [] current = list(overlap_tail) + [sent] has_primary = True if has_primary: body = " ".join(current) chunk_text = f"[{context}]\n{body}" chunks.append({ "chunk_id": f"c{idx}", "text": chunk_text, "sezione": sezione, "titolo": titolo, "context": context, "n_chars": len(chunk_text), }) idx += 1 overlap_tail = current[-cfg.OVERLAP_SENTENCES:] if cfg.OVERLAP_SENTENCES else [] return chunks # ─── Pipeline per documento ─────────────────────────────────────────────────── def process_stem(stem: str, project_root: Path, force: bool, skip_optimize: bool) -> bool: """Esegue Stage 1 (ottimizzazione MD) + Stage 2 (chunking) per un documento.""" # ── Stage 1: ottimizzazione Markdown ────────────────────────────────────── if not skip_optimize: ok = _optimize_md(stem, project_root, force=force) if not ok: return False else: print(f"\n[Stage 1] skip (--skip-optimize)") # ── Stage 2: chunking ───────────────────────────────────────────────────── clean_md = project_root / "sources" / stem / "auto" / f"{stem}_clean.md" out_dir = project_root / "chunks" / stem out_file = out_dir / "chunks.json" print(f"[Stage 2] Chunking: {stem}") if not clean_md.exists(): print(f" ✗ {stem}_clean.md non trovato") return False if out_file.exists() and not force: print(f" ↩ chunks.json già presente — skip chunking") return True text = clean_md.read_text(encoding="utf-8") paragraphs = parse_paragraphs(text) if not paragraphs: print(f" ✗ Nessun paragrafo estratto da {clean_md.name}") return False chunks = make_chunks(paragraphs) if not chunks: print(f" ✗ Nessun chunk generato") return False out_dir.mkdir(parents=True, exist_ok=True) out_file.write_text( json.dumps(chunks, ensure_ascii=False, indent=2), encoding="utf-8" ) (out_dir / "meta.json").write_text( json.dumps({ "min_chars": cfg.MIN_CHARS, "max_chars": cfg.MAX_CHARS, "target_chars": cfg.MAX_CHARS, "overlap": cfg.OVERLAP_SENTENCES, "strategy": "paragraph_overlap", }, ensure_ascii=False), encoding="utf-8", ) lengths = [c["n_chars"] for c in chunks] over_max = sum(1 for l in lengths if l > cfg.MAX_CHARS) under_min = sum(1 for l in lengths if l < cfg.MIN_CHARS) avg = int(sum(lengths) / len(lengths)) print(f" ✅ {len(chunks)} chunk | media {avg} char | max {max(lengths)} char") if over_max: print(f" ⚠️ {over_max} chunk superano MAX_CHARS={cfg.MAX_CHARS}") if under_min: print(f" ℹ️ {under_min} chunk sotto MIN_CHARS={cfg.MIN_CHARS}") print(f" → chunks/{stem}/chunks.json") return True # ─── Entry point ────────────────────────────────────────────────────────────── if __name__ == "__main__": project_root = Path(__file__).parent.parent parser = argparse.ArgumentParser( description="Pipeline unificata MinerU → _clean.md → chunks.json" ) parser.add_argument("--stem", help="Nome documento (sottocartella di sources/)") parser.add_argument("--force", action="store_true", help="Rigenera _clean.md e chunks.json anche se esistono") parser.add_argument("--skip-optimize", action="store_true", help="Salta Stage 1 (usa _clean.md già presente)") args = parser.parse_args() if args.stem: stems = [args.stem] else: sources_dir = project_root / "sources" stems = sorted( p.name for p in sources_dir.iterdir() if p.is_dir() and (p / "auto" / f"{p.name}_content_list_v2.json").exists() ) if not stems: print("Errore: nessun documento MinerU trovato in sources/") sys.exit(1) results = [ process_stem(s, project_root, args.force, args.skip_optimize) for s in stems ] ok = sum(results) print(f"\n{'✅' if all(results) else '⚠️ '} {ok}/{len(results)} documenti processati") sys.exit(0 if all(results) else 1)