rag-from-scratch/chunks/chunker.py

#!/usr/bin/env python3
"""
Pipeline di chunking unificata (Stage 1 + Stage 2)

Stage 1 — Ottimizzazione Markdown (md_optimizer):
  Legge _content_list_v2.json + _model.json di MinerU e produce _clean.md
  con gerarchia H1/H2/H3 pulita (TOC, frontespizi e sommari rimossi).

Stage 2 — Chunking semantico:
  Divide il _clean.md in chunk semantici:
  - un chunk per paragrafo (mai due paragrafi nello stesso chunk)
  - split a confine di frase se il paragrafo supera MAX_CHARS
  - overlap di OVERLAP_SENTENCES frasi tra chunk consecutivi
  - tabelle e liste sono blocchi atomici (non si spezzano)

Input:  sources/<stem>/auto/<stem>_content_list_v2.json
        sources/<stem>/auto/<stem>_model.json  (opzionale)
Output: sources/<stem>/auto/<stem>_clean.md
        chunks/<stem>/chunks.json
        chunks/<stem>/meta.json

Uso:
    python chunks/chunker.py --stem <stem>
    python chunks/chunker.py                     # tutti gli stem in sources/
    python chunks/chunker.py --stem <stem> --force
    python chunks/chunker.py --stem <stem> --skip-optimize  # salta Stage 1
"""

import argparse
import json
import re
import sys
from pathlib import Path

_HERE = Path(__file__).resolve().parent
if str(_HERE) not in sys.path:
    sys.path.insert(0, str(_HERE))
import config as cfg
from md_optimizer import optimize as _optimize_md


# ─── Utilità ──────────────────────────────────────────────────────────────────

def split_sentences(text: str) -> list[str]:
    parts = re.split(cfg.SENTENCE_SPLIT_RE, text.strip())
    return [p.strip() for p in parts if p.strip()]


def context_to_meta(context: str) -> tuple[str, str]:
    """Divide 'H1 > H2 > H3' in (sezione, titolo) per ingest/verify."""
    parts = [p.strip() for p in context.split(" > ") if p.strip()]
    if len(parts) >= 2:
        return " > ".join(parts[:-1]), parts[-1]
    return (parts[0] if parts else ""), ""


# ─── Parser Markdown ──────────────────────────────────────────────────────────

def parse_paragraphs(text: str) -> list[dict]:
    """Estrae blocchi dal _clean.md con il loro contesto heading.

    Restituisce: [{"context": "H1 > H2 > H3", "text": "...", "kind": "text|table|list"}]

    Ogni riga vuota chiude il paragrafo corrente. Tabelle (righe con |) e
    liste (righe con -) vengono accumulate come blocchi atomici.
    """
    h1 = h2 = h3 = ""
    result:  list[dict] = []
    buf:     list[str]  = []
    cur_kind = "text"

    def flush() -> None:
        body = "\n".join(buf).strip()
        if body:
            parts   = [p for p in [h1, h2, h3] if p]
            context = " > ".join(parts) if parts else "documento"
            result.append({"context": context, "text": body, "kind": cur_kind})
        buf.clear()

    for line in text.splitlines():
        if re.match(r"^# ", line):
            flush()
            h1, h2, h3 = line[2:].strip(), "", ""
            cur_kind = "text"
        elif re.match(r"^## ", line):
            flush()
            h2, h3 = line[3:].strip(), ""
            cur_kind = "text"
        elif re.match(r"^### ", line):
            flush()
            h3 = line[4:].strip()
            cur_kind = "text"
        elif line.strip().startswith("|"):
            if cur_kind != "table":
                flush()
                cur_kind = "table"
            buf.append(line)
        elif line.strip().startswith("- "):
            if cur_kind != "list":
                flush()
                cur_kind = "list"
            buf.append(line)
        elif line.strip() == "":
            flush()
            cur_kind = "text"
        else:
            if cur_kind in ("table", "list"):
                flush()
                cur_kind = "text"
            buf.append(line)

    flush()
    return result


# ─── Chunking ─────────────────────────────────────────────────────────────────

def make_chunks(paragraphs: list[dict]) -> list[dict]:
    """Genera chunk dal risultato di parse_paragraphs.

    Regole:
      - un chunk = un paragrafo (o sotto-parte se > MAX_CHARS)
      - split solo a confine di frase; una frase che supera MAX_CHARS è emessa intera
      - l'ultima frase del chunk N viene preposta al chunk N+1 (overlap)
      - tabelle e liste: blocco atomico (mai spezzato)
    """
    chunks:       list[dict] = []
    overlap_tail: list[str]  = []
    idx = 0

    for para in paragraphs:
        text    = para["text"]
        context = para["context"]
        kind    = para["kind"]
        sezione, titolo = context_to_meta(context)

        # ── Blocchi atomici (tabelle, liste) ──────────────────────────────────
        if kind in ("table", "list"):
            prefix     = " ".join(overlap_tail) + " " if overlap_tail else ""
            body       = (prefix + text).strip()
            chunk_text = f"[{context}]\n{body}"
            chunks.append({
                "chunk_id": f"c{idx}",
                "text":     chunk_text,
                "sezione":  sezione,
                "titolo":   titolo,
                "context":  context,
                "n_chars":  len(chunk_text),
            })
            idx         += 1
            sents        = split_sentences(text)
            overlap_tail = sents[-cfg.OVERLAP_SENTENCES:] if cfg.OVERLAP_SENTENCES else []
            continue

        # ── Paragrafo testo: split a confine di frase ─────────────────────────
        sents = split_sentences(text)
        if not sents:
            continue

        current:     list[str] = list(overlap_tail)
        has_primary: bool      = False

        for sent in sents:
            candidate_len = len(" ".join(current + [sent]))

            if candidate_len <= cfg.MAX_CHARS or not has_primary:
                current.append(sent)
                has_primary = True
            else:
                body       = " ".join(current)
                chunk_text = f"[{context}]\n{body}"
                chunks.append({
                    "chunk_id": f"c{idx}",
                    "text":     chunk_text,
                    "sezione":  sezione,
                    "titolo":   titolo,
                    "context":  context,
                    "n_chars":  len(chunk_text),
                })
                idx         += 1
                overlap_tail = current[-cfg.OVERLAP_SENTENCES:] if cfg.OVERLAP_SENTENCES else []
                current      = list(overlap_tail) + [sent]
                has_primary  = True

        if has_primary:
            body       = " ".join(current)
            chunk_text = f"[{context}]\n{body}"
            chunks.append({
                "chunk_id": f"c{idx}",
                "text":     chunk_text,
                "sezione":  sezione,
                "titolo":   titolo,
                "context":  context,
                "n_chars":  len(chunk_text),
            })
            idx         += 1
            overlap_tail = current[-cfg.OVERLAP_SENTENCES:] if cfg.OVERLAP_SENTENCES else []

    return chunks


# ─── Pipeline per documento ───────────────────────────────────────────────────

def process_stem(stem: str, project_root: Path,
                 force: bool, skip_optimize: bool) -> bool:
    """Esegue Stage 1 (ottimizzazione MD) + Stage 2 (chunking) per un documento."""

    # ── Stage 1: ottimizzazione Markdown ──────────────────────────────────────
    if not skip_optimize:
        ok = _optimize_md(stem, project_root, force=force)
        if not ok:
            return False
    else:
        print(f"\n[Stage 1] skip (--skip-optimize)")

    # ── Stage 2: chunking ─────────────────────────────────────────────────────
    clean_md = project_root / "sources" / stem / "auto" / f"{stem}_clean.md"
    out_dir  = project_root / "chunks" / stem
    out_file = out_dir / "chunks.json"

    print(f"[Stage 2] Chunking: {stem}")

    if not clean_md.exists():
        print(f"  ✗ {stem}_clean.md non trovato")
        return False

    if out_file.exists() and not force:
        print(f"  ↩  chunks.json già presente — skip chunking")
        return True

    text       = clean_md.read_text(encoding="utf-8")
    paragraphs = parse_paragraphs(text)

    if not paragraphs:
        print(f"  ✗ Nessun paragrafo estratto da {clean_md.name}")
        return False

    chunks = make_chunks(paragraphs)

    if not chunks:
        print(f"  ✗ Nessun chunk generato")
        return False

    out_dir.mkdir(parents=True, exist_ok=True)
    out_file.write_text(
        json.dumps(chunks, ensure_ascii=False, indent=2), encoding="utf-8"
    )
    (out_dir / "meta.json").write_text(
        json.dumps({
            "min_chars":    cfg.MIN_CHARS,
            "max_chars":    cfg.MAX_CHARS,
            "target_chars": cfg.MAX_CHARS,
            "overlap":      cfg.OVERLAP_SENTENCES,
            "strategy":     "paragraph_overlap",
        }, ensure_ascii=False),
        encoding="utf-8",
    )

    lengths   = [c["n_chars"] for c in chunks]
    over_max  = sum(1 for l in lengths if l > cfg.MAX_CHARS)
    under_min = sum(1 for l in lengths if l < cfg.MIN_CHARS)
    avg       = int(sum(lengths) / len(lengths))

    print(f"  ✅ {len(chunks)} chunk  |  media {avg} char  |  max {max(lengths)} char")
    if over_max:
        print(f"  ⚠️  {over_max} chunk superano MAX_CHARS={cfg.MAX_CHARS}")
    if under_min:
        print(f"  ℹ️  {under_min} chunk sotto MIN_CHARS={cfg.MIN_CHARS}")
    print(f"  → chunks/{stem}/chunks.json")
    return True


# ─── Entry point ──────────────────────────────────────────────────────────────

if __name__ == "__main__":
    project_root = Path(__file__).parent.parent

    parser = argparse.ArgumentParser(
        description="Pipeline unificata MinerU → _clean.md → chunks.json"
    )
    parser.add_argument("--stem",  help="Nome documento (sottocartella di sources/)")
    parser.add_argument("--force", action="store_true",
                        help="Rigenera _clean.md e chunks.json anche se esistono")
    parser.add_argument("--skip-optimize", action="store_true",
                        help="Salta Stage 1 (usa _clean.md già presente)")
    args = parser.parse_args()

    if args.stem:
        stems = [args.stem]
    else:
        sources_dir = project_root / "sources"
        stems = sorted(
            p.name for p in sources_dir.iterdir()
            if p.is_dir()
            and (p / "auto" / f"{p.name}_content_list_v2.json").exists()
        )
        if not stems:
            print("Errore: nessun documento MinerU trovato in sources/")
            sys.exit(1)

    results = [
        process_stem(s, project_root, args.force, args.skip_optimize)
        for s in stems
    ]
    ok = sum(results)
    print(f"\n{'✅' if all(results) else '⚠️ '} {ok}/{len(results)} documenti processati")
    sys.exit(0 if all(results) else 1)