feat(chunks): aggiungi pipeline chunking consolidata

Nuova cartella chunks/ con chunker.py (step 5), verify_chunks.py e fix_chunks.py (step 6). Tutto l'I/O va in chunks/<stem>/ invece di step-5/ e step-6/ separati. Input: conversione/<stem>/clean.md
2026-04-20 11:36:18 +02:00
parent 5215f53ad0
commit 4c0e0db2a5
3 changed files with 999 additions and 0 deletions
@@ -0,0 +1,414 @@
+#!/usr/bin/env python3
+"""
+Chunking adattivo
+
+Divide il Markdown revisionato in chunk semantici pronti per la
+vettorizzazione. La strategia dipende dal profilo strutturale del documento.
+
+Input:  conversione/<stem>/clean.md + conversione/<stem>/structure_profile.json
+Output: chunks/<stem>/chunks.json
+
+Uso:
+    python chunks/chunker.py                    # tutti i documenti in conversione/
+    python chunks/chunker.py --stem documento   # un solo documento
+    python chunks/chunker.py --stem documento --force
+"""
+
+import argparse
+import json
+import re
+import sys
+from pathlib import Path
+
+
+# ─── Parametri ────────────────────────────────────────────────────────────────
+
+MIN_CHARS = 200   # sotto questa soglia → accorpa al chunk successivo
+MAX_CHARS = 800   # sopra questa soglia → spezza su frasi
+OVERLAP_S = 2     # frasi di overlap tra sotto-chunk dello stesso boundary
+
+
+# ─── Utilità ──────────────────────────────────────────────────────────────────
+
+def split_sentences(text: str) -> list[str]:
+    parts = re.split(r'(?<=[.!?»])\s+(?=[A-ZÀÈÉÌÒÙA-Z\"])', text.strip())
+    if len(parts) <= 1:
+        parts = re.split(r'(?<=[.!?»])\s+', text.strip())
+    return [p.strip() for p in parts if p.strip()]
+
+
+def slugify(s: str, max_len: int = 60) -> str:
+    s = s.lower()
+    s = re.sub(r'[^\w\s-]', '', s)
+    s = re.sub(r'[\s_-]+', '_', s).strip('_')
+    return s[:max_len] if s else "section"
+
+
+def make_sub_chunks(
+    body: str,
+    prefix: str,
+    sezione: str,
+    titolo: str,
+    max_chars: int,
+    overlap_s: int,
+) -> list[dict]:
+    sentences = split_sentences(body)
+    if not sentences:
+        return []
+
+    chunks = []
+    current: list[str] = []
+    current_len = 0
+    sub_index = 0
+
+    i = 0
+    while i < len(sentences):
+        sent = sentences[i]
+        if not current or current_len + len(sent) + 1 <= max_chars:
+            current.append(sent)
+            current_len += len(sent) + (1 if len(current) > 1 else 0)
+            i += 1
+        else:
+            chunk_text = prefix + " ".join(current)
+            chunks.append({
+                "chunk_id": f"{slugify(sezione)}__{slugify(titolo)}__s{sub_index}",
+                "text": chunk_text,
+                "sezione": sezione,
+                "titolo": titolo,
+                "sub_index": sub_index,
+                "n_chars": len(chunk_text),
+            })
+            sub_index += 1
+            overlap = current[-overlap_s:] if overlap_s and len(current) > overlap_s else []
+            current = overlap[:]
+            current_len = sum(len(s) + 1 for s in current)
+
+    if current:
+        chunk_text = prefix + " ".join(current)
+        chunks.append({
+            "chunk_id": f"{slugify(sezione)}__{slugify(titolo)}__s{sub_index}",
+            "text": chunk_text,
+            "sezione": sezione,
+            "titolo": titolo,
+            "sub_index": sub_index,
+            "n_chars": len(chunk_text),
+        })
+
+    return chunks
+
+
+# ─── Parser Markdown ──────────────────────────────────────────────────────────
+
+def parse_h3_sections(text: str) -> list[dict]:
+    sections = []
+    current_h2 = ""
+    current_h3 = ""
+    current_body_lines: list[str] = []
+
+    def flush():
+        body = "\n".join(current_body_lines).strip()
+        if body:
+            sections.append({
+                "sezione": current_h2,
+                "titolo": current_h3,
+                "body": body,
+            })
+
+    for line in text.splitlines():
+        if re.match(r"^# ", line):
+            flush()
+            current_h2 = line[2:].strip()
+            current_h3 = ""
+            current_body_lines = []
+        elif re.match(r"^## ", line):
+            flush()
+            current_h2 = line[3:].strip()
+            current_h3 = ""
+            current_body_lines = []
+        elif re.match(r"^### ", line):
+            flush()
+            current_h3 = line[4:].strip()
+            current_body_lines = []
+        else:
+            current_body_lines.append(line)
+
+    flush()
+    return sections
+
+
+def parse_h2_sections(text: str) -> list[dict]:
+    sections = []
+    current_h2 = ""
+    current_body_lines: list[str] = []
+
+    def flush():
+        body = "\n".join(current_body_lines).strip()
+        if body:
+            sections.append({"sezione": current_h2, "body": body})
+
+    for line in text.splitlines():
+        if re.match(r"^## ", line):
+            flush()
+            current_h2 = line[3:].strip()
+            current_body_lines = []
+        elif re.match(r"^# ", line):
+            flush()
+            current_h2 = line[2:].strip()
+            current_body_lines = []
+        else:
+            current_body_lines.append(line)
+
+    flush()
+    return sections
+
+
+# ─── Strategie di chunking ────────────────────────────────────────────────────
+
+def chunk_h3_aware(text: str, stem: str) -> list[dict]:
+    sections = parse_h3_sections(text)
+
+    merged: list[dict] = []
+    pending: dict | None = None
+
+    for sec in sections:
+        if pending is None:
+            pending = dict(sec)
+            continue
+
+        if (pending["sezione"] == sec["sezione"]
+                and len(pending["body"]) < MIN_CHARS):
+            sep_title = " / ".join(filter(None, [pending["titolo"], sec["titolo"]]))
+            pending = {
+                "sezione": pending["sezione"],
+                "titolo": sep_title or pending["titolo"],
+                "body": pending["body"] + "\n\n" + sec["body"],
+            }
+        else:
+            merged.append(pending)
+            pending = dict(sec)
+
+    if pending:
+        merged.append(pending)
+
+    chunks = []
+    for sec in merged:
+        sezione = sec["sezione"] or stem
+        titolo = sec["titolo"] or ""
+        body = sec["body"]
+
+        prefix = f"[{sezione} > {titolo}]\n" if titolo else f"[{sezione}]\n"
+        sub = make_sub_chunks(body, prefix, sezione, titolo, MAX_CHARS, OVERLAP_S)
+        chunks.extend(sub)
+
+    return chunks
+
+
+def chunk_h2_paragraph_split(text: str, stem: str) -> list[dict]:
+    sections = parse_h2_sections(text)
+    chunks = []
+
+    for sec in sections:
+        sezione = sec["sezione"] or stem
+        body = sec["body"]
+        prefix = f"[{sezione}]\n"
+
+        paragraphs = [
+            p.strip()
+            for p in re.split(r"\n{2,}", body)
+            if p.strip() and not re.match(r"^#+\s", p.strip())
+        ]
+
+        merged_pars: list[str] = []
+        pending = ""
+        for par in paragraphs:
+            if pending and len(pending) < MIN_CHARS:
+                pending = pending + "\n\n" + par
+            else:
+                if pending:
+                    merged_pars.append(pending)
+                pending = par
+        if pending:
+            merged_pars.append(pending)
+
+        for idx, par in enumerate(merged_pars):
+            sub = make_sub_chunks(par, prefix, sezione, f"par{idx}", MAX_CHARS, OVERLAP_S)
+            for c in sub:
+                c["chunk_id"] = f"{slugify(sezione)}__p{idx}__s{c['sub_index']}"
+            chunks.extend(sub)
+
+    return chunks
+
+
+def chunk_paragraph(text: str, stem: str) -> list[dict]:
+    paragraphs = [
+        p.strip()
+        for p in re.split(r"\n{2,}", text)
+        if p.strip() and not re.match(r"^#+\s", p.strip())
+    ]
+    prefix = f"[Documento: {stem}]\n"
+
+    merged: list[str] = []
+    pending = ""
+    for par in paragraphs:
+        if pending and len(pending) < MIN_CHARS:
+            pending = pending + "\n\n" + par
+        else:
+            if pending:
+                merged.append(pending)
+            pending = par
+    if pending:
+        merged.append(pending)
+
+    chunks = []
+    for idx, par in enumerate(merged):
+        sub = make_sub_chunks(par, prefix, stem, f"par{idx}", MAX_CHARS, OVERLAP_S)
+        for c in sub:
+            c["chunk_id"] = f"para__{idx}__s{c['sub_index']}"
+        chunks.extend(sub)
+
+    return chunks
+
+
+def chunk_sliding_window(text: str, stem: str) -> list[dict]:
+    sentences = split_sentences(text)
+    prefix = f"[Documento: {stem}]\n"
+
+    chunks = []
+    i = 0
+    win_idx = 0
+
+    while i < len(sentences):
+        window: list[str] = []
+        cur_len = 0
+
+        j = i
+        while j < len(sentences):
+            s = sentences[j]
+            if window and cur_len + len(s) + 1 > MAX_CHARS:
+                break
+            window.append(s)
+            cur_len += len(s) + (1 if len(window) > 1 else 0)
+            j += 1
+
+        if not window:
+            window = [sentences[i]]
+            j = i + 1
+
+        chunk_text = prefix + " ".join(window)
+        chunks.append({
+            "chunk_id": f"win__{win_idx}",
+            "text": chunk_text,
+            "sezione": stem,
+            "titolo": f"finestra {win_idx}",
+            "sub_index": win_idx,
+            "n_chars": len(chunk_text),
+        })
+        win_idx += 1
+        i += max(1, len(window) - OVERLAP_S)
+
+    return chunks
+
+
+# ─── Dispatcher ───────────────────────────────────────────────────────────────
+
+_STRATEGIES: dict[str, callable] = {
+    "h3_aware": chunk_h3_aware,
+    "h2_paragraph_split": chunk_h2_paragraph_split,
+    "paragraph": chunk_paragraph,
+    "sliding_window": chunk_sliding_window,
+}
+
+
+def chunk_document(clean_md: Path, profile: dict, stem: str) -> list[dict]:
+    text = clean_md.read_text(encoding="utf-8")
+    strategia = profile.get("strategia_chunking", "paragraph")
+    fn = _STRATEGIES.get(strategia, chunk_paragraph)
+    return fn(text, stem)
+
+
+# ─── Per-document processing ──────────────────────────────────────────────────
+
+def process_stem(stem: str, project_root: Path, force: bool) -> bool:
+    conv_dir  = project_root / "conversione" / stem
+    out_dir   = project_root / "chunks" / stem
+    clean_md  = conv_dir / "clean.md"
+    profile_path = conv_dir / "structure_profile.json"
+    out_file  = out_dir / "chunks.json"
+
+    print(f"\nDocumento: {stem}")
+
+    if not clean_md.exists():
+        print(f"  ✗ clean.md non trovato in conversione/{stem}/ — skip")
+        return False
+    if not profile_path.exists():
+        print(f"  ✗ structure_profile.json non trovato in conversione/{stem}/ — skip")
+        return False
+
+    if out_file.exists() and not force:
+        print(f"  ⚠️  chunks.json già presente — skip")
+        print(f"       (usa --force per rieseguire)")
+        return True
+
+    profile   = json.loads(profile_path.read_text(encoding="utf-8"))
+    strategia = profile.get("strategia_chunking", "paragraph")
+    print(f"  Strategia: {strategia}")
+
+    chunks = chunk_document(clean_md, profile, stem)
+
+    if not chunks:
+        print(f"  ✗ Nessun chunk generato — controlla clean.md")
+        return False
+
+    out_dir.mkdir(parents=True, exist_ok=True)
+    out_file.write_text(
+        json.dumps(chunks, ensure_ascii=False, indent=2), encoding="utf-8"
+    )
+
+    lengths = [c["n_chars"] for c in chunks]
+    min_c = min(lengths)
+    max_c = max(lengths)
+    avg_c = int(sum(lengths) / len(lengths))
+    short = sum(1 for l in lengths if l < MIN_CHARS)
+    long_ = sum(1 for l in lengths if l > MAX_CHARS * 1.5)
+
+    print(f"  Chunk totali: {len(chunks)}")
+    print(f"  Min: {min_c} char  Max: {max_c} char  Media: {avg_c} char")
+    if short:
+        print(f"  ⚠️  {short} chunk sotto MIN_CHARS ({MIN_CHARS})")
+    if long_:
+        print(f"  ⚠️  {long_} chunk sopra MAX_CHARS×1.5 ({int(MAX_CHARS * 1.5)})")
+    print(f"  ✅ chunks.json salvato in chunks/{stem}/")
+    return True
+
+
+# ─── Entry point ─────────────────────────────────────────────────────────────
+
+if __name__ == "__main__":
+    project_root = Path(__file__).parent.parent
+
+    parser = argparse.ArgumentParser(description="Chunking adattivo")
+    parser.add_argument("--stem", help="Nome del documento (sottocartella di conversione/)")
+    parser.add_argument("--force", action="store_true", help="Riesegui anche se già presente")
+    args = parser.parse_args()
+
+    if args.stem:
+        stems = [args.stem]
+    else:
+        conv_dir = project_root / "conversione"
+        if not conv_dir.exists():
+            print(f"Errore: cartella conversione/ non trovata in {project_root}")
+            sys.exit(1)
+        stems = sorted(
+            p.name for p in conv_dir.iterdir()
+            if p.is_dir() and (p / "clean.md").exists()
+        )
+        if not stems:
+            print(f"Errore: nessun documento trovato in conversione/")
+            sys.exit(1)
+
+    results = [process_stem(s, project_root, args.force) for s in stems]
+
+    ok    = sum(results)
+    total = len(results)
+    print(f"\n{'✅' if all(results) else '⚠️ '} {ok}/{total} documenti processati")
+    sys.exit(0 if all(results) else 1)
@@ -0,0 +1,283 @@
+#!/usr/bin/env python3
+"""
+Fix chunk
+
+Applica correzioni dirette su chunks/<stem>/chunks.json basandosi sul
+report.json prodotto da verify_chunks.py. Non tocca clean.md.
+
+Fixes applicati:
+  empty      → rimuove il chunk
+  incomplete → fonde con il chunk successivo (la frase continua)
+  no_prefix  → aggiunge prefisso [sezione > titolo] se mancante
+  too_short  → fonde con il chunk adiacente nello stesso sezione
+  too_long   → spezza all'ultimo confine di paragrafo/frase entro MAX_CHARS
+
+Input:  chunks/<stem>/chunks.json  +  chunks/<stem>/report.json
+Output: chunks/<stem>/chunks.json  (sovrascrive)
+
+Uso:
+    python chunks/fix_chunks.py --stem documento
+    python chunks/fix_chunks.py --stem documento --dry-run
+"""
+
+import argparse
+import json
+import re
+import sys
+from pathlib import Path
+
+MAX_CHARS = 800
+PUNCT_END = re.compile(r"[.!?»)\]'\u2019\"\u201c\u201d\u2018\u2014\u2013-]$")
+
+
+# ─── Helpers ──────────────────────────────────────────────────────────────────
+
+def _prefix(chunk: dict) -> str:
+    sezione = chunk.get("sezione", "")
+    titolo  = chunk.get("titolo", "")
+    if titolo:
+        return f"[{sezione} > {titolo}]"
+    return f"[{sezione}]"
+
+
+def _strip_prefix(text: str) -> str:
+    text = text.lstrip()
+    if text.startswith("["):
+        end = text.find("]")
+        if end != -1:
+            return text[end + 1:].lstrip("\n")
+    return text
+
+
+def _rebuild_text(chunk: dict, body: str) -> str:
+    return f"{_prefix(chunk)}\n{body}"
+
+
+def _split_at_boundary(text: str, max_chars: int) -> list[str]:
+    if len(text) <= max_chars:
+        return [text]
+
+    parts = []
+    remaining = text
+
+    while len(remaining) > max_chars:
+        candidate = remaining[:max_chars]
+        split_pos = candidate.rfind("\n\n")
+
+        if split_pos == -1:
+            m = None
+            for m in re.finditer(r"[.!?»]\s+", candidate):
+                pass
+            split_pos = m.end() if m else None
+
+        if split_pos is None or split_pos == 0:
+            sp = remaining.find(" ", max_chars)
+            split_pos = sp if sp != -1 else len(remaining)
+
+        parts.append(remaining[:split_pos].rstrip())
+        remaining = remaining[split_pos:].lstrip()
+
+    if remaining:
+        parts.append(remaining)
+
+    return [p for p in parts if p.strip()]
+
+
+# ─── Operazioni sui chunk ─────────────────────────────────────────────────────
+
+def fix_empty(chunks: list[dict], empty_ids: set[str]) -> tuple[list[dict], int]:
+    before = len(chunks)
+    chunks = [c for c in chunks if c["chunk_id"] not in empty_ids]
+    return chunks, before - len(chunks)
+
+
+def fix_no_prefix(chunks: list[dict], no_prefix_ids: set[str]) -> tuple[list[dict], int]:
+    count = 0
+    for c in chunks:
+        if c["chunk_id"] in no_prefix_ids:
+            body = _strip_prefix(c["text"])
+            c["text"] = _rebuild_text(c, body)
+            c["n_chars"] = len(c["text"])
+            count += 1
+    return chunks, count
+
+
+def fix_incomplete_and_short(chunks: list[dict],
+                              problem_ids: set[str]) -> tuple[list[dict], int]:
+    merged = 0
+    i = 0
+    result: list[dict] = []
+
+    while i < len(chunks):
+        c = chunks[i]
+        if c["chunk_id"] in problem_ids and i + 1 < len(chunks):
+            nxt = chunks[i + 1]
+            body_c   = _strip_prefix(c["text"])
+            body_nxt = _strip_prefix(nxt["text"])
+            merged_body = body_c.rstrip() + "\n" + body_nxt.lstrip()
+            nxt["text"]    = _rebuild_text(nxt, merged_body)
+            nxt["n_chars"] = len(nxt["text"])
+            merged += 1
+            i += 1
+            continue
+        result.append(c)
+        i += 1
+
+    return result, merged
+
+
+def fix_too_long(chunks: list[dict],
+                 too_long_ids: set[str],
+                 max_chars: int) -> tuple[list[dict], int]:
+    result: list[dict] = []
+    split_count = 0
+
+    for c in chunks:
+        if c["chunk_id"] not in too_long_ids:
+            result.append(c)
+            continue
+
+        body  = _strip_prefix(c["text"])
+        parts = _split_at_boundary(body, max_chars)
+
+        if len(parts) == 1:
+            result.append(c)
+            continue
+
+        base_id  = re.sub(r"__s\d+$", "", c["chunk_id"])
+        base_sub = c.get("sub_index", 0)
+
+        for j, part in enumerate(parts):
+            new_chunk = dict(c)
+            new_chunk["sub_index"] = base_sub + j
+            new_chunk["chunk_id"]  = f"{base_id}__s{base_sub + j}"
+            new_chunk["text"]      = _rebuild_text(new_chunk, part)
+            new_chunk["n_chars"]   = len(new_chunk["text"])
+            result.append(new_chunk)
+
+        split_count += 1
+
+    return result, split_count
+
+
+def renumber_ids(chunks: list[dict]) -> list[dict]:
+    seen: dict[str, int] = {}
+    for c in chunks:
+        base = re.sub(r"__s\d+$", "", c["chunk_id"])
+        idx  = seen.get(base, 0)
+        c["chunk_id"]  = f"{base}__s{idx}"
+        c["sub_index"] = idx
+        seen[base] = idx + 1
+    return chunks
+
+
+# ─── Core ─────────────────────────────────────────────────────────────────────
+
+def fix_stem(stem: str, project_root: Path, max_chars: int, dry_run: bool) -> bool:
+    stem_dir    = project_root / "chunks" / stem
+    chunks_path = stem_dir / "chunks.json"
+    report_path = stem_dir / "report.json"
+
+    if not chunks_path.exists():
+        print(f"✗ chunks/{stem}/chunks.json non trovato.")
+        print(f"  Esegui prima: python chunks/chunker.py --stem {stem}")
+        return False
+
+    if not report_path.exists():
+        print(f"✗ chunks/{stem}/report.json non trovato.")
+        print(f"  Esegui prima: python chunks/verify_chunks.py --stem {stem}")
+        return False
+
+    chunks: list[dict] = json.loads(chunks_path.read_text(encoding="utf-8"))
+    report: dict       = json.loads(report_path.read_text(encoding="utf-8"))
+
+    verdict = report.get("verdict", "ok")
+    print(f"\nDocumento: {stem}  (verdict: {verdict})")
+
+    if verdict == "ok":
+        print("  ✅ Nessun problema — nulla da correggere.")
+        return True
+
+    empty_ids      = {e["chunk_id"] for e in report.get("blockers", {}).get("empty", [])}
+    no_prefix_ids  = {e["chunk_id"] for e in report.get("blockers", {}).get("no_prefix", [])}
+    incomplete_ids = {e["chunk_id"] for e in report.get("blockers", {}).get("incomplete", [])}
+    too_short_ids  = {e["chunk_id"] for e in report.get("warnings", {}).get("too_short", [])}
+    too_long_ids   = {e["chunk_id"] for e in report.get("warnings", {}).get("too_long", [])}
+
+    ops: list[str] = []
+    if empty_ids:
+        ops.append(f"  🗑  rimuovi {len(empty_ids)} chunk vuoti")
+    if no_prefix_ids:
+        ops.append(f"  🔧 aggiungi prefisso a {len(no_prefix_ids)} chunk")
+    if incomplete_ids:
+        ops.append(f"  🔗 fondi {len(incomplete_ids)} chunk incompleti col successivo")
+    if too_short_ids:
+        ops.append(f"  🔗 fondi {len(too_short_ids)} chunk troppo corti col successivo")
+    if too_long_ids:
+        ops.append(f"  ✂️  spezza {len(too_long_ids)} chunk troppo lunghi")
+
+    if not ops:
+        print("  ✅ Nessuna correzione necessaria.")
+        return True
+
+    print("\n  Operazioni pianificate:")
+    for op in ops:
+        print(op)
+
+    if dry_run:
+        print("\n  [dry-run] Nessuna modifica applicata.")
+        return True
+
+    n_before = len(chunks)
+
+    if empty_ids:
+        chunks, n = fix_empty(chunks, empty_ids)
+        print(f"\n  🗑  Rimossi {n} chunk vuoti.")
+
+    if no_prefix_ids:
+        chunks, n = fix_no_prefix(chunks, no_prefix_ids)
+        print(f"  🔧 Aggiunto prefisso a {n} chunk.")
+
+    merge_ids = incomplete_ids | too_short_ids
+    if merge_ids:
+        chunks, n = fix_incomplete_and_short(chunks, merge_ids)
+        print(f"  🔗 Fusi {n} chunk (incompleti + corti).")
+
+    if too_long_ids:
+        chunks, n = fix_too_long(chunks, too_long_ids, max_chars)
+        print(f"  ✂️  Spezzati {n} chunk lunghi.")
+
+    chunks = renumber_ids(chunks)
+
+    n_after = len(chunks)
+    print(f"\n  Totale chunk: {n_before} → {n_after}")
+
+    chunks_path.write_text(
+        json.dumps(chunks, ensure_ascii=False, indent=2), encoding="utf-8"
+    )
+    print(f"  ✅ Salvato: chunks/{stem}/chunks.json")
+    print(f"\n  Riesegui la verifica:")
+    print(f"     python chunks/verify_chunks.py --stem {stem}")
+
+    return True
+
+
+# ─── Entry point ──────────────────────────────────────────────────────────────
+
+if __name__ == "__main__":
+    project_root = Path(__file__).parent.parent
+
+    parser = argparse.ArgumentParser(description="Fix chunk")
+    parser.add_argument("--stem", required=True, help="Nome del documento (sottocartella di chunks/)")
+    parser.add_argument(
+        "--max", type=int, default=MAX_CHARS,
+        help=f"Soglia massima caratteri per lo split (default: {MAX_CHARS})"
+    )
+    parser.add_argument(
+        "--dry-run", action="store_true",
+        help="Mostra le operazioni pianificate senza applicarle"
+    )
+    args = parser.parse_args()
+
+    ok = fix_stem(args.stem, project_root, args.max, args.dry_run)
+    sys.exit(0 if ok else 1)
@@ -0,0 +1,302 @@
+#!/usr/bin/env python3
+"""
+Verifica chunk
+
+Analizza chunks/<stem>/chunks.json e segnala ogni anomalia che potrebbe
+degradare la qualità del retrieval. Non modifica nulla.
+
+Input:  chunks/<stem>/chunks.json
+Output: report a schermo + chunks/<stem>/report.json + exit code (0 = OK, 1 = problemi)
+
+Uso:
+    python chunks/verify_chunks.py --stem documento
+    python chunks/verify_chunks.py                    # tutti i documenti in chunks/
+    python chunks/verify_chunks.py --min 200 --max 800
+"""
+
+import argparse
+import json
+import re
+import sys
+from pathlib import Path
+
+
+# ─── Soglie ───────────────────────────────────────────────────────────────────
+
+MIN_CHARS = 200
+MAX_CHARS = 800
+PUNCT_END = re.compile("[.!?»)\\]'\u2019\"\u201c\u201d\u2018\u2014\u2013\u2026-]$")
+
+
+# ─── Checks ───────────────────────────────────────────────────────────────────
+
+def has_prefix(chunk: dict) -> bool:
+    return chunk.get("text", "").lstrip().startswith("[")
+
+
+def is_empty(chunk: dict) -> bool:
+    return not chunk.get("text", "").strip()
+
+
+def is_too_short(chunk: dict, min_chars: int) -> bool:
+    return chunk.get("n_chars", 0) < min_chars
+
+
+def is_too_long(chunk: dict, max_chars: int) -> bool:
+    return chunk.get("n_chars", 0) > max_chars * 1.5
+
+
+def ends_incomplete(chunk: dict) -> bool:
+    text = chunk.get("text", "").rstrip()
+    if not text:
+        return False
+    text_check = re.sub(r"[_*]+$", "", text).rstrip()
+    if not text_check:
+        return False
+    return not PUNCT_END.search(text_check)
+
+
+# ─── Report ───────────────────────────────────────────────────────────────────
+
+def _fmt_chunk(c: dict) -> str:
+    cid     = c.get("chunk_id", "?")
+    n       = c.get("n_chars", 0)
+    preview = c.get("text", "")[:60].replace("\n", " ")
+    return f"  [{cid}] ({n} char) «{preview}»"
+
+
+def verify_stem(stem: str, project_root: Path, min_chars: int, max_chars: int) -> bool:
+    chunks_path = project_root / "chunks" / stem / "chunks.json"
+
+    print(f"\nDocumento: {stem}")
+
+    if not chunks_path.exists():
+        print(f"  ✗ chunks/{stem}/chunks.json non trovato")
+        print(f"    Esegui prima: python chunks/chunker.py --stem {stem}")
+        return False
+
+    chunks: list[dict] = json.loads(chunks_path.read_text(encoding="utf-8"))
+
+    if not chunks:
+        print(f"  ✗ chunks.json è vuoto")
+        return False
+
+    # ── Raccogli problemi ──────────────────────────────────────────────────────
+
+    empty_chunks = [c for c in chunks if is_empty(c)]
+    no_prefix    = [c for c in chunks if not is_empty(c) and not has_prefix(c)]
+    too_short    = [c for c in chunks if is_too_short(c, min_chars)]
+    too_long     = [c for c in chunks if is_too_long(c, max_chars)]
+    incomplete   = [c for c in chunks if not is_empty(c) and ends_incomplete(c)]
+
+    # ── Statistiche ───────────────────────────────────────────────────────────
+
+    lengths = [c.get("n_chars", 0) for c in chunks]
+    n_total = len(chunks)
+    n_ok    = n_total - len(set(
+        c["chunk_id"]
+        for lst in [empty_chunks, no_prefix, too_short, too_long, incomplete]
+        for c in lst
+    ))
+    min_l = min(lengths)
+    max_l = max(lengths)
+    avg_l = int(sum(lengths) / n_total)
+
+    n_under  = sum(1 for l in lengths if l < min_chars)
+    n_normal = sum(1 for l in lengths if min_chars <= l <= max_chars)
+    n_over   = sum(1 for l in lengths if l > max_chars)
+
+    # ── Output ────────────────────────────────────────────────────────────────
+
+    print(f"  Totale chunk:  {n_total}")
+    print(f"  ✅ OK:         {n_ok}")
+    print()
+    print(f"  Distribuzione lunghezze:")
+    print(f"    Min:   {min_l} char")
+    print(f"    Max:   {max_l} char")
+    print(f"    Media: {avg_l} char")
+    print(f"    < {min_chars} char (sotto MIN): {n_under}")
+    print(f"    {min_chars}–{max_chars} char (ideale):  {n_normal}")
+    print(f"    > {max_chars} char (sopra MAX): {n_over}")
+
+    has_errors = False
+
+    if empty_chunks:
+        has_errors = True
+        print(f"\n  🔴 {len(empty_chunks)} chunk VUOTI:")
+        for c in empty_chunks[:5]:
+            print(f"  [{c.get('chunk_id', '?')}]")
+        if len(empty_chunks) > 5:
+            print(f"  ... e altri {len(empty_chunks) - 5}")
+
+    if no_prefix:
+        has_errors = True
+        print(f"\n  🔴 {len(no_prefix)} chunk SENZA PREFISSO DI CONTESTO:")
+        for c in no_prefix[:5]:
+            print(_fmt_chunk(c))
+        if len(no_prefix) > 5:
+            print(f"  ... e altri {len(no_prefix) - 5}")
+        print(f"  → Causa probabile: header ### mancanti o malformati nel MD")
+
+    if too_short:
+        has_errors = True
+        print(f"\n  🟡 {len(too_short)} chunk SOTTO MIN_CHARS ({min_chars}):")
+        for c in too_short[:5]:
+            print(_fmt_chunk(c))
+        if len(too_short) > 5:
+            print(f"  ... e altri {len(too_short) - 5}")
+        print(f"  → Soluzione: abbassa MIN_CHARS o revisiona il MD")
+
+    if too_long:
+        has_errors = True
+        print(f"\n  🟡 {len(too_long)} chunk SOPRA MAX_CHARS×1.5 ({int(max_chars * 1.5)}):")
+        for c in too_long[:5]:
+            print(_fmt_chunk(c))
+        if len(too_long) > 5:
+            print(f"  ... e altri {len(too_long) - 5}")
+        print(f"  → Soluzione: alza MAX_CHARS o verifica il testo nel MD")
+
+    if incomplete:
+        has_errors = True
+        print(f"\n  🔴 {len(incomplete)} chunk CHE FINISCONO SENZA PUNTEGGIATURA (frase spezzata):")
+        for c in incomplete[:5]:
+            last_line = c.get("text", "").rstrip().split("\n")[-1][-80:]
+            print(f"  [{c.get('chunk_id', '?')}] ...{last_line!r}")
+        if len(incomplete) > 5:
+            print(f"  ... e altri {len(incomplete) - 5}")
+        print(f"  → Soluzione: correggi le righe spezzate in conversione/{stem}/clean.md")
+
+    # ── Costruisci e salva report.json ────────────────────────────────────────
+
+    blockers = empty_chunks + no_prefix + incomplete
+    warnings = too_short + too_long
+
+    def _chunk_entry(c: dict) -> dict:
+        return {
+            "chunk_id":  c.get("chunk_id", ""),
+            "sezione":   c.get("sezione", ""),
+            "titolo":    c.get("titolo", ""),
+            "n_chars":   c.get("n_chars", 0),
+            "last_text": c.get("text", "").rstrip().split("\n")[-1][-120:],
+        }
+
+    verdict = "ok" if not blockers else "blocked"
+    if not blockers and warnings:
+        verdict = "warnings_only"
+
+    report = {
+        "stem":    stem,
+        "verdict": verdict,
+        "stats": {
+            "total":     n_total,
+            "ok":        n_ok,
+            "min_chars": min_l,
+            "max_chars": max_l,
+            "avg_chars": avg_l,
+        },
+        "thresholds": {"min_chars": min_chars, "max_chars": max_chars},
+        "blockers": {
+            "empty":      [_chunk_entry(c) for c in empty_chunks],
+            "no_prefix":  [_chunk_entry(c) for c in no_prefix],
+            "incomplete": [_chunk_entry(c) for c in incomplete],
+        },
+        "warnings": {
+            "too_short": [_chunk_entry(c) for c in too_short],
+            "too_long":  [_chunk_entry(c) for c in too_long],
+        },
+    }
+
+    out_dir = project_root / "chunks" / stem
+    out_dir.mkdir(parents=True, exist_ok=True)
+    (out_dir / "report.json").write_text(
+        json.dumps(report, ensure_ascii=False, indent=2), encoding="utf-8"
+    )
+    print(f"\n  report.json salvato in chunks/{stem}/")
+
+    # ── Prossimi passi ────────────────────────────────────────────────────────
+
+    print(f"\n  {'─' * 50}")
+    print(f"  PROSSIMI PASSI")
+    print(f"  {'─' * 50}")
+
+    if not blockers and not warnings:
+        print(f"  ✅ Tutto OK — procedi alla vettorizzazione:")
+        print(f"       python step-8/ingest.py --stem {stem}")
+
+    elif not blockers:
+        print(f"  🟡 Solo avvisi minori — puoi procedere alla vettorizzazione:")
+        print(f"       python step-8/ingest.py --stem {stem}")
+        print()
+        print(f"  Oppure, per ottimizzare prima:")
+        if too_short:
+            pct = int(len(too_short) / n_total * 100)
+            print(f"    • {len(too_short)} chunk corti ({pct}% del totale)")
+        if too_long:
+            pct = int(len(too_long) / n_total * 100)
+            print(f"    • {len(too_long)} chunk lunghi ({pct}% del totale)")
+        if too_short or too_long:
+            print(f"      → Esegui: python chunks/fix_chunks.py --stem {stem} --dry-run")
+            print(f"        poi:     python chunks/fix_chunks.py --stem {stem}")
+            print(f"        poi:     python chunks/verify_chunks.py --stem {stem}")
+
+    else:
+        print(f"  🔴 Problemi bloccanti — correggi prima di procedere:")
+        print()
+        if empty_chunks:
+            print(f"    • {len(empty_chunks)} chunk vuoti")
+            print(f"      → Controlla conversione/{stem}/clean.md per sezioni prive di testo")
+        if no_prefix:
+            print(f"    • {len(no_prefix)} chunk senza prefisso di contesto")
+            print(f"      → Controlla che gli header ### siano corretti in conversione/{stem}/clean.md")
+        if incomplete:
+            print(f"    • {len(incomplete)} chunk con frase spezzata")
+            print(f"      → Esegui: python chunks/fix_chunks.py --stem {stem}")
+        print()
+        print(f"  Dopo le correzioni, riesegui nell'ordine:")
+        print(f"       python chunks/chunker.py --stem {stem} --force")
+        print(f"       python chunks/verify_chunks.py --stem {stem}")
+        print()
+        if warnings:
+            print(f"  🟡 Hai anche {len(warnings)} avvisi minori — affrontali dopo aver risolto i 🔴.")
+
+    return not blockers
+
+
+# ─── Entry point ──────────────────────────────────────────────────────────────
+
+if __name__ == "__main__":
+    project_root = Path(__file__).parent.parent
+
+    parser = argparse.ArgumentParser(description="Verifica chunk")
+    parser.add_argument("--stem", help="Nome del documento (sottocartella di chunks/)")
+    parser.add_argument(
+        "--min", type=int, default=MIN_CHARS,
+        help=f"Soglia minima caratteri (default: {MIN_CHARS})"
+    )
+    parser.add_argument(
+        "--max", type=int, default=MAX_CHARS,
+        help=f"Soglia massima caratteri (default: {MAX_CHARS})"
+    )
+    args = parser.parse_args()
+
+    if args.stem:
+        stems = [args.stem]
+    else:
+        chunks_dir = project_root / "chunks"
+        if not chunks_dir.exists():
+            print(f"Errore: cartella chunks/ non trovata in {project_root}")
+            sys.exit(1)
+        stems = sorted(
+            p.name for p in chunks_dir.iterdir()
+            if p.is_dir() and (p / "chunks.json").exists()
+        )
+        if not stems:
+            print("Errore: nessun chunks.json trovato in chunks/")
+            sys.exit(1)
+
+    results = [verify_stem(s, project_root, args.min, args.max) for s in stems]
+
+    ok    = sum(results)
+    total = len(results)
+    print(f"\n{'✅' if all(results) else '⚠️ '} {ok}/{total} documenti senza problemi")
+    sys.exit(0 if all(results) else 1)