feat(chunks): aggiungi pipeline chunking consolidata

Nuova cartella chunks/ con chunker.py (step 5), verify_chunks.py e fix_chunks.py (step 6). Tutto l'I/O va in chunks/<stem>/ invece di step-5/ e step-6/ separati. Input: conversione/<stem>/clean.md
2026-04-20 11:36:18 +02:00
parent 5215f53ad0
commit 4c0e0db2a5
3 changed files with 999 additions and 0 deletions
@@ -0,0 +1,283 @@
+#!/usr/bin/env python3
+"""
+Fix chunk
+
+Applica correzioni dirette su chunks/<stem>/chunks.json basandosi sul
+report.json prodotto da verify_chunks.py. Non tocca clean.md.
+
+Fixes applicati:
+  empty      → rimuove il chunk
+  incomplete → fonde con il chunk successivo (la frase continua)
+  no_prefix  → aggiunge prefisso [sezione > titolo] se mancante
+  too_short  → fonde con il chunk adiacente nello stesso sezione
+  too_long   → spezza all'ultimo confine di paragrafo/frase entro MAX_CHARS
+
+Input:  chunks/<stem>/chunks.json  +  chunks/<stem>/report.json
+Output: chunks/<stem>/chunks.json  (sovrascrive)
+
+Uso:
+    python chunks/fix_chunks.py --stem documento
+    python chunks/fix_chunks.py --stem documento --dry-run
+"""
+
+import argparse
+import json
+import re
+import sys
+from pathlib import Path
+
+MAX_CHARS = 800
+PUNCT_END = re.compile(r"[.!?»)\]'\u2019\"\u201c\u201d\u2018\u2014\u2013-]$")
+
+
+# ─── Helpers ──────────────────────────────────────────────────────────────────
+
+def _prefix(chunk: dict) -> str:
+    sezione = chunk.get("sezione", "")
+    titolo  = chunk.get("titolo", "")
+    if titolo:
+        return f"[{sezione} > {titolo}]"
+    return f"[{sezione}]"
+
+
+def _strip_prefix(text: str) -> str:
+    text = text.lstrip()
+    if text.startswith("["):
+        end = text.find("]")
+        if end != -1:
+            return text[end + 1:].lstrip("\n")
+    return text
+
+
+def _rebuild_text(chunk: dict, body: str) -> str:
+    return f"{_prefix(chunk)}\n{body}"
+
+
+def _split_at_boundary(text: str, max_chars: int) -> list[str]:
+    if len(text) <= max_chars:
+        return [text]
+
+    parts = []
+    remaining = text
+
+    while len(remaining) > max_chars:
+        candidate = remaining[:max_chars]
+        split_pos = candidate.rfind("\n\n")
+
+        if split_pos == -1:
+            m = None
+            for m in re.finditer(r"[.!?»]\s+", candidate):
+                pass
+            split_pos = m.end() if m else None
+
+        if split_pos is None or split_pos == 0:
+            sp = remaining.find(" ", max_chars)
+            split_pos = sp if sp != -1 else len(remaining)
+
+        parts.append(remaining[:split_pos].rstrip())
+        remaining = remaining[split_pos:].lstrip()
+
+    if remaining:
+        parts.append(remaining)
+
+    return [p for p in parts if p.strip()]
+
+
+# ─── Operazioni sui chunk ─────────────────────────────────────────────────────
+
+def fix_empty(chunks: list[dict], empty_ids: set[str]) -> tuple[list[dict], int]:
+    before = len(chunks)
+    chunks = [c for c in chunks if c["chunk_id"] not in empty_ids]
+    return chunks, before - len(chunks)
+
+
+def fix_no_prefix(chunks: list[dict], no_prefix_ids: set[str]) -> tuple[list[dict], int]:
+    count = 0
+    for c in chunks:
+        if c["chunk_id"] in no_prefix_ids:
+            body = _strip_prefix(c["text"])
+            c["text"] = _rebuild_text(c, body)
+            c["n_chars"] = len(c["text"])
+            count += 1
+    return chunks, count
+
+
+def fix_incomplete_and_short(chunks: list[dict],
+                              problem_ids: set[str]) -> tuple[list[dict], int]:
+    merged = 0
+    i = 0
+    result: list[dict] = []
+
+    while i < len(chunks):
+        c = chunks[i]
+        if c["chunk_id"] in problem_ids and i + 1 < len(chunks):
+            nxt = chunks[i + 1]
+            body_c   = _strip_prefix(c["text"])
+            body_nxt = _strip_prefix(nxt["text"])
+            merged_body = body_c.rstrip() + "\n" + body_nxt.lstrip()
+            nxt["text"]    = _rebuild_text(nxt, merged_body)
+            nxt["n_chars"] = len(nxt["text"])
+            merged += 1
+            i += 1
+            continue
+        result.append(c)
+        i += 1
+
+    return result, merged
+
+
+def fix_too_long(chunks: list[dict],
+                 too_long_ids: set[str],
+                 max_chars: int) -> tuple[list[dict], int]:
+    result: list[dict] = []
+    split_count = 0
+
+    for c in chunks:
+        if c["chunk_id"] not in too_long_ids:
+            result.append(c)
+            continue
+
+        body  = _strip_prefix(c["text"])
+        parts = _split_at_boundary(body, max_chars)
+
+        if len(parts) == 1:
+            result.append(c)
+            continue
+
+        base_id  = re.sub(r"__s\d+$", "", c["chunk_id"])
+        base_sub = c.get("sub_index", 0)
+
+        for j, part in enumerate(parts):
+            new_chunk = dict(c)
+            new_chunk["sub_index"] = base_sub + j
+            new_chunk["chunk_id"]  = f"{base_id}__s{base_sub + j}"
+            new_chunk["text"]      = _rebuild_text(new_chunk, part)
+            new_chunk["n_chars"]   = len(new_chunk["text"])
+            result.append(new_chunk)
+
+        split_count += 1
+
+    return result, split_count
+
+
+def renumber_ids(chunks: list[dict]) -> list[dict]:
+    seen: dict[str, int] = {}
+    for c in chunks:
+        base = re.sub(r"__s\d+$", "", c["chunk_id"])
+        idx  = seen.get(base, 0)
+        c["chunk_id"]  = f"{base}__s{idx}"
+        c["sub_index"] = idx
+        seen[base] = idx + 1
+    return chunks
+
+
+# ─── Core ─────────────────────────────────────────────────────────────────────
+
+def fix_stem(stem: str, project_root: Path, max_chars: int, dry_run: bool) -> bool:
+    stem_dir    = project_root / "chunks" / stem
+    chunks_path = stem_dir / "chunks.json"
+    report_path = stem_dir / "report.json"
+
+    if not chunks_path.exists():
+        print(f"✗ chunks/{stem}/chunks.json non trovato.")
+        print(f"  Esegui prima: python chunks/chunker.py --stem {stem}")
+        return False
+
+    if not report_path.exists():
+        print(f"✗ chunks/{stem}/report.json non trovato.")
+        print(f"  Esegui prima: python chunks/verify_chunks.py --stem {stem}")
+        return False
+
+    chunks: list[dict] = json.loads(chunks_path.read_text(encoding="utf-8"))
+    report: dict       = json.loads(report_path.read_text(encoding="utf-8"))
+
+    verdict = report.get("verdict", "ok")
+    print(f"\nDocumento: {stem}  (verdict: {verdict})")
+
+    if verdict == "ok":
+        print("  ✅ Nessun problema — nulla da correggere.")
+        return True
+
+    empty_ids      = {e["chunk_id"] for e in report.get("blockers", {}).get("empty", [])}
+    no_prefix_ids  = {e["chunk_id"] for e in report.get("blockers", {}).get("no_prefix", [])}
+    incomplete_ids = {e["chunk_id"] for e in report.get("blockers", {}).get("incomplete", [])}
+    too_short_ids  = {e["chunk_id"] for e in report.get("warnings", {}).get("too_short", [])}
+    too_long_ids   = {e["chunk_id"] for e in report.get("warnings", {}).get("too_long", [])}
+
+    ops: list[str] = []
+    if empty_ids:
+        ops.append(f"  🗑  rimuovi {len(empty_ids)} chunk vuoti")
+    if no_prefix_ids:
+        ops.append(f"  🔧 aggiungi prefisso a {len(no_prefix_ids)} chunk")
+    if incomplete_ids:
+        ops.append(f"  🔗 fondi {len(incomplete_ids)} chunk incompleti col successivo")
+    if too_short_ids:
+        ops.append(f"  🔗 fondi {len(too_short_ids)} chunk troppo corti col successivo")
+    if too_long_ids:
+        ops.append(f"  ✂️  spezza {len(too_long_ids)} chunk troppo lunghi")
+
+    if not ops:
+        print("  ✅ Nessuna correzione necessaria.")
+        return True
+
+    print("\n  Operazioni pianificate:")
+    for op in ops:
+        print(op)
+
+    if dry_run:
+        print("\n  [dry-run] Nessuna modifica applicata.")
+        return True
+
+    n_before = len(chunks)
+
+    if empty_ids:
+        chunks, n = fix_empty(chunks, empty_ids)
+        print(f"\n  🗑  Rimossi {n} chunk vuoti.")
+
+    if no_prefix_ids:
+        chunks, n = fix_no_prefix(chunks, no_prefix_ids)
+        print(f"  🔧 Aggiunto prefisso a {n} chunk.")
+
+    merge_ids = incomplete_ids | too_short_ids
+    if merge_ids:
+        chunks, n = fix_incomplete_and_short(chunks, merge_ids)
+        print(f"  🔗 Fusi {n} chunk (incompleti + corti).")
+
+    if too_long_ids:
+        chunks, n = fix_too_long(chunks, too_long_ids, max_chars)
+        print(f"  ✂️  Spezzati {n} chunk lunghi.")
+
+    chunks = renumber_ids(chunks)
+
+    n_after = len(chunks)
+    print(f"\n  Totale chunk: {n_before} → {n_after}")
+
+    chunks_path.write_text(
+        json.dumps(chunks, ensure_ascii=False, indent=2), encoding="utf-8"
+    )
+    print(f"  ✅ Salvato: chunks/{stem}/chunks.json")
+    print(f"\n  Riesegui la verifica:")
+    print(f"     python chunks/verify_chunks.py --stem {stem}")
+
+    return True
+
+
+# ─── Entry point ──────────────────────────────────────────────────────────────
+
+if __name__ == "__main__":
+    project_root = Path(__file__).parent.parent
+
+    parser = argparse.ArgumentParser(description="Fix chunk")
+    parser.add_argument("--stem", required=True, help="Nome del documento (sottocartella di chunks/)")
+    parser.add_argument(
+        "--max", type=int, default=MAX_CHARS,
+        help=f"Soglia massima caratteri per lo split (default: {MAX_CHARS})"
+    )
+    parser.add_argument(
+        "--dry-run", action="store_true",
+        help="Mostra le operazioni pianificate senza applicarle"
+    )
+    args = parser.parse_args()
+
+    ok = fix_stem(args.stem, project_root, args.max, args.dry_run)
+    sys.exit(0 if ok else 1)