feat(chunks): ottimizzazione chunking e post-processing

- chunker.py: scrive meta.json con strategia e soglie effettive (target, min_chars, max_chars) per ogni documento chunked - verify_chunks.py: * _load_thresholds(): legge min/max da meta.json invece del TARGET_CHARS globale, eliminando il mismatch tra soglie chunker e verify (h3_aware target=600 -> range 450-750, non piu' validato a 225-375) * _ROMAN_END: esclude numeri romani finali (XV, XIV...) dagli incompleti perche' sono artefatti indice PDF, non frasi spezzate * PUNCT_END: aggiunge ; come fine valida (clausole legali italiane) - fix_chunks.py: * _load_thresholds(): usa max_chars da meta.json per split coerente * _SECONDARY_END: split secondario su ; per testo legale multi-clausola * Fase 1 (convergenza): risolve solo blockers (incomplete, empty, no_prefix) senza toccare warnings -- elimina il ciclo merge->too_long->split->incomplete->merge * Fase 2 (finale): una sola passata di merge too_short + split too_long dopo che i blockers sono azzerati Risultato su dirittopenale: da blocked (265 incomplete) a warnings_only in 2 iterazioni, senza cicli infiniti. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-12 11:09:28 +02:00
parent 587238f9f5
commit 5b63c423cc
3 changed files with 129 additions and 57 deletions
@@ -31,16 +31,28 @@ import config as cfg
 MIN_CHARS = int(cfg.TARGET_CHARS * (1 - cfg.CHUNK_TOLERANCE))
 MAX_CHARS = int(cfg.TARGET_CHARS * (1 + cfg.CHUNK_TOLERANCE))
 PUNCT_END = re.compile(
-    r"[.!?»)\]'\u2019\"\u201c\u201d\u2018\u2014\u2013\u2026]$"
+    r"[.!?\xbb)\]'\u2019\"\u201c\u201d\u2018\u2014\u2013\u2026]$"
    r"|/$"    # URL che finisce con /
    r"|\|$"   # riga di tabella Markdown
+    r"|;$"    # fine clausola legale (testo giuridico)
    r"|:$"    # introduzione a lista o formula
 )
 _HEX_END     = re.compile(r"[0-9a-fA-F]{8,}$")
 _URL_TAIL    = re.compile(r"https?://\S+(\s+\S+){0,3}$")  # URL con fino a 3 token extra
 _MATH_SYMS   = re.compile(r"[∈∑≤≥≠∀∃∫√∞∂±×÷→←↔⊂⊃⊆⊇∩∪·°]")
+_ROMAN_END   = re.compile(r"\b(I{1,3}|IV|VI{0,3}|IX|XI{0,2}|XIV|XV|XVI{0,2}|XIX|XX{0,2})$")


+
+def _load_thresholds(stem_dir: "Path") -> "tuple[int, int]":
+    """Legge min/max da meta.json (scritto dal chunker) o usa i default da config."""
+    meta = stem_dir / "meta.json"
+    if meta.exists():
+        import json as _json
+        m = _json.loads(meta.read_text(encoding="utf-8"))
+        return m["min_chars"], m["max_chars"]
+    return MIN_CHARS, MAX_CHARS
+
 # ─── Checks ───────────────────────────────────────────────────────────────────

 def has_prefix(chunk: dict) -> bool:
@@ -70,6 +82,8 @@ def ends_incomplete(chunk: dict) -> bool:
        return False
    if _HEX_END.search(text_check):   # hash SHA / codice hex
        return False
+    if _ROMAN_END.search(text_check):  # numero romano finale (indice/riferimento PDF)
+        return False
    if _URL_TAIL.search(text_check[-200:]):  # URL (con eventuale path dopo spazio)
        return False
    return True
@@ -90,7 +104,9 @@ def _fmt_chunk(c: dict) -> str:


 def verify_stem(stem: str, project_root: Path, min_chars: int, max_chars: int) -> bool:
-    chunks_path = project_root / "chunks" / stem / "chunks.json"
+    stem_dir    = project_root / "chunks" / stem
+    chunks_path = stem_dir / "chunks.json"
+    min_chars, max_chars = _load_thresholds(stem_dir)

    print(f"\nDocumento: {stem}")