feat(chunks): target-based chunking con config centralizzata

Introduce chunks/config.py come unica fonte di verità per tutti i parametri della pipeline di chunking. TARGET_CHARS + CHUNK_TOLERANCE sostituiscono MIN_CHARS/MAX_CHARS: il chunker mira a una dimensione target e si avvicina il più possibile rispettando il vincolo assoluto di terminare ogni chunk su un confine di frase (punto/punteggiatura). - config.py: TARGET_CHARS, CHUNK_TOLERANCE, SPLIT_THRESHOLD_FACTOR, PROTECT_TABLES, FIX_MAX_ITERATIONS, STRATEGY_OVERRIDES per strategia - chunker.py: algoritmo target-based (emit quando frase successiva sfora upper_body = upper - prefix_len), table protection atomica, override MIN/MAX/overlap per ciascuna delle 4 strategie - verify_chunks.py: soglie derivate da target*(1±tolerance) - fix_chunks.py: _split_at_boundary sempre su punteggiatura finale, loop ricorsivo fix→verify fino a FIX_MAX_ITERATIONS, split solo per chunk > upper × SPLIT_THRESHOLD_FACTOR Risultato su bitcoin: 694 chunk, 0 incompleti, 83% in range [450,750], tutti terminanti su punteggiatura indipendentemente dalla dimensione. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-11 15:45:24 +02:00
parent 508587c5bf
commit 02c785678d
4 changed files with 342 additions and 130 deletions
@@ -20,11 +20,16 @@ import re
 import sys
 from pathlib import Path

+_HERE = Path(__file__).resolve().parent
+if str(_HERE) not in sys.path:
+    sys.path.insert(0, str(_HERE))
+import config as cfg

-# ─── Soglie ───────────────────────────────────────────────────────────────────

-MIN_CHARS = 200
-MAX_CHARS = 800
+# ─── Soglie (derivate dal target, sovrascrivibili da CLI) ────────────────────
+
+MIN_CHARS = int(cfg.TARGET_CHARS * (1 - cfg.CHUNK_TOLERANCE))
+MAX_CHARS = int(cfg.TARGET_CHARS * (1 + cfg.CHUNK_TOLERANCE))
 PUNCT_END = re.compile(
    r"[.!?»)\]'\u2019\"\u201c\u201d\u2018\u2014\u2013\u2026]$"
    r"|/$"    # URL che finisce con /
@@ -51,7 +56,7 @@ def is_too_short(chunk: dict, min_chars: int) -> bool:


 def is_too_long(chunk: dict, max_chars: int) -> bool:
-    return chunk.get("n_chars", 0) > max_chars * 1.5
+    return chunk.get("n_chars", 0) > max_chars


 def ends_incomplete(chunk: dict) -> bool:
@@ -72,7 +77,7 @@ def ends_incomplete(chunk: dict) -> bool:

 def is_math_incomplete(chunk: dict) -> bool:
    """Incompleto ma in contesto matematico — degrada a warning invece di blocker."""
-    return ends_incomplete(chunk) and len(_MATH_SYMS.findall(chunk.get("text", ""))) >= 3
+    return ends_incomplete(chunk) and len(_MATH_SYMS.findall(chunk.get("text", ""))) >= cfg.MATH_SYMS_MIN


 # ─── Report ───────────────────────────────────────────────────────────────────
@@ -170,12 +175,12 @@ def verify_stem(stem: str, project_root: Path, min_chars: int, max_chars: int) -

    if too_long:
        has_errors = True
-        print(f"\n  🟡 {len(too_long)} chunk SOPRA MAX_CHARS×1.5 ({int(max_chars * 1.5)}):")
+        print(f"\n  🟡 {len(too_long)} chunk SOPRA MAX ({max_chars}):")
        for c in too_long[:5]:
            print(_fmt_chunk(c))
        if len(too_long) > 5:
            print(f"  ... e altri {len(too_long) - 5}")
-        print(f"  → Soluzione: alza MAX_CHARS o verifica il testo nel MD")
+        print(f"  → Causa probabile: frasi singole lunghe (liste/paragrafi non suddivisibili)")

    if incomplete:
        has_errors = True
@@ -225,7 +230,12 @@ def verify_stem(stem: str, project_root: Path, min_chars: int, max_chars: int) -
            "max_chars": max_l,
            "avg_chars": avg_l,
        },
-        "thresholds": {"min_chars": min_chars, "max_chars": max_chars},
+        "thresholds": {
+            "min_chars": min_chars,
+            "max_chars": max_chars,
+            "target_chars": cfg.TARGET_CHARS,
+            "chunk_tolerance": cfg.CHUNK_TOLERANCE,
+        },
        "blockers": {
            "empty":      [_chunk_entry(c) for c in empty_chunks],
            "no_prefix":  [_chunk_entry(c) for c in no_prefix],
@@ -301,13 +311,15 @@ if __name__ == "__main__":

    parser = argparse.ArgumentParser(description="Verifica chunk")
    parser.add_argument("--stem", help="Nome del documento (sottocartella di chunks/)")
+    _min_def = int(cfg.TARGET_CHARS * (1 - cfg.CHUNK_TOLERANCE))
+    _max_def = int(cfg.TARGET_CHARS * (1 + cfg.CHUNK_TOLERANCE))
    parser.add_argument(
-        "--min", type=int, default=MIN_CHARS,
-        help=f"Soglia minima caratteri (default: {MIN_CHARS})"
+        "--min", type=int, default=_min_def,
+        help=f"Soglia minima caratteri (default: TARGET×(1-TOL) = {_min_def})"
    )
    parser.add_argument(
-        "--max", type=int, default=MAX_CHARS,
-        help=f"Soglia massima caratteri (default: {MAX_CHARS})"
+        "--max", type=int, default=_max_def,
+        help=f"Soglia massima caratteri (default: TARGET×(1+TOL) = {_max_def})"
    )
    args = parser.parse_args()