diff --git a/chunks/chunker.py b/chunks/chunker.py index 2a07f57..9fa7b84 100644 --- a/chunks/chunker.py +++ b/chunks/chunker.py @@ -439,6 +439,12 @@ def process_stem(stem: str, project_root: Path, force: bool) -> bool: lower = int(target * (1 - tolerance)) upper = int(target * (1 + tolerance)) + meta = {"strategy": strategia, "target_chars": target, + "min_chars": lower, "max_chars": upper} + (out_dir / "meta.json").write_text( + json.dumps(meta, ensure_ascii=False), encoding="utf-8" + ) + lengths = [c["n_chars"] for c in chunks] min_c = min(lengths) max_c = max(lengths) diff --git a/chunks/fix_chunks.py b/chunks/fix_chunks.py index 794dc2b..45b6862 100644 --- a/chunks/fix_chunks.py +++ b/chunks/fix_chunks.py @@ -21,6 +21,8 @@ Uso: """ import argparse +import contextlib +import io import json import re import sys @@ -33,6 +35,15 @@ import config as cfg from verify_chunks import verify_stem as _verify_stem MAX_CHARS = int(cfg.TARGET_CHARS * (1 + cfg.CHUNK_TOLERANCE)) + + +def _load_thresholds(stem_dir: Path) -> int: + """Legge max_chars da meta.json (scritto dal chunker) o usa il default da config.""" + meta = stem_dir / "meta.json" + if meta.exists(): + import json as _json + return _json.loads(meta.read_text(encoding="utf-8"))["max_chars"] + return MAX_CHARS PUNCT_END = re.compile(r"[.!?»)\]'\u2019\"\u201c\u201d\u2018\u2014\u2013-]$") @@ -59,11 +70,20 @@ def _rebuild_text(chunk: dict, body: str) -> str: return f"{_prefix(chunk)}\n{body}" -_SENT_END = re.compile(r'[.!?»)\]\'"’”…]') +# Fine frase forte: . ! ? seguiti da spazio + maiuscola o virgolette. +# Non usare punteggiatura debole (,;:)>>]) per non creare chunk incompleti. +_STRONG_END = re.compile( + r'[.!?\xbb]\s+(?=[A-Z\xc0-\xd6\xd8-\xde\xc0-\xff\xab\x22\x27(])' +) +_SECONDARY_END = re.compile(r';\s+') def _split_at_boundary(text: str, max_chars: int) -> list[str]: - """Spezza text in parti ≤ max_chars, ciascuna terminante su punteggiatura.""" + """Spezza text in parti ≤ max_chars su confini di frase forti (.!?). + + Se non trova un confine forte entro max_chars, NON spezza: meglio un + chunk too_long (warning) che un chunk incompleto (blocker). + """ if len(text) <= max_chars: return [text] @@ -73,27 +93,28 @@ def _split_at_boundary(text: str, max_chars: int) -> list[str]: while len(remaining) > max_chars: candidate = remaining[:max_chars] - # Cerca l'ultima punteggiatura finale entro max_chars. - last_punct = -1 - for m in _SENT_END.finditer(candidate): - last_punct = m.end() # posizione dopo il carattere di punteggiatura + last_pos = -1 + for m in _STRONG_END.finditer(candidate): + last_pos = m.start() + 1 # posizione dopo il carattere terminatore - if last_punct > 0: - # Taglia dopo la punteggiatura; il resto inizia alla parola successiva. - first = remaining[:last_punct].rstrip() - remaining = remaining[last_punct:].lstrip() + if last_pos > 0: + first = remaining[:last_pos].rstrip() + remaining = remaining[last_pos:].lstrip() + if first: + parts.append(first) else: - # Nessuna punteggiatura: taglia all'ultimo spazio disponibile. - sp = candidate.rfind(" ") - if sp > 0: - first = remaining[:sp].rstrip() - remaining = remaining[sp:].lstrip() + # Prova confine secondario: ; + spazio (clausole legali) + sec_pos = -1 + for m in _SECONDARY_END.finditer(candidate): + sec_pos = m.start() + 1 + if sec_pos > 0: + first = remaining[:sec_pos].rstrip() + remaining = remaining[sec_pos:].lstrip() + if first: + parts.append(first) else: - first = remaining[:max_chars] - remaining = remaining[max_chars:] - - if first: - parts.append(first) + # Nessun confine: lascia il chunk intero (too_long > incomplete) + break if remaining: parts.append(remaining) @@ -191,10 +212,12 @@ def renumber_ids(chunks: list[dict]) -> list[dict]: # ─── Core ───────────────────────────────────────────────────────────────────── -def fix_stem(stem: str, project_root: Path, max_chars: int, dry_run: bool) -> bool: +def fix_stem(stem: str, project_root: Path, max_chars: int, dry_run: bool, + max_iter: int = 10) -> bool: stem_dir = project_root / "chunks" / stem chunks_path = stem_dir / "chunks.json" report_path = stem_dir / "report.json" + max_chars = _load_thresholds(stem_dir) if not chunks_path.exists(): print(f"✗ chunks/{stem}/chunks.json non trovato.") @@ -213,7 +236,7 @@ def fix_stem(stem: str, project_root: Path, max_chars: int, dry_run: bool) -> bo print(f"\nDocumento: {stem} (verdict: {verdict})") if verdict == "ok": - print(" ✅ Nessun problema — nulla da correggere.") + print(" ✅ Nessun problema - nulla da correggere.") return True empty_ids = {e["chunk_id"] for e in report.get("blockers", {}).get("empty", [])} @@ -256,54 +279,77 @@ def fix_stem(stem: str, project_root: Path, max_chars: int, dry_run: bool) -> bo n_before = len(chunks) - def _apply_fixes(chunks: list[dict], report: dict) -> list[dict]: - empty_ids_ = {e["chunk_id"] for e in report.get("blockers", {}).get("empty", [])} - no_prefix_ids_ = {e["chunk_id"] for e in report.get("blockers", {}).get("no_prefix", [])} - incomplete_ids_= {e["chunk_id"] for e in report.get("blockers", {}).get("incomplete", [])} - too_short_ids_ = {e["chunk_id"] for e in report.get("warnings", {}).get("too_short", [])} - too_long_ids_ = { - e["chunk_id"] - for e in report.get("warnings", {}).get("too_long", []) - if e.get("n_chars", 0) > max_chars * cfg.SPLIT_THRESHOLD_FACTOR - } - + def _fix_blockers(chunks: list[dict], report: dict) -> list[dict]: + """Risolve solo i blockers (incomplete, empty, no_prefix) senza toccare warnings.""" + empty_ids_ = {e["chunk_id"] for e in report.get("blockers", {}).get("empty", [])} + no_prefix_ids_ = {e["chunk_id"] for e in report.get("blockers", {}).get("no_prefix", [])} + incomplete_ids_ = {e["chunk_id"] for e in report.get("blockers", {}).get("incomplete", [])} if empty_ids_: chunks, n = fix_empty(chunks, empty_ids_) print(f" 🗑 Rimossi {n} chunk vuoti.") if no_prefix_ids_: chunks, n = fix_no_prefix(chunks, no_prefix_ids_) print(f" 🔧 Aggiunto prefisso a {n} chunk.") - merge_ids_ = incomplete_ids_ | too_short_ids_ - if merge_ids_: - chunks, n = fix_incomplete_and_short(chunks, merge_ids_) - print(f" 🔗 Fusi {n} chunk (incompleti + corti).") + if incomplete_ids_: + chunks, n = fix_incomplete_and_short(chunks, incomplete_ids_) + print(f" 🔗 Fusi {n} chunk incompleti.") + return renumber_ids(chunks) + + def _fix_warnings(chunks: list[dict], report: dict) -> list[dict]: + """Applica fix opzionali: merge too_short e split too_long.""" + too_short_ids_ = {e["chunk_id"] for e in report.get("warnings", {}).get("too_short", [])} + too_long_ids_ = { + e["chunk_id"] + for e in report.get("warnings", {}).get("too_long", []) + if e.get("n_chars", 0) > max_chars * cfg.SPLIT_THRESHOLD_FACTOR + } + if too_short_ids_: + chunks, n = fix_incomplete_and_short(chunks, too_short_ids_) + print(f" 🔗 Fusi {n} chunk troppo corti.") if too_long_ids_: chunks, n = fix_too_long(chunks, too_long_ids_, max_chars) print(f" ✂️ Spezzati {n} chunk lunghi.") return renumber_ids(chunks) - chunks = _apply_fixes(chunks, report) + # Fase 1: risolvi blockers a convergenza (solo merge incomplete) + chunks = _fix_blockers(chunks, report) - for iteration in range(1, cfg.FIX_MAX_ITERATIONS): + _min = int(cfg.TARGET_CHARS * (1 - cfg.CHUNK_TOLERANCE)) + _max = int(cfg.TARGET_CHARS * (1 + cfg.CHUNK_TOLERANCE)) + prev_blockers = sum(len(v) for v in report.get("blockers", {}).values()) + + for iteration in range(1, max_iter + 1): chunks_path.write_text( json.dumps(chunks, ensure_ascii=False, indent=2), encoding="utf-8" ) - project_root = chunks_path.parent.parent.parent - _min = int(cfg.TARGET_CHARS * (1 - cfg.CHUNK_TOLERANCE)) - _max = int(cfg.TARGET_CHARS * (1 + cfg.CHUNK_TOLERANCE)) - _verify_stem(stem, project_root, _min, _max) + with contextlib.redirect_stdout(io.StringIO()): + _verify_stem(stem, project_root, _min, _max) report = json.loads(report_path.read_text(encoding="utf-8")) new_verdict = report.get("verdict", "ok") - if new_verdict in ("ok", "warnings_only"): + curr_blockers = sum(len(v) for v in report.get("blockers", {}).values()) + + if new_verdict in ("ok", "warnings_only") or curr_blockers == 0: break - remaining_blockers = sum( - len(v) for v in report.get("blockers", {}).values() - ) - if remaining_blockers == 0: + if curr_blockers >= prev_blockers: + print(f"\n ⚠️ Nessun miglioramento ({curr_blockers} blockers) - i restanti richiedono correzione manuale del clean.md.") break - print(f"\n Iterazione {iteration + 1}/{cfg.FIX_MAX_ITERATIONS} " - f"— {remaining_blockers} bloccer residui:") - chunks = _apply_fixes(chunks, report) + + print(f"\n Iterazione {iteration + 1} - {curr_blockers} blockers residui:") + prev_blockers = curr_blockers + chunks = _fix_blockers(chunks, report) + + # Fase 2: fix warnings (too_short merge + too_long split) - una sola passata finale + with contextlib.redirect_stdout(io.StringIO()): + _verify_stem(stem, project_root, _min, _max) + report = json.loads(report_path.read_text(encoding="utf-8")) + n_short = len(report.get("warnings", {}).get("too_short", [])) + n_long = sum( + 1 for e in report.get("warnings", {}).get("too_long", []) + if e.get("n_chars", 0) > max_chars * cfg.SPLIT_THRESHOLD_FACTOR + ) + if n_short or n_long: + print(f"\n Fix warnings: {n_short} corti, {n_long} lunghi da spezzare") + chunks = _fix_warnings(chunks, report) n_after = len(chunks) print(f"\n Totale chunk: {n_before} → {n_after}") @@ -315,11 +361,11 @@ def fix_stem(stem: str, project_root: Path, max_chars: int, dry_run: bool) -> bo final_verdict = report.get("verdict", "?") if final_verdict == "ok": - print(f" ✅ Verdict finale: ok — procedi alla vettorizzazione.") + print(f" ✅ Verdict finale: ok - procedi alla vettorizzazione.") elif final_verdict == "warnings_only": - print(f" 🟡 Verdict finale: warnings_only — puoi procedere.") + print(f" 🟡 Verdict finale: warnings_only - puoi procedere.") else: - print(f" 🔴 Verdict finale: {final_verdict} — rilancia la verifica manualmente:") + print(f" 🔴 Verdict finale: {final_verdict} - rilancia la verifica manualmente:") print(f" python chunks/verify_chunks.py --stem {stem}") return True @@ -341,7 +387,11 @@ if __name__ == "__main__": "--dry-run", action="store_true", help="Mostra le operazioni pianificate senza applicarle" ) + parser.add_argument( + "--max-iter", type=int, default=10, metavar="N", + help="Numero massimo di iterazioni automatiche (default: 10)" + ) args = parser.parse_args() - ok = fix_stem(args.stem, project_root, args.max, args.dry_run) + ok = fix_stem(args.stem, project_root, args.max, args.dry_run, args.max_iter) sys.exit(0 if ok else 1) diff --git a/chunks/verify_chunks.py b/chunks/verify_chunks.py index 561638a..95e6aad 100644 --- a/chunks/verify_chunks.py +++ b/chunks/verify_chunks.py @@ -31,16 +31,28 @@ import config as cfg MIN_CHARS = int(cfg.TARGET_CHARS * (1 - cfg.CHUNK_TOLERANCE)) MAX_CHARS = int(cfg.TARGET_CHARS * (1 + cfg.CHUNK_TOLERANCE)) PUNCT_END = re.compile( - r"[.!?»)\]'\u2019\"\u201c\u201d\u2018\u2014\u2013\u2026]$" + r"[.!?\xbb)\]'\u2019\"\u201c\u201d\u2018\u2014\u2013\u2026]$" r"|/$" # URL che finisce con / r"|\|$" # riga di tabella Markdown + r"|;$" # fine clausola legale (testo giuridico) r"|:$" # introduzione a lista o formula ) _HEX_END = re.compile(r"[0-9a-fA-F]{8,}$") _URL_TAIL = re.compile(r"https?://\S+(\s+\S+){0,3}$") # URL con fino a 3 token extra _MATH_SYMS = re.compile(r"[∈∑≤≥≠∀∃∫√∞∂±×÷→←↔⊂⊃⊆⊇∩∪·°]") +_ROMAN_END = re.compile(r"\b(I{1,3}|IV|VI{0,3}|IX|XI{0,2}|XIV|XV|XVI{0,2}|XIX|XX{0,2})$") + +def _load_thresholds(stem_dir: "Path") -> "tuple[int, int]": + """Legge min/max da meta.json (scritto dal chunker) o usa i default da config.""" + meta = stem_dir / "meta.json" + if meta.exists(): + import json as _json + m = _json.loads(meta.read_text(encoding="utf-8")) + return m["min_chars"], m["max_chars"] + return MIN_CHARS, MAX_CHARS + # ─── Checks ─────────────────────────────────────────────────────────────────── def has_prefix(chunk: dict) -> bool: @@ -70,6 +82,8 @@ def ends_incomplete(chunk: dict) -> bool: return False if _HEX_END.search(text_check): # hash SHA / codice hex return False + if _ROMAN_END.search(text_check): # numero romano finale (indice/riferimento PDF) + return False if _URL_TAIL.search(text_check[-200:]): # URL (con eventuale path dopo spazio) return False return True @@ -90,7 +104,9 @@ def _fmt_chunk(c: dict) -> str: def verify_stem(stem: str, project_root: Path, min_chars: int, max_chars: int) -> bool: - chunks_path = project_root / "chunks" / stem / "chunks.json" + stem_dir = project_root / "chunks" / stem + chunks_path = stem_dir / "chunks.json" + min_chars, max_chars = _load_thresholds(stem_dir) print(f"\nDocumento: {stem}")