diff --git a/chunks/chunker.py b/chunks/chunker.py index 2f4c718..188d95a 100644 --- a/chunks/chunker.py +++ b/chunks/chunker.py @@ -44,6 +44,41 @@ def slugify(s: str, max_len: int = 60) -> str: return s[:max_len] if s else "section" +_SENT_BOUNDARY = re.compile(r"[.!?»)\]'\u2019\"\u201c\u201d/:|\u2026]$") + + +def _flush_chunk( + current: list[str], + sentences: list[str], + i: int, + prefix: str, + sezione: str, + titolo: str, + sub_index: int, + max_chars: int, +) -> tuple[dict, list[str], int, int]: + """Emette un chunk, estendendo fino a un confine di frase (max +20%).""" + hard_limit = int(max_chars * 1.2) + current_len = sum(len(s) + 1 for s in current) + while i < len(sentences) and not _SENT_BOUNDARY.search(" ".join(current)): + nxt = sentences[i] + if current_len + len(nxt) + 1 > hard_limit: + break + current.append(nxt) + current_len += len(nxt) + 1 + i += 1 + chunk_text = prefix + " ".join(current) + chunk = { + "chunk_id": f"{slugify(sezione)}__{slugify(titolo)}__s{sub_index}", + "text": chunk_text, + "sezione": sezione, + "titolo": titolo, + "sub_index": sub_index, + "n_chars": len(chunk_text), + } + return chunk, current, i, sub_index + 1 + + def make_sub_chunks( body: str, prefix: str, @@ -69,16 +104,10 @@ def make_sub_chunks( current_len += len(sent) + (1 if len(current) > 1 else 0) i += 1 else: - chunk_text = prefix + " ".join(current) - chunks.append({ - "chunk_id": f"{slugify(sezione)}__{slugify(titolo)}__s{sub_index}", - "text": chunk_text, - "sezione": sezione, - "titolo": titolo, - "sub_index": sub_index, - "n_chars": len(chunk_text), - }) - sub_index += 1 + chunk, current, i, sub_index = _flush_chunk( + current, sentences, i, prefix, sezione, titolo, sub_index, max_chars + ) + chunks.append(chunk) overlap = current[-overlap_s:] if overlap_s and len(current) > overlap_s else [] current = overlap[:] current_len = sum(len(s) + 1 for s in current) diff --git a/chunks/verify_chunks.py b/chunks/verify_chunks.py index b18e55a..d682748 100644 --- a/chunks/verify_chunks.py +++ b/chunks/verify_chunks.py @@ -25,7 +25,15 @@ from pathlib import Path MIN_CHARS = 200 MAX_CHARS = 800 -PUNCT_END = re.compile("[.!?»)\\]'\u2019\"\u201c\u201d\u2018\u2014\u2013\u2026-]$") +PUNCT_END = re.compile( + r"[.!?»)\]'\u2019\"\u201c\u201d\u2018\u2014\u2013\u2026]$" + r"|/$" # URL che finisce con / + r"|\|$" # riga di tabella Markdown + r"|:$" # introduzione a lista o formula +) +_HEX_END = re.compile(r"[0-9a-fA-F]{8,}$") +_URL_TAIL = re.compile(r"https?://\S+(\s+\S+){0,3}$") # URL con fino a 3 token extra +_MATH_SYMS = re.compile(r"[∈∑≤≥≠∀∃∫√∞∂±×÷→←↔⊂⊃⊆⊇∩∪·°]") # ─── Checks ─────────────────────────────────────────────────────────────────── @@ -53,7 +61,18 @@ def ends_incomplete(chunk: dict) -> bool: text_check = re.sub(r"[_*]+$", "", text).rstrip() if not text_check: return False - return not PUNCT_END.search(text_check) + if PUNCT_END.search(text_check): + return False + if _HEX_END.search(text_check): # hash SHA / codice hex + return False + if _URL_TAIL.search(text_check[-200:]): # URL (con eventuale path dopo spazio) + return False + return True + + +def is_math_incomplete(chunk: dict) -> bool: + """Incompleto ma in contesto matematico — degrada a warning invece di blocker.""" + return ends_incomplete(chunk) and len(_MATH_SYMS.findall(chunk.get("text", ""))) >= 3 # ─── Report ─────────────────────────────────────────────────────────────────── @@ -83,11 +102,13 @@ def verify_stem(stem: str, project_root: Path, min_chars: int, max_chars: int) - # ── Raccogli problemi ────────────────────────────────────────────────────── - empty_chunks = [c for c in chunks if is_empty(c)] - no_prefix = [c for c in chunks if not is_empty(c) and not has_prefix(c)] - too_short = [c for c in chunks if is_too_short(c, min_chars)] - too_long = [c for c in chunks if is_too_long(c, max_chars)] - incomplete = [c for c in chunks if not is_empty(c) and ends_incomplete(c)] + empty_chunks = [c for c in chunks if is_empty(c)] + no_prefix = [c for c in chunks if not is_empty(c) and not has_prefix(c)] + too_short = [c for c in chunks if is_too_short(c, min_chars)] + too_long = [c for c in chunks if is_too_long(c, max_chars)] + _incomplete_all = [c for c in chunks if not is_empty(c) and ends_incomplete(c)] + incomplete_math = [c for c in _incomplete_all if is_math_incomplete(c)] + incomplete = [c for c in _incomplete_all if not is_math_incomplete(c)] # ── Statistiche ─────────────────────────────────────────────────────────── @@ -166,10 +187,20 @@ def verify_stem(stem: str, project_root: Path, min_chars: int, max_chars: int) - print(f" ... e altri {len(incomplete) - 5}") print(f" → Soluzione: correggi le righe spezzate in conversione/{stem}/clean.md") + if incomplete_math: + has_errors = True + print(f"\n 🟡 {len(incomplete_math)} chunk MATEMATICI SENZA PUNTEGGIATURA (formula/espressione):") + for c in incomplete_math[:3]: + last_line = c.get("text", "").rstrip().split("\n")[-1][-80:] + print(f" [{c.get('chunk_id', '?')}] ...{last_line!r}") + if len(incomplete_math) > 3: + print(f" ... e altri {len(incomplete_math) - 3}") + print(f" → Le formule non finiscono con punteggiatura — avviso non bloccante") + # ── Costruisci e salva report.json ──────────────────────────────────────── blockers = empty_chunks + no_prefix + incomplete - warnings = too_short + too_long + warnings = too_short + too_long + incomplete_math def _chunk_entry(c: dict) -> dict: return { @@ -201,8 +232,9 @@ def verify_stem(stem: str, project_root: Path, min_chars: int, max_chars: int) - "incomplete": [_chunk_entry(c) for c in incomplete], }, "warnings": { - "too_short": [_chunk_entry(c) for c in too_short], - "too_long": [_chunk_entry(c) for c in too_long], + "too_short": [_chunk_entry(c) for c in too_short], + "too_long": [_chunk_entry(c) for c in too_long], + "incomplete_math": [_chunk_entry(c) for c in incomplete_math], }, } diff --git a/conversione/pipeline.py b/conversione/pipeline.py index eedf436..e657da0 100644 --- a/conversione/pipeline.py +++ b/conversione/pipeline.py @@ -1538,6 +1538,9 @@ def run(stem: str, project_root: Path, force: bool) -> bool: print(f" ✗ Permesso negato durante la scrittura: {e}") return False profile = analyze(clean_out) + (out_dir / "structure_profile.json").write_text( + json.dumps(profile, ensure_ascii=False, indent=2), encoding="utf-8" + ) _LIVELLO_DESC = {3: "ricca (h3)", 2: "parziale (h2)", 1: "paragrafi", 0: "testo piatto"} print(f" ✅ Struttura: livello {profile['livello_struttura']} — {_LIVELLO_DESC[profile['livello_struttura']]}")