feat(chunks): sentence-boundary flush, math incomplete detection, structure profile export

- chunker: estrai _flush_chunk() con estensione al confine di frase (max 120%) - verify: rileva chunk matematici incompleti come warning, gestisci hash hex e URL - conversione: esporta structure_profile.json nell'output dir
2026-04-20 12:27:58 +02:00
parent 995a8be735
commit fe0ecc24ad
3 changed files with 84 additions and 20 deletions
@@ -25,7 +25,15 @@ from pathlib import Path

 MIN_CHARS = 200
 MAX_CHARS = 800
-PUNCT_END = re.compile("[.!?»)\\]'\u2019\"\u201c\u201d\u2018\u2014\u2013\u2026-]$")
+PUNCT_END = re.compile(
+    r"[.!?»)\]'\u2019\"\u201c\u201d\u2018\u2014\u2013\u2026]$"
+    r"|/$"    # URL che finisce con /
+    r"|\|$"   # riga di tabella Markdown
+    r"|:$"    # introduzione a lista o formula
+)
+_HEX_END     = re.compile(r"[0-9a-fA-F]{8,}$")
+_URL_TAIL    = re.compile(r"https?://\S+(\s+\S+){0,3}$")  # URL con fino a 3 token extra
+_MATH_SYMS   = re.compile(r"[∈∑≤≥≠∀∃∫√∞∂±×÷→←↔⊂⊃⊆⊇∩∪·°]")


 # ─── Checks ───────────────────────────────────────────────────────────────────
@@ -53,7 +61,18 @@ def ends_incomplete(chunk: dict) -> bool:
    text_check = re.sub(r"[_*]+$", "", text).rstrip()
    if not text_check:
        return False
-    return not PUNCT_END.search(text_check)
+    if PUNCT_END.search(text_check):
+        return False
+    if _HEX_END.search(text_check):   # hash SHA / codice hex
+        return False
+    if _URL_TAIL.search(text_check[-200:]):  # URL (con eventuale path dopo spazio)
+        return False
+    return True
+
+
+def is_math_incomplete(chunk: dict) -> bool:
+    """Incompleto ma in contesto matematico — degrada a warning invece di blocker."""
+    return ends_incomplete(chunk) and len(_MATH_SYMS.findall(chunk.get("text", ""))) >= 3


 # ─── Report ───────────────────────────────────────────────────────────────────
@@ -83,11 +102,13 @@ def verify_stem(stem: str, project_root: Path, min_chars: int, max_chars: int) -

    # ── Raccogli problemi ──────────────────────────────────────────────────────

-    empty_chunks = [c for c in chunks if is_empty(c)]
-    no_prefix    = [c for c in chunks if not is_empty(c) and not has_prefix(c)]
-    too_short    = [c for c in chunks if is_too_short(c, min_chars)]
-    too_long     = [c for c in chunks if is_too_long(c, max_chars)]
-    incomplete   = [c for c in chunks if not is_empty(c) and ends_incomplete(c)]
+    empty_chunks      = [c for c in chunks if is_empty(c)]
+    no_prefix         = [c for c in chunks if not is_empty(c) and not has_prefix(c)]
+    too_short         = [c for c in chunks if is_too_short(c, min_chars)]
+    too_long          = [c for c in chunks if is_too_long(c, max_chars)]
+    _incomplete_all   = [c for c in chunks if not is_empty(c) and ends_incomplete(c)]
+    incomplete_math   = [c for c in _incomplete_all if is_math_incomplete(c)]
+    incomplete        = [c for c in _incomplete_all if not is_math_incomplete(c)]

    # ── Statistiche ───────────────────────────────────────────────────────────

@@ -166,10 +187,20 @@ def verify_stem(stem: str, project_root: Path, min_chars: int, max_chars: int) -
            print(f"  ... e altri {len(incomplete) - 5}")
        print(f"  → Soluzione: correggi le righe spezzate in conversione/{stem}/clean.md")

+    if incomplete_math:
+        has_errors = True
+        print(f"\n  🟡 {len(incomplete_math)} chunk MATEMATICI SENZA PUNTEGGIATURA (formula/espressione):")
+        for c in incomplete_math[:3]:
+            last_line = c.get("text", "").rstrip().split("\n")[-1][-80:]
+            print(f"  [{c.get('chunk_id', '?')}] ...{last_line!r}")
+        if len(incomplete_math) > 3:
+            print(f"  ... e altri {len(incomplete_math) - 3}")
+        print(f"  → Le formule non finiscono con punteggiatura — avviso non bloccante")
+
    # ── Costruisci e salva report.json ────────────────────────────────────────

    blockers = empty_chunks + no_prefix + incomplete
-    warnings = too_short + too_long
+    warnings = too_short + too_long + incomplete_math

    def _chunk_entry(c: dict) -> dict:
        return {
@@ -201,8 +232,9 @@ def verify_stem(stem: str, project_root: Path, min_chars: int, max_chars: int) -
            "incomplete": [_chunk_entry(c) for c in incomplete],
        },
        "warnings": {
-            "too_short": [_chunk_entry(c) for c in too_short],
-            "too_long":  [_chunk_entry(c) for c in too_long],
+            "too_short":       [_chunk_entry(c) for c in too_short],
+            "too_long":        [_chunk_entry(c) for c in too_long],
+            "incomplete_math": [_chunk_entry(c) for c in incomplete_math],
        },
    }