feat(chunks): sentence-boundary flush, math incomplete detection, structure profile export

- chunker: estrai _flush_chunk() con estensione al confine di frase (max 120%)
- verify: rileva chunk matematici incompleti come warning, gestisci hash hex e URL
- conversione: esporta structure_profile.json nell'output dir
This commit is contained in:
2026-04-20 12:27:58 +02:00
parent 995a8be735
commit fe0ecc24ad
3 changed files with 84 additions and 20 deletions
+42 -10
View File
@@ -25,7 +25,15 @@ from pathlib import Path
MIN_CHARS = 200
MAX_CHARS = 800
PUNCT_END = re.compile("[.!?»)\\]'\u2019\"\u201c\u201d\u2018\u2014\u2013\u2026-]$")
PUNCT_END = re.compile(
r"[.!?»)\]'\u2019\"\u201c\u201d\u2018\u2014\u2013\u2026]$"
r"|/$" # URL che finisce con /
r"|\|$" # riga di tabella Markdown
r"|:$" # introduzione a lista o formula
)
_HEX_END = re.compile(r"[0-9a-fA-F]{8,}$")
_URL_TAIL = re.compile(r"https?://\S+(\s+\S+){0,3}$") # URL con fino a 3 token extra
_MATH_SYMS = re.compile(r"[∈∑≤≥≠∀∃∫√∞∂±×÷→←↔⊂⊃⊆⊇∩∪·°]")
# ─── Checks ───────────────────────────────────────────────────────────────────
@@ -53,7 +61,18 @@ def ends_incomplete(chunk: dict) -> bool:
text_check = re.sub(r"[_*]+$", "", text).rstrip()
if not text_check:
return False
return not PUNCT_END.search(text_check)
if PUNCT_END.search(text_check):
return False
if _HEX_END.search(text_check): # hash SHA / codice hex
return False
if _URL_TAIL.search(text_check[-200:]): # URL (con eventuale path dopo spazio)
return False
return True
def is_math_incomplete(chunk: dict) -> bool:
"""Incompleto ma in contesto matematico — degrada a warning invece di blocker."""
return ends_incomplete(chunk) and len(_MATH_SYMS.findall(chunk.get("text", ""))) >= 3
# ─── Report ───────────────────────────────────────────────────────────────────
@@ -83,11 +102,13 @@ def verify_stem(stem: str, project_root: Path, min_chars: int, max_chars: int) -
# ── Raccogli problemi ──────────────────────────────────────────────────────
empty_chunks = [c for c in chunks if is_empty(c)]
no_prefix = [c for c in chunks if not is_empty(c) and not has_prefix(c)]
too_short = [c for c in chunks if is_too_short(c, min_chars)]
too_long = [c for c in chunks if is_too_long(c, max_chars)]
incomplete = [c for c in chunks if not is_empty(c) and ends_incomplete(c)]
empty_chunks = [c for c in chunks if is_empty(c)]
no_prefix = [c for c in chunks if not is_empty(c) and not has_prefix(c)]
too_short = [c for c in chunks if is_too_short(c, min_chars)]
too_long = [c for c in chunks if is_too_long(c, max_chars)]
_incomplete_all = [c for c in chunks if not is_empty(c) and ends_incomplete(c)]
incomplete_math = [c for c in _incomplete_all if is_math_incomplete(c)]
incomplete = [c for c in _incomplete_all if not is_math_incomplete(c)]
# ── Statistiche ───────────────────────────────────────────────────────────
@@ -166,10 +187,20 @@ def verify_stem(stem: str, project_root: Path, min_chars: int, max_chars: int) -
print(f" ... e altri {len(incomplete) - 5}")
print(f" → Soluzione: correggi le righe spezzate in conversione/{stem}/clean.md")
if incomplete_math:
has_errors = True
print(f"\n 🟡 {len(incomplete_math)} chunk MATEMATICI SENZA PUNTEGGIATURA (formula/espressione):")
for c in incomplete_math[:3]:
last_line = c.get("text", "").rstrip().split("\n")[-1][-80:]
print(f" [{c.get('chunk_id', '?')}] ...{last_line!r}")
if len(incomplete_math) > 3:
print(f" ... e altri {len(incomplete_math) - 3}")
print(f" → Le formule non finiscono con punteggiatura — avviso non bloccante")
# ── Costruisci e salva report.json ────────────────────────────────────────
blockers = empty_chunks + no_prefix + incomplete
warnings = too_short + too_long
warnings = too_short + too_long + incomplete_math
def _chunk_entry(c: dict) -> dict:
return {
@@ -201,8 +232,9 @@ def verify_stem(stem: str, project_root: Path, min_chars: int, max_chars: int) -
"incomplete": [_chunk_entry(c) for c in incomplete],
},
"warnings": {
"too_short": [_chunk_entry(c) for c in too_short],
"too_long": [_chunk_entry(c) for c in too_long],
"too_short": [_chunk_entry(c) for c in too_short],
"too_long": [_chunk_entry(c) for c in too_long],
"incomplete_math": [_chunk_entry(c) for c in incomplete_math],
},
}