feat(chunks): sentence-boundary flush, math incomplete detection, structure profile export
- chunker: estrai _flush_chunk() con estensione al confine di frase (max 120%) - verify: rileva chunk matematici incompleti come warning, gestisci hash hex e URL - conversione: esporta structure_profile.json nell'output dir
This commit is contained in:
+42
-10
@@ -25,7 +25,15 @@ from pathlib import Path
|
||||
|
||||
MIN_CHARS = 200
|
||||
MAX_CHARS = 800
|
||||
PUNCT_END = re.compile("[.!?»)\\]'\u2019\"\u201c\u201d\u2018\u2014\u2013\u2026-]$")
|
||||
PUNCT_END = re.compile(
|
||||
r"[.!?»)\]'\u2019\"\u201c\u201d\u2018\u2014\u2013\u2026]$"
|
||||
r"|/$" # URL che finisce con /
|
||||
r"|\|$" # riga di tabella Markdown
|
||||
r"|:$" # introduzione a lista o formula
|
||||
)
|
||||
_HEX_END = re.compile(r"[0-9a-fA-F]{8,}$")
|
||||
_URL_TAIL = re.compile(r"https?://\S+(\s+\S+){0,3}$") # URL con fino a 3 token extra
|
||||
_MATH_SYMS = re.compile(r"[∈∑≤≥≠∀∃∫√∞∂±×÷→←↔⊂⊃⊆⊇∩∪·°]")
|
||||
|
||||
|
||||
# ─── Checks ───────────────────────────────────────────────────────────────────
|
||||
@@ -53,7 +61,18 @@ def ends_incomplete(chunk: dict) -> bool:
|
||||
text_check = re.sub(r"[_*]+$", "", text).rstrip()
|
||||
if not text_check:
|
||||
return False
|
||||
return not PUNCT_END.search(text_check)
|
||||
if PUNCT_END.search(text_check):
|
||||
return False
|
||||
if _HEX_END.search(text_check): # hash SHA / codice hex
|
||||
return False
|
||||
if _URL_TAIL.search(text_check[-200:]): # URL (con eventuale path dopo spazio)
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def is_math_incomplete(chunk: dict) -> bool:
|
||||
"""Incompleto ma in contesto matematico — degrada a warning invece di blocker."""
|
||||
return ends_incomplete(chunk) and len(_MATH_SYMS.findall(chunk.get("text", ""))) >= 3
|
||||
|
||||
|
||||
# ─── Report ───────────────────────────────────────────────────────────────────
|
||||
@@ -83,11 +102,13 @@ def verify_stem(stem: str, project_root: Path, min_chars: int, max_chars: int) -
|
||||
|
||||
# ── Raccogli problemi ──────────────────────────────────────────────────────
|
||||
|
||||
empty_chunks = [c for c in chunks if is_empty(c)]
|
||||
no_prefix = [c for c in chunks if not is_empty(c) and not has_prefix(c)]
|
||||
too_short = [c for c in chunks if is_too_short(c, min_chars)]
|
||||
too_long = [c for c in chunks if is_too_long(c, max_chars)]
|
||||
incomplete = [c for c in chunks if not is_empty(c) and ends_incomplete(c)]
|
||||
empty_chunks = [c for c in chunks if is_empty(c)]
|
||||
no_prefix = [c for c in chunks if not is_empty(c) and not has_prefix(c)]
|
||||
too_short = [c for c in chunks if is_too_short(c, min_chars)]
|
||||
too_long = [c for c in chunks if is_too_long(c, max_chars)]
|
||||
_incomplete_all = [c for c in chunks if not is_empty(c) and ends_incomplete(c)]
|
||||
incomplete_math = [c for c in _incomplete_all if is_math_incomplete(c)]
|
||||
incomplete = [c for c in _incomplete_all if not is_math_incomplete(c)]
|
||||
|
||||
# ── Statistiche ───────────────────────────────────────────────────────────
|
||||
|
||||
@@ -166,10 +187,20 @@ def verify_stem(stem: str, project_root: Path, min_chars: int, max_chars: int) -
|
||||
print(f" ... e altri {len(incomplete) - 5}")
|
||||
print(f" → Soluzione: correggi le righe spezzate in conversione/{stem}/clean.md")
|
||||
|
||||
if incomplete_math:
|
||||
has_errors = True
|
||||
print(f"\n 🟡 {len(incomplete_math)} chunk MATEMATICI SENZA PUNTEGGIATURA (formula/espressione):")
|
||||
for c in incomplete_math[:3]:
|
||||
last_line = c.get("text", "").rstrip().split("\n")[-1][-80:]
|
||||
print(f" [{c.get('chunk_id', '?')}] ...{last_line!r}")
|
||||
if len(incomplete_math) > 3:
|
||||
print(f" ... e altri {len(incomplete_math) - 3}")
|
||||
print(f" → Le formule non finiscono con punteggiatura — avviso non bloccante")
|
||||
|
||||
# ── Costruisci e salva report.json ────────────────────────────────────────
|
||||
|
||||
blockers = empty_chunks + no_prefix + incomplete
|
||||
warnings = too_short + too_long
|
||||
warnings = too_short + too_long + incomplete_math
|
||||
|
||||
def _chunk_entry(c: dict) -> dict:
|
||||
return {
|
||||
@@ -201,8 +232,9 @@ def verify_stem(stem: str, project_root: Path, min_chars: int, max_chars: int) -
|
||||
"incomplete": [_chunk_entry(c) for c in incomplete],
|
||||
},
|
||||
"warnings": {
|
||||
"too_short": [_chunk_entry(c) for c in too_short],
|
||||
"too_long": [_chunk_entry(c) for c in too_long],
|
||||
"too_short": [_chunk_entry(c) for c in too_short],
|
||||
"too_long": [_chunk_entry(c) for c in too_long],
|
||||
"incomplete_math": [_chunk_entry(c) for c in incomplete_math],
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user