diff --git a/step-6/verify_chunks.py b/step-6/verify_chunks.py index 53f676f..d8eb125 100644 --- a/step-6/verify_chunks.py +++ b/step-6/verify_chunks.py @@ -26,7 +26,7 @@ from pathlib import Path MIN_CHARS = 200 MAX_CHARS = 800 -PUNCT_END = re.compile("[.!?»)\\]'\u2019\"\u201c\u201d\u2018\u2014\u2013-]$") +PUNCT_END = re.compile("[.!?»)\\]'\u2019\"\u201c\u201d\u2018\u2014\u2013\u2026-]$") # ─── Checks ─────────────────────────────────────────────────────────────────── @@ -53,8 +53,12 @@ def ends_incomplete(chunk: dict) -> bool: text = chunk.get("text", "").rstrip() if not text: return False - # Controlla l'ultimo carattere non-whitespace - return not PUNCT_END.search(text) + # Rimuovi marcatori markdown finali (_ e *) prima di controllare: + # pattern come _parola._ o _parola!_ sono frasi complete. + text_check = re.sub(r"[_*]+$", "", text).rstrip() + if not text_check: + return False + return not PUNCT_END.search(text_check) # ─── Report ───────────────────────────────────────────────────────────────────