fix(verify): riconosce URL www. come terminatori validi + doc multi-documento
- _URL_TAIL ora matcha anche www.* (non solo https://) — evita falsi blockers su watermark tipo www.docsity.com - README: documenta --collection / --stems per ingestion, retrieve e rag Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -38,7 +38,7 @@ PUNCT_END = re.compile(
|
||||
r"|:$" # introduzione a lista o formula
|
||||
)
|
||||
_HEX_END = re.compile(r"[0-9a-fA-F]{8,}$")
|
||||
_URL_TAIL = re.compile(r"https?://\S+(\s+\S+){0,3}$") # URL con fino a 3 token extra
|
||||
_URL_TAIL = re.compile(r"(https?://|www\.)\S+(\s+\S+){0,3}$") # URL con fino a 3 token extra
|
||||
_MATH_SYMS = re.compile(r"[∈∑≤≥≠∀∃∫√∞∂±×÷→←↔⊂⊃⊆⊇∩∪·°]")
|
||||
_ROMAN_END = re.compile(r"\b(I{1,3}|IV|VI{0,3}|IX|XI{0,2}|XIV|XV|XVI{0,2}|XIX|XX{0,2})$")
|
||||
|
||||
|
||||
Reference in New Issue
Block a user