64dc403e80
- Unifica deps.py + checker.py + converter.py in extract.py (fronte PDF) - Sposta transforms/ in _pipeline/ (struttura piatta, no sottocartelle) - Aggiunge spinner animato (thread) durante conversione opendataloader-pdf - Aggiunge progresso step-by-step [i/37] per apply_transforms via callback - Mostra punteggio qualità (score/100 grade) a fine elaborazione - Fix: _DOTLEADER_RE spostata in _constants.py (non più definita inline) - Fix: report.py importa regex da _constants invece di ridefinirle - Fix: _t_remove_urls ora conta e ritorna le rimozioni effettive Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
110 lines
3.2 KiB
Python
110 lines
3.2 KiB
Python
"""Trasformazioni sul testo: merge paragrafi, whitespace, poesia, versi."""
|
|
import re
|
|
|
|
from ._constants import _VERSE_NUM_RE
|
|
|
|
|
|
def _t_merge_paragraphs(text: str) -> tuple[str, int]:
|
|
_SENTENCE_END = set(".?!\xbb)\"'")
|
|
blocks = text.split("\n\n")
|
|
merged = []
|
|
count = 0
|
|
i = 0
|
|
while i < len(blocks):
|
|
b = blocks[i]
|
|
stripped = b.strip()
|
|
while (
|
|
i + 1 < len(blocks)
|
|
and stripped
|
|
and not stripped.startswith("#")
|
|
and not stripped.startswith("|")
|
|
and stripped[-1] not in _SENTENCE_END
|
|
):
|
|
nxt = blocks[i + 1].strip()
|
|
if (
|
|
not nxt
|
|
or nxt.startswith("#")
|
|
or nxt.startswith("|")
|
|
or re.match(r"^\d+\.", nxt)
|
|
or re.match(r"^[-*+]\s", nxt)
|
|
):
|
|
break
|
|
b = stripped + " " + nxt
|
|
stripped = b.strip()
|
|
count += 1
|
|
i += 1
|
|
merged.append(b)
|
|
i += 1
|
|
text = "\n\n".join(merged)
|
|
text = re.sub(r"(?m)^\|---\|\s*", "", text)
|
|
return text, count
|
|
|
|
|
|
def _t_normalize_whitespace(text: str) -> tuple[str, int]:
|
|
lines = text.split("\n")
|
|
text = "\n".join(
|
|
re.sub(r" +", " ", line) if line.strip() else line
|
|
for line in lines
|
|
)
|
|
return text, 0
|
|
|
|
|
|
def _t_collapse_blank_lines(text: str) -> tuple[str, int]:
|
|
return re.sub(r"\n{3,}", "\n\n", text), 0
|
|
|
|
|
|
def _t_restore_poetry_lines(text: str) -> tuple[str, int]:
|
|
count = 0
|
|
blocks = text.split("\n\n")
|
|
result = []
|
|
|
|
for block in blocks:
|
|
stripped = block.strip()
|
|
if not stripped or stripped.startswith("#"):
|
|
result.append(block)
|
|
continue
|
|
|
|
matches = list(_VERSE_NUM_RE.finditer(stripped))
|
|
if len(matches) < 2:
|
|
result.append(block)
|
|
continue
|
|
|
|
nums = [int(m.group(2)) for m in matches]
|
|
diffs = [nums[i + 1] - nums[i] for i in range(len(nums) - 1)]
|
|
if not diffs or len(set(diffs)) > 2 or not (1 <= diffs[0] <= 5):
|
|
result.append(block)
|
|
continue
|
|
|
|
step = diffs[0]
|
|
|
|
def _replace_verse_num(m: re.Match) -> str:
|
|
n = int(m.group(2))
|
|
sep = "\n\n" if n % (step * 3) == 0 else "\n"
|
|
return m.group(1).rstrip() + sep
|
|
|
|
new_block = _VERSE_NUM_RE.sub(_replace_verse_num, stripped)
|
|
if new_block != stripped:
|
|
count += len(matches)
|
|
result.append(new_block)
|
|
|
|
return "\n\n".join(result), count
|
|
|
|
|
|
def _t_demote_verse_headers(text: str) -> tuple[str, int]:
|
|
count = 0
|
|
|
|
def _demote(m: re.Match) -> str:
|
|
nonlocal count
|
|
hashes, content = m.group(1), m.group(2).strip()
|
|
if not re.search(r"\s\d{1,4}\s*$", content):
|
|
return m.group(0)
|
|
inner = re.sub(r"\s\d{1,4}\s*$", "", content)
|
|
if not re.search(r'[,;:.!?\xbb"\'][\ ]+[A-Za-z\xc0-\xff\xab""]', inner):
|
|
return m.group(0)
|
|
count += 1
|
|
clean = re.sub(r"\s\d{1,4}\s*$", "", content)
|
|
return clean
|
|
|
|
text = re.sub(r"^(#{1,6})\s+(.{20,})$", _demote, text, flags=re.MULTILINE)
|
|
return text, count
|