Files
rag-from-scratch/conversione/_pipeline/_text.py
T
davide 64dc403e80 refactor: ottimizza pipeline PDF→Markdown — struttura piatta e verbosità
- Unifica deps.py + checker.py + converter.py in extract.py (fronte PDF)
- Sposta transforms/ in _pipeline/ (struttura piatta, no sottocartelle)
- Aggiunge spinner animato (thread) durante conversione opendataloader-pdf
- Aggiunge progresso step-by-step [i/37] per apply_transforms via callback
- Mostra punteggio qualità (score/100 grade) a fine elaborazione
- Fix: _DOTLEADER_RE spostata in _constants.py (non più definita inline)
- Fix: report.py importa regex da _constants invece di ridefinirle
- Fix: _t_remove_urls ora conta e ritorna le rimozioni effettive

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-07 14:30:41 +02:00

110 lines
3.2 KiB
Python

"""Trasformazioni sul testo: merge paragrafi, whitespace, poesia, versi."""
import re
from ._constants import _VERSE_NUM_RE
def _t_merge_paragraphs(text: str) -> tuple[str, int]:
_SENTENCE_END = set(".?!\xbb)\"'")
blocks = text.split("\n\n")
merged = []
count = 0
i = 0
while i < len(blocks):
b = blocks[i]
stripped = b.strip()
while (
i + 1 < len(blocks)
and stripped
and not stripped.startswith("#")
and not stripped.startswith("|")
and stripped[-1] not in _SENTENCE_END
):
nxt = blocks[i + 1].strip()
if (
not nxt
or nxt.startswith("#")
or nxt.startswith("|")
or re.match(r"^\d+\.", nxt)
or re.match(r"^[-*+]\s", nxt)
):
break
b = stripped + " " + nxt
stripped = b.strip()
count += 1
i += 1
merged.append(b)
i += 1
text = "\n\n".join(merged)
text = re.sub(r"(?m)^\|---\|\s*", "", text)
return text, count
def _t_normalize_whitespace(text: str) -> tuple[str, int]:
lines = text.split("\n")
text = "\n".join(
re.sub(r" +", " ", line) if line.strip() else line
for line in lines
)
return text, 0
def _t_collapse_blank_lines(text: str) -> tuple[str, int]:
return re.sub(r"\n{3,}", "\n\n", text), 0
def _t_restore_poetry_lines(text: str) -> tuple[str, int]:
count = 0
blocks = text.split("\n\n")
result = []
for block in blocks:
stripped = block.strip()
if not stripped or stripped.startswith("#"):
result.append(block)
continue
matches = list(_VERSE_NUM_RE.finditer(stripped))
if len(matches) < 2:
result.append(block)
continue
nums = [int(m.group(2)) for m in matches]
diffs = [nums[i + 1] - nums[i] for i in range(len(nums) - 1)]
if not diffs or len(set(diffs)) > 2 or not (1 <= diffs[0] <= 5):
result.append(block)
continue
step = diffs[0]
def _replace_verse_num(m: re.Match) -> str:
n = int(m.group(2))
sep = "\n\n" if n % (step * 3) == 0 else "\n"
return m.group(1).rstrip() + sep
new_block = _VERSE_NUM_RE.sub(_replace_verse_num, stripped)
if new_block != stripped:
count += len(matches)
result.append(new_block)
return "\n\n".join(result), count
def _t_demote_verse_headers(text: str) -> tuple[str, int]:
count = 0
def _demote(m: re.Match) -> str:
nonlocal count
hashes, content = m.group(1), m.group(2).strip()
if not re.search(r"\s\d{1,4}\s*$", content):
return m.group(0)
inner = re.sub(r"\s\d{1,4}\s*$", "", content)
if not re.search(r'[,;:.!?\xbb"\'][\ ]+[A-Za-z\xc0-\xff\xab""]', inner):
return m.group(0)
count += 1
clean = re.sub(r"\s\d{1,4}\s*$", "", content)
return clean
text = re.sub(r"^(#{1,6})\s+(.{20,})$", _demote, text, flags=re.MULTILINE)
return text, count