64dc403e80
- Unifica deps.py + checker.py + converter.py in extract.py (fronte PDF) - Sposta transforms/ in _pipeline/ (struttura piatta, no sottocartelle) - Aggiunge spinner animato (thread) durante conversione opendataloader-pdf - Aggiunge progresso step-by-step [i/37] per apply_transforms via callback - Mostra punteggio qualità (score/100 grade) a fine elaborazione - Fix: _DOTLEADER_RE spostata in _constants.py (non più definita inline) - Fix: report.py importa regex da _constants invece di ridefinirle - Fix: _t_remove_urls ora conta e ritorna le rimozioni effettive Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
117 lines
3.9 KiB
Python
117 lines
3.9 KiB
Python
"""Trasformazioni di rifinitura: header vuoti, garbage, demozione formula-header, frontmatter."""
|
||
import re
|
||
|
||
from ._constants import (
|
||
_FM_RE, _MATH_HDR_RE, _MATH_SYMBOLS_RE,
|
||
_EXERCISE_TRIGGER_RE, _NUMBERED_PREFIX_RE,
|
||
)
|
||
from ._helpers import _merge_title_headers
|
||
|
||
|
||
def _t_remove_empty_headers(text: str) -> tuple[str, int]:
|
||
blocks = re.split(r"\n{2,}", text)
|
||
cleaned = []
|
||
for i, block in enumerate(blocks):
|
||
stripped = block.strip()
|
||
if re.match(r"^#{1,6} ", stripped) and "\n" not in stripped:
|
||
next_stripped = blocks[i + 1].strip() if i + 1 < len(blocks) else ""
|
||
next_is_long_hdr = (
|
||
re.match(r"^#{1,6} ", next_stripped) and len(next_stripped) > 80
|
||
)
|
||
if not next_stripped or (
|
||
re.match(r"^#{1,6} ", next_stripped) and not next_is_long_hdr
|
||
):
|
||
continue
|
||
cleaned.append(block)
|
||
return re.sub(r"\n{3,}", "\n\n", "\n\n".join(cleaned)), 0
|
||
|
||
|
||
def _t_merge_title_headers(text: str) -> tuple[str, int]:
|
||
return _merge_title_headers(text)
|
||
|
||
|
||
def _t_remove_garbage_headers(text: str) -> tuple[str, int]:
|
||
def _is_garbage(content: str) -> bool:
|
||
if content.lstrip().startswith("..."):
|
||
return True
|
||
if not re.search(r"[A-Za-z\xc0-\xffΑ-ω]{2,}", content):
|
||
return True
|
||
if re.fullmatch(r"\(?\s*[A-Za-z]{1,4}\s*\)?", content.strip()):
|
||
return True
|
||
if len(content) > 60 and re.search(r"[!%#]\w|\w[!%#]|\b\w+-\s*\w", content):
|
||
return True
|
||
first_alpha = next((c for c in content if c.isalpha()), None)
|
||
if first_alpha and first_alpha.islower() and len(content) > 40:
|
||
return True
|
||
if re.match(r"^[A-Za-zΑ-ω_]{1,3}\s*[=<>≤≥]", content.strip()):
|
||
return True
|
||
if re.match(
|
||
r"^(Figura|Figure|Fig\.|Tabella|Table|Tab\.)\s+\d",
|
||
content.strip(), re.IGNORECASE,
|
||
):
|
||
return True
|
||
return False
|
||
|
||
count = 0
|
||
lines = text.split("\n")
|
||
new_lines = []
|
||
for line in lines:
|
||
m = re.match(r"^#{1,6} (.+)$", line)
|
||
if m and _is_garbage(m.group(1)):
|
||
count += 1
|
||
continue
|
||
new_lines.append(line)
|
||
text = "\n".join(new_lines)
|
||
text = re.sub(r"\n{3,}", "\n\n", text)
|
||
return text, count
|
||
|
||
|
||
def _t_math_header_demotion(text: str) -> tuple[str, int]:
|
||
lines = text.split("\n")
|
||
result, count = [], 0
|
||
for line in lines:
|
||
m = _MATH_HDR_RE.match(line)
|
||
if not m:
|
||
result.append(line)
|
||
continue
|
||
body = m.group(2)
|
||
if len(body) <= 100:
|
||
result.append(line)
|
||
continue
|
||
has_math = len(_MATH_SYMBOLS_RE.findall(body)) >= 3
|
||
has_exercise = bool(_EXERCISE_TRIGGER_RE.search(body))
|
||
if not (has_math or has_exercise):
|
||
result.append(line)
|
||
continue
|
||
nm = _NUMBERED_PREFIX_RE.match(body)
|
||
if nm:
|
||
result.append(f"**{nm.group(1)}** {nm.group(2)}")
|
||
else:
|
||
result.append(body)
|
||
count += 1
|
||
return "\n".join(result), count
|
||
|
||
|
||
def _t_remove_frontmatter(text: str) -> tuple[str, int]:
|
||
blocks = re.split(r"\n{2,}", text)
|
||
cleaned = []
|
||
count = 0
|
||
total = len(blocks)
|
||
cutoff = max(5, min(15, int(total * 0.20)))
|
||
for i, block in enumerate(blocks):
|
||
stripped = block.strip()
|
||
if i >= cutoff:
|
||
cleaned.append(block)
|
||
continue
|
||
if not re.match(r"^### ", stripped) or re.match(r"^### \d", stripped):
|
||
cleaned.append(block)
|
||
continue
|
||
body = blocks[i + 1].strip() if i + 1 < len(blocks) else ""
|
||
is_fm_body = len(body) < 250 and _FM_RE.search(body)
|
||
is_fm_hdr = _FM_RE.search(stripped)
|
||
if is_fm_body or is_fm_hdr:
|
||
count += 1
|
||
continue
|
||
cleaned.append(block)
|
||
return re.sub(r"\n{3,}", "\n\n", "\n\n".join(cleaned)), count
|