64dc403e80
- Unifica deps.py + checker.py + converter.py in extract.py (fronte PDF) - Sposta transforms/ in _pipeline/ (struttura piatta, no sottocartelle) - Aggiunge spinner animato (thread) durante conversione opendataloader-pdf - Aggiunge progresso step-by-step [i/37] per apply_transforms via callback - Mostra punteggio qualità (score/100 grade) a fine elaborazione - Fix: _DOTLEADER_RE spostata in _constants.py (non più definita inline) - Fix: report.py importa regex da _constants invece di ridefinirle - Fix: _t_remove_urls ora conta e ritorna le rimozioni effettive Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
111 lines
3.4 KiB
Python
111 lines
3.4 KiB
Python
"""Trasformazioni sulla struttura degli header: normalizzazione livelli, concat, bold."""
|
||
import re
|
||
|
||
from ._constants import _NUMBERED_HDR_RE
|
||
from ._helpers import _sentence_case
|
||
|
||
|
||
def _t_fix_header_concat(text: str) -> tuple[str, int]:
|
||
count = 0
|
||
|
||
def _fix(m: re.Match) -> str:
|
||
nonlocal count
|
||
hashes = m.group(1)
|
||
full = m.group(2).strip()
|
||
if len(full) < 60:
|
||
return m.group(0)
|
||
skip = min(10, len(full) // 3)
|
||
split = re.search(
|
||
r"(?<=[a-z\xe0\xe8\xe9\xec\xed\xf2\xf3\xf9\xfa\xe4])"
|
||
r"(?=[A-Z\xc0\xc8\xc9\xcc\xcd\xd2\xd3\xd9\xda])",
|
||
full[skip:],
|
||
)
|
||
if split:
|
||
pos = skip + split.start()
|
||
title = full[:pos].strip()
|
||
body = full[pos:].strip()
|
||
if len(title) >= 5 and len(body) >= 15:
|
||
count += 1
|
||
return f"{hashes} {title}\n\n{body}"
|
||
return m.group(0)
|
||
|
||
text = re.sub(r"^(#{2,6})\s+(.{40,})$", _fix, text, flags=re.MULTILINE)
|
||
return text, count
|
||
|
||
|
||
def _t_extract_capitolo(text: str) -> tuple[str, int]:
|
||
def _repl(m: re.Match) -> str:
|
||
num = m.group(1)
|
||
titolo = _sentence_case(m.group(2).strip().rstrip("- ").strip())
|
||
return f"\n\n## Capitolo {num}: {titolo}\n\n"
|
||
|
||
text = re.sub(
|
||
r"\bCapitolo\s+(\d+)\s*[:\s]\s*([A-Z\xc0\xc8\xc9\xcc\xcd\xd2\xd3\xd9\xda\'L]"
|
||
r"[A-Z\xc0\xc8\xc9\xcc\xcd\xd2\xd3\xd9\xda\s\'\.,\(\)]{5,80}?)"
|
||
r"(?=\s*[-–]\s*\d|\s*\n|\s*$)",
|
||
_repl,
|
||
text,
|
||
)
|
||
return text, 0
|
||
|
||
|
||
def _t_normalize_numbered_headings(text: str) -> tuple[str, int]:
|
||
all_matches = list(_NUMBERED_HDR_RE.finditer(text))
|
||
if not all_matches:
|
||
return text, 0
|
||
|
||
pairs = [(m.group(2).count(".") + 1, len(m.group(1))) for m in all_matches]
|
||
depths = [d for d, _ in pairs]
|
||
min_depth = min(depths)
|
||
max_depth = max(depths)
|
||
if max_depth == min_depth:
|
||
return text, 0
|
||
|
||
base_level = min(lv for d, lv in pairs if d == min_depth)
|
||
count = 0
|
||
|
||
def _repl(m: re.Match) -> str:
|
||
nonlocal count
|
||
hashes, num, title = m.group(1), m.group(2), m.group(3)
|
||
depth = num.count(".") + 1
|
||
new_level = min(base_level + (depth - min_depth), 6)
|
||
if new_level == len(hashes):
|
||
return m.group(0)
|
||
count += 1
|
||
return f"{'#' * new_level} {num}. {title}"
|
||
|
||
return _NUMBERED_HDR_RE.sub(_repl, text), count
|
||
|
||
|
||
def _t_normalize_header_levels(text: str) -> tuple[str, int]:
|
||
text = re.sub(r"^#{3,6}\s*$", "", text, flags=re.MULTILINE)
|
||
text = re.sub(
|
||
r"^(#{3,6})\s+(\d{1,3})\s+(.+)$",
|
||
lambda m: f"### {m.group(2)}. {m.group(3)}",
|
||
text,
|
||
flags=re.MULTILINE,
|
||
)
|
||
text = re.sub(r"^#{4,6}\s+(.+)$", r"### \1", text, flags=re.MULTILINE)
|
||
return text, 0
|
||
|
||
|
||
def _t_remove_header_bold(text: str) -> tuple[str, int]:
|
||
text = re.sub(
|
||
r"^(#{1,6})\s+\*\*(.+?)\*\*\s*$",
|
||
r"\1 \2",
|
||
text, flags=re.MULTILINE,
|
||
)
|
||
return text, 0
|
||
|
||
|
||
def _t_normalize_allcaps_headers(text: str) -> tuple[str, int]:
|
||
def _norm(m: re.Match) -> str:
|
||
hashes, content = m.group(1), m.group(2).strip()
|
||
letters = [c for c in content if c.isalpha()]
|
||
if letters and all(c.isupper() for c in letters):
|
||
return f"{hashes} {_sentence_case(content)}"
|
||
return m.group(0)
|
||
|
||
text = re.sub(r"^(#{1,6}) (.+)$", _norm, text, flags=re.MULTILINE)
|
||
return text, 0
|