Files
rag-from-scratch/conversione/_pipeline/_encoding.py
T
davide 64dc403e80 refactor: ottimizza pipeline PDF→Markdown — struttura piatta e verbosità
- Unifica deps.py + checker.py + converter.py in extract.py (fronte PDF)
- Sposta transforms/ in _pipeline/ (struttura piatta, no sottocartelle)
- Aggiunge spinner animato (thread) durante conversione opendataloader-pdf
- Aggiunge progresso step-by-step [i/37] per apply_transforms via callback
- Mostra punteggio qualità (score/100 grade) a fine elaborazione
- Fix: _DOTLEADER_RE spostata in _constants.py (non più definita inline)
- Fix: report.py importa regex da _constants invece di ridefinirle
- Fix: _t_remove_urls ora conta e ritorna le rimozioni effettive

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-07 14:30:41 +02:00

46 lines
1.4 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""Trasformazioni di encoding: PUA font Symbol, accenti LaTeX, simboli SI."""
import re
from ._constants import _SYMBOL_PUA_MAP, _SYMBOL_PUA_RE
def _t_fix_symbol_font(text: str) -> tuple[str, int]:
count = [0]
def _repl(m: re.Match) -> str:
count[0] += 1
return _SYMBOL_PUA_MAP[m.group(0)]
result = _SYMBOL_PUA_RE.sub(_repl, text)
return result, count[0]
def _t_fix_accents(text: str) -> tuple[str, int]:
_ACCENT_MAP = {
"e": "\xe8", "E": "\xc8", "a": "\xe0", "A": "\xc0",
"u": "\xf9", "U": "\xd9", "i": "\xec", "I": "\xcc",
"o": "\xf2", "O": "\xd2",
}
n_bt_before = text.count("`")
text = re.sub(r"`([eEaAuUiIoO])", lambda m: _ACCENT_MAP[m.group(1)], text)
text = re.sub(r"([eEaAuUiIoO])`", lambda m: _ACCENT_MAP[m.group(1)], text)
n_accenti = n_bt_before - text.count("`")
n_bt_orfani = text.count("`")
if n_bt_orfani:
text = re.sub(r"`", "", text)
n_accenti += n_bt_orfani
return text, n_accenti
def _t_fix_multiplication(text: str) -> tuple[str, int]:
n = len(re.findall(r'(?<=[0-9])"(?=[0-9(])', text))
text = re.sub(r'(?<=[0-9])"(?=[0-9(])', '×', text)
return text, n
def _t_fix_micro(text: str) -> tuple[str, int]:
_SI_UNITS_RE = r'[mAsgVWFHTKNJClΩ]'
n = len(re.findall(rf'\d\s*!(?={_SI_UNITS_RE})', text))
text = re.sub(rf'(\d)\s*!({_SI_UNITS_RE})', r'\1 µ\2', text)
return text, n