rag-from-scratch/conversione/_pipeline/_encoding.py

"""Trasformazioni di encoding: PUA font Symbol, accenti LaTeX, simboli SI."""
import re

from ._constants import _SYMBOL_PUA_MAP, _SYMBOL_PUA_RE


def _t_fix_symbol_font(text: str) -> tuple[str, int]:
    count = [0]

    def _repl(m: re.Match) -> str:
        count[0] += 1
        return _SYMBOL_PUA_MAP[m.group(0)]

    result = _SYMBOL_PUA_RE.sub(_repl, text)
    return result, count[0]


def _t_fix_accents(text: str) -> tuple[str, int]:
    _ACCENT_MAP = {
        "e": "\xe8", "E": "\xc8", "a": "\xe0", "A": "\xc0",
        "u": "\xf9", "U": "\xd9", "i": "\xec", "I": "\xcc",
        "o": "\xf2", "O": "\xd2",
    }
    n_bt_before = text.count("`")
    text = re.sub(r"`([eEaAuUiIoO])", lambda m: _ACCENT_MAP[m.group(1)], text)
    text = re.sub(r"([eEaAuUiIoO])`", lambda m: _ACCENT_MAP[m.group(1)], text)
    n_accenti   = n_bt_before - text.count("`")
    n_bt_orfani = text.count("`")
    if n_bt_orfani:
        text = re.sub(r"`", "", text)
        n_accenti += n_bt_orfani
    return text, n_accenti


def _t_fix_multiplication(text: str) -> tuple[str, int]:
    n    = len(re.findall(r'(?<=[0-9])"(?=[0-9(])', text))
    text = re.sub(r'(?<=[0-9])"(?=[0-9(])', '×', text)
    return text, n


def _t_fix_micro(text: str) -> tuple[str, int]:
    _SI_UNITS_RE = r'[mAsgVWFHTKNJClΩ]'
    n    = len(re.findall(rf'\d\s*!(?={_SI_UNITS_RE})', text))
    text = re.sub(rf'(\d)\s*!({_SI_UNITS_RE})', r'\1 µ\2', text)
    return text, n