conversione/_pipeline/_helpers.py

"""Funzioni helper pure condivise tra i moduli di trasformazione."""
import re

from ._constants import _ORDINALS_IT, _ORDINALS_EN


def _sentence_case(s: str) -> str:
    if not s:
        return s
    lower = s.lower()
    return lower[0].upper() + lower[1:]


def _is_allcaps_line(line: str) -> bool:
    stripped = line.strip()
    letters  = [c for c in stripped if c.isalpha()]
    return (
        len(letters) >= 3
        and all(c.isupper() for c in letters)
        and not stripped.startswith("#")
        and not stripped.startswith("|")
    )


def _allcaps_to_header(raw_line: str) -> str:
    text = re.sub(r"^[-*+]\s+", "", raw_line.strip())
    text = text.rstrip(".").rstrip("?").strip()

    _ORD_IT_PAT = "|".join(_ORDINALS_IT.keys())
    m = re.match(rf"^CAPITOLO ({_ORD_IT_PAT})\. (.+)", text)
    if m:
        roman  = _ORDINALS_IT[m.group(1)]
        titolo = m.group(2).rstrip(".").rstrip("?").strip()
        return f"## Capitolo {roman} — {_sentence_case(titolo)}"

    _ORD_EN_PAT = "|".join(_ORDINALS_EN.keys())
    m = re.match(rf"^CHAPTER ({_ORD_EN_PAT}|\d+)\.? (.+)", text)
    if m:
        n      = _ORDINALS_EN.get(m.group(1), m.group(1))
        titolo = m.group(2).rstrip(".").rstrip("?").strip()
        return f"## Chapter {n} — {_sentence_case(titolo)}"

    m = re.match(r"^([IVXLCDM]+|[0-9]+)\. (.+)", text)
    if m:
        return f"## {m.group(1)}. {_sentence_case(m.group(2).rstrip('.').strip())}"

    return f"## {_sentence_case(text)}"


def _extract_math_environments(text: str) -> tuple[str, int]:
    _ENVS = (
        r"Definizione|Definition|Teorema|Theorem|Lemma|"
        r"Proposizione|Proposition|Corollario|Corollary|"
        r"Osservazione|Remark|Nota|Note|Esempio|Example"
    )
    count  = 0
    blocks = text.split("\n\n")
    result = []

    for block in blocks:
        stripped = block.strip()
        if not stripped or stripped.startswith("#"):
            result.append(block)
            continue

        m = re.match(
            rf"^({_ENVS})\s+((?:\d+\.?){{1,4}})\s*(.*)",
            stripped,
            re.DOTALL,
        )
        if not m:
            result.append(block)
            continue

        env  = m.group(1)
        num  = m.group(2).rstrip(".")
        rest = m.group(3).strip()

        title_m = re.match(r"^(\([^)]{2,60}\))\s+(.*)", rest, re.DOTALL)
        if title_m:
            header = f"### {env} {num} {title_m.group(1)}"
            body   = title_m.group(2).strip()
        else:
            header = f"### {env} {num}."
            body   = rest

        result.append(f"{header}\n\n{body}" if body else header)
        count += 1

    return "\n\n".join(result), count


def _merge_title_headers(text: str) -> tuple[str, int]:
    count  = 0
    blocks = re.split(r"\n{2,}", text)
    result = []
    i = 0
    while i < len(blocks):
        block    = blocks[i]
        stripped = block.strip()
        if (
            re.match(r"^#{2,3} \d+\.\s*$", stripped)
            and i + 1 < len(blocks)
        ):
            nxt = blocks[i + 1].strip()
            if (
                nxt
                and "\n" not in nxt
                and len(nxt) <= 80
                and not nxt.startswith("#")
                and not re.match(r"^\d+[\.\)]\s", nxt)
            ):
                result.append(stripped.rstrip() + " " + nxt)
                count += 1
                i += 2
                continue
        result.append(block)
        i += 1
    return re.sub(r"\n{3,}", "\n\n", "\n\n".join(result)), count


def _extract_article_headers(text: str) -> tuple[str, int]:
    count = 0

    def _repl(m: re.Match) -> str:
        nonlocal count
        num  = m.group(1)
        rest = m.group(2).strip()

        title_m = re.match(
            r"^([A-Z\xc0\xc8\xc9\xcc\xcd\xd2\xd3\xd9\xda].{1,74}?)\.\s+"
            r"([A-Z\xc0\xc8\xc9\xcc\xcd\xd2\xd3\xd9\xda\(\d].{4,})",
            rest,
        )
        if title_m:
            count += 1
            return (
                f"### Art. {num}. {title_m.group(1)}.\n\n"
                f"{title_m.group(2).strip()}"
            )
        if rest:
            count += 1
            return f"### Art. {num}.\n\n{rest}"
        count += 1
        return f"### Art. {num}."

    text = re.sub(
        r"^-\s+Art\.\s+([\d]+[a-z\-]*)\.\s*(.*)",
        _repl,
        text,
        flags=re.MULTILINE,
    )
    return text, count
feat: integra pipeline PDF→Markdown a 9 stadi e test suite 2026-05-11 14:46:16 +02:00			`"""Funzioni helper pure condivise tra i moduli di trasformazione."""`
			`import re`

			`from ._constants import _ORDINALS_IT, _ORDINALS_EN`


			`def _sentence_case(s: str) -> str:`
			`if not s:`
			`return s`
			`lower = s.lower()`
			`return lower[0].upper() + lower[1:]`


			`def _is_allcaps_line(line: str) -> bool:`
			`stripped = line.strip()`
			`letters = [c for c in stripped if c.isalpha()]`
			`return (`
			`len(letters) >= 3`
			`and all(c.isupper() for c in letters)`
			`and not stripped.startswith("#")`
			`and not stripped.startswith("\|")`
			`)`


			`def _allcaps_to_header(raw_line: str) -> str:`
			`text = re.sub(r"^[-*+]\s+", "", raw_line.strip())`
			`text = text.rstrip(".").rstrip("?").strip()`

			`_ORD_IT_PAT = "\|".join(_ORDINALS_IT.keys())`
			`m = re.match(rf"^CAPITOLO ({_ORD_IT_PAT})\. (.+)", text)`
			`if m:`
			`roman = _ORDINALS_IT[m.group(1)]`
			`titolo = m.group(2).rstrip(".").rstrip("?").strip()`
			`return f"## Capitolo {roman} — {_sentence_case(titolo)}"`

			`_ORD_EN_PAT = "\|".join(_ORDINALS_EN.keys())`
			`m = re.match(rf"^CHAPTER ({_ORD_EN_PAT}\|\d+)\.? (.+)", text)`
			`if m:`
			`n = _ORDINALS_EN.get(m.group(1), m.group(1))`
			`titolo = m.group(2).rstrip(".").rstrip("?").strip()`
			`return f"## Chapter {n} — {_sentence_case(titolo)}"`

			`m = re.match(r"^([IVXLCDM]+\|[0-9]+)\. (.+)", text)`
			`if m:`
			`return f"## {m.group(1)}. {_sentence_case(m.group(2).rstrip('.').strip())}"`

			`return f"## {_sentence_case(text)}"`


			`def _extract_math_environments(text: str) -> tuple[str, int]:`
			`_ENVS = (`
			`r"Definizione\|Definition\|Teorema\|Theorem\|Lemma\|"`
			`r"Proposizione\|Proposition\|Corollario\|Corollary\|"`
			`r"Osservazione\|Remark\|Nota\|Note\|Esempio\|Example"`
			`)`
			`count = 0`
			`blocks = text.split("\n\n")`
			`result = []`

			`for block in blocks:`
			`stripped = block.strip()`
			`if not stripped or stripped.startswith("#"):`
			`result.append(block)`
			`continue`

			`m = re.match(`
			`rf"^({_ENVS})\s+((?:\d+\.?){{1,4}})\s(.)",`
			`stripped,`
			`re.DOTALL,`
			`)`
			`if not m:`
			`result.append(block)`
			`continue`

			`env = m.group(1)`
			`num = m.group(2).rstrip(".")`
			`rest = m.group(3).strip()`

			`title_m = re.match(r"^(\([^)]{2,60}\))\s+(.*)", rest, re.DOTALL)`
			`if title_m:`
			`header = f"### {env} {num} {title_m.group(1)}"`
			`body = title_m.group(2).strip()`
			`else:`
			`header = f"### {env} {num}."`
			`body = rest`

			`result.append(f"{header}\n\n{body}" if body else header)`
			`count += 1`

			`return "\n\n".join(result), count`


			`def _merge_title_headers(text: str) -> tuple[str, int]:`
			`count = 0`
			`blocks = re.split(r"\n{2,}", text)`
			`result = []`
			`i = 0`
			`while i < len(blocks):`
			`block = blocks[i]`
			`stripped = block.strip()`
			`if (`
			`re.match(r"^#{2,3} \d+\.\s*$", stripped)`
			`and i + 1 < len(blocks)`
			`):`
			`nxt = blocks[i + 1].strip()`
			`if (`
			`nxt`
			`and "\n" not in nxt`
			`and len(nxt) <= 80`
			`and not nxt.startswith("#")`
			`and not re.match(r"^\d+[\.\)]\s", nxt)`
			`):`
			`result.append(stripped.rstrip() + " " + nxt)`
			`count += 1`
			`i += 2`
			`continue`
			`result.append(block)`
			`i += 1`
			`return re.sub(r"\n{3,}", "\n\n", "\n\n".join(result)), count`


			`def _extract_article_headers(text: str) -> tuple[str, int]:`
			`count = 0`

			`def _repl(m: re.Match) -> str:`
			`nonlocal count`
			`num = m.group(1)`
			`rest = m.group(2).strip()`

			`title_m = re.match(`
			`r"^([A-Z\xc0\xc8\xc9\xcc\xcd\xd2\xd3\xd9\xda].{1,74}?)\.\s+"`
			`r"([A-Z\xc0\xc8\xc9\xcc\xcd\xd2\xd3\xd9\xda\(\d].{4,})",`
			`rest,`
			`)`
			`if title_m:`
			`count += 1`
			`return (`
			`f"### Art. {num}. {title_m.group(1)}.\n\n"`
			`f"{title_m.group(2).strip()}"`
			`)`
			`if rest:`
			`count += 1`
			`return f"### Art. {num}.\n\n{rest}"`
			`count += 1`
			`return f"### Art. {num}."`

			`text = re.sub(`
			`r"^-\s+Art\.\s+([\d]+[a-z\-])\.\s(.*)",`
			`_repl,`
			`text,`
			`flags=re.MULTILINE,`
			`)`
			`return text, count`