154 lines
4.3 KiB
Python
154 lines
4.3 KiB
Python
|
|
"""Funzioni helper pure condivise tra i moduli di trasformazione."""
|
||
|
|
import re
|
||
|
|
|
||
|
|
from ._constants import _ORDINALS_IT, _ORDINALS_EN
|
||
|
|
|
||
|
|
|
||
|
|
def _sentence_case(s: str) -> str:
|
||
|
|
if not s:
|
||
|
|
return s
|
||
|
|
lower = s.lower()
|
||
|
|
return lower[0].upper() + lower[1:]
|
||
|
|
|
||
|
|
|
||
|
|
def _is_allcaps_line(line: str) -> bool:
|
||
|
|
stripped = line.strip()
|
||
|
|
letters = [c for c in stripped if c.isalpha()]
|
||
|
|
return (
|
||
|
|
len(letters) >= 3
|
||
|
|
and all(c.isupper() for c in letters)
|
||
|
|
and not stripped.startswith("#")
|
||
|
|
and not stripped.startswith("|")
|
||
|
|
)
|
||
|
|
|
||
|
|
|
||
|
|
def _allcaps_to_header(raw_line: str) -> str:
|
||
|
|
text = re.sub(r"^[-*+]\s+", "", raw_line.strip())
|
||
|
|
text = text.rstrip(".").rstrip("?").strip()
|
||
|
|
|
||
|
|
_ORD_IT_PAT = "|".join(_ORDINALS_IT.keys())
|
||
|
|
m = re.match(rf"^CAPITOLO ({_ORD_IT_PAT})\. (.+)", text)
|
||
|
|
if m:
|
||
|
|
roman = _ORDINALS_IT[m.group(1)]
|
||
|
|
titolo = m.group(2).rstrip(".").rstrip("?").strip()
|
||
|
|
return f"## Capitolo {roman} — {_sentence_case(titolo)}"
|
||
|
|
|
||
|
|
_ORD_EN_PAT = "|".join(_ORDINALS_EN.keys())
|
||
|
|
m = re.match(rf"^CHAPTER ({_ORD_EN_PAT}|\d+)\.? (.+)", text)
|
||
|
|
if m:
|
||
|
|
n = _ORDINALS_EN.get(m.group(1), m.group(1))
|
||
|
|
titolo = m.group(2).rstrip(".").rstrip("?").strip()
|
||
|
|
return f"## Chapter {n} — {_sentence_case(titolo)}"
|
||
|
|
|
||
|
|
m = re.match(r"^([IVXLCDM]+|[0-9]+)\. (.+)", text)
|
||
|
|
if m:
|
||
|
|
return f"## {m.group(1)}. {_sentence_case(m.group(2).rstrip('.').strip())}"
|
||
|
|
|
||
|
|
return f"## {_sentence_case(text)}"
|
||
|
|
|
||
|
|
|
||
|
|
def _extract_math_environments(text: str) -> tuple[str, int]:
|
||
|
|
_ENVS = (
|
||
|
|
r"Definizione|Definition|Teorema|Theorem|Lemma|"
|
||
|
|
r"Proposizione|Proposition|Corollario|Corollary|"
|
||
|
|
r"Osservazione|Remark|Nota|Note|Esempio|Example"
|
||
|
|
)
|
||
|
|
count = 0
|
||
|
|
blocks = text.split("\n\n")
|
||
|
|
result = []
|
||
|
|
|
||
|
|
for block in blocks:
|
||
|
|
stripped = block.strip()
|
||
|
|
if not stripped or stripped.startswith("#"):
|
||
|
|
result.append(block)
|
||
|
|
continue
|
||
|
|
|
||
|
|
m = re.match(
|
||
|
|
rf"^({_ENVS})\s+((?:\d+\.?){{1,4}})\s*(.*)",
|
||
|
|
stripped,
|
||
|
|
re.DOTALL,
|
||
|
|
)
|
||
|
|
if not m:
|
||
|
|
result.append(block)
|
||
|
|
continue
|
||
|
|
|
||
|
|
env = m.group(1)
|
||
|
|
num = m.group(2).rstrip(".")
|
||
|
|
rest = m.group(3).strip()
|
||
|
|
|
||
|
|
title_m = re.match(r"^(\([^)]{2,60}\))\s+(.*)", rest, re.DOTALL)
|
||
|
|
if title_m:
|
||
|
|
header = f"### {env} {num} {title_m.group(1)}"
|
||
|
|
body = title_m.group(2).strip()
|
||
|
|
else:
|
||
|
|
header = f"### {env} {num}."
|
||
|
|
body = rest
|
||
|
|
|
||
|
|
result.append(f"{header}\n\n{body}" if body else header)
|
||
|
|
count += 1
|
||
|
|
|
||
|
|
return "\n\n".join(result), count
|
||
|
|
|
||
|
|
|
||
|
|
def _merge_title_headers(text: str) -> tuple[str, int]:
|
||
|
|
count = 0
|
||
|
|
blocks = re.split(r"\n{2,}", text)
|
||
|
|
result = []
|
||
|
|
i = 0
|
||
|
|
while i < len(blocks):
|
||
|
|
block = blocks[i]
|
||
|
|
stripped = block.strip()
|
||
|
|
if (
|
||
|
|
re.match(r"^#{2,3} \d+\.\s*$", stripped)
|
||
|
|
and i + 1 < len(blocks)
|
||
|
|
):
|
||
|
|
nxt = blocks[i + 1].strip()
|
||
|
|
if (
|
||
|
|
nxt
|
||
|
|
and "\n" not in nxt
|
||
|
|
and len(nxt) <= 80
|
||
|
|
and not nxt.startswith("#")
|
||
|
|
and not re.match(r"^\d+[\.\)]\s", nxt)
|
||
|
|
):
|
||
|
|
result.append(stripped.rstrip() + " " + nxt)
|
||
|
|
count += 1
|
||
|
|
i += 2
|
||
|
|
continue
|
||
|
|
result.append(block)
|
||
|
|
i += 1
|
||
|
|
return re.sub(r"\n{3,}", "\n\n", "\n\n".join(result)), count
|
||
|
|
|
||
|
|
|
||
|
|
def _extract_article_headers(text: str) -> tuple[str, int]:
|
||
|
|
count = 0
|
||
|
|
|
||
|
|
def _repl(m: re.Match) -> str:
|
||
|
|
nonlocal count
|
||
|
|
num = m.group(1)
|
||
|
|
rest = m.group(2).strip()
|
||
|
|
|
||
|
|
title_m = re.match(
|
||
|
|
r"^([A-Z\xc0\xc8\xc9\xcc\xcd\xd2\xd3\xd9\xda].{1,74}?)\.\s+"
|
||
|
|
r"([A-Z\xc0\xc8\xc9\xcc\xcd\xd2\xd3\xd9\xda\(\d].{4,})",
|
||
|
|
rest,
|
||
|
|
)
|
||
|
|
if title_m:
|
||
|
|
count += 1
|
||
|
|
return (
|
||
|
|
f"### Art. {num}. {title_m.group(1)}.\n\n"
|
||
|
|
f"{title_m.group(2).strip()}"
|
||
|
|
)
|
||
|
|
if rest:
|
||
|
|
count += 1
|
||
|
|
return f"### Art. {num}.\n\n{rest}"
|
||
|
|
count += 1
|
||
|
|
return f"### Art. {num}."
|
||
|
|
|
||
|
|
text = re.sub(
|
||
|
|
r"^-\s+Art\.\s+([\d]+[a-z\-]*)\.\s*(.*)",
|
||
|
|
_repl,
|
||
|
|
text,
|
||
|
|
flags=re.MULTILINE,
|
||
|
|
)
|
||
|
|
return text, count
|