"""Costruzione struttura: TOC, ALLCAPS→##, sezioni numerate, ambienti matematici, articoli.""" import re from ._constants import ( _TOC_KEYWORDS, _BIB_MARKERS_RE, _TOC_ITEM_RE, _TOC_HDR_WITH_PAGE_RE, ) from ._helpers import ( _is_allcaps_line, _allcaps_to_header, _extract_math_environments, _extract_article_headers, ) def _t_remove_toc(text: str) -> tuple[str, int]: lines = text.split("\n") new_lines = [] _in_toc = False removed = False for line in lines: bare = re.sub(r"^#+\s*", "", line.strip()) first_word = bare.split(".")[0].strip().lower() if first_word in _TOC_KEYWORDS: removed = True _in_toc = True continue if _in_toc: if re.match(r"^\s*$", line) or re.match(r"^\s*[-*+]\s+\d", line): continue if re.match(r"^\s*[-*+]\s+.{2,70}\s+\d{1,3}\s*$", line): continue if len(line.strip()) > 200: _in_toc = False new_lines.append(line) continue _in_toc = False new_lines.append(line) return "\n".join(new_lines), 1 if removed else 0 def _t_remove_orphan_toc(text: str) -> tuple[str, int]: """ Rimuove voci di sommario senza dot-leader che sfuggono a _t_remove_toc. Rileva: (a) blocchi di 3+ righe consecutive che matchano il pattern TOC nei primi 25% del documento; (b) header ### N. Titolo PAGINA il cui corpo è una lista di voci numerate. """ blocks = re.split(r"\n{2,}", text) total = len(blocks) cutoff = max(10, min(40, int(total * 0.25))) to_drop = set() i = 0 while i < cutoff and i < total: b = blocks[i].strip() # (a) Sequenza di 3+ blocchi TOC consecutivi if _TOC_ITEM_RE.match(b): j = i while j < min(cutoff, i + 60) and j < len(blocks) and _TOC_ITEM_RE.match(blocks[j].strip()): j += 1 if j - i >= 3: for k in range(i, j): to_drop.add(k) # Rimuovi anche l'header ### precedente se ha numero di pagina if i > 0 and _TOC_HDR_WITH_PAGE_RE.match(blocks[i - 1].strip()): to_drop.add(i - 1) i = j continue # (b) Header ### N. Titolo PAGINA con corpo che è lista di voci numerate if _TOC_HDR_WITH_PAGE_RE.match(b): body = blocks[i + 1].strip() if i + 1 < len(blocks) else "" # Il corpo contiene 2+ occorrenze di "N. Titolo" toc_hits = re.findall(r"\d+\.?\s+[A-Za-zÀ-ú]", body) if len(toc_hits) >= 2 and len(body) < 300: to_drop.add(i) if i + 1 < total: to_drop.add(i + 1) i += 2 continue i += 1 if not to_drop: return text, 0 kept = [b for idx, b in enumerate(blocks) if idx not in to_drop] return re.sub(r"\n{3,}", "\n\n", "\n\n".join(kept)), len(to_drop) def _t_allcaps_to_headers(text: str) -> tuple[str, int]: count = 0 blocks = text.split("\n\n") new_blocks = [] for block in blocks: stripped = block.strip() if "\n" not in stripped and _is_allcaps_line(stripped): new_blocks.append(_allcaps_to_header(stripped)) count += 1 else: sub_lines = block.split("\n") converted = [] for ln in sub_lines: if _is_allcaps_line(ln) and len(ln.strip()) > 3: converted.append(_allcaps_to_header(ln)) count += 1 else: converted.append(ln) new_blocks.append("\n".join(converted)) return "\n\n".join(new_blocks), count def _t_numbered_sections(text: str, has_exercises: bool = False) -> tuple[str, int]: count = 0 def _num_repl(m: re.Match) -> str: nonlocal count content = m.group(2).strip() if content.endswith(".") and len(content) > 40: return m.group(0) if _BIB_MARKERS_RE.search(content): return m.group(0) count += 1 return f"### {m.group(1)}.\n\n{content}" text = re.sub(r"^(\d+)\.\s+(.+)$", _num_repl, text, flags=re.MULTILINE) def _num_letter_repl(m: re.Match) -> str: nonlocal count count += 1 return f"### {m.group(1)}{m.group(2)}.\n\n{m.group(3).strip()}" text = re.sub(r"^(\d+)\s*([a-z])\.\s+(.+)$", _num_letter_repl, text, flags=re.MULTILINE) if not has_exercises: def _aphorism_repl(m: re.Match) -> str: nonlocal count content = m.group(2).strip() if _BIB_MARKERS_RE.search(content): return m.group(0) count += 1 return f"\n\n### {m.group(1)}.\n\n{content}" text = re.sub( r"^-\s+(\d{1,3})\.\s+(.{10,})$", _aphorism_repl, text, flags=re.MULTILINE, ) def _list_section_repl(m: re.Match) -> str: nonlocal count num = m.group(1) content = m.group(2).strip() if _BIB_MARKERS_RE.search(content): return m.group(0) count += 1 split = re.search( r"(?<=[a-z\xe0\xe8\xe9\xec\xed\xf2\xf3\xf9\xfa])\s+" r"(?=[A-Z\xc0\xc8\xc9\xcc\xcd\xd2\xd3\xd9\xda])", content, ) if split and split.start() >= 3: title = content[: split.start()].strip() body = content[split.end():].strip() if len(body) >= 20: return f"\n\n### {num}. {title}\n\n{body}" return f"\n\n### {num}. {content}" text = re.sub( r"^-\s+(\d{1,3})\s+([A-Z\xc0\xc8\xc9\xcc\xcd\xd2\xd3\xd9\xda\'L].{10,})$", _list_section_repl, text, flags=re.MULTILINE, ) return text, count def _t_extract_math(text: str) -> tuple[str, int]: return _extract_math_environments(text) def _t_extract_articles(text: str) -> tuple[str, int]: return _extract_article_headers(text)