Files
rag-from-scratch/conversione/_pipeline/_structure.py
T
davide 3f4689e8fd feat: rileva note bibliografiche e raccolte multi-articolo in pipeline
Risolve la conversione errata di note a piè di pagina accademiche in
header Markdown nei testi giuridici (es. dirittopubblico: da 424 h2
errati → 27 h2 semanticamente corretti).

- _BIB_MARKERS_RE: aggiunge ibid., cfr., op. cit., cit., ivi
- _FOOTNOTE_AUTHOR_RE: nuovo pattern per "A. COGNOME" (es. G. GUZZETTA)
- _num_repl / _aphorism_repl / _list_section_repl: usano entrambi i
  guard per non convertire note bibliografiche in sezioni
- _t_promote_chapter_headers: usa max-count ≥ 3 per distinguere
  raccolte multi-articolo (non promuovere) da libri con capitoli
  sequenziali (promuovere); preserva il comportamento corretto su anatomia
- _t_remove_page_markers / _t_remove_page_numbers / _t_remove_separators:
  nuove transform per page marker PDF, numeri isolati, separatori underscore

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-07 16:12:50 +02:00

249 lines
8.7 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""Costruzione struttura: TOC, ALLCAPS→##, sezioni numerate, ambienti matematici, articoli."""
import re
from ._constants import (
_TOC_KEYWORDS, _BIB_MARKERS_RE, _FOOTNOTE_AUTHOR_RE,
_TOC_ITEM_RE, _TOC_HDR_WITH_PAGE_RE,
)
from ._helpers import (
_is_allcaps_line, _allcaps_to_header,
_extract_math_environments, _extract_article_headers,
)
def _t_remove_toc(text: str) -> tuple[str, int]:
lines = text.split("\n")
new_lines = []
_in_toc = False
removed = False
for line in lines:
bare = re.sub(r"^#+\s*", "", line.strip())
first_word = bare.split(".")[0].strip().lower()
if first_word in _TOC_KEYWORDS:
removed = True
_in_toc = True
continue
if _in_toc:
if re.match(r"^\s*$", line) or re.match(r"^\s*[-*+]\s+\d", line):
continue
if re.match(r"^\s*[-*+]\s+.{2,70}\s+\d{1,3}\s*$", line):
continue
# Righe brevi con riferimento pagina (es. "Prefazione pag. 4")
if re.match(r"^.{3,80}\s+pag\.\s*\d{1,4}\s*$", line.strip()):
continue
if len(line.strip()) > 200:
_in_toc = False
new_lines.append(line)
continue
_in_toc = False
new_lines.append(line)
return "\n".join(new_lines), 1 if removed else 0
def _t_remove_orphan_toc(text: str) -> tuple[str, int]:
"""
Rimuove voci di sommario senza dot-leader che sfuggono a _t_remove_toc.
Rileva: (a) blocchi di 3+ righe consecutive che matchano il pattern TOC
nei primi 25% del documento; (b) header ### N. Titolo PAGINA il cui corpo
è una lista di voci numerate.
"""
blocks = re.split(r"\n{2,}", text)
total = len(blocks)
cutoff = max(10, min(40, int(total * 0.25)))
to_drop = set()
i = 0
while i < cutoff and i < total:
b = blocks[i].strip()
# (a) Sequenza di 3+ blocchi TOC consecutivi
if _TOC_ITEM_RE.match(b):
j = i
while j < min(cutoff, i + 60) and j < len(blocks) and _TOC_ITEM_RE.match(blocks[j].strip()):
j += 1
if j - i >= 3:
for k in range(i, j):
to_drop.add(k)
# Rimuovi anche l'header ### precedente se ha numero di pagina
if i > 0 and _TOC_HDR_WITH_PAGE_RE.match(blocks[i - 1].strip()):
to_drop.add(i - 1)
i = j
continue
# (b) Header ### N. Titolo PAGINA con corpo che è lista di voci numerate
if _TOC_HDR_WITH_PAGE_RE.match(b):
body = blocks[i + 1].strip() if i + 1 < len(blocks) else ""
# Il corpo contiene 2+ occorrenze di "N. Titolo"
toc_hits = re.findall(r"\d+\.?\s+[A-Za-zÀ-ú]", body)
if len(toc_hits) >= 2 and len(body) < 300:
to_drop.add(i)
if i + 1 < total:
to_drop.add(i + 1)
i += 2
continue
i += 1
if not to_drop:
return text, 0
kept = [b for idx, b in enumerate(blocks) if idx not in to_drop]
return re.sub(r"\n{3,}", "\n\n", "\n\n".join(kept)), len(to_drop)
def _t_allcaps_to_headers(text: str) -> tuple[str, int]:
count = 0
blocks = text.split("\n\n")
new_blocks = []
for block in blocks:
stripped = block.strip()
if "\n" not in stripped and _is_allcaps_line(stripped):
new_blocks.append(_allcaps_to_header(stripped))
count += 1
else:
sub_lines = block.split("\n")
converted = []
for ln in sub_lines:
if _is_allcaps_line(ln) and len(ln.strip()) > 3:
converted.append(_allcaps_to_header(ln))
count += 1
else:
converted.append(ln)
new_blocks.append("\n".join(converted))
return "\n\n".join(new_blocks), count
def _t_numbered_sections(text: str, has_exercises: bool = False) -> tuple[str, int]:
count = 0
def _num_repl(m: re.Match) -> str:
nonlocal count
content = m.group(2).strip()
if content.endswith(".") and len(content) > 40:
return m.group(0)
# Paragrafo lungo: non è un titolo di sezione
if len(content) > 130:
return m.group(0)
if _BIB_MARKERS_RE.search(content) or _FOOTNOTE_AUTHOR_RE.search(content):
return m.group(0)
count += 1
# Prova a separare titolo dal corpo alla prima transizione minusc→Maiusc
split = re.search(
r"(?<=[a-z\xe0\xe8\xe9\xec\xed\xf2\xf3\xf9\xfa])\s+"
r"(?=[A-Z\xc0\xc8\xc9\xcc\xcd\xd2\xd3\xd9\xda])",
content,
)
if split and 3 <= split.start() and len(content) - split.end() >= 40:
title = content[: split.start()].strip()
body = content[split.end():].strip()
return f"### {m.group(1)}. {title}\n\n{body}"
return f"### {m.group(1)}. {content}"
text = re.sub(r"^(\d+)\.\s+(.+)$", _num_repl, text, flags=re.MULTILINE)
def _num_letter_repl(m: re.Match) -> str:
nonlocal count
count += 1
return f"### {m.group(1)}{m.group(2)}.\n\n{m.group(3).strip()}"
text = re.sub(r"^(\d+)\s*([a-z])\.\s+(.+)$", _num_letter_repl, text, flags=re.MULTILINE)
if not has_exercises:
def _aphorism_repl(m: re.Match) -> str:
nonlocal count
content = m.group(2).strip()
if _BIB_MARKERS_RE.search(content) or _FOOTNOTE_AUTHOR_RE.search(content):
return m.group(0)
count += 1
split = re.search(
r"(?<=[a-z\xe0\xe8\xe9\xec\xed\xf2\xf3\xf9\xfa])\s+"
r"(?=[A-Z\xc0\xc8\xc9\xcc\xcd\xd2\xd3\xd9\xda])",
content,
)
if split and 3 <= split.start() and len(content) - split.end() >= 40:
title = content[: split.start()].strip()
body = content[split.end():].strip()
return f"\n\n### {m.group(1)}. {title}\n\n{body}"
return f"\n\n### {m.group(1)}. {content}"
text = re.sub(
r"^-[ \t]+(\d{1,3})\.[ \t]+(.{10,})$",
_aphorism_repl,
text,
flags=re.MULTILINE,
)
def _list_section_repl(m: re.Match) -> str:
nonlocal count
num = m.group(1)
content = m.group(2).strip()
if _BIB_MARKERS_RE.search(content) or _FOOTNOTE_AUTHOR_RE.search(content):
return m.group(0)
count += 1
split = re.search(
r"(?<=[a-z\xe0\xe8\xe9\xec\xed\xf2\xf3\xf9\xfa])\s+"
r"(?=[A-Z\xc0\xc8\xc9\xcc\xcd\xd2\xd3\xd9\xda])",
content,
)
if split and split.start() >= 3:
title = content[: split.start()].strip()
body = content[split.end():].strip()
if len(body) >= 20:
return f"\n\n### {num}. {title}\n\n{body}"
return f"\n\n### {num}. {content}"
text = re.sub(
r"^-\s+(\d{1,3})\s+([A-Z\xc0\xc8\xc9\xcc\xcd\xd2\xd3\xd9\xda\'L].{10,})$",
_list_section_repl,
text,
flags=re.MULTILINE,
)
return text, count
def _t_promote_chapter_headers(text: str) -> tuple[str, int]:
"""
Promuove ### N. Titolo → ## N. Titolo quando sembrano capitoli principali.
Condizioni: ≥3 headers ### con numero 150, nessun ## già presente,
numeri di capitolo sequenziali e NON duplicati.
Numeri duplicati indicano una raccolta multi-articolo: non promuovere.
"""
if re.search(r"^## \d", text, re.MULTILINE):
return text, 0
pattern = re.compile(r"^### (\d+)\. (.+)$", re.MULTILINE)
matches = list(pattern.finditer(text))
chapter_matches = [m for m in matches if int(m.group(1)) <= 50]
if len(chapter_matches) < 3:
return text, 0
chapter_nums_list = [int(m.group(1)) for m in chapter_matches]
# Se qualche numero appare ≥3 volte è una raccolta multi-articolo: non promuovere
num_counter: dict[int, int] = {}
for n in chapter_nums_list:
num_counter[n] = num_counter.get(n, 0) + 1
if max(num_counter.values()) >= 3:
return text, 0
chapter_nums = set(chapter_nums_list)
count = 0
def _repl(m: re.Match) -> str:
nonlocal count
if int(m.group(1)) in chapter_nums:
count += 1
return f"## {m.group(1)}. {m.group(2)}"
return m.group(0)
return pattern.sub(_repl, text), count
def _t_extract_math(text: str) -> tuple[str, int]:
return _extract_math_environments(text)
def _t_extract_articles(text: str) -> tuple[str, int]:
return _extract_article_headers(text)