Files
rag-from-scratch/conversione/_pipeline/_structure.py
T
davide 64dc403e80 refactor: ottimizza pipeline PDF→Markdown — struttura piatta e verbosità
- Unifica deps.py + checker.py + converter.py in extract.py (fronte PDF)
- Sposta transforms/ in _pipeline/ (struttura piatta, no sottocartelle)
- Aggiunge spinner animato (thread) durante conversione opendataloader-pdf
- Aggiunge progresso step-by-step [i/37] per apply_transforms via callback
- Mostra punteggio qualità (score/100 grade) a fine elaborazione
- Fix: _DOTLEADER_RE spostata in _constants.py (non più definita inline)
- Fix: report.py importa regex da _constants invece di ridefinirle
- Fix: _t_remove_urls ora conta e ritorna le rimozioni effettive

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-07 14:30:41 +02:00

185 lines
6.0 KiB
Python

"""Costruzione struttura: TOC, ALLCAPS→##, sezioni numerate, ambienti matematici, articoli."""
import re
from ._constants import (
_TOC_KEYWORDS, _BIB_MARKERS_RE,
_TOC_ITEM_RE, _TOC_HDR_WITH_PAGE_RE,
)
from ._helpers import (
_is_allcaps_line, _allcaps_to_header,
_extract_math_environments, _extract_article_headers,
)
def _t_remove_toc(text: str) -> tuple[str, int]:
lines = text.split("\n")
new_lines = []
_in_toc = False
removed = False
for line in lines:
bare = re.sub(r"^#+\s*", "", line.strip())
first_word = bare.split(".")[0].strip().lower()
if first_word in _TOC_KEYWORDS:
removed = True
_in_toc = True
continue
if _in_toc:
if re.match(r"^\s*$", line) or re.match(r"^\s*[-*+]\s+\d", line):
continue
if re.match(r"^\s*[-*+]\s+.{2,70}\s+\d{1,3}\s*$", line):
continue
if len(line.strip()) > 200:
_in_toc = False
new_lines.append(line)
continue
_in_toc = False
new_lines.append(line)
return "\n".join(new_lines), 1 if removed else 0
def _t_remove_orphan_toc(text: str) -> tuple[str, int]:
"""
Rimuove voci di sommario senza dot-leader che sfuggono a _t_remove_toc.
Rileva: (a) blocchi di 3+ righe consecutive che matchano il pattern TOC
nei primi 25% del documento; (b) header ### N. Titolo PAGINA il cui corpo
è una lista di voci numerate.
"""
blocks = re.split(r"\n{2,}", text)
total = len(blocks)
cutoff = max(10, min(40, int(total * 0.25)))
to_drop = set()
i = 0
while i < cutoff and i < total:
b = blocks[i].strip()
# (a) Sequenza di 3+ blocchi TOC consecutivi
if _TOC_ITEM_RE.match(b):
j = i
while j < min(cutoff, i + 60) and j < len(blocks) and _TOC_ITEM_RE.match(blocks[j].strip()):
j += 1
if j - i >= 3:
for k in range(i, j):
to_drop.add(k)
# Rimuovi anche l'header ### precedente se ha numero di pagina
if i > 0 and _TOC_HDR_WITH_PAGE_RE.match(blocks[i - 1].strip()):
to_drop.add(i - 1)
i = j
continue
# (b) Header ### N. Titolo PAGINA con corpo che è lista di voci numerate
if _TOC_HDR_WITH_PAGE_RE.match(b):
body = blocks[i + 1].strip() if i + 1 < len(blocks) else ""
# Il corpo contiene 2+ occorrenze di "N. Titolo"
toc_hits = re.findall(r"\d+\.?\s+[A-Za-zÀ-ú]", body)
if len(toc_hits) >= 2 and len(body) < 300:
to_drop.add(i)
if i + 1 < total:
to_drop.add(i + 1)
i += 2
continue
i += 1
if not to_drop:
return text, 0
kept = [b for idx, b in enumerate(blocks) if idx not in to_drop]
return re.sub(r"\n{3,}", "\n\n", "\n\n".join(kept)), len(to_drop)
def _t_allcaps_to_headers(text: str) -> tuple[str, int]:
count = 0
blocks = text.split("\n\n")
new_blocks = []
for block in blocks:
stripped = block.strip()
if "\n" not in stripped and _is_allcaps_line(stripped):
new_blocks.append(_allcaps_to_header(stripped))
count += 1
else:
sub_lines = block.split("\n")
converted = []
for ln in sub_lines:
if _is_allcaps_line(ln) and len(ln.strip()) > 3:
converted.append(_allcaps_to_header(ln))
count += 1
else:
converted.append(ln)
new_blocks.append("\n".join(converted))
return "\n\n".join(new_blocks), count
def _t_numbered_sections(text: str, has_exercises: bool = False) -> tuple[str, int]:
count = 0
def _num_repl(m: re.Match) -> str:
nonlocal count
content = m.group(2).strip()
if content.endswith(".") and len(content) > 40:
return m.group(0)
if _BIB_MARKERS_RE.search(content):
return m.group(0)
count += 1
return f"### {m.group(1)}.\n\n{content}"
text = re.sub(r"^(\d+)\.\s+(.+)$", _num_repl, text, flags=re.MULTILINE)
def _num_letter_repl(m: re.Match) -> str:
nonlocal count
count += 1
return f"### {m.group(1)}{m.group(2)}.\n\n{m.group(3).strip()}"
text = re.sub(r"^(\d+)\s*([a-z])\.\s+(.+)$", _num_letter_repl, text, flags=re.MULTILINE)
if not has_exercises:
def _aphorism_repl(m: re.Match) -> str:
nonlocal count
content = m.group(2).strip()
if _BIB_MARKERS_RE.search(content):
return m.group(0)
count += 1
return f"\n\n### {m.group(1)}.\n\n{content}"
text = re.sub(
r"^-\s+(\d{1,3})\.\s+(.{10,})$",
_aphorism_repl,
text,
flags=re.MULTILINE,
)
def _list_section_repl(m: re.Match) -> str:
nonlocal count
num = m.group(1)
content = m.group(2).strip()
if _BIB_MARKERS_RE.search(content):
return m.group(0)
count += 1
split = re.search(
r"(?<=[a-z\xe0\xe8\xe9\xec\xed\xf2\xf3\xf9\xfa])\s+"
r"(?=[A-Z\xc0\xc8\xc9\xcc\xcd\xd2\xd3\xd9\xda])",
content,
)
if split and split.start() >= 3:
title = content[: split.start()].strip()
body = content[split.end():].strip()
if len(body) >= 20:
return f"\n\n### {num}. {title}\n\n{body}"
return f"\n\n### {num}. {content}"
text = re.sub(
r"^-\s+(\d{1,3})\s+([A-Z\xc0\xc8\xc9\xcc\xcd\xd2\xd3\xd9\xda\'L].{10,})$",
_list_section_repl,
text,
flags=re.MULTILINE,
)
return text, count
def _t_extract_math(text: str) -> tuple[str, int]:
return _extract_math_environments(text)
def _t_extract_articles(text: str) -> tuple[str, int]:
return _extract_article_headers(text)