64dc403e80
- Unifica deps.py + checker.py + converter.py in extract.py (fronte PDF) - Sposta transforms/ in _pipeline/ (struttura piatta, no sottocartelle) - Aggiunge spinner animato (thread) durante conversione opendataloader-pdf - Aggiunge progresso step-by-step [i/37] per apply_transforms via callback - Mostra punteggio qualità (score/100 grade) a fine elaborazione - Fix: _DOTLEADER_RE spostata in _constants.py (non più definita inline) - Fix: report.py importa regex da _constants invece di ridefinirle - Fix: _t_remove_urls ora conta e ritorna le rimozioni effettive Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
185 lines
6.0 KiB
Python
185 lines
6.0 KiB
Python
"""Costruzione struttura: TOC, ALLCAPS→##, sezioni numerate, ambienti matematici, articoli."""
|
|
import re
|
|
|
|
from ._constants import (
|
|
_TOC_KEYWORDS, _BIB_MARKERS_RE,
|
|
_TOC_ITEM_RE, _TOC_HDR_WITH_PAGE_RE,
|
|
)
|
|
from ._helpers import (
|
|
_is_allcaps_line, _allcaps_to_header,
|
|
_extract_math_environments, _extract_article_headers,
|
|
)
|
|
|
|
|
|
def _t_remove_toc(text: str) -> tuple[str, int]:
|
|
lines = text.split("\n")
|
|
new_lines = []
|
|
_in_toc = False
|
|
removed = False
|
|
for line in lines:
|
|
bare = re.sub(r"^#+\s*", "", line.strip())
|
|
first_word = bare.split(".")[0].strip().lower()
|
|
if first_word in _TOC_KEYWORDS:
|
|
removed = True
|
|
_in_toc = True
|
|
continue
|
|
if _in_toc:
|
|
if re.match(r"^\s*$", line) or re.match(r"^\s*[-*+]\s+\d", line):
|
|
continue
|
|
if re.match(r"^\s*[-*+]\s+.{2,70}\s+\d{1,3}\s*$", line):
|
|
continue
|
|
if len(line.strip()) > 200:
|
|
_in_toc = False
|
|
new_lines.append(line)
|
|
continue
|
|
_in_toc = False
|
|
new_lines.append(line)
|
|
return "\n".join(new_lines), 1 if removed else 0
|
|
|
|
|
|
def _t_remove_orphan_toc(text: str) -> tuple[str, int]:
|
|
"""
|
|
Rimuove voci di sommario senza dot-leader che sfuggono a _t_remove_toc.
|
|
Rileva: (a) blocchi di 3+ righe consecutive che matchano il pattern TOC
|
|
nei primi 25% del documento; (b) header ### N. Titolo PAGINA il cui corpo
|
|
è una lista di voci numerate.
|
|
"""
|
|
blocks = re.split(r"\n{2,}", text)
|
|
total = len(blocks)
|
|
cutoff = max(10, min(40, int(total * 0.25)))
|
|
to_drop = set()
|
|
|
|
i = 0
|
|
while i < cutoff and i < total:
|
|
b = blocks[i].strip()
|
|
|
|
# (a) Sequenza di 3+ blocchi TOC consecutivi
|
|
if _TOC_ITEM_RE.match(b):
|
|
j = i
|
|
while j < min(cutoff, i + 60) and j < len(blocks) and _TOC_ITEM_RE.match(blocks[j].strip()):
|
|
j += 1
|
|
if j - i >= 3:
|
|
for k in range(i, j):
|
|
to_drop.add(k)
|
|
# Rimuovi anche l'header ### precedente se ha numero di pagina
|
|
if i > 0 and _TOC_HDR_WITH_PAGE_RE.match(blocks[i - 1].strip()):
|
|
to_drop.add(i - 1)
|
|
i = j
|
|
continue
|
|
|
|
# (b) Header ### N. Titolo PAGINA con corpo che è lista di voci numerate
|
|
if _TOC_HDR_WITH_PAGE_RE.match(b):
|
|
body = blocks[i + 1].strip() if i + 1 < len(blocks) else ""
|
|
# Il corpo contiene 2+ occorrenze di "N. Titolo"
|
|
toc_hits = re.findall(r"\d+\.?\s+[A-Za-zÀ-ú]", body)
|
|
if len(toc_hits) >= 2 and len(body) < 300:
|
|
to_drop.add(i)
|
|
if i + 1 < total:
|
|
to_drop.add(i + 1)
|
|
i += 2
|
|
continue
|
|
|
|
i += 1
|
|
|
|
if not to_drop:
|
|
return text, 0
|
|
|
|
kept = [b for idx, b in enumerate(blocks) if idx not in to_drop]
|
|
return re.sub(r"\n{3,}", "\n\n", "\n\n".join(kept)), len(to_drop)
|
|
|
|
|
|
def _t_allcaps_to_headers(text: str) -> tuple[str, int]:
|
|
count = 0
|
|
blocks = text.split("\n\n")
|
|
new_blocks = []
|
|
for block in blocks:
|
|
stripped = block.strip()
|
|
if "\n" not in stripped and _is_allcaps_line(stripped):
|
|
new_blocks.append(_allcaps_to_header(stripped))
|
|
count += 1
|
|
else:
|
|
sub_lines = block.split("\n")
|
|
converted = []
|
|
for ln in sub_lines:
|
|
if _is_allcaps_line(ln) and len(ln.strip()) > 3:
|
|
converted.append(_allcaps_to_header(ln))
|
|
count += 1
|
|
else:
|
|
converted.append(ln)
|
|
new_blocks.append("\n".join(converted))
|
|
return "\n\n".join(new_blocks), count
|
|
|
|
|
|
def _t_numbered_sections(text: str, has_exercises: bool = False) -> tuple[str, int]:
|
|
count = 0
|
|
|
|
def _num_repl(m: re.Match) -> str:
|
|
nonlocal count
|
|
content = m.group(2).strip()
|
|
if content.endswith(".") and len(content) > 40:
|
|
return m.group(0)
|
|
if _BIB_MARKERS_RE.search(content):
|
|
return m.group(0)
|
|
count += 1
|
|
return f"### {m.group(1)}.\n\n{content}"
|
|
|
|
text = re.sub(r"^(\d+)\.\s+(.+)$", _num_repl, text, flags=re.MULTILINE)
|
|
|
|
def _num_letter_repl(m: re.Match) -> str:
|
|
nonlocal count
|
|
count += 1
|
|
return f"### {m.group(1)}{m.group(2)}.\n\n{m.group(3).strip()}"
|
|
|
|
text = re.sub(r"^(\d+)\s*([a-z])\.\s+(.+)$", _num_letter_repl, text, flags=re.MULTILINE)
|
|
|
|
if not has_exercises:
|
|
def _aphorism_repl(m: re.Match) -> str:
|
|
nonlocal count
|
|
content = m.group(2).strip()
|
|
if _BIB_MARKERS_RE.search(content):
|
|
return m.group(0)
|
|
count += 1
|
|
return f"\n\n### {m.group(1)}.\n\n{content}"
|
|
|
|
text = re.sub(
|
|
r"^-\s+(\d{1,3})\.\s+(.{10,})$",
|
|
_aphorism_repl,
|
|
text,
|
|
flags=re.MULTILINE,
|
|
)
|
|
|
|
def _list_section_repl(m: re.Match) -> str:
|
|
nonlocal count
|
|
num = m.group(1)
|
|
content = m.group(2).strip()
|
|
if _BIB_MARKERS_RE.search(content):
|
|
return m.group(0)
|
|
count += 1
|
|
split = re.search(
|
|
r"(?<=[a-z\xe0\xe8\xe9\xec\xed\xf2\xf3\xf9\xfa])\s+"
|
|
r"(?=[A-Z\xc0\xc8\xc9\xcc\xcd\xd2\xd3\xd9\xda])",
|
|
content,
|
|
)
|
|
if split and split.start() >= 3:
|
|
title = content[: split.start()].strip()
|
|
body = content[split.end():].strip()
|
|
if len(body) >= 20:
|
|
return f"\n\n### {num}. {title}\n\n{body}"
|
|
return f"\n\n### {num}. {content}"
|
|
|
|
text = re.sub(
|
|
r"^-\s+(\d{1,3})\s+([A-Z\xc0\xc8\xc9\xcc\xcd\xd2\xd3\xd9\xda\'L].{10,})$",
|
|
_list_section_repl,
|
|
text,
|
|
flags=re.MULTILINE,
|
|
)
|
|
return text, count
|
|
|
|
|
|
def _t_extract_math(text: str) -> tuple[str, int]:
|
|
return _extract_math_environments(text)
|
|
|
|
|
|
def _t_extract_articles(text: str) -> tuple[str, int]:
|
|
return _extract_article_headers(text)
|