feat: integra pipeline PDF→Markdown a 9 stadi e test suite

Porta da branch marker la riscrittura completa di conversione/_pipeline/
(9 stadi PyMuPDF) e la suite tests/ senza modificare il resto del progetto
RAG (ollama/, step-5/, step-6/, step-8/, rag.py, retrieve.py, config.py).

requirements.txt: aggiunge PyMuPDF>=1.24.0 e pytest>=8.0, mantiene chromadb,
rimuove opendataloader-pdf e pymupdf4llm.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-05-11 14:44:16 +02:00
parent 5215f53ad0
commit e1b5298b20
38 changed files with 3691 additions and 169 deletions
View File
+96
View File
@@ -0,0 +1,96 @@
"""Fixture condivise per l'intera test suite."""
import pytest
from conversione._pipeline.models import Block, Section
@pytest.fixture
def make_block():
"""Factory per Block di test con valori di default ragionevoli."""
def _make(
text="testo di prova",
page=1,
font_size=12.0,
font_name="Helvetica",
is_bold=False,
block_type="paragraph",
space_before=5.0,
bbox=(50.0, 100.0, 400.0, 114.0),
level=0,
):
return Block(
text=text,
page=page,
bbox=bbox,
font_size=font_size,
font_name=font_name,
is_bold=is_bold,
block_type=block_type,
space_before=space_before,
level=level,
)
return _make
@pytest.fixture
def mock_fitz_page():
"""Dizionario che simula l'output di page.get_text('dict') per una pagina."""
return {
"width": 595.0,
"height": 842.0,
"blocks": [
{
"type": 0,
"bbox": (50, 50, 450, 70),
"lines": [{
"bbox": (50, 50, 450, 70),
"spans": [{
"text": "1. Capitolo Primo",
"font": "Helvetica-Bold",
"size": 18.0,
"flags": 16,
"bbox": (50, 50, 450, 70),
"origin": (50, 68),
"color": 0,
}],
}],
},
{
"type": 0,
"bbox": (50, 90, 500, 104),
"lines": [{
"bbox": (50, 90, 500, 104),
"spans": [{
"text": "Testo del primo paragrafo del capitolo.",
"font": "Helvetica",
"size": 12.0,
"flags": 0,
"bbox": (50, 90, 500, 104),
"origin": (50, 102),
"color": 0,
}],
}],
},
],
}
@pytest.fixture
def simple_hierarchy_blocks(make_block):
"""Lista di Block con gerarchia semplice H1→H2→H3 numerata."""
return [
make_block("1. Introduzione", font_size=18, is_bold=True, space_before=20.0),
make_block("Testo del paragrafo di introduzione.", font_size=12),
make_block("1.1 Contesto", font_size=15, is_bold=True, space_before=15.0),
make_block("Testo della sezione di contesto.", font_size=12),
make_block("1.1.1 Dettaglio", font_size=13, is_bold=True, space_before=10.0),
make_block("Testo del dettaglio specifico.", font_size=12),
make_block("2. Conclusioni", font_size=18, is_bold=True, space_before=20.0),
make_block("Testo conclusivo.", font_size=12),
]
@pytest.fixture
def sources_dir():
from pathlib import Path
d = Path(__file__).parent.parent / "sources"
return d if d.exists() else None
View File
+68
View File
@@ -0,0 +1,68 @@
"""Test end-to-end: pipeline completa su PDF reali da sources/."""
import json
import shutil
import pytest
from pathlib import Path
from conversione._pipeline import run
PROJECT_ROOT = Path(__file__).parent.parent.parent
def _sources_available(stem: str) -> bool:
return (PROJECT_ROOT / "sources" / f"{stem}.pdf").exists()
@pytest.mark.skipif(not _sources_available("bitcoin"), reason="sources/bitcoin.pdf non disponibile")
def test_bitcoin_produces_clean_md(tmp_path, monkeypatch):
"""Pipeline completa su bitcoin.pdf — verifica output strutturato."""
# Usa tmp_path come output per non inquinare il repo
out_dir = tmp_path / "conversione" / "bitcoin"
out_dir.mkdir(parents=True)
sources_dir = tmp_path / "sources"
sources_dir.mkdir()
shutil.copy(PROJECT_ROOT / "sources" / "bitcoin.pdf", sources_dir / "bitcoin.pdf")
ok = run("bitcoin", tmp_path, force=True)
assert ok, "La pipeline deve completare senza errori"
clean_md = out_dir / "clean.md"
assert clean_md.exists(), "clean.md deve essere creato"
text = clean_md.read_text(encoding="utf-8")
assert len(text) > 1000, "clean.md deve avere contenuto significativo"
assert "#" in text, "clean.md deve avere almeno un header"
report = json.loads((out_dir / "report.json").read_text(encoding="utf-8"))
assert report["structure"]["livello_struttura"] >= 1, "Struttura deve avere almeno livello 1"
@pytest.mark.skipif(not _sources_available("bitcoin"), reason="sources/bitcoin.pdf non disponibile")
def test_determinism(tmp_path):
"""Due run consecutive sullo stesso PDF producono output identico."""
sources_dir = tmp_path / "sources"
sources_dir.mkdir()
shutil.copy(PROJECT_ROOT / "sources" / "bitcoin.pdf", sources_dir / "bitcoin.pdf")
run("bitcoin", tmp_path, force=True)
first = (tmp_path / "conversione" / "bitcoin" / "clean.md").read_text()
run("bitcoin", tmp_path, force=True)
second = (tmp_path / "conversione" / "bitcoin" / "clean.md").read_text()
assert first == second, "Output deve essere deterministico tra due run"
@pytest.mark.skipif(not _sources_available("codice_civile"), reason="sources/codice_civile.pdf non disponibile")
def test_codice_civile_has_articles(tmp_path):
"""Il Codice Civile deve produrre header con 'Art.'."""
sources_dir = tmp_path / "sources"
sources_dir.mkdir()
shutil.copy(PROJECT_ROOT / "sources" / "codice_civile.pdf", sources_dir / "codice_civile.pdf")
ok = run("codice_civile", tmp_path, force=True)
assert ok
text = (tmp_path / "conversione" / "codice_civile" / "clean.md").read_text()
assert "Art." in text, "clean.md del codice civile deve contenere articoli"
+40
View File
@@ -0,0 +1,40 @@
"""Test categoria 8: riparazione automatica gerarchia rotta (todo.md Cat.8)."""
from conversione._pipeline.stage8_normalize import normalize_hierarchy
def test_cat8_invalid_hierarchy_auto_repaired():
"""
Categoria 8 dal todo.md:
Input: # A \\n\\n#### B
Atteso: # A \\n\\n## B (salto riparato a max +1)
"""
md_input = "# A\n\n#### B\n\nContenuto di B.\n"
result, stats = normalize_hierarchy(md_input)
assert "## B" in result, "#### deve diventare ## (salto +1 dal padre #)"
assert "#### B" not in result, "Il livello originale non deve restare"
assert stats["n_level_jumps_repaired"] >= 1
def test_multiple_jumps_all_repaired():
"""Catena di salti: # → #### → ######."""
md_input = "# Root\n\n#### Middle\n\nTesto\n\n###### Deep\n\nTesto\n"
result, stats = normalize_hierarchy(md_input)
lines = [l for l in result.split("\n") if l.startswith("#")]
levels = [len(l) - len(l.lstrip("#")) for l in lines]
# Verifica che non ci siano salti > 1
for i in range(1, len(levels)):
assert levels[i] <= levels[i - 1] + 1, \
f"Salto non riparato: {levels[i-1]}{levels[i]}"
def test_valid_hierarchy_not_touched():
"""Gerarchia valida non deve essere modificata."""
md_valid = "# H1\n\nTesto\n\n## H2\n\nTesto\n\n### H3\n\nTesto\n"
result, stats = normalize_hierarchy(md_valid)
assert stats["n_level_jumps_repaired"] == 0
assert "# H1" in result
assert "## H2" in result
assert "### H3" in result
View File
+47
View File
@@ -0,0 +1,47 @@
"""Test dataclass Block, Section, FontProfile."""
from conversione._pipeline.models import Block, Section, FontProfile
def test_block_creation():
b = Block(
text="Titolo", page=1,
bbox=(0, 0, 100, 14),
font_size=16.0, font_name="Arial-Bold",
is_bold=True,
)
assert b.text == "Titolo"
assert b.is_bold
assert b.block_type == "paragraph"
assert b.level == 0
assert b.x0 == 0.0
assert b.y1 == 14.0
def test_block_properties():
b = Block("x", 1, (10.0, 20.0, 110.0, 34.0), 12.0, "Helvetica", False)
assert b.x0 == 10.0
assert b.y0 == 20.0
assert b.x1 == 110.0
assert b.y1 == 34.0
def test_section_defaults():
s = Section(title="Intro", level=1)
assert s.content == []
assert s.children == []
assert s.page_start == 0
def test_section_nesting():
parent = Section("Parent", level=1)
child = Section("Child", level=2)
parent.children.append(child)
assert len(parent.children) == 1
assert parent.children[0].title == "Child"
def test_font_profile():
fp = FontProfile(body_size=11.0, cluster_map={18.0: 1, 15.0: 2}, header_sizes=[18.0, 15.0])
assert fp.body_size == 11.0
assert fp.cluster_map[18.0] == 1
assert len(fp.header_sizes) == 2
+44
View File
@@ -0,0 +1,44 @@
"""Test Stage 3: font analysis."""
from conversione._pipeline.models import Block
from conversione._pipeline.stage3_font import build_font_profile
def _make_block(font_size, n=1):
return [
Block(f"testo {i}", 1, (0, i*14.0, 100, (i+1)*14.0), font_size, "Helvetica", False)
for i in range(n)
]
def test_body_size_is_most_frequent():
blocks = _make_block(12.0, 20) + _make_block(18.0, 2) + _make_block(15.0, 3)
profile = build_font_profile(blocks)
assert profile.body_size == 12.0
def test_header_sizes_above_body():
blocks = _make_block(12.0, 20) + _make_block(18.0, 2) + _make_block(15.0, 3)
profile = build_font_profile(blocks)
assert all(s > profile.body_size for s in profile.header_sizes)
def test_cluster_map_levels():
blocks = _make_block(12.0, 20) + _make_block(24.0, 2) + _make_block(18.0, 3) + _make_block(14.0, 4)
profile = build_font_profile(blocks)
# Taglia più grande deve avere livello 1
if profile.header_sizes:
assert profile.cluster_map[profile.header_sizes[0]] == 1
def test_empty_blocks():
profile = build_font_profile([])
assert profile.body_size == 11.0
assert profile.header_sizes == []
def test_single_font_size():
blocks = _make_block(11.0, 50)
profile = build_font_profile(blocks)
assert profile.body_size == 11.0
assert profile.header_sizes == []
assert profile.cluster_map == {}
+52
View File
@@ -0,0 +1,52 @@
"""Test Stage 4: header detection — segnali combinati."""
import pytest
from conversione._pipeline.models import Block, FontProfile
from conversione._pipeline.stage4_headers import classify_blocks
def _profile(body=12.0):
return FontProfile(body_size=body, cluster_map={18.0: 1, 15.0: 2}, header_sizes=[18.0, 15.0])
def _block(text, font_size=12.0, is_bold=False, space_before=5.0, block_type="paragraph"):
return Block(text, 1, (50, 100, 400, 114), font_size, "Helvetica", is_bold,
block_type=block_type, space_before=space_before)
def test_numbered_large_bold_short_becomes_header():
# Tutti i segnali positivi
b = _block("1. Introduzione", font_size=18, is_bold=True, space_before=30.0)
classify_blocks([b], _profile())
assert b.block_type == "header_candidate"
def test_body_text_stays_paragraph():
b = _block("Questo è un lungo paragrafo di testo normale che non deve diventare un header.", font_size=12)
classify_blocks([b], _profile())
assert b.block_type == "paragraph"
def test_bold_body_text_not_header():
# Bold ma stesso size del corpo e testo lungo → NON header (bold_signal richiede size > body+0.5)
b = _block("Testo importante in grassetto nel corpo del documento.", font_size=12, is_bold=True)
classify_blocks([b], _profile())
assert b.block_type == "paragraph"
def test_article_forced_header():
# "Art. N" → sempre header candidate
b = _block("Art. 1423. Nullità del contratto.", font_size=12)
classify_blocks([b], _profile())
assert b.block_type == "header_candidate"
def test_table_preserved():
b = _block("Colonna A | Colonna B", font_size=12, block_type="table")
classify_blocks([b], _profile())
assert b.block_type == "table"
def test_list_item_detection():
b = _block("- primo elemento della lista", font_size=12)
classify_blocks([b], _profile())
assert b.block_type == "list_item"
+95
View File
@@ -0,0 +1,95 @@
"""Test Stage 5: hierarchy inference — numerazione, TOC, font fallback."""
from conversione._pipeline.models import Block, FontProfile
from conversione._pipeline.stage5_hierarchy import infer_hierarchy, _level_from_numbering
def _profile():
return FontProfile(body_size=12.0, cluster_map={18.0: 1, 15.0: 2, 13.0: 3}, header_sizes=[18.0, 15.0, 13.0])
def _hblock(text, font_size=18.0, is_bold=True):
b = Block(text, 1, (50, 100, 400, 114), font_size, "Helvetica-Bold", is_bold)
b.block_type = "header_candidate"
return b
def _pblock(text):
b = Block(text, 1, (50, 120, 400, 134), 12.0, "Helvetica", False)
b.block_type = "paragraph"
return b
# ── Test _level_from_numbering ────────────────────────────────────────────────
def test_numbering_level1():
assert _level_from_numbering("1. Titolo") == 1
def test_numbering_level2():
assert _level_from_numbering("1.2 Sottotitolo") == 2
def test_numbering_level3():
assert _level_from_numbering("1.2.3 Dettaglio") == 3
def test_numbering_deep_capped_at_3():
assert _level_from_numbering("1.2.3.4 Troppo profondo") == 3
def test_numbering_no_match():
assert _level_from_numbering("Testo senza numero") == 0
# ── Test infer_hierarchy con numerazione ─────────────────────────────────────
def test_numbered_sections_get_correct_levels():
blocks = [
_hblock("1. Introduzione", font_size=18),
_pblock("Testo."),
_hblock("1.1 Contesto", font_size=15),
_pblock("Testo."),
_hblock("1.1.1 Dettaglio", font_size=13),
_pblock("Testo."),
_hblock("2. Conclusioni", font_size=18),
]
result = infer_hierarchy(blocks, _profile(), toc=[])
headers = [b for b in result if b.block_type == "header_candidate"]
assert headers[0].level == 1 # "1."
assert headers[1].level == 2 # "1.1"
assert headers[2].level == 3 # "1.1.1"
assert headers[3].level == 1 # "2."
# ── Test infer_hierarchy con TOC ─────────────────────────────────────────────
def test_toc_alignment():
toc = [[1, "Introduzione", 1], [2, "Contesto storico", 3], [1, "Conclusioni", 10]]
blocks = [
_hblock("Introduzione", font_size=14),
_hblock("Contesto storico", font_size=13),
_hblock("Conclusioni", font_size=14),
]
result = infer_hierarchy(blocks, _profile(), toc=toc)
headers = [b for b in result if b.block_type == "header_candidate"]
assert headers[0].level == 1
assert headers[1].level == 2
assert headers[2].level == 1
# ── Test infer_hierarchy con font fallback ────────────────────────────────────
def test_font_fallback_no_numbering_no_toc():
blocks = [
_hblock("Capitolo Grande", font_size=18),
_pblock("Testo."),
_hblock("Sezione Media", font_size=15),
_pblock("Testo."),
]
result = infer_hierarchy(blocks, _profile(), toc=[])
headers = [b for b in result if b.block_type == "header_candidate"]
assert headers[0].level == 1 # 18pt → cluster level 1
assert headers[1].level == 2 # 15pt → cluster level 2
def test_empty_cluster_map_defaults_to_2():
profile_empty = FontProfile(body_size=12.0, cluster_map={}, header_sizes=[])
blocks = [_hblock("Titolo qualsiasi", font_size=18)]
result = infer_hierarchy(blocks, profile_empty, toc=[])
assert result[0].level == 2
+98
View File
@@ -0,0 +1,98 @@
"""Test Stage 6: document tree reconstruction."""
import pytest
from conversione._pipeline.models import Block, Section
from conversione._pipeline.stage6_tree import build_tree
def _hblock(text, level, page=1):
b = Block(text, page, (50, 100, 400, 114), 16.0, "Helvetica-Bold", True)
b.block_type = "header_candidate"
b.level = level
return b
def _pblock(text, page=1):
b = Block(text, page, (50, 120, 400, 134), 12.0, "Helvetica", False)
b.block_type = "paragraph"
return b
def test_simple_hierarchy():
blocks = [
_hblock("H1", 1),
_pblock("p1"),
_hblock("H2", 2),
_pblock("p2"),
]
roots = build_tree(blocks)
assert len(roots) == 1
h1 = roots[0]
assert h1.title == "H1"
assert h1.level == 1
assert len(h1.content) == 1
assert h1.content[0].text == "p1"
assert len(h1.children) == 1
h2 = h1.children[0]
assert h2.title == "H2"
assert len(h2.content) == 1
def test_two_siblings():
blocks = [
_hblock("Cap 1", 1),
_pblock("testo 1"),
_hblock("Cap 2", 1),
_pblock("testo 2"),
]
roots = build_tree(blocks)
assert len(roots) == 2
assert roots[0].title == "Cap 1"
assert roots[1].title == "Cap 2"
def test_pre_header_text_gets_implicit_section():
blocks = [
_pblock("Testo introduttivo prima del primo header."),
_hblock("Primo header", 1),
]
roots = build_tree(blocks)
# La sezione implicita (level=0) è la radice; contiene il testo pre-header
# e il primo header diventa suo figlio.
assert len(roots) == 1
implicit = roots[0]
assert implicit.title == ""
assert implicit.level == 0
assert len(implicit.content) == 1
assert len(implicit.children) == 1
assert implicit.children[0].title == "Primo header"
def test_deep_nesting():
blocks = [
_hblock("H1", 1),
_hblock("H2", 2),
_hblock("H3", 3),
_pblock("testo profondo"),
]
roots = build_tree(blocks)
assert len(roots) == 1
h1 = roots[0]
assert len(h1.children) == 1
h2 = h1.children[0]
assert len(h2.children) == 1
h3 = h2.children[0]
assert len(h3.content) == 1
def test_ignore_blocks_skipped():
b_ignore = Block("superscript", 1, (0,0,10,10), 8.0, "Helvetica", False, block_type="ignore")
blocks = [
_hblock("Titolo", 1),
b_ignore,
_pblock("paragrafo"),
]
roots = build_tree(blocks)
h1 = roots[0]
# Il blocco ignore non deve essere nel content
assert all(b.block_type != "ignore" for b in h1.content)
assert len(h1.content) == 1
+62
View File
@@ -0,0 +1,62 @@
"""Test Stage 7: serializzazione Markdown."""
from conversione._pipeline.models import Block, Section
from conversione._pipeline.stage7_markdown import serialize_tree, _table_to_markdown
def _section(title, level, texts=None, children=None):
blocks = []
for t in (texts or []):
b = Block(t, 1, (0,0,100,14), 12.0, "Helvetica", False, block_type="paragraph")
blocks.append(b)
s = Section(title=title, level=level, content=blocks, children=children or [])
return s
def test_h1_header():
roots = [_section("Introduzione", 1, ["Testo."])]
md = serialize_tree(roots, {})
assert "# Introduzione" in md
assert "Testo." in md
def test_h2_nested():
child = _section("Sezione 1.1", 2, ["Contenuto della sezione."])
root = _section("Capitolo 1", 1, [], [child])
md = serialize_tree([root], {})
assert "# Capitolo 1" in md
assert "## Sezione 1.1" in md
assert "Contenuto della sezione." in md
def test_implicit_section_no_hash():
# Sezione implicita level=0 → nessun # header
s = Section(title="", level=0)
b = Block("Testo iniziale.", 1, (0,0,100,14), 12.0, "Helvetica", False)
s.content.append(b)
md = serialize_tree([s], {})
assert not md.startswith("#")
assert "Testo iniziale." in md
def test_ignore_blocks_not_serialized():
s = Section("Titolo", 1)
b_ignore = Block("superscript", 1, (0,0,10,10), 8.0, "Helvetica", False, block_type="ignore")
b_para = Block("Paragrafo valido.", 1, (0,0,100,14), 12.0, "Helvetica", False, block_type="paragraph")
s.content.extend([b_ignore, b_para])
md = serialize_tree([s], {})
assert "superscript" not in md
assert "Paragrafo valido." in md
def test_table_to_markdown():
table = [["Nome", "Età"], ["Alice", "30"], ["Bob", "25"]]
md = _table_to_markdown(table)
assert "| Nome | Età |" in md
assert "| --- | --- |" in md
assert "| Alice | 30 |" in md
def test_no_excessive_blank_lines():
roots = [_section("A", 1, ["p1", "p2", "p3"])]
md = serialize_tree(roots, {})
assert "\n\n\n" not in md
+49
View File
@@ -0,0 +1,49 @@
"""Test Stage 8: normalizzazione gerarchia Markdown."""
from conversione._pipeline.stage8_normalize import normalize_hierarchy
def test_level_jump_repaired():
md = "# A\n\n#### B\n\nTesto\n"
result, stats = normalize_hierarchy(md)
assert "## B" in result
assert "#### B" not in result
assert stats["n_level_jumps_repaired"] == 1
def test_valid_hierarchy_unchanged():
md = "# A\n\n## B\n\nTesto\n\n### C\n\nTesto\n"
result, stats = normalize_hierarchy(md)
assert "# A" in result
assert "## B" in result
assert "### C" in result
assert stats["n_level_jumps_repaired"] == 0
def test_empty_header_removed():
md = "# Titolo\n\n## Vuoto\n\n## Con contenuto\n\nTesto.\n"
result, stats = normalize_hierarchy(md)
assert "## Vuoto" not in result
assert "## Con contenuto" in result
assert stats["n_empty_headers_removed"] == 1
def test_duplicate_consecutive_header_collapsed():
md = "# Titolo\n\n# Titolo\n\nTesto.\n"
result, stats = normalize_hierarchy(md)
assert result.count("# Titolo") == 1
assert stats["n_duplicate_headers_removed"] == 1
def test_multiple_jumps():
md = "# A\n\n### B\n\nTesto B\n\n##### C\n\nTesto C\n"
result, stats = normalize_hierarchy(md)
assert stats["n_level_jumps_repaired"] == 2
assert "## B" in result
assert "### C" in result
def test_no_false_positives():
md = "# A\n\nTesto.\n\n## B\n\nTesto.\n"
result, stats = normalize_hierarchy(md)
assert stats["n_level_jumps_repaired"] == 0
assert stats["n_empty_headers_removed"] == 0
+36
View File
@@ -0,0 +1,36 @@
"""Test Stage 9: validazione strutturale Markdown."""
from conversione._pipeline.stage9_validate import validate_markdown
def test_valid_document():
md = "# Titolo\n\nTesto.\n\n## Sezione\n\nContenuto.\n"
result = validate_markdown(md)
assert result.is_valid
assert not result.errors
def test_level_jump_detected():
md = "# A\n\n### B\n\nTesto.\n"
result = validate_markdown(md)
assert not result.is_valid
assert any("salto" in e.lower() or "livello" in e.lower() for e in result.errors)
def test_no_headers_warning():
md = "Testo senza nessun header.\n\nAltro paragrafo.\n"
result = validate_markdown(md)
assert any("header" in w.lower() or "strutturato" in w.lower() for w in result.warnings)
def test_inconsistent_table_warning():
md = "# Titolo\n\nTesto.\n\n| A | B |\n|---|---|\n| 1 | 2 | 3 |\n"
result = validate_markdown(md)
assert any("tabelle" in w.lower() or "colonne" in w.lower() for w in result.warnings)
def test_to_dict():
md = "# A\n\nTesto.\n"
d = validate_markdown(md).to_dict()
assert "valid" in d
assert "errors" in d
assert "warnings" in d