feat: integra pipeline PDF→Markdown a 9 stadi e test suite

Porta da branch marker la riscrittura completa di conversione/_pipeline/ (9 stadi PyMuPDF) e la suite tests/ senza modificare il resto del progetto RAG (ollama/, step-5/, step-6/, step-8/, rag.py, retrieve.py, config.py). requirements.txt: aggiunge PyMuPDF>=1.24.0 e pytest>=8.0, mantiene chromadb, rimuove opendataloader-pdf e pymupdf4llm. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-11 14:44:16 +02:00
parent 5215f53ad0
commit e1b5298b20
38 changed files with 3691 additions and 169 deletions
@@ -0,0 +1,96 @@
+"""Fixture condivise per l'intera test suite."""
+import pytest
+from conversione._pipeline.models import Block, Section
+
+
+@pytest.fixture
+def make_block():
+    """Factory per Block di test con valori di default ragionevoli."""
+    def _make(
+        text="testo di prova",
+        page=1,
+        font_size=12.0,
+        font_name="Helvetica",
+        is_bold=False,
+        block_type="paragraph",
+        space_before=5.0,
+        bbox=(50.0, 100.0, 400.0, 114.0),
+        level=0,
+    ):
+        return Block(
+            text=text,
+            page=page,
+            bbox=bbox,
+            font_size=font_size,
+            font_name=font_name,
+            is_bold=is_bold,
+            block_type=block_type,
+            space_before=space_before,
+            level=level,
+        )
+    return _make
+
+
+@pytest.fixture
+def mock_fitz_page():
+    """Dizionario che simula l'output di page.get_text('dict') per una pagina."""
+    return {
+        "width": 595.0,
+        "height": 842.0,
+        "blocks": [
+            {
+                "type": 0,
+                "bbox": (50, 50, 450, 70),
+                "lines": [{
+                    "bbox": (50, 50, 450, 70),
+                    "spans": [{
+                        "text": "1. Capitolo Primo",
+                        "font": "Helvetica-Bold",
+                        "size": 18.0,
+                        "flags": 16,
+                        "bbox": (50, 50, 450, 70),
+                        "origin": (50, 68),
+                        "color": 0,
+                    }],
+                }],
+            },
+            {
+                "type": 0,
+                "bbox": (50, 90, 500, 104),
+                "lines": [{
+                    "bbox": (50, 90, 500, 104),
+                    "spans": [{
+                        "text": "Testo del primo paragrafo del capitolo.",
+                        "font": "Helvetica",
+                        "size": 12.0,
+                        "flags": 0,
+                        "bbox": (50, 90, 500, 104),
+                        "origin": (50, 102),
+                        "color": 0,
+                    }],
+                }],
+            },
+        ],
+    }
+
+
+@pytest.fixture
+def simple_hierarchy_blocks(make_block):
+    """Lista di Block con gerarchia semplice H1→H2→H3 numerata."""
+    return [
+        make_block("1. Introduzione", font_size=18, is_bold=True, space_before=20.0),
+        make_block("Testo del paragrafo di introduzione.", font_size=12),
+        make_block("1.1 Contesto", font_size=15, is_bold=True, space_before=15.0),
+        make_block("Testo della sezione di contesto.", font_size=12),
+        make_block("1.1.1 Dettaglio", font_size=13, is_bold=True, space_before=10.0),
+        make_block("Testo del dettaglio specifico.", font_size=12),
+        make_block("2. Conclusioni", font_size=18, is_bold=True, space_before=20.0),
+        make_block("Testo conclusivo.", font_size=12),
+    ]
+
+
+@pytest.fixture
+def sources_dir():
+    from pathlib import Path
+    d = Path(__file__).parent.parent / "sources"
+    return d if d.exists() else None
@@ -0,0 +1,68 @@
+"""Test end-to-end: pipeline completa su PDF reali da sources/."""
+import json
+import shutil
+import pytest
+from pathlib import Path
+
+from conversione._pipeline import run
+
+
+PROJECT_ROOT = Path(__file__).parent.parent.parent
+
+
+def _sources_available(stem: str) -> bool:
+    return (PROJECT_ROOT / "sources" / f"{stem}.pdf").exists()
+
+
+@pytest.mark.skipif(not _sources_available("bitcoin"), reason="sources/bitcoin.pdf non disponibile")
+def test_bitcoin_produces_clean_md(tmp_path, monkeypatch):
+    """Pipeline completa su bitcoin.pdf — verifica output strutturato."""
+    # Usa tmp_path come output per non inquinare il repo
+    out_dir = tmp_path / "conversione" / "bitcoin"
+    out_dir.mkdir(parents=True)
+    sources_dir = tmp_path / "sources"
+    sources_dir.mkdir()
+    shutil.copy(PROJECT_ROOT / "sources" / "bitcoin.pdf", sources_dir / "bitcoin.pdf")
+
+    ok = run("bitcoin", tmp_path, force=True)
+    assert ok, "La pipeline deve completare senza errori"
+
+    clean_md = out_dir / "clean.md"
+    assert clean_md.exists(), "clean.md deve essere creato"
+
+    text = clean_md.read_text(encoding="utf-8")
+    assert len(text) > 1000, "clean.md deve avere contenuto significativo"
+    assert "#" in text, "clean.md deve avere almeno un header"
+
+    report = json.loads((out_dir / "report.json").read_text(encoding="utf-8"))
+    assert report["structure"]["livello_struttura"] >= 1, "Struttura deve avere almeno livello 1"
+
+
+@pytest.mark.skipif(not _sources_available("bitcoin"), reason="sources/bitcoin.pdf non disponibile")
+def test_determinism(tmp_path):
+    """Due run consecutive sullo stesso PDF producono output identico."""
+    sources_dir = tmp_path / "sources"
+    sources_dir.mkdir()
+    shutil.copy(PROJECT_ROOT / "sources" / "bitcoin.pdf", sources_dir / "bitcoin.pdf")
+
+    run("bitcoin", tmp_path, force=True)
+    first = (tmp_path / "conversione" / "bitcoin" / "clean.md").read_text()
+
+    run("bitcoin", tmp_path, force=True)
+    second = (tmp_path / "conversione" / "bitcoin" / "clean.md").read_text()
+
+    assert first == second, "Output deve essere deterministico tra due run"
+
+
+@pytest.mark.skipif(not _sources_available("codice_civile"), reason="sources/codice_civile.pdf non disponibile")
+def test_codice_civile_has_articles(tmp_path):
+    """Il Codice Civile deve produrre header con 'Art.'."""
+    sources_dir = tmp_path / "sources"
+    sources_dir.mkdir()
+    shutil.copy(PROJECT_ROOT / "sources" / "codice_civile.pdf", sources_dir / "codice_civile.pdf")
+
+    ok = run("codice_civile", tmp_path, force=True)
+    assert ok
+
+    text = (tmp_path / "conversione" / "codice_civile" / "clean.md").read_text()
+    assert "Art." in text, "clean.md del codice civile deve contenere articoli"
@@ -0,0 +1,40 @@
+"""Test categoria 8: riparazione automatica gerarchia rotta (todo.md Cat.8)."""
+from conversione._pipeline.stage8_normalize import normalize_hierarchy
+
+
+def test_cat8_invalid_hierarchy_auto_repaired():
+    """
+    Categoria 8 dal todo.md:
+    Input:  # A \\n\\n#### B
+    Atteso: # A \\n\\n## B   (salto riparato a max +1)
+    """
+    md_input = "# A\n\n#### B\n\nContenuto di B.\n"
+    result, stats = normalize_hierarchy(md_input)
+
+    assert "## B" in result, "#### deve diventare ## (salto +1 dal padre #)"
+    assert "#### B" not in result, "Il livello originale non deve restare"
+    assert stats["n_level_jumps_repaired"] >= 1
+
+
+def test_multiple_jumps_all_repaired():
+    """Catena di salti: # → #### → ######."""
+    md_input = "# Root\n\n#### Middle\n\nTesto\n\n###### Deep\n\nTesto\n"
+    result, stats = normalize_hierarchy(md_input)
+
+    lines = [l for l in result.split("\n") if l.startswith("#")]
+    levels = [len(l) - len(l.lstrip("#")) for l in lines]
+
+    # Verifica che non ci siano salti > 1
+    for i in range(1, len(levels)):
+        assert levels[i] <= levels[i - 1] + 1, \
+            f"Salto non riparato: {levels[i-1]} → {levels[i]}"
+
+
+def test_valid_hierarchy_not_touched():
+    """Gerarchia valida non deve essere modificata."""
+    md_valid = "# H1\n\nTesto\n\n## H2\n\nTesto\n\n### H3\n\nTesto\n"
+    result, stats = normalize_hierarchy(md_valid)
+    assert stats["n_level_jumps_repaired"] == 0
+    assert "# H1" in result
+    assert "## H2" in result
+    assert "### H3" in result
@@ -0,0 +1,47 @@
+"""Test dataclass Block, Section, FontProfile."""
+from conversione._pipeline.models import Block, Section, FontProfile
+
+
+def test_block_creation():
+    b = Block(
+        text="Titolo", page=1,
+        bbox=(0, 0, 100, 14),
+        font_size=16.0, font_name="Arial-Bold",
+        is_bold=True,
+    )
+    assert b.text == "Titolo"
+    assert b.is_bold
+    assert b.block_type == "paragraph"
+    assert b.level == 0
+    assert b.x0 == 0.0
+    assert b.y1 == 14.0
+
+
+def test_block_properties():
+    b = Block("x", 1, (10.0, 20.0, 110.0, 34.0), 12.0, "Helvetica", False)
+    assert b.x0 == 10.0
+    assert b.y0 == 20.0
+    assert b.x1 == 110.0
+    assert b.y1 == 34.0
+
+
+def test_section_defaults():
+    s = Section(title="Intro", level=1)
+    assert s.content == []
+    assert s.children == []
+    assert s.page_start == 0
+
+
+def test_section_nesting():
+    parent = Section("Parent", level=1)
+    child  = Section("Child", level=2)
+    parent.children.append(child)
+    assert len(parent.children) == 1
+    assert parent.children[0].title == "Child"
+
+
+def test_font_profile():
+    fp = FontProfile(body_size=11.0, cluster_map={18.0: 1, 15.0: 2}, header_sizes=[18.0, 15.0])
+    assert fp.body_size == 11.0
+    assert fp.cluster_map[18.0] == 1
+    assert len(fp.header_sizes) == 2
@@ -0,0 +1,44 @@
+"""Test Stage 3: font analysis."""
+from conversione._pipeline.models import Block
+from conversione._pipeline.stage3_font import build_font_profile
+
+
+def _make_block(font_size, n=1):
+    return [
+        Block(f"testo {i}", 1, (0, i*14.0, 100, (i+1)*14.0), font_size, "Helvetica", False)
+        for i in range(n)
+    ]
+
+
+def test_body_size_is_most_frequent():
+    blocks = _make_block(12.0, 20) + _make_block(18.0, 2) + _make_block(15.0, 3)
+    profile = build_font_profile(blocks)
+    assert profile.body_size == 12.0
+
+
+def test_header_sizes_above_body():
+    blocks = _make_block(12.0, 20) + _make_block(18.0, 2) + _make_block(15.0, 3)
+    profile = build_font_profile(blocks)
+    assert all(s > profile.body_size for s in profile.header_sizes)
+
+
+def test_cluster_map_levels():
+    blocks = _make_block(12.0, 20) + _make_block(24.0, 2) + _make_block(18.0, 3) + _make_block(14.0, 4)
+    profile = build_font_profile(blocks)
+    # Taglia più grande deve avere livello 1
+    if profile.header_sizes:
+        assert profile.cluster_map[profile.header_sizes[0]] == 1
+
+
+def test_empty_blocks():
+    profile = build_font_profile([])
+    assert profile.body_size == 11.0
+    assert profile.header_sizes == []
+
+
+def test_single_font_size():
+    blocks = _make_block(11.0, 50)
+    profile = build_font_profile(blocks)
+    assert profile.body_size == 11.0
+    assert profile.header_sizes == []
+    assert profile.cluster_map == {}
@@ -0,0 +1,52 @@
+"""Test Stage 4: header detection — segnali combinati."""
+import pytest
+from conversione._pipeline.models import Block, FontProfile
+from conversione._pipeline.stage4_headers import classify_blocks
+
+
+def _profile(body=12.0):
+    return FontProfile(body_size=body, cluster_map={18.0: 1, 15.0: 2}, header_sizes=[18.0, 15.0])
+
+
+def _block(text, font_size=12.0, is_bold=False, space_before=5.0, block_type="paragraph"):
+    return Block(text, 1, (50, 100, 400, 114), font_size, "Helvetica", is_bold,
+                 block_type=block_type, space_before=space_before)
+
+
+def test_numbered_large_bold_short_becomes_header():
+    # Tutti i segnali positivi
+    b = _block("1. Introduzione", font_size=18, is_bold=True, space_before=30.0)
+    classify_blocks([b], _profile())
+    assert b.block_type == "header_candidate"
+
+
+def test_body_text_stays_paragraph():
+    b = _block("Questo è un lungo paragrafo di testo normale che non deve diventare un header.", font_size=12)
+    classify_blocks([b], _profile())
+    assert b.block_type == "paragraph"
+
+
+def test_bold_body_text_not_header():
+    # Bold ma stesso size del corpo e testo lungo → NON header (bold_signal richiede size > body+0.5)
+    b = _block("Testo importante in grassetto nel corpo del documento.", font_size=12, is_bold=True)
+    classify_blocks([b], _profile())
+    assert b.block_type == "paragraph"
+
+
+def test_article_forced_header():
+    # "Art. N" → sempre header candidate
+    b = _block("Art. 1423. Nullità del contratto.", font_size=12)
+    classify_blocks([b], _profile())
+    assert b.block_type == "header_candidate"
+
+
+def test_table_preserved():
+    b = _block("Colonna A | Colonna B", font_size=12, block_type="table")
+    classify_blocks([b], _profile())
+    assert b.block_type == "table"
+
+
+def test_list_item_detection():
+    b = _block("- primo elemento della lista", font_size=12)
+    classify_blocks([b], _profile())
+    assert b.block_type == "list_item"
@@ -0,0 +1,95 @@
+"""Test Stage 5: hierarchy inference — numerazione, TOC, font fallback."""
+from conversione._pipeline.models import Block, FontProfile
+from conversione._pipeline.stage5_hierarchy import infer_hierarchy, _level_from_numbering
+
+
+def _profile():
+    return FontProfile(body_size=12.0, cluster_map={18.0: 1, 15.0: 2, 13.0: 3}, header_sizes=[18.0, 15.0, 13.0])
+
+
+def _hblock(text, font_size=18.0, is_bold=True):
+    b = Block(text, 1, (50, 100, 400, 114), font_size, "Helvetica-Bold", is_bold)
+    b.block_type = "header_candidate"
+    return b
+
+
+def _pblock(text):
+    b = Block(text, 1, (50, 120, 400, 134), 12.0, "Helvetica", False)
+    b.block_type = "paragraph"
+    return b
+
+
+# ── Test _level_from_numbering ────────────────────────────────────────────────
+
+def test_numbering_level1():
+    assert _level_from_numbering("1. Titolo") == 1
+
+def test_numbering_level2():
+    assert _level_from_numbering("1.2 Sottotitolo") == 2
+
+def test_numbering_level3():
+    assert _level_from_numbering("1.2.3 Dettaglio") == 3
+
+def test_numbering_deep_capped_at_3():
+    assert _level_from_numbering("1.2.3.4 Troppo profondo") == 3
+
+def test_numbering_no_match():
+    assert _level_from_numbering("Testo senza numero") == 0
+
+
+# ── Test infer_hierarchy con numerazione ─────────────────────────────────────
+
+def test_numbered_sections_get_correct_levels():
+    blocks = [
+        _hblock("1. Introduzione", font_size=18),
+        _pblock("Testo."),
+        _hblock("1.1 Contesto", font_size=15),
+        _pblock("Testo."),
+        _hblock("1.1.1 Dettaglio", font_size=13),
+        _pblock("Testo."),
+        _hblock("2. Conclusioni", font_size=18),
+    ]
+    result = infer_hierarchy(blocks, _profile(), toc=[])
+    headers = [b for b in result if b.block_type == "header_candidate"]
+    assert headers[0].level == 1  # "1."
+    assert headers[1].level == 2  # "1.1"
+    assert headers[2].level == 3  # "1.1.1"
+    assert headers[3].level == 1  # "2."
+
+
+# ── Test infer_hierarchy con TOC ─────────────────────────────────────────────
+
+def test_toc_alignment():
+    toc = [[1, "Introduzione", 1], [2, "Contesto storico", 3], [1, "Conclusioni", 10]]
+    blocks = [
+        _hblock("Introduzione", font_size=14),
+        _hblock("Contesto storico", font_size=13),
+        _hblock("Conclusioni", font_size=14),
+    ]
+    result = infer_hierarchy(blocks, _profile(), toc=toc)
+    headers = [b for b in result if b.block_type == "header_candidate"]
+    assert headers[0].level == 1
+    assert headers[1].level == 2
+    assert headers[2].level == 1
+
+
+# ── Test infer_hierarchy con font fallback ────────────────────────────────────
+
+def test_font_fallback_no_numbering_no_toc():
+    blocks = [
+        _hblock("Capitolo Grande", font_size=18),
+        _pblock("Testo."),
+        _hblock("Sezione Media", font_size=15),
+        _pblock("Testo."),
+    ]
+    result = infer_hierarchy(blocks, _profile(), toc=[])
+    headers = [b for b in result if b.block_type == "header_candidate"]
+    assert headers[0].level == 1  # 18pt → cluster level 1
+    assert headers[1].level == 2  # 15pt → cluster level 2
+
+
+def test_empty_cluster_map_defaults_to_2():
+    profile_empty = FontProfile(body_size=12.0, cluster_map={}, header_sizes=[])
+    blocks = [_hblock("Titolo qualsiasi", font_size=18)]
+    result = infer_hierarchy(blocks, profile_empty, toc=[])
+    assert result[0].level == 2
@@ -0,0 +1,98 @@
+"""Test Stage 6: document tree reconstruction."""
+import pytest
+from conversione._pipeline.models import Block, Section
+from conversione._pipeline.stage6_tree import build_tree
+
+
+def _hblock(text, level, page=1):
+    b = Block(text, page, (50, 100, 400, 114), 16.0, "Helvetica-Bold", True)
+    b.block_type = "header_candidate"
+    b.level = level
+    return b
+
+
+def _pblock(text, page=1):
+    b = Block(text, page, (50, 120, 400, 134), 12.0, "Helvetica", False)
+    b.block_type = "paragraph"
+    return b
+
+
+def test_simple_hierarchy():
+    blocks = [
+        _hblock("H1", 1),
+        _pblock("p1"),
+        _hblock("H2", 2),
+        _pblock("p2"),
+    ]
+    roots = build_tree(blocks)
+    assert len(roots) == 1
+    h1 = roots[0]
+    assert h1.title == "H1"
+    assert h1.level == 1
+    assert len(h1.content) == 1
+    assert h1.content[0].text == "p1"
+    assert len(h1.children) == 1
+    h2 = h1.children[0]
+    assert h2.title == "H2"
+    assert len(h2.content) == 1
+
+
+def test_two_siblings():
+    blocks = [
+        _hblock("Cap 1", 1),
+        _pblock("testo 1"),
+        _hblock("Cap 2", 1),
+        _pblock("testo 2"),
+    ]
+    roots = build_tree(blocks)
+    assert len(roots) == 2
+    assert roots[0].title == "Cap 1"
+    assert roots[1].title == "Cap 2"
+
+
+def test_pre_header_text_gets_implicit_section():
+    blocks = [
+        _pblock("Testo introduttivo prima del primo header."),
+        _hblock("Primo header", 1),
+    ]
+    roots = build_tree(blocks)
+    # La sezione implicita (level=0) è la radice; contiene il testo pre-header
+    # e il primo header diventa suo figlio.
+    assert len(roots) == 1
+    implicit = roots[0]
+    assert implicit.title == ""
+    assert implicit.level == 0
+    assert len(implicit.content) == 1
+    assert len(implicit.children) == 1
+    assert implicit.children[0].title == "Primo header"
+
+
+def test_deep_nesting():
+    blocks = [
+        _hblock("H1", 1),
+        _hblock("H2", 2),
+        _hblock("H3", 3),
+        _pblock("testo profondo"),
+    ]
+    roots = build_tree(blocks)
+    assert len(roots) == 1
+    h1 = roots[0]
+    assert len(h1.children) == 1
+    h2 = h1.children[0]
+    assert len(h2.children) == 1
+    h3 = h2.children[0]
+    assert len(h3.content) == 1
+
+
+def test_ignore_blocks_skipped():
+    b_ignore = Block("superscript", 1, (0,0,10,10), 8.0, "Helvetica", False, block_type="ignore")
+    blocks = [
+        _hblock("Titolo", 1),
+        b_ignore,
+        _pblock("paragrafo"),
+    ]
+    roots = build_tree(blocks)
+    h1 = roots[0]
+    # Il blocco ignore non deve essere nel content
+    assert all(b.block_type != "ignore" for b in h1.content)
+    assert len(h1.content) == 1
@@ -0,0 +1,62 @@
+"""Test Stage 7: serializzazione Markdown."""
+from conversione._pipeline.models import Block, Section
+from conversione._pipeline.stage7_markdown import serialize_tree, _table_to_markdown
+
+
+def _section(title, level, texts=None, children=None):
+    blocks = []
+    for t in (texts or []):
+        b = Block(t, 1, (0,0,100,14), 12.0, "Helvetica", False, block_type="paragraph")
+        blocks.append(b)
+    s = Section(title=title, level=level, content=blocks, children=children or [])
+    return s
+
+
+def test_h1_header():
+    roots = [_section("Introduzione", 1, ["Testo."])]
+    md = serialize_tree(roots, {})
+    assert "# Introduzione" in md
+    assert "Testo." in md
+
+
+def test_h2_nested():
+    child = _section("Sezione 1.1", 2, ["Contenuto della sezione."])
+    root  = _section("Capitolo 1", 1, [], [child])
+    md = serialize_tree([root], {})
+    assert "# Capitolo 1" in md
+    assert "## Sezione 1.1" in md
+    assert "Contenuto della sezione." in md
+
+
+def test_implicit_section_no_hash():
+    # Sezione implicita level=0 → nessun # header
+    s = Section(title="", level=0)
+    b = Block("Testo iniziale.", 1, (0,0,100,14), 12.0, "Helvetica", False)
+    s.content.append(b)
+    md = serialize_tree([s], {})
+    assert not md.startswith("#")
+    assert "Testo iniziale." in md
+
+
+def test_ignore_blocks_not_serialized():
+    s = Section("Titolo", 1)
+    b_ignore = Block("superscript", 1, (0,0,10,10), 8.0, "Helvetica", False, block_type="ignore")
+    b_para   = Block("Paragrafo valido.", 1, (0,0,100,14), 12.0, "Helvetica", False, block_type="paragraph")
+    s.content.extend([b_ignore, b_para])
+    md = serialize_tree([s], {})
+    assert "superscript" not in md
+    assert "Paragrafo valido." in md
+
+
+def test_table_to_markdown():
+    table = [["Nome", "Età"], ["Alice", "30"], ["Bob", "25"]]
+    md = _table_to_markdown(table)
+    assert "| Nome | Età |" in md
+    assert "| --- | --- |" in md
+    assert "| Alice | 30 |" in md
+
+
+def test_no_excessive_blank_lines():
+    roots = [_section("A", 1, ["p1", "p2", "p3"])]
+    md = serialize_tree(roots, {})
+    assert "\n\n\n" not in md
@@ -0,0 +1,49 @@
+"""Test Stage 8: normalizzazione gerarchia Markdown."""
+from conversione._pipeline.stage8_normalize import normalize_hierarchy
+
+
+def test_level_jump_repaired():
+    md = "# A\n\n#### B\n\nTesto\n"
+    result, stats = normalize_hierarchy(md)
+    assert "## B" in result
+    assert "#### B" not in result
+    assert stats["n_level_jumps_repaired"] == 1
+
+
+def test_valid_hierarchy_unchanged():
+    md = "# A\n\n## B\n\nTesto\n\n### C\n\nTesto\n"
+    result, stats = normalize_hierarchy(md)
+    assert "# A" in result
+    assert "## B" in result
+    assert "### C" in result
+    assert stats["n_level_jumps_repaired"] == 0
+
+
+def test_empty_header_removed():
+    md = "# Titolo\n\n## Vuoto\n\n## Con contenuto\n\nTesto.\n"
+    result, stats = normalize_hierarchy(md)
+    assert "## Vuoto" not in result
+    assert "## Con contenuto" in result
+    assert stats["n_empty_headers_removed"] == 1
+
+
+def test_duplicate_consecutive_header_collapsed():
+    md = "# Titolo\n\n# Titolo\n\nTesto.\n"
+    result, stats = normalize_hierarchy(md)
+    assert result.count("# Titolo") == 1
+    assert stats["n_duplicate_headers_removed"] == 1
+
+
+def test_multiple_jumps():
+    md = "# A\n\n### B\n\nTesto B\n\n##### C\n\nTesto C\n"
+    result, stats = normalize_hierarchy(md)
+    assert stats["n_level_jumps_repaired"] == 2
+    assert "## B" in result
+    assert "### C" in result
+
+
+def test_no_false_positives():
+    md = "# A\n\nTesto.\n\n## B\n\nTesto.\n"
+    result, stats = normalize_hierarchy(md)
+    assert stats["n_level_jumps_repaired"] == 0
+    assert stats["n_empty_headers_removed"] == 0
@@ -0,0 +1,36 @@
+"""Test Stage 9: validazione strutturale Markdown."""
+from conversione._pipeline.stage9_validate import validate_markdown
+
+
+def test_valid_document():
+    md = "# Titolo\n\nTesto.\n\n## Sezione\n\nContenuto.\n"
+    result = validate_markdown(md)
+    assert result.is_valid
+    assert not result.errors
+
+
+def test_level_jump_detected():
+    md = "# A\n\n### B\n\nTesto.\n"
+    result = validate_markdown(md)
+    assert not result.is_valid
+    assert any("salto" in e.lower() or "livello" in e.lower() for e in result.errors)
+
+
+def test_no_headers_warning():
+    md = "Testo senza nessun header.\n\nAltro paragrafo.\n"
+    result = validate_markdown(md)
+    assert any("header" in w.lower() or "strutturato" in w.lower() for w in result.warnings)
+
+
+def test_inconsistent_table_warning():
+    md = "# Titolo\n\nTesto.\n\n| A | B |\n|---|---|\n| 1 | 2 | 3 |\n"
+    result = validate_markdown(md)
+    assert any("tabelle" in w.lower() or "colonne" in w.lower() for w in result.warnings)
+
+
+def test_to_dict():
+    md = "# A\n\nTesto.\n"
+    d = validate_markdown(md).to_dict()
+    assert "valid" in d
+    assert "errors" in d
+    assert "warnings" in d