feat: integra pipeline PDF→Markdown a 9 stadi e test suite

Porta da main la riscrittura completa di conversione/_pipeline/ (9 stadi PyMuPDF) e la suite tests/ senza modificare chunks/, step-8/, rag.py, ollama/, retrieve.py, config.py. requirements.txt: aggiunge PyMuPDF>=1.24.0 e pytest>=8.0, mantiene chromadb, rimuove opendataloader-pdf e pymupdf4llm. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-11 14:46:16 +02:00
parent a7b71fa508
commit ebd2a43f84
39 changed files with 3688 additions and 153 deletions
@@ -0,0 +1,47 @@
+"""Test dataclass Block, Section, FontProfile."""
+from conversione._pipeline.models import Block, Section, FontProfile
+
+
+def test_block_creation():
+    b = Block(
+        text="Titolo", page=1,
+        bbox=(0, 0, 100, 14),
+        font_size=16.0, font_name="Arial-Bold",
+        is_bold=True,
+    )
+    assert b.text == "Titolo"
+    assert b.is_bold
+    assert b.block_type == "paragraph"
+    assert b.level == 0
+    assert b.x0 == 0.0
+    assert b.y1 == 14.0
+
+
+def test_block_properties():
+    b = Block("x", 1, (10.0, 20.0, 110.0, 34.0), 12.0, "Helvetica", False)
+    assert b.x0 == 10.0
+    assert b.y0 == 20.0
+    assert b.x1 == 110.0
+    assert b.y1 == 34.0
+
+
+def test_section_defaults():
+    s = Section(title="Intro", level=1)
+    assert s.content == []
+    assert s.children == []
+    assert s.page_start == 0
+
+
+def test_section_nesting():
+    parent = Section("Parent", level=1)
+    child  = Section("Child", level=2)
+    parent.children.append(child)
+    assert len(parent.children) == 1
+    assert parent.children[0].title == "Child"
+
+
+def test_font_profile():
+    fp = FontProfile(body_size=11.0, cluster_map={18.0: 1, 15.0: 2}, header_sizes=[18.0, 15.0])
+    assert fp.body_size == 11.0
+    assert fp.cluster_map[18.0] == 1
+    assert len(fp.header_sizes) == 2
@@ -0,0 +1,44 @@
+"""Test Stage 3: font analysis."""
+from conversione._pipeline.models import Block
+from conversione._pipeline.stage3_font import build_font_profile
+
+
+def _make_block(font_size, n=1):
+    return [
+        Block(f"testo {i}", 1, (0, i*14.0, 100, (i+1)*14.0), font_size, "Helvetica", False)
+        for i in range(n)
+    ]
+
+
+def test_body_size_is_most_frequent():
+    blocks = _make_block(12.0, 20) + _make_block(18.0, 2) + _make_block(15.0, 3)
+    profile = build_font_profile(blocks)
+    assert profile.body_size == 12.0
+
+
+def test_header_sizes_above_body():
+    blocks = _make_block(12.0, 20) + _make_block(18.0, 2) + _make_block(15.0, 3)
+    profile = build_font_profile(blocks)
+    assert all(s > profile.body_size for s in profile.header_sizes)
+
+
+def test_cluster_map_levels():
+    blocks = _make_block(12.0, 20) + _make_block(24.0, 2) + _make_block(18.0, 3) + _make_block(14.0, 4)
+    profile = build_font_profile(blocks)
+    # Taglia più grande deve avere livello 1
+    if profile.header_sizes:
+        assert profile.cluster_map[profile.header_sizes[0]] == 1
+
+
+def test_empty_blocks():
+    profile = build_font_profile([])
+    assert profile.body_size == 11.0
+    assert profile.header_sizes == []
+
+
+def test_single_font_size():
+    blocks = _make_block(11.0, 50)
+    profile = build_font_profile(blocks)
+    assert profile.body_size == 11.0
+    assert profile.header_sizes == []
+    assert profile.cluster_map == {}
@@ -0,0 +1,52 @@
+"""Test Stage 4: header detection — segnali combinati."""
+import pytest
+from conversione._pipeline.models import Block, FontProfile
+from conversione._pipeline.stage4_headers import classify_blocks
+
+
+def _profile(body=12.0):
+    return FontProfile(body_size=body, cluster_map={18.0: 1, 15.0: 2}, header_sizes=[18.0, 15.0])
+
+
+def _block(text, font_size=12.0, is_bold=False, space_before=5.0, block_type="paragraph"):
+    return Block(text, 1, (50, 100, 400, 114), font_size, "Helvetica", is_bold,
+                 block_type=block_type, space_before=space_before)
+
+
+def test_numbered_large_bold_short_becomes_header():
+    # Tutti i segnali positivi
+    b = _block("1. Introduzione", font_size=18, is_bold=True, space_before=30.0)
+    classify_blocks([b], _profile())
+    assert b.block_type == "header_candidate"
+
+
+def test_body_text_stays_paragraph():
+    b = _block("Questo è un lungo paragrafo di testo normale che non deve diventare un header.", font_size=12)
+    classify_blocks([b], _profile())
+    assert b.block_type == "paragraph"
+
+
+def test_bold_body_text_not_header():
+    # Bold ma stesso size del corpo e testo lungo → NON header (bold_signal richiede size > body+0.5)
+    b = _block("Testo importante in grassetto nel corpo del documento.", font_size=12, is_bold=True)
+    classify_blocks([b], _profile())
+    assert b.block_type == "paragraph"
+
+
+def test_article_forced_header():
+    # "Art. N" → sempre header candidate
+    b = _block("Art. 1423. Nullità del contratto.", font_size=12)
+    classify_blocks([b], _profile())
+    assert b.block_type == "header_candidate"
+
+
+def test_table_preserved():
+    b = _block("Colonna A | Colonna B", font_size=12, block_type="table")
+    classify_blocks([b], _profile())
+    assert b.block_type == "table"
+
+
+def test_list_item_detection():
+    b = _block("- primo elemento della lista", font_size=12)
+    classify_blocks([b], _profile())
+    assert b.block_type == "list_item"
@@ -0,0 +1,95 @@
+"""Test Stage 5: hierarchy inference — numerazione, TOC, font fallback."""
+from conversione._pipeline.models import Block, FontProfile
+from conversione._pipeline.stage5_hierarchy import infer_hierarchy, _level_from_numbering
+
+
+def _profile():
+    return FontProfile(body_size=12.0, cluster_map={18.0: 1, 15.0: 2, 13.0: 3}, header_sizes=[18.0, 15.0, 13.0])
+
+
+def _hblock(text, font_size=18.0, is_bold=True):
+    b = Block(text, 1, (50, 100, 400, 114), font_size, "Helvetica-Bold", is_bold)
+    b.block_type = "header_candidate"
+    return b
+
+
+def _pblock(text):
+    b = Block(text, 1, (50, 120, 400, 134), 12.0, "Helvetica", False)
+    b.block_type = "paragraph"
+    return b
+
+
+# ── Test _level_from_numbering ────────────────────────────────────────────────
+
+def test_numbering_level1():
+    assert _level_from_numbering("1. Titolo") == 1
+
+def test_numbering_level2():
+    assert _level_from_numbering("1.2 Sottotitolo") == 2
+
+def test_numbering_level3():
+    assert _level_from_numbering("1.2.3 Dettaglio") == 3
+
+def test_numbering_deep_capped_at_3():
+    assert _level_from_numbering("1.2.3.4 Troppo profondo") == 3
+
+def test_numbering_no_match():
+    assert _level_from_numbering("Testo senza numero") == 0
+
+
+# ── Test infer_hierarchy con numerazione ─────────────────────────────────────
+
+def test_numbered_sections_get_correct_levels():
+    blocks = [
+        _hblock("1. Introduzione", font_size=18),
+        _pblock("Testo."),
+        _hblock("1.1 Contesto", font_size=15),
+        _pblock("Testo."),
+        _hblock("1.1.1 Dettaglio", font_size=13),
+        _pblock("Testo."),
+        _hblock("2. Conclusioni", font_size=18),
+    ]
+    result = infer_hierarchy(blocks, _profile(), toc=[])
+    headers = [b for b in result if b.block_type == "header_candidate"]
+    assert headers[0].level == 1  # "1."
+    assert headers[1].level == 2  # "1.1"
+    assert headers[2].level == 3  # "1.1.1"
+    assert headers[3].level == 1  # "2."
+
+
+# ── Test infer_hierarchy con TOC ─────────────────────────────────────────────
+
+def test_toc_alignment():
+    toc = [[1, "Introduzione", 1], [2, "Contesto storico", 3], [1, "Conclusioni", 10]]
+    blocks = [
+        _hblock("Introduzione", font_size=14),
+        _hblock("Contesto storico", font_size=13),
+        _hblock("Conclusioni", font_size=14),
+    ]
+    result = infer_hierarchy(blocks, _profile(), toc=toc)
+    headers = [b for b in result if b.block_type == "header_candidate"]
+    assert headers[0].level == 1
+    assert headers[1].level == 2
+    assert headers[2].level == 1
+
+
+# ── Test infer_hierarchy con font fallback ────────────────────────────────────
+
+def test_font_fallback_no_numbering_no_toc():
+    blocks = [
+        _hblock("Capitolo Grande", font_size=18),
+        _pblock("Testo."),
+        _hblock("Sezione Media", font_size=15),
+        _pblock("Testo."),
+    ]
+    result = infer_hierarchy(blocks, _profile(), toc=[])
+    headers = [b for b in result if b.block_type == "header_candidate"]
+    assert headers[0].level == 1  # 18pt → cluster level 1
+    assert headers[1].level == 2  # 15pt → cluster level 2
+
+
+def test_empty_cluster_map_defaults_to_2():
+    profile_empty = FontProfile(body_size=12.0, cluster_map={}, header_sizes=[])
+    blocks = [_hblock("Titolo qualsiasi", font_size=18)]
+    result = infer_hierarchy(blocks, profile_empty, toc=[])
+    assert result[0].level == 2
@@ -0,0 +1,98 @@
+"""Test Stage 6: document tree reconstruction."""
+import pytest
+from conversione._pipeline.models import Block, Section
+from conversione._pipeline.stage6_tree import build_tree
+
+
+def _hblock(text, level, page=1):
+    b = Block(text, page, (50, 100, 400, 114), 16.0, "Helvetica-Bold", True)
+    b.block_type = "header_candidate"
+    b.level = level
+    return b
+
+
+def _pblock(text, page=1):
+    b = Block(text, page, (50, 120, 400, 134), 12.0, "Helvetica", False)
+    b.block_type = "paragraph"
+    return b
+
+
+def test_simple_hierarchy():
+    blocks = [
+        _hblock("H1", 1),
+        _pblock("p1"),
+        _hblock("H2", 2),
+        _pblock("p2"),
+    ]
+    roots = build_tree(blocks)
+    assert len(roots) == 1
+    h1 = roots[0]
+    assert h1.title == "H1"
+    assert h1.level == 1
+    assert len(h1.content) == 1
+    assert h1.content[0].text == "p1"
+    assert len(h1.children) == 1
+    h2 = h1.children[0]
+    assert h2.title == "H2"
+    assert len(h2.content) == 1
+
+
+def test_two_siblings():
+    blocks = [
+        _hblock("Cap 1", 1),
+        _pblock("testo 1"),
+        _hblock("Cap 2", 1),
+        _pblock("testo 2"),
+    ]
+    roots = build_tree(blocks)
+    assert len(roots) == 2
+    assert roots[0].title == "Cap 1"
+    assert roots[1].title == "Cap 2"
+
+
+def test_pre_header_text_gets_implicit_section():
+    blocks = [
+        _pblock("Testo introduttivo prima del primo header."),
+        _hblock("Primo header", 1),
+    ]
+    roots = build_tree(blocks)
+    # La sezione implicita (level=0) è la radice; contiene il testo pre-header
+    # e il primo header diventa suo figlio.
+    assert len(roots) == 1
+    implicit = roots[0]
+    assert implicit.title == ""
+    assert implicit.level == 0
+    assert len(implicit.content) == 1
+    assert len(implicit.children) == 1
+    assert implicit.children[0].title == "Primo header"
+
+
+def test_deep_nesting():
+    blocks = [
+        _hblock("H1", 1),
+        _hblock("H2", 2),
+        _hblock("H3", 3),
+        _pblock("testo profondo"),
+    ]
+    roots = build_tree(blocks)
+    assert len(roots) == 1
+    h1 = roots[0]
+    assert len(h1.children) == 1
+    h2 = h1.children[0]
+    assert len(h2.children) == 1
+    h3 = h2.children[0]
+    assert len(h3.content) == 1
+
+
+def test_ignore_blocks_skipped():
+    b_ignore = Block("superscript", 1, (0,0,10,10), 8.0, "Helvetica", False, block_type="ignore")
+    blocks = [
+        _hblock("Titolo", 1),
+        b_ignore,
+        _pblock("paragrafo"),
+    ]
+    roots = build_tree(blocks)
+    h1 = roots[0]
+    # Il blocco ignore non deve essere nel content
+    assert all(b.block_type != "ignore" for b in h1.content)
+    assert len(h1.content) == 1
@@ -0,0 +1,62 @@
+"""Test Stage 7: serializzazione Markdown."""
+from conversione._pipeline.models import Block, Section
+from conversione._pipeline.stage7_markdown import serialize_tree, _table_to_markdown
+
+
+def _section(title, level, texts=None, children=None):
+    blocks = []
+    for t in (texts or []):
+        b = Block(t, 1, (0,0,100,14), 12.0, "Helvetica", False, block_type="paragraph")
+        blocks.append(b)
+    s = Section(title=title, level=level, content=blocks, children=children or [])
+    return s
+
+
+def test_h1_header():
+    roots = [_section("Introduzione", 1, ["Testo."])]
+    md = serialize_tree(roots, {})
+    assert "# Introduzione" in md
+    assert "Testo." in md
+
+
+def test_h2_nested():
+    child = _section("Sezione 1.1", 2, ["Contenuto della sezione."])
+    root  = _section("Capitolo 1", 1, [], [child])
+    md = serialize_tree([root], {})
+    assert "# Capitolo 1" in md
+    assert "## Sezione 1.1" in md
+    assert "Contenuto della sezione." in md
+
+
+def test_implicit_section_no_hash():
+    # Sezione implicita level=0 → nessun # header
+    s = Section(title="", level=0)
+    b = Block("Testo iniziale.", 1, (0,0,100,14), 12.0, "Helvetica", False)
+    s.content.append(b)
+    md = serialize_tree([s], {})
+    assert not md.startswith("#")
+    assert "Testo iniziale." in md
+
+
+def test_ignore_blocks_not_serialized():
+    s = Section("Titolo", 1)
+    b_ignore = Block("superscript", 1, (0,0,10,10), 8.0, "Helvetica", False, block_type="ignore")
+    b_para   = Block("Paragrafo valido.", 1, (0,0,100,14), 12.0, "Helvetica", False, block_type="paragraph")
+    s.content.extend([b_ignore, b_para])
+    md = serialize_tree([s], {})
+    assert "superscript" not in md
+    assert "Paragrafo valido." in md
+
+
+def test_table_to_markdown():
+    table = [["Nome", "Età"], ["Alice", "30"], ["Bob", "25"]]
+    md = _table_to_markdown(table)
+    assert "| Nome | Età |" in md
+    assert "| --- | --- |" in md
+    assert "| Alice | 30 |" in md
+
+
+def test_no_excessive_blank_lines():
+    roots = [_section("A", 1, ["p1", "p2", "p3"])]
+    md = serialize_tree(roots, {})
+    assert "\n\n\n" not in md
@@ -0,0 +1,49 @@
+"""Test Stage 8: normalizzazione gerarchia Markdown."""
+from conversione._pipeline.stage8_normalize import normalize_hierarchy
+
+
+def test_level_jump_repaired():
+    md = "# A\n\n#### B\n\nTesto\n"
+    result, stats = normalize_hierarchy(md)
+    assert "## B" in result
+    assert "#### B" not in result
+    assert stats["n_level_jumps_repaired"] == 1
+
+
+def test_valid_hierarchy_unchanged():
+    md = "# A\n\n## B\n\nTesto\n\n### C\n\nTesto\n"
+    result, stats = normalize_hierarchy(md)
+    assert "# A" in result
+    assert "## B" in result
+    assert "### C" in result
+    assert stats["n_level_jumps_repaired"] == 0
+
+
+def test_empty_header_removed():
+    md = "# Titolo\n\n## Vuoto\n\n## Con contenuto\n\nTesto.\n"
+    result, stats = normalize_hierarchy(md)
+    assert "## Vuoto" not in result
+    assert "## Con contenuto" in result
+    assert stats["n_empty_headers_removed"] == 1
+
+
+def test_duplicate_consecutive_header_collapsed():
+    md = "# Titolo\n\n# Titolo\n\nTesto.\n"
+    result, stats = normalize_hierarchy(md)
+    assert result.count("# Titolo") == 1
+    assert stats["n_duplicate_headers_removed"] == 1
+
+
+def test_multiple_jumps():
+    md = "# A\n\n### B\n\nTesto B\n\n##### C\n\nTesto C\n"
+    result, stats = normalize_hierarchy(md)
+    assert stats["n_level_jumps_repaired"] == 2
+    assert "## B" in result
+    assert "### C" in result
+
+
+def test_no_false_positives():
+    md = "# A\n\nTesto.\n\n## B\n\nTesto.\n"
+    result, stats = normalize_hierarchy(md)
+    assert stats["n_level_jumps_repaired"] == 0
+    assert stats["n_empty_headers_removed"] == 0
@@ -0,0 +1,36 @@
+"""Test Stage 9: validazione strutturale Markdown."""
+from conversione._pipeline.stage9_validate import validate_markdown
+
+
+def test_valid_document():
+    md = "# Titolo\n\nTesto.\n\n## Sezione\n\nContenuto.\n"
+    result = validate_markdown(md)
+    assert result.is_valid
+    assert not result.errors
+
+
+def test_level_jump_detected():
+    md = "# A\n\n### B\n\nTesto.\n"
+    result = validate_markdown(md)
+    assert not result.is_valid
+    assert any("salto" in e.lower() or "livello" in e.lower() for e in result.errors)
+
+
+def test_no_headers_warning():
+    md = "Testo senza nessun header.\n\nAltro paragrafo.\n"
+    result = validate_markdown(md)
+    assert any("header" in w.lower() or "strutturato" in w.lower() for w in result.warnings)
+
+
+def test_inconsistent_table_warning():
+    md = "# Titolo\n\nTesto.\n\n| A | B |\n|---|---|\n| 1 | 2 | 3 |\n"
+    result = validate_markdown(md)
+    assert any("tabelle" in w.lower() or "colonne" in w.lower() for w in result.warnings)
+
+
+def test_to_dict():
+    md = "# A\n\nTesto.\n"
+    d = validate_markdown(md).to_dict()
+    assert "valid" in d
+    assert "errors" in d
+    assert "warnings" in d