feat: integra pipeline PDF→Markdown a 9 stadi e test suite

Porta da main la riscrittura completa di conversione/_pipeline/ (9 stadi PyMuPDF) e la suite tests/ senza modificare chunks/, step-8/, rag.py, ollama/, retrieve.py, config.py. requirements.txt: aggiunge PyMuPDF>=1.24.0 e pytest>=8.0, mantiene chromadb, rimuove opendataloader-pdf e pymupdf4llm. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-11 14:46:16 +02:00
parent a7b71fa508
commit ebd2a43f84
39 changed files with 3688 additions and 153 deletions
@@ -0,0 +1,68 @@
+"""Test end-to-end: pipeline completa su PDF reali da sources/."""
+import json
+import shutil
+import pytest
+from pathlib import Path
+
+from conversione._pipeline import run
+
+
+PROJECT_ROOT = Path(__file__).parent.parent.parent
+
+
+def _sources_available(stem: str) -> bool:
+    return (PROJECT_ROOT / "sources" / f"{stem}.pdf").exists()
+
+
+@pytest.mark.skipif(not _sources_available("bitcoin"), reason="sources/bitcoin.pdf non disponibile")
+def test_bitcoin_produces_clean_md(tmp_path, monkeypatch):
+    """Pipeline completa su bitcoin.pdf — verifica output strutturato."""
+    # Usa tmp_path come output per non inquinare il repo
+    out_dir = tmp_path / "conversione" / "bitcoin"
+    out_dir.mkdir(parents=True)
+    sources_dir = tmp_path / "sources"
+    sources_dir.mkdir()
+    shutil.copy(PROJECT_ROOT / "sources" / "bitcoin.pdf", sources_dir / "bitcoin.pdf")
+
+    ok = run("bitcoin", tmp_path, force=True)
+    assert ok, "La pipeline deve completare senza errori"
+
+    clean_md = out_dir / "clean.md"
+    assert clean_md.exists(), "clean.md deve essere creato"
+
+    text = clean_md.read_text(encoding="utf-8")
+    assert len(text) > 1000, "clean.md deve avere contenuto significativo"
+    assert "#" in text, "clean.md deve avere almeno un header"
+
+    report = json.loads((out_dir / "report.json").read_text(encoding="utf-8"))
+    assert report["structure"]["livello_struttura"] >= 1, "Struttura deve avere almeno livello 1"
+
+
+@pytest.mark.skipif(not _sources_available("bitcoin"), reason="sources/bitcoin.pdf non disponibile")
+def test_determinism(tmp_path):
+    """Due run consecutive sullo stesso PDF producono output identico."""
+    sources_dir = tmp_path / "sources"
+    sources_dir.mkdir()
+    shutil.copy(PROJECT_ROOT / "sources" / "bitcoin.pdf", sources_dir / "bitcoin.pdf")
+
+    run("bitcoin", tmp_path, force=True)
+    first = (tmp_path / "conversione" / "bitcoin" / "clean.md").read_text()
+
+    run("bitcoin", tmp_path, force=True)
+    second = (tmp_path / "conversione" / "bitcoin" / "clean.md").read_text()
+
+    assert first == second, "Output deve essere deterministico tra due run"
+
+
+@pytest.mark.skipif(not _sources_available("codice_civile"), reason="sources/codice_civile.pdf non disponibile")
+def test_codice_civile_has_articles(tmp_path):
+    """Il Codice Civile deve produrre header con 'Art.'."""
+    sources_dir = tmp_path / "sources"
+    sources_dir.mkdir()
+    shutil.copy(PROJECT_ROOT / "sources" / "codice_civile.pdf", sources_dir / "codice_civile.pdf")
+
+    ok = run("codice_civile", tmp_path, force=True)
+    assert ok
+
+    text = (tmp_path / "conversione" / "codice_civile" / "clean.md").read_text()
+    assert "Art." in text, "clean.md del codice civile deve contenere articoli"
@@ -0,0 +1,40 @@
+"""Test categoria 8: riparazione automatica gerarchia rotta (todo.md Cat.8)."""
+from conversione._pipeline.stage8_normalize import normalize_hierarchy
+
+
+def test_cat8_invalid_hierarchy_auto_repaired():
+    """
+    Categoria 8 dal todo.md:
+    Input:  # A \\n\\n#### B
+    Atteso: # A \\n\\n## B   (salto riparato a max +1)
+    """
+    md_input = "# A\n\n#### B\n\nContenuto di B.\n"
+    result, stats = normalize_hierarchy(md_input)
+
+    assert "## B" in result, "#### deve diventare ## (salto +1 dal padre #)"
+    assert "#### B" not in result, "Il livello originale non deve restare"
+    assert stats["n_level_jumps_repaired"] >= 1
+
+
+def test_multiple_jumps_all_repaired():
+    """Catena di salti: # → #### → ######."""
+    md_input = "# Root\n\n#### Middle\n\nTesto\n\n###### Deep\n\nTesto\n"
+    result, stats = normalize_hierarchy(md_input)
+
+    lines = [l for l in result.split("\n") if l.startswith("#")]
+    levels = [len(l) - len(l.lstrip("#")) for l in lines]
+
+    # Verifica che non ci siano salti > 1
+    for i in range(1, len(levels)):
+        assert levels[i] <= levels[i - 1] + 1, \
+            f"Salto non riparato: {levels[i-1]} → {levels[i]}"
+
+
+def test_valid_hierarchy_not_touched():
+    """Gerarchia valida non deve essere modificata."""
+    md_valid = "# H1\n\nTesto\n\n## H2\n\nTesto\n\n### H3\n\nTesto\n"
+    result, stats = normalize_hierarchy(md_valid)
+    assert stats["n_level_jumps_repaired"] == 0
+    assert "# H1" in result
+    assert "## H2" in result
+    assert "### H3" in result