"""Test end-to-end: pipeline completa su PDF reali da sources/.""" import json import shutil import pytest from pathlib import Path from conversione._pipeline import run PROJECT_ROOT = Path(__file__).parent.parent.parent def _sources_available(stem: str) -> bool: return (PROJECT_ROOT / "sources" / f"{stem}.pdf").exists() @pytest.mark.skipif(not _sources_available("bitcoin"), reason="sources/bitcoin.pdf non disponibile") def test_bitcoin_produces_clean_md(tmp_path, monkeypatch): """Pipeline completa su bitcoin.pdf — verifica output strutturato.""" # Usa tmp_path come output per non inquinare il repo out_dir = tmp_path / "conversione" / "bitcoin" out_dir.mkdir(parents=True) sources_dir = tmp_path / "sources" sources_dir.mkdir() shutil.copy(PROJECT_ROOT / "sources" / "bitcoin.pdf", sources_dir / "bitcoin.pdf") ok = run("bitcoin", tmp_path, force=True) assert ok, "La pipeline deve completare senza errori" clean_md = out_dir / "clean.md" assert clean_md.exists(), "clean.md deve essere creato" text = clean_md.read_text(encoding="utf-8") assert len(text) > 1000, "clean.md deve avere contenuto significativo" assert "#" in text, "clean.md deve avere almeno un header" report = json.loads((out_dir / "report.json").read_text(encoding="utf-8")) assert report["structure"]["livello_struttura"] >= 1, "Struttura deve avere almeno livello 1" @pytest.mark.skipif(not _sources_available("bitcoin"), reason="sources/bitcoin.pdf non disponibile") def test_determinism(tmp_path): """Due run consecutive sullo stesso PDF producono output identico.""" sources_dir = tmp_path / "sources" sources_dir.mkdir() shutil.copy(PROJECT_ROOT / "sources" / "bitcoin.pdf", sources_dir / "bitcoin.pdf") run("bitcoin", tmp_path, force=True) first = (tmp_path / "conversione" / "bitcoin" / "clean.md").read_text() run("bitcoin", tmp_path, force=True) second = (tmp_path / "conversione" / "bitcoin" / "clean.md").read_text() assert first == second, "Output deve essere deterministico tra due run" @pytest.mark.skipif(not _sources_available("codice_civile"), reason="sources/codice_civile.pdf non disponibile") def test_codice_civile_has_articles(tmp_path): """Il Codice Civile deve produrre header con 'Art.'.""" sources_dir = tmp_path / "sources" sources_dir.mkdir() shutil.copy(PROJECT_ROOT / "sources" / "codice_civile.pdf", sources_dir / "codice_civile.pdf") ok = run("codice_civile", tmp_path, force=True) assert ok text = (tmp_path / "conversione" / "codice_civile" / "clean.md").read_text() assert "Art." in text, "clean.md del codice civile deve contenere articoli"