e1b5298b20
Porta da branch marker la riscrittura completa di conversione/_pipeline/ (9 stadi PyMuPDF) e la suite tests/ senza modificare il resto del progetto RAG (ollama/, step-5/, step-6/, step-8/, rag.py, retrieve.py, config.py). requirements.txt: aggiunge PyMuPDF>=1.24.0 e pytest>=8.0, mantiene chromadb, rimuove opendataloader-pdf e pymupdf4llm. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
69 lines
2.6 KiB
Python
69 lines
2.6 KiB
Python
"""Test end-to-end: pipeline completa su PDF reali da sources/."""
|
|
import json
|
|
import shutil
|
|
import pytest
|
|
from pathlib import Path
|
|
|
|
from conversione._pipeline import run
|
|
|
|
|
|
PROJECT_ROOT = Path(__file__).parent.parent.parent
|
|
|
|
|
|
def _sources_available(stem: str) -> bool:
|
|
return (PROJECT_ROOT / "sources" / f"{stem}.pdf").exists()
|
|
|
|
|
|
@pytest.mark.skipif(not _sources_available("bitcoin"), reason="sources/bitcoin.pdf non disponibile")
|
|
def test_bitcoin_produces_clean_md(tmp_path, monkeypatch):
|
|
"""Pipeline completa su bitcoin.pdf — verifica output strutturato."""
|
|
# Usa tmp_path come output per non inquinare il repo
|
|
out_dir = tmp_path / "conversione" / "bitcoin"
|
|
out_dir.mkdir(parents=True)
|
|
sources_dir = tmp_path / "sources"
|
|
sources_dir.mkdir()
|
|
shutil.copy(PROJECT_ROOT / "sources" / "bitcoin.pdf", sources_dir / "bitcoin.pdf")
|
|
|
|
ok = run("bitcoin", tmp_path, force=True)
|
|
assert ok, "La pipeline deve completare senza errori"
|
|
|
|
clean_md = out_dir / "clean.md"
|
|
assert clean_md.exists(), "clean.md deve essere creato"
|
|
|
|
text = clean_md.read_text(encoding="utf-8")
|
|
assert len(text) > 1000, "clean.md deve avere contenuto significativo"
|
|
assert "#" in text, "clean.md deve avere almeno un header"
|
|
|
|
report = json.loads((out_dir / "report.json").read_text(encoding="utf-8"))
|
|
assert report["structure"]["livello_struttura"] >= 1, "Struttura deve avere almeno livello 1"
|
|
|
|
|
|
@pytest.mark.skipif(not _sources_available("bitcoin"), reason="sources/bitcoin.pdf non disponibile")
|
|
def test_determinism(tmp_path):
|
|
"""Due run consecutive sullo stesso PDF producono output identico."""
|
|
sources_dir = tmp_path / "sources"
|
|
sources_dir.mkdir()
|
|
shutil.copy(PROJECT_ROOT / "sources" / "bitcoin.pdf", sources_dir / "bitcoin.pdf")
|
|
|
|
run("bitcoin", tmp_path, force=True)
|
|
first = (tmp_path / "conversione" / "bitcoin" / "clean.md").read_text()
|
|
|
|
run("bitcoin", tmp_path, force=True)
|
|
second = (tmp_path / "conversione" / "bitcoin" / "clean.md").read_text()
|
|
|
|
assert first == second, "Output deve essere deterministico tra due run"
|
|
|
|
|
|
@pytest.mark.skipif(not _sources_available("codice_civile"), reason="sources/codice_civile.pdf non disponibile")
|
|
def test_codice_civile_has_articles(tmp_path):
|
|
"""Il Codice Civile deve produrre header con 'Art.'."""
|
|
sources_dir = tmp_path / "sources"
|
|
sources_dir.mkdir()
|
|
shutil.copy(PROJECT_ROOT / "sources" / "codice_civile.pdf", sources_dir / "codice_civile.pdf")
|
|
|
|
ok = run("codice_civile", tmp_path, force=True)
|
|
assert ok
|
|
|
|
text = (tmp_path / "conversione" / "codice_civile" / "clean.md").read_text()
|
|
assert "Art." in text, "clean.md del codice civile deve contenere articoli"
|