rag-from-scratch/conversione/_pipeline/converter.py

from pathlib import Path


def _is_tagged_pdf(pdf_path: Path) -> bool:
    try:
        import fitz
        doc = fitz.open(str(pdf_path))
        tagged = "StructTreeRoot" in doc.pdf_catalog()
        doc.close()
        return tagged
    except Exception:
        return False


def convert_pdf(pdf_path: Path, out_dir: Path) -> Path:
    """
    Converte il PDF in Markdown tramite opendataloader-pdf.
    Scrive il file nella out_dir e restituisce il percorso.

    Parametri scelti per output RAG-ottimale:
      - keep_line_breaks=False   → testo fluente, no hard-wrap PDF
      - reading_order="xycut"    → corregge ordine multi-colonna (XY-Cut++)
      - sanitize=False           → preserva il testo originale
      - image_output="off"       → nessuna immagine estratta né referenziata
      - table_method="cluster"   → rileva tabelle senza bordi visibili
      - content_safety_off       → evita filtraggio di footnote (tiny) e layer OCG
      - use_struct_tree          → attivo solo se il PDF è taggato (Word/InDesign)
    """
    import opendataloader_pdf

    out_dir.mkdir(parents=True, exist_ok=True)
    tagged = _is_tagged_pdf(pdf_path)

    opendataloader_pdf.convert(
        input_path=str(pdf_path),
        output_dir=str(out_dir),
        format="markdown",
        keep_line_breaks=False,
        reading_order="xycut",
        sanitize=False,
        image_output="off",
        table_method="cluster",
        content_safety_off=["tiny", "hidden-ocg"],
        use_struct_tree=tagged,
        quiet=True,
    )

    md_file = out_dir / f"{pdf_path.stem}.md"
    if not md_file.exists():
        candidates = list(out_dir.glob("*.md"))
        if not candidates:
            raise RuntimeError(f"Nessun file .md prodotto in {out_dir}")
        md_file = candidates[0]

    content = md_file.read_text(encoding="utf-8", errors="replace").strip()
    if len(content) < 100:
        raise RuntimeError(
            f"opendataloader ha prodotto un file .md quasi vuoto ({len(content)} char) "
            f"— il PDF potrebbe essere corrotto o non supportato"
        )

    return md_file