rag-from-scratch/conversione/__main__.py

#!/usr/bin/env python3
"""
Pipeline PDF → clean Markdown per vettorizzazione RAG.

Uso:
    # Converti
    python conversione/ --stem <nome>
    python conversione/ --stem <nome> --force
    python conversione/                          # tutti i PDF in sources/

    # Valida
    python conversione/ validate
    python conversione/ validate <stem> [<stem> ...] --detail

Prerequisiti:
    pip install opendataloader-pdf pdfplumber
    Java 11+ sul PATH (https://adoptium.net/)
"""

import argparse
import sys
from pathlib import Path

# Rende _pipeline importabile da conversione/
sys.path.insert(0, str(Path(__file__).parent))

from _pipeline import run, validate


def _build_parser() -> argparse.ArgumentParser:
    parser = argparse.ArgumentParser(
        prog="conversione",
        description="PDF → clean Markdown strutturato, pronto per chunking RAG",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog=(
            "Esempi:\n"
            "  python conversione/ --stem manuale\n"
            "  python conversione/ --stem manuale --force\n"
            "  python conversione/ validate\n"
            "  python conversione/ validate manuale --detail"
        ),
    )

    # ── Subcommand: validate ──────────────────────────────────────────────
    sub = parser.add_subparsers(dest="cmd", metavar="comando")
    val = sub.add_parser(
        "validate",
        help="valida i report.json prodotti dalla conversione",
        description="Legge i report.json e assegna un voto 0-100 (A/B/C/D/F).",
    )
    val.add_argument(
        "stems",
        nargs="*",
        metavar="STEM",
        help="stem da validare. Ometti per tutti.",
    )
    val.add_argument(
        "--detail", "-d",
        action="store_true",
        help="mostra il dettaglio delle penalità per ogni documento",
    )

    # ── Opzioni convert (modalità default) ───────────────────────────────
    parser.add_argument(
        "--stem",
        metavar="NOME",
        help="nome del PDF in sources/ (senza estensione). Ometti per tutti.",
    )
    parser.add_argument(
        "--force",
        action="store_true",
        help="riesegui anche se clean.md è già presente",
    )

    return parser


def main() -> None:
    parser = _build_parser()
    args   = parser.parse_args()
    root   = Path(__file__).parent.parent

    # ── Validate ─────────────────────────────────────────────────────────
    if args.cmd == "validate":
        validate(args.stems, root, detail=args.detail)
        return

    # ── Convert (default) ────────────────────────────────────────────────
    if args.stem:
        stems = [args.stem]
    else:
        sources_dir = root / "sources"
        if not sources_dir.exists():
            print("Errore: cartella sources/ non trovata.")
            sys.exit(1)
        stems = sorted(p.stem for p in sources_dir.glob("*.pdf"))
        if not stems:
            print("Errore: nessun PDF trovato in sources/.")
            sys.exit(1)

    results = [run(s, root, args.force) for s in stems]
    ok      = sum(results)
    total   = len(results)
    print(f"\n{'✅' if all(results) else '⚠️ '} {ok}/{total} documenti convertiti")
    sys.exit(0 if all(results) else 1)


if __name__ == "__main__":
    main()