feat(step-7,8): leggi modello da config.py, allinea EMBED_MODELS al README

- step-8/ingest.py: rimuove EMBED_MODEL e OLLAMA_URL hardcoded; li importa da step-9/config.py (fonte di verita unica) - step-7/check_env.py: aggiorna EMBED_MODELS con tutti i modelli del README (aggiunge qwen3-embedding, nomic-embed-text-v2-moe, paraphrase-multilingual); mostra il modello configurato in config.py e verifica proprio quello, non un qualsiasi modello embedding - step-8/README.md: creato
2026-04-14 18:22:05 +02:00
parent f62b5bc871
commit 6594033673
3 changed files with 145 additions and 37 deletions
@@ -18,11 +18,36 @@ Uso:
 import shutil
 import subprocess
 import sys
+from pathlib import Path


-# Modelli riconosciuti come embedding (basta che uno sia presente)
-# Tutto il resto viene considerato LLM
-EMBED_MODELS = ["bge-m3", "nomic-embed-text", "mxbai-embed-large", "all-minilm"]
+# ─── Lista canonica di modelli embedding supportati ───────────────────────────
+# Ordine: prima scelta → ultima scelta (come da README step-7)
+EMBED_MODELS = [
+    "qwen3-embedding",
+    "nomic-embed-text-v2-moe",
+    "bge-m3",
+    "nomic-embed-text",
+    "mxbai-embed-large",
+    "paraphrase-multilingual",
+    "all-minilm",
+]
+
+
+def _is_embed(model_name: str) -> bool:
+    """True se il modello è riconosciuto come embedding (lista canonica o keyword)."""
+    base = model_name.split(":")[0].lower()
+    return any(base == e or base.startswith(e) for e in EMBED_MODELS) or "embed" in base
+
+
+# ─── Modelli configurati in step-9/config.py ─────────────────────────────────
+# Per spostare config.py alla root: cambia solo la riga qui sotto.
+sys.path.insert(0, str(Path(__file__).parent.parent / "step-9"))
+try:
+    from config import EMBED_MODEL as CONFIGURED_EMBED, OLLAMA_MODEL as CONFIGURED_LLM
+except Exception:
+    CONFIGURED_EMBED = None
+    CONFIGURED_LLM = None

 REQUIRED_LIBS = ["chromadb"]

@@ -83,25 +108,39 @@ def _match(model_name: str, available: list[str]) -> str | None:


 def check_embed_model(available: list[str]) -> bool:
-    """Verifica che almeno un modello di embedding sia presente."""
-    for candidate in EMBED_MODELS:
-        found = _match(candidate, available)
+    """Verifica che il modello di embedding configurato sia disponibile."""
+    if CONFIGURED_EMBED:
+        print(f"   modello configurato (step-9/config.py): {CONFIGURED_EMBED}")
+        found = _match(CONFIGURED_EMBED, available)
        if found:
-            print(f"✅ modello embedding trovato: {found}")
+            print(f"✅ embedding disponibile: {found}")
            return True
+        print(f"❌ {CONFIGURED_EMBED} non trovato in Ollama")
+        print(f"   → ollama pull {CONFIGURED_EMBED}")
+        return False
+    # fallback: config.py non leggibile
+    found = next((m for m in available if _is_embed(m)), None)
+    if found:
+        print(f"✅ modello embedding trovato: {found}")
+        return True
    print("❌ nessun modello di embedding trovato")
-    print(f"   → Consigliato per italiano: ollama pull bge-m3")
-    print(f"   → Alternativa leggera:      ollama pull nomic-embed-text")
+    print(f"   → Prima scelta: ollama pull qwen3-embedding:0.6b")
    return False


 def check_llm_model(available: list[str]) -> bool:
-    """Verifica che almeno un modello non-embedding sia presente."""
-    llm_candidates = [
-        m for m in available
-        if not any(m == e or m.startswith(e + ":") or m.startswith(e + "-")
-                   for e in EMBED_MODELS)
-    ]
+    """Verifica che il modello LLM configurato sia disponibile."""
+    if CONFIGURED_LLM:
+        print(f"   modello configurato (step-9/config.py): {CONFIGURED_LLM}")
+        found = _match(CONFIGURED_LLM, available)
+        if found:
+            print(f"✅ LLM disponibile: {found}")
+            return True
+        print(f"❌ {CONFIGURED_LLM} non trovato in Ollama")
+        print(f"   → ollama pull {CONFIGURED_LLM}")
+        return False
+    # fallback: config.py non leggibile
+    llm_candidates = [m for m in available if not _is_embed(m)]
    if llm_candidates:
        print(f"✅ modello LLM trovato: {llm_candidates[0]}")
        return True
@@ -0,0 +1,59 @@
+# Step 8 — Vettorizzazione
+
+Legge i chunk prodotti da step-6, genera gli embedding tramite Ollama e li
+salva in ChromaDB (vector store persistente su disco).
+
+---
+
+## Prerequisiti
+
+- Step-6 completato (esiste `step-6/<stem>/chunks.json`)
+- Ollama attivo con il modello di embedding scaricato
+- `chromadb` installato (`pip install -r requirements.txt`)
+
+---
+
+## Configurazione modello
+
+Il modello di embedding viene letto da **`step-9/config.py`**:
+
+```python
+# step-9/config.py
+EMBED_MODEL = "nomic-embed-text"   # ← cambia qui
+```
+
+> Il modello scelto qui deve corrispondere a quello usato in step-9.
+> Se lo cambi dopo aver già vettorizzato, devi rieseguire step-8 con `--force`.
+
+---
+
+## Uso
+
+```bash
+# Vettorizza un singolo documento
+python step-8/ingest.py --stem <nome>
+
+# Vettorizza tutti i documenti trovati in step-6/
+python step-8/ingest.py
+
+# Sovrascrive una collection già esistente
+python step-8/ingest.py --stem <nome> --force
+
+# Override modello (senza modificare config.py)
+python step-8/ingest.py --stem <nome> --model bge-m3
+```
+
+---
+
+## Output
+
+I vettori vengono salvati in `chroma_db/<stem>/` come collection ChromaDB con
+distanza coseno. La directory è ignorata da git (generata automaticamente).
+
+---
+
+## Modelli supportati
+
+Stessi modelli raccomandati nel [README di step-7](../step-7/README.md).
+Il modello deve essere scaricato in Ollama prima di eseguire questo script
+(`ollama pull <modello>`).
@@ -3,15 +3,20 @@
 Step 8 — Vettorizzazione

 Legge i chunk prodotti da step-6, genera gli embedding tramite Ollama
-(nomic-embed-text) e li indicizza in ChromaDB (persistente).
+e li indicizza in ChromaDB (persistente).
+
+Il modello di embedding viene letto da step-9/config.py (EMBED_MODEL).
+Puoi sovrascriverlo con --model, ma deve corrispondere al modello che
+userai in step-9 — altrimenti riesegui con --force dopo aver cambiato.

 Input:  step-6/<stem>/chunks.json
 Output: chroma_db/<stem> (collection ChromaDB)

 Uso:
-    python step-8/ingest.py --stem <nome>      # singolo documento
-    python step-8/ingest.py                    # tutti gli stem trovati
-    python step-8/ingest.py --stem <nome> --force   # sovrascrive collection
+    python step-8/ingest.py --stem <nome>            # singolo documento
+    python step-8/ingest.py                          # tutti gli stem trovati
+    python step-8/ingest.py --stem <nome> --force    # sovrascrive collection
+    python step-8/ingest.py --model bge-m3           # override modello
 """

 import argparse
@@ -24,23 +29,26 @@ from pathlib import Path

 import chromadb

-# ─── Costanti ─────────────────────────────────────────────────────────────────
+# ─── Configurazione ────────────────────────────────────────────────────────────

 project_root = Path(__file__).parent.parent

-CHUNKS_DIR   = project_root / "step-6"
-CHROMA_DIR   = project_root / "chroma_db"
+CHUNKS_DIR = project_root / "step-6"
+CHROMA_DIR = project_root / "chroma_db"

-OLLAMA_URL        = "http://localhost:11434"
-EMBED_MODEL       = "nomic-embed-text"
-EMBED_ENDPOINT    = f"{OLLAMA_URL}/api/embeddings"
+# Legge EMBED_MODEL e OLLAMA_URL da step-9/config.py (fonte di verità).
+# Per spostare config.py alla root: cambia solo la riga qui sotto.
+sys.path.insert(0, str(project_root / "step-9"))
+from config import EMBED_MODEL, OLLAMA_URL  # noqa: E402
+
+EMBED_ENDPOINT = f"{OLLAMA_URL}/api/embeddings"


 # ─── Ollama ────────────────────────────────────────────────────────────────────

-def embed(text: str) -> list[float]:
+def embed(text: str, model: str) -> list[float]:
    """Chiama Ollama /api/embeddings e ritorna il vettore."""
-    payload = json.dumps({"model": EMBED_MODEL, "prompt": text}).encode()
+    payload = json.dumps({"model": model, "prompt": text}).encode()
    req = urllib.request.Request(
        EMBED_ENDPOINT,
        data=payload,
@@ -52,22 +60,22 @@ def embed(text: str) -> list[float]:
    return data["embedding"]


-def check_ollama() -> bool:
-    """Verifica che Ollama sia attivo e che nomic-embed-text sia disponibile."""
+def check_ollama(model: str) -> bool:
+    """Verifica che Ollama sia attivo e che il modello di embedding sia disponibile."""
    try:
        req = urllib.request.Request(f"{OLLAMA_URL}/api/tags", method="GET")
        with urllib.request.urlopen(req, timeout=10) as resp:
            data = json.loads(resp.read())
        models = [m["name"] for m in data.get("models", [])]
        found = any(
-            m == EMBED_MODEL or m.startswith(EMBED_MODEL + ":")
+            m == model or m.startswith(model + ":")
            for m in models
        )
        if found:
-            print(f"✅ Ollama OK — {EMBED_MODEL} disponibile")
+            print(f"✅ Ollama OK — {model} disponibile")
            return True
-        print(f"❌ Modello {EMBED_MODEL} non trovato in Ollama")
-        print(f"   → ollama pull {EMBED_MODEL}")
+        print(f"❌ Modello {model} non trovato in Ollama")
+        print(f"   → ollama pull {model}")
        return False
    except (urllib.error.URLError, OSError):
        print("❌ Ollama non raggiungibile — assicurati che sia in esecuzione")
@@ -88,7 +96,7 @@ def collection_exists(client: chromadb.PersistentClient, stem: str) -> bool:

 # ─── Ingestione ───────────────────────────────────────────────────────────────

-def ingest(stem: str, force: bool) -> bool:
+def ingest(stem: str, force: bool, model: str = EMBED_MODEL) -> bool:
    """
    Legge step-6/<stem>/chunks.json, genera embedding e popola ChromaDB.
    Ritorna True se completato con successo, False altrimenti.
@@ -133,7 +141,7 @@ def ingest(stem: str, force: bool) -> bool:

    for i, chunk in enumerate(chunks, start=1):
        t0 = time.monotonic()
-        vector = embed(chunk["text"])
+        vector = embed(chunk["text"], model)
        t1 = time.monotonic()
        durations.append(t1 - t0)

@@ -196,11 +204,13 @@ def main() -> int:
    parser.add_argument("--stem", help="Nome del documento (senza --stem = tutti)")
    parser.add_argument("--force", action="store_true",
                        help="Sovrascrive la collection se già esistente")
+    parser.add_argument("--model", default=EMBED_MODEL,
+                        help=f"Modello embedding Ollama (default da step-9/config.py: {EMBED_MODEL})")
    args = parser.parse_args()

    print("─── Step 8 — Vettorizzazione ─────────────────────────────────────────\n")

-    if not check_ollama():
+    if not check_ollama(args.model):
        return 1

    stems = [args.stem] if args.stem else find_stems()
@@ -213,7 +223,7 @@ def main() -> int:
    for stem in stems:
        if len(stems) > 1:
            print(f"── {stem} ──")
-        results.append(ingest(stem, force=args.force))
+        results.append(ingest(stem, force=args.force, model=args.model))
        if len(stems) > 1:
            print()