From 65940336733064dc768fb16c8239bbefa7689156 Mon Sep 17 00:00:00 2001
From: Davide Grilli <davide.grilli@outlook.com>
Date: Tue, 14 Apr 2026 18:22:05 +0200
Subject: [PATCH] feat(step-7,8): leggi modello da config.py, allinea
 EMBED_MODELS al README

- step-8/ingest.py: rimuove EMBED_MODEL e OLLAMA_URL hardcoded;
  li importa da step-9/config.py (fonte di verita unica)
- step-7/check_env.py: aggiorna EMBED_MODELS con tutti i modelli
  del README (aggiunge qwen3-embedding, nomic-embed-text-v2-moe,
  paraphrase-multilingual); mostra il modello configurato in config.py
  e verifica proprio quello, non un qualsiasi modello embedding
- step-8/README.md: creato
---
 step-7/check_env.py | 69 +++++++++++++++++++++++++++++++++++----------
 step-8/README.md    | 59 ++++++++++++++++++++++++++++++++++++++
 step-8/ingest.py    | 54 ++++++++++++++++++++---------------
 3 files changed, 145 insertions(+), 37 deletions(-)
 create mode 100644 step-8/README.md

diff --git a/step-7/check_env.py b/step-7/check_env.py
index cdda95b..071c48d 100644
--- a/step-7/check_env.py
+++ b/step-7/check_env.py
@@ -18,11 +18,36 @@ Uso:
 import shutil
 import subprocess
 import sys
+from pathlib import Path
 
 
-# Modelli riconosciuti come embedding (basta che uno sia presente)
-# Tutto il resto viene considerato LLM
-EMBED_MODELS = ["bge-m3", "nomic-embed-text", "mxbai-embed-large", "all-minilm"]
+# ─── Lista canonica di modelli embedding supportati ───────────────────────────
+# Ordine: prima scelta → ultima scelta (come da README step-7)
+EMBED_MODELS = [
+    "qwen3-embedding",
+    "nomic-embed-text-v2-moe",
+    "bge-m3",
+    "nomic-embed-text",
+    "mxbai-embed-large",
+    "paraphrase-multilingual",
+    "all-minilm",
+]
+
+
+def _is_embed(model_name: str) -> bool:
+    """True se il modello è riconosciuto come embedding (lista canonica o keyword)."""
+    base = model_name.split(":")[0].lower()
+    return any(base == e or base.startswith(e) for e in EMBED_MODELS) or "embed" in base
+
+
+# ─── Modelli configurati in step-9/config.py ─────────────────────────────────
+# Per spostare config.py alla root: cambia solo la riga qui sotto.
+sys.path.insert(0, str(Path(__file__).parent.parent / "step-9"))
+try:
+    from config import EMBED_MODEL as CONFIGURED_EMBED, OLLAMA_MODEL as CONFIGURED_LLM
+except Exception:
+    CONFIGURED_EMBED = None
+    CONFIGURED_LLM = None
 
 REQUIRED_LIBS = ["chromadb"]
 
@@ -83,25 +108,39 @@ def _match(model_name: str, available: list[str]) -> str | None:
 
 
 def check_embed_model(available: list[str]) -> bool:
-    """Verifica che almeno un modello di embedding sia presente."""
-    for candidate in EMBED_MODELS:
-        found = _match(candidate, available)
+    """Verifica che il modello di embedding configurato sia disponibile."""
+    if CONFIGURED_EMBED:
+        print(f"   modello configurato (step-9/config.py): {CONFIGURED_EMBED}")
+        found = _match(CONFIGURED_EMBED, available)
         if found:
-            print(f"✅ modello embedding trovato: {found}")
+            print(f"✅ embedding disponibile: {found}")
             return True
+        print(f"❌ {CONFIGURED_EMBED} non trovato in Ollama")
+        print(f"   → ollama pull {CONFIGURED_EMBED}")
+        return False
+    # fallback: config.py non leggibile
+    found = next((m for m in available if _is_embed(m)), None)
+    if found:
+        print(f"✅ modello embedding trovato: {found}")
+        return True
     print("❌ nessun modello di embedding trovato")
-    print(f"   → Consigliato per italiano: ollama pull bge-m3")
-    print(f"   → Alternativa leggera:      ollama pull nomic-embed-text")
+    print(f"   → Prima scelta: ollama pull qwen3-embedding:0.6b")
     return False
 
 
 def check_llm_model(available: list[str]) -> bool:
-    """Verifica che almeno un modello non-embedding sia presente."""
-    llm_candidates = [
-        m for m in available
-        if not any(m == e or m.startswith(e + ":") or m.startswith(e + "-")
-                   for e in EMBED_MODELS)
-    ]
+    """Verifica che il modello LLM configurato sia disponibile."""
+    if CONFIGURED_LLM:
+        print(f"   modello configurato (step-9/config.py): {CONFIGURED_LLM}")
+        found = _match(CONFIGURED_LLM, available)
+        if found:
+            print(f"✅ LLM disponibile: {found}")
+            return True
+        print(f"❌ {CONFIGURED_LLM} non trovato in Ollama")
+        print(f"   → ollama pull {CONFIGURED_LLM}")
+        return False
+    # fallback: config.py non leggibile
+    llm_candidates = [m for m in available if not _is_embed(m)]
     if llm_candidates:
         print(f"✅ modello LLM trovato: {llm_candidates[0]}")
         return True
diff --git a/step-8/README.md b/step-8/README.md
new file mode 100644
index 0000000..626c7eb
--- /dev/null
+++ b/step-8/README.md
@@ -0,0 +1,59 @@
+# Step 8 — Vettorizzazione
+
+Legge i chunk prodotti da step-6, genera gli embedding tramite Ollama e li
+salva in ChromaDB (vector store persistente su disco).
+
+---
+
+## Prerequisiti
+
+- Step-6 completato (esiste `step-6/<stem>/chunks.json`)
+- Ollama attivo con il modello di embedding scaricato
+- `chromadb` installato (`pip install -r requirements.txt`)
+
+---
+
+## Configurazione modello
+
+Il modello di embedding viene letto da **`step-9/config.py`**:
+
+```python
+# step-9/config.py
+EMBED_MODEL = "nomic-embed-text"   # ← cambia qui
+```
+
+> Il modello scelto qui deve corrispondere a quello usato in step-9.
+> Se lo cambi dopo aver già vettorizzato, devi rieseguire step-8 con `--force`.
+
+---
+
+## Uso
+
+```bash
+# Vettorizza un singolo documento
+python step-8/ingest.py --stem <nome>
+
+# Vettorizza tutti i documenti trovati in step-6/
+python step-8/ingest.py
+
+# Sovrascrive una collection già esistente
+python step-8/ingest.py --stem <nome> --force
+
+# Override modello (senza modificare config.py)
+python step-8/ingest.py --stem <nome> --model bge-m3
+```
+
+---
+
+## Output
+
+I vettori vengono salvati in `chroma_db/<stem>/` come collection ChromaDB con
+distanza coseno. La directory è ignorata da git (generata automaticamente).
+
+---
+
+## Modelli supportati
+
+Stessi modelli raccomandati nel [README di step-7](../step-7/README.md).
+Il modello deve essere scaricato in Ollama prima di eseguire questo script
+(`ollama pull <modello>`).
diff --git a/step-8/ingest.py b/step-8/ingest.py
index 76b1a61..8db0329 100644
--- a/step-8/ingest.py
+++ b/step-8/ingest.py
@@ -3,15 +3,20 @@
 Step 8 — Vettorizzazione
 
 Legge i chunk prodotti da step-6, genera gli embedding tramite Ollama
-(nomic-embed-text) e li indicizza in ChromaDB (persistente).
+e li indicizza in ChromaDB (persistente).
+
+Il modello di embedding viene letto da step-9/config.py (EMBED_MODEL).
+Puoi sovrascriverlo con --model, ma deve corrispondere al modello che
+userai in step-9 — altrimenti riesegui con --force dopo aver cambiato.
 
 Input:  step-6/<stem>/chunks.json
 Output: chroma_db/<stem> (collection ChromaDB)
 
 Uso:
-    python step-8/ingest.py --stem <nome>      # singolo documento
-    python step-8/ingest.py                    # tutti gli stem trovati
-    python step-8/ingest.py --stem <nome> --force   # sovrascrive collection
+    python step-8/ingest.py --stem <nome>            # singolo documento
+    python step-8/ingest.py                          # tutti gli stem trovati
+    python step-8/ingest.py --stem <nome> --force    # sovrascrive collection
+    python step-8/ingest.py --model bge-m3           # override modello
 """
 
 import argparse
@@ -24,23 +29,26 @@ from pathlib import Path
 
 import chromadb
 
-# ─── Costanti ─────────────────────────────────────────────────────────────────
+# ─── Configurazione ────────────────────────────────────────────────────────────
 
 project_root = Path(__file__).parent.parent
 
-CHUNKS_DIR   = project_root / "step-6"
-CHROMA_DIR   = project_root / "chroma_db"
+CHUNKS_DIR = project_root / "step-6"
+CHROMA_DIR = project_root / "chroma_db"
 
-OLLAMA_URL        = "http://localhost:11434"
-EMBED_MODEL       = "nomic-embed-text"
-EMBED_ENDPOINT    = f"{OLLAMA_URL}/api/embeddings"
+# Legge EMBED_MODEL e OLLAMA_URL da step-9/config.py (fonte di verità).
+# Per spostare config.py alla root: cambia solo la riga qui sotto.
+sys.path.insert(0, str(project_root / "step-9"))
+from config import EMBED_MODEL, OLLAMA_URL  # noqa: E402
+
+EMBED_ENDPOINT = f"{OLLAMA_URL}/api/embeddings"
 
 
 # ─── Ollama ────────────────────────────────────────────────────────────────────
 
-def embed(text: str) -> list[float]:
+def embed(text: str, model: str) -> list[float]:
     """Chiama Ollama /api/embeddings e ritorna il vettore."""
-    payload = json.dumps({"model": EMBED_MODEL, "prompt": text}).encode()
+    payload = json.dumps({"model": model, "prompt": text}).encode()
     req = urllib.request.Request(
         EMBED_ENDPOINT,
         data=payload,
@@ -52,22 +60,22 @@ def embed(text: str) -> list[float]:
     return data["embedding"]
 
 
-def check_ollama() -> bool:
-    """Verifica che Ollama sia attivo e che nomic-embed-text sia disponibile."""
+def check_ollama(model: str) -> bool:
+    """Verifica che Ollama sia attivo e che il modello di embedding sia disponibile."""
     try:
         req = urllib.request.Request(f"{OLLAMA_URL}/api/tags", method="GET")
         with urllib.request.urlopen(req, timeout=10) as resp:
             data = json.loads(resp.read())
         models = [m["name"] for m in data.get("models", [])]
         found = any(
-            m == EMBED_MODEL or m.startswith(EMBED_MODEL + ":")
+            m == model or m.startswith(model + ":")
             for m in models
         )
         if found:
-            print(f"✅ Ollama OK — {EMBED_MODEL} disponibile")
+            print(f"✅ Ollama OK — {model} disponibile")
             return True
-        print(f"❌ Modello {EMBED_MODEL} non trovato in Ollama")
-        print(f"   → ollama pull {EMBED_MODEL}")
+        print(f"❌ Modello {model} non trovato in Ollama")
+        print(f"   → ollama pull {model}")
         return False
     except (urllib.error.URLError, OSError):
         print("❌ Ollama non raggiungibile — assicurati che sia in esecuzione")
@@ -88,7 +96,7 @@ def collection_exists(client: chromadb.PersistentClient, stem: str) -> bool:
 
 # ─── Ingestione ───────────────────────────────────────────────────────────────
 
-def ingest(stem: str, force: bool) -> bool:
+def ingest(stem: str, force: bool, model: str = EMBED_MODEL) -> bool:
     """
     Legge step-6/<stem>/chunks.json, genera embedding e popola ChromaDB.
     Ritorna True se completato con successo, False altrimenti.
@@ -133,7 +141,7 @@ def ingest(stem: str, force: bool) -> bool:
 
     for i, chunk in enumerate(chunks, start=1):
         t0 = time.monotonic()
-        vector = embed(chunk["text"])
+        vector = embed(chunk["text"], model)
         t1 = time.monotonic()
         durations.append(t1 - t0)
 
@@ -196,11 +204,13 @@ def main() -> int:
     parser.add_argument("--stem", help="Nome del documento (senza --stem = tutti)")
     parser.add_argument("--force", action="store_true",
                         help="Sovrascrive la collection se già esistente")
+    parser.add_argument("--model", default=EMBED_MODEL,
+                        help=f"Modello embedding Ollama (default da step-9/config.py: {EMBED_MODEL})")
     args = parser.parse_args()
 
     print("─── Step 8 — Vettorizzazione ─────────────────────────────────────────\n")
 
-    if not check_ollama():
+    if not check_ollama(args.model):
         return 1
 
     stems = [args.stem] if args.stem else find_stems()
@@ -213,7 +223,7 @@ def main() -> int:
     for stem in stems:
         if len(stems) > 1:
             print(f"── {stem} ──")
-        results.append(ingest(stem, force=args.force))
+        results.append(ingest(stem, force=args.force, model=args.model))
         if len(stems) > 1:
             print()