refactor: rinomina step-8 → ingestion

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-11 15:58:54 +02:00
parent 70b304e1d4
commit 9e1a72a9e6
7 changed files with 25 additions and 25 deletions
@@ -50,7 +50,7 @@ except Exception as e: print(f'ERRORE lettura report: {e}')

 ```
 ✅ Chunk pronti — procedi con la vettorizzazione:
-   python step-8/ingest.py --stem $ARGUMENTS
+   python ingestion/ingest.py --stem $ARGUMENTS
 ```

 Se ci sono solo 🟡, spiega brevemente i warning e chiedi se l'utente vuole risolverli prima o procedere.
@@ -105,7 +105,7 @@ Se verdict finale è `ok` o `warnings_only` senza 🔴:
 ```
 ✅ Chunk pronti in chunks/$ARGUMENTS/chunks.json
   Procedi con la vettorizzazione:
-   python step-8/ingest.py --stem $ARGUMENTS
+   python ingestion/ingest.py --stem $ARGUMENTS
 ```

 Se rimangono 🔴 dopo il fix (testo non spezzabile o struttura anomala nel sorgente):
@@ -263,11 +263,11 @@ def verify_stem(stem: str, project_root: Path, min_chars: int, max_chars: int) -

    if not blockers and not warnings:
        print(f"  ✅ Tutto OK — procedi alla vettorizzazione:")
-        print(f"       python step-8/ingest.py --stem {stem}")
+        print(f"       python ingestion/ingest.py --stem {stem}")

    elif not blockers:
        print(f"  🟡 Solo avvisi minori — puoi procedere alla vettorizzazione:")
-        print(f"       python step-8/ingest.py --stem {stem}")
+        print(f"       python ingestion/ingest.py --stem {stem}")
        print()
        print(f"  Oppure, per ottimizzare prima:")
        if too_short:
@@ -23,7 +23,7 @@ EMBED_MODEL = "nomic-embed-text"   # ← cambia qui
 ```

 > Il modello scelto qui deve corrispondere a quello usato in rag.py.
-> Se lo cambi dopo aver già vettorizzato, devi rieseguire step-8 con `--force`.
+> Se lo cambi dopo aver già vettorizzato, devi rieseguire ingestion con `--force`.

 ---

@@ -31,16 +31,16 @@ EMBED_MODEL = "nomic-embed-text"   # ← cambia qui

 ```bash
 # Vettorizza un singolo documento
-python step-8/ingest.py --stem <nome>
+python ingestion/ingest.py --stem <nome>

 # Vettorizza tutti i documenti trovati in step-6/
-python step-8/ingest.py
+python ingestion/ingest.py

 # Sovrascrive una collection già esistente
-python step-8/ingest.py --stem <nome> --force
+python ingestion/ingest.py --stem <nome> --force

 # Override modello (senza modificare config.py)
-python step-8/ingest.py --stem <nome> --model bge-m3
+python ingestion/ingest.py --stem <nome> --model bge-m3
 ```

 ---
@@ -94,7 +94,7 @@ Senza `--force` lo script salta la collection già esistente — i vecchi vettor

 ```bash
 # Cambio modello → ricrea sempre la collection
-python step-8/ingest.py --stem <nome> --force
+python ingestion/ingest.py --stem <nome> --force
 ```

 ### Quando usare `--force`
@@ -13,10 +13,10 @@ Input:  step-6/<stem>/chunks.json
 Output: chroma_db/<stem> (collection ChromaDB)

 Uso:
-    python step-8/ingest.py --stem <nome>            # singolo documento
-    python step-8/ingest.py                          # tutti gli stem trovati
-    python step-8/ingest.py --stem <nome> --force    # sovrascrive collection
-    python step-8/ingest.py --model bge-m3           # override modello
+    python ingestion/ingest.py --stem <nome>            # singolo documento
+    python ingestion/ingest.py                          # tutti gli stem trovati
+    python ingestion/ingest.py --stem <nome> --force    # sovrascrive collection
+    python ingestion/ingest.py --model bge-m3           # override modello
 """

 import argparse
@@ -57,7 +57,7 @@ Alternative supportate:
 - `bge-m3`
 - `nomic-embed-text`

-Se cambi embedding model rispetto a quello usato in step-8, riesegui ingest con `--force` e aggiorna `EMBED_MODEL` in `config.py`.
+Se cambi embedding model rispetto a quello usato in ingestion, riesegui ingest con `--force` e aggiorna `EMBED_MODEL` in `config.py`.

 ### Modello LLM (consigliato per 8 GB RAM)

@@ -101,7 +101,7 @@ Output atteso (esempio):
 ✅ LLM disponibile: qwen3.5:4b
 ✅ chromadb importabile
 ✅ Ambiente pronto — procedi con la vettorizzazione:
-   python step-8/ingest.py --stem <nome>
+   python ingestion/ingest.py --stem <nome>
 ```

 ---
@@ -109,5 +109,5 @@ Output atteso (esempio):
 ## Prossimo step

 ```bash
-python step-8/ingest.py --stem <nome>
+python ingestion/ingest.py --stem <nome>
 ```
@@ -197,7 +197,7 @@ def _build_epilog() -> str:
            if names:
                lines += ["", f"Collection disponibili: {', '.join(names)}"]
            else:
-                lines += ["", "Nessuna collection trovata — eseguire prima: python step-8/ingest.py"]
+                lines += ["", "Nessuna collection trovata — eseguire prima: python ingestion/ingest.py"]
        except Exception:
            pass
    return "\n".join(lines)
@@ -208,7 +208,7 @@ def main() -> int:
        description=(
            "Pipeline RAG interattiva\n\n"
            "Risponde a domande in linguaggio naturale su un documento\n"
-            "indicizzato in ChromaDB da step-8/ingest.py."
+            "indicizzato in ChromaDB da ingestion/ingest.py."
        ),
        epilog=_build_epilog(),
        formatter_class=argparse.RawDescriptionHelpFormatter,
@@ -218,7 +218,7 @@ def main() -> int:
        required=True,
        help=(
            "Nome della collection ChromaDB da interrogare. "
-            "Le collection vengono create da: python step-8/ingest.py --stem <nome>"
+            "Le collection vengono create da: python ingestion/ingest.py --stem <nome>"
        ),
    )
    args = parser.parse_args()
@@ -231,14 +231,14 @@ def main() -> int:
    print()

    if not CHROMA_DIR.exists():
-        print("❌ chroma_db/ non trovata — esegui prima step-8")
+        print("❌ chroma_db/ non trovata — esegui prima ingestion")
        return 1

    client = chromadb.PersistentClient(path=str(CHROMA_DIR))
    collections = [c.name for c in client.list_collections()]
    if args.stem not in collections:
        print(f"❌ Collection '{args.stem}' non trovata in chroma_db/")
-        print(f"   → python step-8/ingest.py --stem {args.stem}")
+        print(f"   → python ingestion/ingest.py --stem {args.stem}")
        return 1

    collection = client.get_collection(args.stem)
@@ -159,7 +159,7 @@ def _build_epilog() -> str:
            if names:
                lines += ["", f"Collection disponibili: {', '.join(names)}"]
            else:
-                lines += ["", "Nessuna collection trovata — eseguire prima: python step-8/ingest.py"]
+                lines += ["", "Nessuna collection trovata — eseguire prima: python ingestion/ingest.py"]
        except Exception:
            pass
    return "\n".join(lines)
@@ -196,14 +196,14 @@ def main() -> int:
    print()

    if not CHROMA_DIR.exists():
-        print("❌ chroma_db/ non trovata — esegui prima step-8", file=sys.stderr)
+        print("❌ chroma_db/ non trovata — esegui prima ingestion", file=sys.stderr)
        return 1

    client = chromadb.PersistentClient(path=str(CHROMA_DIR))
    collections = [c.name for c in client.list_collections()]
    if args.stem not in collections:
        print(f"❌ Collection '{args.stem}' non trovata in chroma_db/", file=sys.stderr)
-        print(f"   → python step-8/ingest.py --stem {args.stem}", file=sys.stderr)
+        print(f"   → python ingestion/ingest.py --stem {args.stem}", file=sys.stderr)
        return 1

    collection = client.get_collection(args.stem)