fix: aggiorna path step-4/ → conversione/ e riferimenti step-X

- chunker.py: input da conversione/<stem>/ (era step-4/, non esistente) - verify_chunks.py: messaggi errore aggiornati a conversione/ - config.py: commenti step-8 → ingest.py
2026-04-19 00:03:43 +02:00
parent e4dc0856bb
commit c8167d4f01
3 changed files with 22 additions and 22 deletions
@@ -28,8 +28,8 @@ NO_THINK = True
 # ── Embedding ─────────────────────────────────────────────────────────────────

 # Modello di embedding usato da Ollama.
-# Deve corrispondere al modello usato durante la vettorizzazione (step-8).
-# Se cambi questo, devi rieseguire step-8 con --force.
+# Deve corrispondere al modello usato durante la vettorizzazione (ingest.py).
+# Se cambi questo, devi rieseguire ingest.py con --force.
 EMBED_MODEL = "nomic-embed-text"

 # ── Ollama ────────────────────────────────────────────────────────────────────
@@ -1,15 +1,15 @@
 #!/usr/bin/env python3
 """
-Step 5 — Chunking adattivo
+Chunking adattivo

-Divide il Markdown revisionato (step 4) in chunk semantici pronti per la
+Divide il Markdown revisionato in chunk semantici pronti per la
 vettorizzazione. La strategia dipende dal profilo strutturale del documento.

-Input:  step-4/<stem>/clean.md + step-4/<stem>/structure_profile.json
+Input:  conversione/<stem>/clean.md + conversione/<stem>/structure_profile.json
 Output: step-5/<stem>/chunks.json

 Uso:
-    python step-5/chunker.py                    # tutti i documenti in step-4/
+    python step-5/chunker.py                    # tutti i documenti in conversione/
    python step-5/chunker.py --stem documento   # un solo documento
    python step-5/chunker.py --stem documento --force
 """
@@ -375,19 +375,19 @@ def chunk_document(clean_md: Path, profile: dict, stem: str) -> list[dict]:
 # ─── Per-document processing ──────────────────────────────────────────────────

 def process_stem(stem: str, project_root: Path, force: bool) -> bool:
-    step4_dir = project_root / "step-4" / stem
+    conv_dir = project_root / "conversione" / stem
    out_dir = project_root / "step-5" / stem
-    clean_md = step4_dir / "clean.md"
-    profile_path = step4_dir / "structure_profile.json"
+    clean_md = conv_dir / "clean.md"
+    profile_path = conv_dir / "structure_profile.json"
    out_file = out_dir / "chunks.json"

    print(f"\nDocumento: {stem}")

    if not clean_md.exists():
-        print(f"  ✗ clean.md non trovato in step-4/{stem}/ — skip")
+        print(f"  ✗ clean.md non trovato in conversione/{stem}/ — skip")
        return False
    if not profile_path.exists():
-        print(f"  ✗ structure_profile.json non trovato in step-4/{stem}/ — skip")
+        print(f"  ✗ structure_profile.json non trovato in conversione/{stem}/ — skip")
        return False

    if out_file.exists() and not force:
@@ -432,21 +432,21 @@ def process_stem(stem: str, project_root: Path, force: bool) -> bool:
 if __name__ == "__main__":
    project_root = Path(__file__).parent.parent

-    parser = argparse.ArgumentParser(description="Step 5 — Chunking adattivo")
-    parser.add_argument("--stem", help="Nome del documento (sottocartella di step-4/)")
+    parser = argparse.ArgumentParser(description="Chunking adattivo")
+    parser.add_argument("--stem", help="Nome del documento (sottocartella di conversione/)")
    parser.add_argument("--force", action="store_true", help="Riesegui anche se già presente")
    args = parser.parse_args()

    if args.stem:
        stems = [args.stem]
    else:
-        step4_dir = project_root / "step-4"
-        if not step4_dir.exists():
-            print(f"Errore: cartella step-4/ non trovata in {project_root}")
+        conv_dir = project_root / "conversione"
+        if not conv_dir.exists():
+            print(f"Errore: cartella conversione/ non trovata in {project_root}")
            sys.exit(1)
-        stems = sorted(p.name for p in step4_dir.iterdir() if p.is_dir())
+        stems = sorted(p.name for p in conv_dir.iterdir() if p.is_dir() and (p / "clean.md").exists())
        if not stems:
-            print(f"Errore: nessun documento trovato in step-4/")
+            print(f"Errore: nessun documento trovato in conversione/")
            sys.exit(1)

    results = [process_stem(s, project_root, args.force) for s in stems]
@@ -179,8 +179,8 @@ def verify_stem(stem: str, project_root: Path, min_chars: int, max_chars: int) -
            print(f"  [{c.get('chunk_id', '?')}] ...{last_line!r}")
        if len(incomplete) > 5:
            print(f"  ... e altri {len(incomplete) - 5}")
-        print(f"  → Causa probabile: paragrafo spezzato nel MD (step 4)")
-        print(f"  → Soluzione: correggi le righe spezzate in step-4/{stem}/clean.md")
+        print(f"  → Causa probabile: paragrafo spezzato nel MD")
+        print(f"  → Soluzione: correggi le righe spezzate in conversione/{stem}/clean.md")

    # ── Costruisci e salva report.json ───────────────────────────────────────

@@ -263,10 +263,10 @@ def verify_stem(stem: str, project_root: Path, min_chars: int, max_chars: int) -
        print()
        if empty_chunks:
            print(f"    • {len(empty_chunks)} chunk vuoti")
-            print(f"      → Controlla step-4/{stem}/clean.md per sezioni prive di testo")
+            print(f"      → Controlla conversione/{stem}/clean.md per sezioni prive di testo")
        if no_prefix:
            print(f"    • {len(no_prefix)} chunk senza prefisso di contesto")
-            print(f"      → Controlla che gli header ### siano corretti in step-4/{stem}/clean.md")
+            print(f"      → Controlla che gli header ### siano corretti in conversione/{stem}/clean.md")
        if incomplete:
            print(f"    • {len(incomplete)} chunk con frase spezzata")
            print(f"      → Esegui: python step-6/fix_chunks.py --stem {stem}")