From b7994100e72a6c7a08e7434da58fdcd57fb63315 Mon Sep 17 00:00:00 2001
From: Davide Grilli <davide.grilli@outlook.com>
Date: Thu, 16 Apr 2026 15:27:45 +0200
Subject: [PATCH 01/22] =?UTF-8?q?feat(pdf-to-md):=20aggiungi=20pipeline=20?=
 =?UTF-8?q?automatica=20PDF=20=E2=86=92=20Markdown=20pulito?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Nuova cartella conversione/ con pipeline.py che sostituisce
step-0+1+2+3+4 in un singolo comando senza operazioni manuali.
Usa opendataloader-pdf (algoritmo XY-Cut++ per ordine di lettura).

Trasformazioni strutturali:
- accenti backtick da PDF LaTeX (es. `e→è, puo`→può)
- rimozione dot-leader TOC e numerali romani pagina (i, ii, iii)
- normalizzazione header a gerarchia uniforme h1/h2/h3
- conversione sezioni numerate e aforismi → header ###
- rilevamento sezioni Esercizi → disabilita conversione numerazioni
- watermark URL rimossi, header vuoti scartati
---
 .gitignore              |   3 +
 conversione/pipeline.py | 690 ++++++++++++++++++++++++++++++++++++++++
 requirements.txt        |   4 +
 3 files changed, 697 insertions(+)
 create mode 100644 conversione/pipeline.py
diff --git a/.gitignore b/.gitignore
index 69458fa..0b18250 100644
--- a/.gitignore
+++ b/.gitignore
@@ -46,3 +46,6 @@ step-5/*/
 # Output step-6 — report generati da verify_chunks.py
 step-6/*/
 
+# Output conversione/ — generati da conversione/pipeline.py
+conversione/*/
+
diff --git a/conversione/pipeline.py b/conversione/pipeline.py
new file mode 100644
index 0000000..5ed919d
--- /dev/null
+++ b/conversione/pipeline.py
@@ -0,0 +1,690 @@
+#!/usr/bin/env python3
+"""
+conversion/pipeline.py — PDF → clean Markdown (pipeline automatica)
+
+Sostituisce step-0 + step-1 + step-2 + step-3 + step-4 in un solo comando,
+senza operazioni manuali.
+
+Usa opendataloader-pdf (algoritmo XY-Cut++ per ordine di lettura corretto,
+testo fluente, struttura preservata) al posto di pymupdf4llm.
+
+Output (compatibile con step-5+):
+  conversion/<stem>/raw.md                — output grezzo opendataloader (immutabile)
+  conversion/<stem>/clean.md              — MD pulito e strutturato
+  conversion/<stem>/structure_profile.json
+
+Uso:
+    python conversion/pipeline.py --stem <nome>
+    python conversion/pipeline.py                       # tutti i PDF in sources/
+    python conversion/pipeline.py --stem <nome> --force # forza riesecuzione
+
+Prerequisiti:
+    pip install opendataloader-pdf
+    Java 11+ sul PATH (https://adoptium.net/)
+"""
+
+import argparse
+import json
+import re
+import subprocess
+import sys
+import tempfile
+from pathlib import Path
+
+
+# ─── Verifica dipendenze ──────────────────────────────────────────────────────
+
+def _check_deps() -> None:
+    try:
+        import opendataloader_pdf  # noqa: F401
+    except ImportError:
+        print("Errore: opendataloader-pdf non installato.")
+        print("       pip install opendataloader-pdf")
+        sys.exit(1)
+
+    try:
+        result = subprocess.run(
+            ["java", "-version"],
+            capture_output=True, text=True,
+        )
+        if result.returncode != 0:
+            raise FileNotFoundError
+    except FileNotFoundError:
+        print("Errore: Java 11+ non trovato sul PATH.")
+        print("       Installa da https://adoptium.net/")
+        sys.exit(1)
+
+
+# ─── [1] Validazione PDF (step-0 + step-1) ────────────────────────────────────
+
+def check_pdf(pdf_path: Path) -> tuple[bool, str]:
+    """
+    Validazione rapida: esistenza, leggibilità, testo estraibile.
+    Restituisce (ok, messaggio).
+    """
+    if not pdf_path.exists():
+        return False, f"File non trovato: {pdf_path}"
+    if pdf_path.suffix.lower() != ".pdf":
+        return False, f"Non è un PDF: {pdf_path.name}"
+    if pdf_path.stat().st_size == 0:
+        return False, "File vuoto"
+
+    try:
+        import pdfplumber
+        with pdfplumber.open(pdf_path) as pdf:
+            n_pages = len(pdf.pages)
+            if n_pages == 0:
+                return False, "PDF senza pagine"
+            sample = min(5, n_pages)
+            pages_with_text = sum(
+                1 for i in range(sample)
+                if len((pdf.pages[i].extract_text() or "").strip()) > 50
+            )
+            if pages_with_text == 0:
+                return False, (
+                    f"Nessun testo nelle prime {sample} pagine "
+                    f"— probabilmente scansionato (usa modalità hybrid)"
+                )
+        return True, f"{n_pages} pagine, testo digitale confermato"
+    except Exception as e:
+        msg = str(e).lower()
+        if "password" in msg or "encrypted" in msg:
+            return False, "PDF protetto da password"
+        return False, f"Impossibile aprire: {e}"
+
+
+# ─── [2] Conversione PDF → Markdown (step-2) ─────────────────────────────────
+
+def convert_pdf(pdf_path: Path, out_dir: Path) -> Path:
+    """
+    Converte il PDF in Markdown tramite opendataloader-pdf.
+    Scrive il file nella out_dir e restituisce il percorso.
+
+    Parametri scelti per output RAG-ottimale:
+      - keep_line_breaks=False  → testo fluente, no hard-wrap PDF
+      - reading_order="xycut"   → corregge ordine multi-colonna (XY-Cut++)
+      - sanitize=False          → preserva il testo originale (no anonimizzazione PII)
+    """
+    import opendataloader_pdf
+
+    out_dir.mkdir(parents=True, exist_ok=True)
+
+    opendataloader_pdf.convert(
+        input_path=str(pdf_path),
+        output_dir=str(out_dir),
+        format="markdown",
+        keep_line_breaks=False,
+        reading_order="xycut",
+        sanitize=False,
+        image_output="off",   # nessuna immagine estratta né referenziata
+        quiet=True,           # sopprime i log Java
+    )
+
+    # Il file output si chiama <stem>.md
+    md_file = out_dir / f"{pdf_path.stem}.md"
+    if not md_file.exists():
+        candidates = list(out_dir.glob("*.md"))
+        if not candidates:
+            raise RuntimeError(f"Nessun file .md prodotto in {out_dir}")
+        md_file = candidates[0]
+
+    return md_file
+
+
+# ─── [3] Pulizia strutturale (step-4 / revise.py) ────────────────────────────
+#
+# Logica identica a step-4/revise.py — mantenuta sincronizzata.
+
+_TOC_KEYWORDS = frozenset([
+    "indice", "index", "contents", "table of contents",
+    "sommario", "inhaltsverzeichnis", "inhalt",
+])
+
+_ORDINALS_IT = {
+    "PRIMO": "I", "SECONDO": "II", "TERZO": "III", "QUARTO": "IV",
+    "QUINTO": "V", "SESTO": "VI", "SETTIMO": "VII", "OTTAVO": "VIII",
+    "NONO": "IX", "DECIMO": "X",
+}
+_ORDINALS_EN = {
+    "ONE": "1", "TWO": "2", "THREE": "3", "FOUR": "4", "FIVE": "5",
+    "SIX": "6", "SEVEN": "7", "EIGHT": "8", "NINE": "9", "TEN": "10",
+}
+
+
+def _sentence_case(s: str) -> str:
+    if not s:
+        return s
+    lower = s.lower()
+    return lower[0].upper() + lower[1:]
+
+
+def _is_allcaps_line(line: str) -> bool:
+    stripped = line.strip()
+    letters = [c for c in stripped if c.isalpha()]
+    return (
+        len(letters) >= 3
+        and all(c.isupper() for c in letters)
+        and not stripped.startswith("#")
+    )
+
+
+def _allcaps_to_header(raw_line: str) -> str:
+    text = raw_line.strip().rstrip(".").rstrip("?").strip()
+
+    _ORD_IT_PAT = "|".join(_ORDINALS_IT.keys())
+    m = re.match(rf"^CAPITOLO ({_ORD_IT_PAT})\. (.+)", text)
+    if m:
+        roman = _ORDINALS_IT[m.group(1)]
+        titolo = m.group(2).rstrip(".").rstrip("?").strip()
+        return f"## Capitolo {roman} — {_sentence_case(titolo)}"
+
+    _ORD_EN_PAT = "|".join(_ORDINALS_EN.keys())
+    m = re.match(rf"^CHAPTER ({_ORD_EN_PAT}|\d+)\.? (.+)", text)
+    if m:
+        n = _ORDINALS_EN.get(m.group(1), m.group(1))
+        titolo = m.group(2).rstrip(".").rstrip("?").strip()
+        return f"## Chapter {n} — {_sentence_case(titolo)}"
+
+    m = re.match(r"^([IVXLCDM]+|[0-9]+)\. (.+)", text)
+    if m:
+        return f"## {m.group(1)}. {_sentence_case(m.group(2).rstrip('.').strip())}"
+
+    return f"## {_sentence_case(text)}"
+
+
+def apply_transforms(text: str) -> tuple[str, dict]:
+    """
+    Applica le trasformazioni strutturali al Markdown grezzo.
+    Restituisce (testo_modificato, statistiche).
+    """
+    stats = {
+        "toc_rimosso": False,
+        "n_immagini_rimosse": 0,
+        "n_accenti_corretti": 0,
+        "n_dotleader_rimossi": 0,
+        "n_header_concat_fixati": 0,
+        "n_header_allcaps": 0,
+        "n_sezioni_numerate": 0,
+        "n_paragrafi_uniti": 0,
+    }
+
+    # 0. Rimuovi riferimenti immagini (artefatti opendataloader-pdf)
+    stats["n_immagini_rimosse"] = len(re.findall(r"!\[[^\]]*\]\([^)]*\)", text))
+    text = re.sub(r"!\[[^\]]*\]\([^)]*\)\s*", "", text)
+
+    # 0a. Fix artefatti backtick da PDF LaTeX: `e→è, e`→è, sar`a→sarà, ecc.
+    #     I PDF prodotti da LaTeX estraggono gli accenti gravi come backtick separati
+    #     dalla vocale accentata. Esempi: "`e" → "è", "puo`" → "può", "sar`a" → "sarà"
+    _ACCENT_MAP = {
+        "e": "è", "E": "È", "a": "à", "A": "À",
+        "u": "ù", "U": "Ù", "i": "ì", "I": "Ì", "o": "ò", "O": "Ò",
+    }
+    n_bt_before = text.count("`")
+    text = re.sub(r"`([eEaAuUiIoO])", lambda m: _ACCENT_MAP[m.group(1)], text)
+    text = re.sub(r"([eEaAuUiIoO])`", lambda m: _ACCENT_MAP[m.group(1)], text)
+    stats["n_accenti_corretti"] = n_bt_before - text.count("`")
+
+    # 0b_pre. Rimuovi righe con dot-leader (voci di indice/sommario)
+    #     Esempi: "- 1.1 Alfabeto greco . . . . . . 1", "3.4 Continuità . . . . 205"
+    #     Pattern: almeno 3 occorrenze di ". " consecutive nella riga
+    stats["n_dotleader_rimossi"] = len(
+        re.findall(r"^[^\n]*(?:\. ){3,}[^\n]*$", text, re.MULTILINE)
+    )
+    text = re.sub(r"^[^\n]*(?:\. ){3,}[^\n]*$", "", text, flags=re.MULTILINE)
+
+    # 0b_pre2. Rimuovi righe che sono solo numerali romani (indicatori di pagina TOC)
+    #     Esempi: "i", "ii", "iii", "iv", "v" su riga isolata (footer pagine indice LaTeX)
+    #     Questi impedirebbero al transform 9 di rimuovere le entry TOC rimaste senza corpo.
+    text = re.sub(
+        r"(?m)^(i{1,3}|iv|vi{0,3}|ix|xi{0,2}|x)$",
+        "",
+        text,
+        flags=re.IGNORECASE,
+    )
+
+    # Flag documento: rilevamento sezioni esercizi (es. libri di testo accademici)
+    # Usato per disabilitare transform 4b che convertirebbe i numeri degli esercizi in header.
+    _has_exercise_sections = bool(re.search(r"\bEsercizi\b", text, re.IGNORECASE))
+
+    # 0b. Fix header + body concatenati senza separatore
+    #     "##### 11 TitoloCorpodel testo..." → "##### 11 Titolo\n\nCorpo del testo..."
+    def _fix_header_concat(m: re.Match) -> str:
+        hashes = m.group(1)
+        full = m.group(2).strip()
+        if len(full) < 60:
+            return m.group(0)
+        # Cerca split: lettera minuscola (incluse accentate) seguita da maiuscola
+        # Salta i primi ~10 char per non spezzare il numero della sezione
+        skip = min(10, len(full) // 3)
+        split = re.search(r"(?<=[a-zàèéìíòóùúä])(?=[A-ZÀÈÉÌÍÒÓÙÚ])", full[skip:])
+        if split:
+            pos = skip + split.start()
+            title = full[:pos].strip()
+            body = full[pos:].strip()
+            if len(title) >= 5 and len(body) >= 15:
+                stats["n_header_concat_fixati"] += 1
+                return f"{hashes} {title}\n\n{body}"
+        return m.group(0)
+
+    text = re.sub(r"^(#{2,6})\s+(.{40,})$", _fix_header_concat, text, flags=re.MULTILINE)
+
+    # 0c. Estrai "Capitolo N: TITOLO" inline nel corpo del testo → ## header separato
+    #     "Capitolo 3: IL TITOLO DEL CAPITOLO - 16 Primo..."  → "## Capitolo 3: ..."
+    #     "Capitolo 1 : TITOLO CAPITOLO"                      → "## Capitolo 1: ..."
+    def _extract_capitolo(m: re.Match) -> str:
+        num = m.group(1)
+        titolo = _sentence_case(m.group(2).strip().rstrip("- ").strip())
+        return f"\n\n## Capitolo {num}: {titolo}\n\n"
+
+    text = re.sub(
+        r"\bCapitolo\s+(\d+)\s*[:\s]\s*([A-ZÀÈÉÌÍÒÓÙÚ\'L][A-ZÀÈÉÌÍÒÓÙÚ\s\'\.,\(\)]{5,80}?)"
+        r"(?=\s*[-–]\s*\d|\s*\n|\s*$)",
+        _extract_capitolo,
+        text,
+    )
+
+    # 0d. Normalizza header di sezione a livello uniforme ###
+    #     "#### N Titolo"  → "### N. Titolo"  (numerati: aggiunge punto)
+    #     "#### B) Titolo" → "### B) Titolo"  (lettera: solo cambio livello)
+    #     "#### "          → rimosso           (vuoti)
+    text = re.sub(
+        r"^#{3,6}\s*$",
+        "",
+        text,
+        flags=re.MULTILINE,
+    )
+    text = re.sub(
+        r"^(#{3,6})\s+(\d{1,3})\s+(.+)$",
+        lambda m: f"### {m.group(2)}. {m.group(3)}",
+        text,
+        flags=re.MULTILINE,
+    )
+    text = re.sub(
+        r"^#{4,6}\s+(.+)$",
+        r"### \1",
+        text,
+        flags=re.MULTILINE,
+    )
+
+    # 1. Rimuovi **bold** negli header esistenti: ## **Titolo** → ## Titolo
+    text = re.sub(
+        r"^(#{1,6})\s+\*\*(.+?)\*\*\s*$",
+        r"\1 \2",
+        text, flags=re.MULTILINE,
+    )
+
+    # 1b. Normalizza header ALL-CAPS → sentence-case
+    def _norm_allcaps_header(m: re.Match) -> str:
+        hashes, content = m.group(1), m.group(2).strip()
+        letters = [c for c in content if c.isalpha()]
+        if letters and all(c.isupper() for c in letters):
+            return f"{hashes} {_sentence_case(content)}"
+        return m.group(0)
+
+    text = re.sub(r"^(#{1,6}) (.+)$", _norm_allcaps_header, text, flags=re.MULTILINE)
+
+    # 2. Rimuovi righe TOC: header "# Indice", "# Contents", ecc.
+    #     Rimuove la riga stessa; le voci subordinate (dot-leader) sono già rimosse da 0b_pre.
+    #     L'header rimasto senza corpo viene poi eliminato dal transform 9.
+    lines = text.split("\n")
+    new_lines = []
+    for line in lines:
+        # Stripping del prefisso markdown (##, #, ecc.) prima del confronto keyword
+        bare = re.sub(r"^#+\s*", "", line.strip())
+        first_word = bare.split(".")[0].strip().lower()
+        if first_word in _TOC_KEYWORDS:
+            stats["toc_rimosso"] = True
+        else:
+            new_lines.append(line)
+    text = "\n".join(new_lines)
+
+    # 3. Converti righe ALL-CAPS standalone → ## header
+    blocks = text.split("\n\n")
+    new_blocks = []
+    for block in blocks:
+        stripped = block.strip()
+        if "\n" not in stripped and _is_allcaps_line(stripped):
+            new_blocks.append(_allcaps_to_header(stripped))
+            stats["n_header_allcaps"] += 1
+        else:
+            sub_lines = block.split("\n")
+            converted = []
+            for ln in sub_lines:
+                if _is_allcaps_line(ln) and len(ln.strip()) > 3:
+                    converted.append(_allcaps_to_header(ln))
+                    stats["n_header_allcaps"] += 1
+                else:
+                    converted.append(ln)
+            new_blocks.append("\n".join(converted))
+    text = "\n\n".join(new_blocks)
+
+    # 4. Converti sezioni numerate "N. testo" → "### N.\n\ntesto"
+    #     Guarda che il testo non sia una frase completa (es. esercizi numerati):
+    #     se termina con "." ed è più lungo di 40 caratteri, è probabilmente una frase,
+    #     non un titolo di sezione → lascia invariato.
+    def _num_repl(m: re.Match) -> str:
+        content = m.group(2).strip()
+        if content.endswith(".") and len(content) > 40:
+            return m.group(0)
+        stats["n_sezioni_numerate"] += 1
+        return f"### {m.group(1)}.\n\n{content}"
+
+    text = re.sub(r"^(\d+)\.\s+(.+)$", _num_repl, text, flags=re.MULTILINE)
+
+    def _num_letter_repl(m: re.Match) -> str:
+        stats["n_sezioni_numerate"] += 1
+        return f"### {m.group(1)}{m.group(2)}.\n\n{m.group(3).strip()}"
+
+    text = re.sub(r"^(\d+)\s*([a-z])\.\s+(.+)$", _num_letter_repl, text, flags=re.MULTILINE)
+
+    # 4b. Converti "- N. testo" sezioni con punto → "### N.\n\ntesto"
+    #     "- 1. Testo del primo punto..."  → "### 1.\n\nTesto del primo punto..."
+    #     Deve precedere 4c: "- N." ha il punto, "- N testo" no.
+    #     Disabilitato se il documento contiene sezioni "Esercizi": in quel caso i
+    #     "- N. testo" sono numerazioni di esercizi, non header di sezione.
+    if not _has_exercise_sections:
+        def _aphorism_repl(m: re.Match) -> str:
+            stats["n_sezioni_numerate"] += 1
+            return f"\n\n### {m.group(1)}.\n\n{m.group(2).strip()}"
+
+        text = re.sub(
+            r"^-\s+(\d{1,3})\.\s+(.{10,})$",
+            _aphorism_repl,
+            text,
+            flags=re.MULTILINE,
+        )
+
+    # 4c. Converti "- N testo" list item numerati → "### N.\n\ntesto"
+    #     "- 12 Titolo sezione Corpo della sezione..." → "### 12. Titolo sezione\n\nCorpo..."
+    #     Non tocca "- a) testo", "- 1) testo" (già gestiti come liste)
+    def _list_section_repl(m: re.Match) -> str:
+        num = m.group(1)
+        content = m.group(2).strip()
+        stats["n_sezioni_numerate"] += 1
+        # Separa titolo da corpo: il titolo finisce dove una lettera minuscola
+        # è seguita da spazio e maiuscola (confine fine-titolo / inizio-corpo)
+        split = re.search(r"(?<=[a-zàèéìíòóùú])\s+(?=[A-ZÀÈÉÌÍÒÓÙÚ])", content)
+        if split and split.start() >= 3:
+            title = content[: split.start()].strip()
+            body = content[split.end() :].strip()
+            if len(body) >= 20:
+                return f"\n\n### {num}. {title}\n\n{body}"
+        # Nessun body inline: il content è solo il titolo
+        return f"\n\n### {num}. {content}"
+
+    text = re.sub(
+        r"^-\s+(\d{1,3})\s+([A-ZÀÈÉÌÍÒÓÙÚ\'L].{10,})$",
+        _list_section_repl,
+        text,
+        flags=re.MULTILINE,
+    )
+
+    # 5. Unisci paragrafi spezzati da salti pagina PDF
+    _SENTENCE_END = set(".?!»)\"'")
+    blocks = text.split("\n\n")
+    merged = []
+    i = 0
+    while i < len(blocks):
+        b = blocks[i]
+        stripped = b.strip()
+        while (
+            i + 1 < len(blocks)
+            and stripped
+            and not stripped.startswith("#")
+            and stripped[-1] not in _SENTENCE_END
+        ):
+            nxt = blocks[i + 1].strip()
+            if not nxt or nxt.startswith("#") or re.match(r"^\d+\.", nxt):
+                break
+            b = stripped + " " + nxt
+            stripped = b.strip()
+            stats["n_paragrafi_uniti"] += 1
+            i += 1
+        merged.append(b)
+        i += 1
+    text = "\n\n".join(merged)
+
+    # 6. Normalizza whitespace multiplo interno alle righe
+    lines = text.split("\n")
+    text = "\n".join(
+        re.sub(r"  +", " ", line) if line.strip() else line
+        for line in lines
+    )
+
+    # 7. Riduci righe vuote multiple a doppie
+    text = re.sub(r"\n{3,}", "\n\n", text)
+
+    # 8. Rimuovi righe che sono solo URL (watermark, footer di piattaforme)
+    text = re.sub(r"(?m)^(https?://|www\.)\S+\s*$", "", text)
+
+    # 9. Rimuovi header senza corpo: header seguito solo da righe vuote e poi
+    #    da un altro header o dalla fine del testo (sezioni vuote / watermark)
+    blocks = re.split(r"\n{2,}", text)
+    cleaned = []
+    for i, block in enumerate(blocks):
+        stripped = block.strip()
+        if re.match(r"^#{1,6} ", stripped) and "\n" not in stripped:
+            next_stripped = blocks[i + 1].strip() if i + 1 < len(blocks) else ""
+            if not next_stripped or re.match(r"^#{1,6} ", next_stripped):
+                continue  # header senza corpo → scarta
+        cleaned.append(block)
+    text = re.sub(r"\n{3,}", "\n\n", "\n\n".join(cleaned))
+
+    return text, stats
+
+
+# ─── [4] Rilevamento struttura (step-3 / detect_structure.py) ────────────────
+#
+# Logica identica a step-3/detect_structure.py — mantenuta sincronizzata.
+
+_IT_WORDS = frozenset([
+    "il", "la", "di", "e", "che", "non", "per", "un", "una", "si",
+    "con", "da", "del", "della", "dei", "in", "ma", "se", "lo", "le",
+    "gli", "al", "alla", "ai", "alle", "sono", "ha", "hanno", "era",
+    "erano", "nel", "nella", "nei", "nelle", "questo", "questa", "così",
+])
+_EN_WORDS = frozenset([
+    "the", "of", "and", "to", "in", "is", "that", "it", "was", "for",
+    "on", "are", "as", "with", "his", "they", "at", "be", "this", "have",
+    "from", "or", "an", "but", "not", "by", "he", "she", "we", "you",
+    "which", "their", "been", "has", "would", "there", "when", "will",
+])
+
+
+def _detect_language(text: str) -> str:
+    words = re.findall(r"\b[a-zA-Z]{2,}\b", text.lower())
+    sample = words[:2000]
+    it = sum(1 for w in sample if w in _IT_WORDS)
+    en = sum(1 for w in sample if w in _EN_WORDS)
+    if it == 0 and en == 0:
+        return "unknown"
+    return "it" if it >= en else "en"
+
+
+def _count_headers(text: str, level: int) -> int:
+    prefix = "#" * level + " "
+    return len(re.findall(rf"(?m)^{re.escape(prefix)}", text))
+
+
+def _count_paragraphs(text: str) -> int:
+    blocks = re.split(r"\n{2,}", text)
+    return sum(1 for b in blocks if b.strip() and not re.match(r"^#+\s", b.strip()))
+
+
+def _split_sections(text: str, level: int) -> list[str]:
+    prefix = "#" * level + " "
+    parts = re.split(rf"(?m)^{re.escape(prefix)}.+", text)
+    return [p for p in parts[1:] if p.strip()]
+
+
+def analyze(md_path: Path) -> dict:
+    text = md_path.read_text(encoding="utf-8")
+    n_h1 = _count_headers(text, 1)
+    n_h2 = _count_headers(text, 2)
+    n_h3 = _count_headers(text, 3)
+    n_paragrafi = _count_paragraphs(text)
+
+    if n_h3 >= 5:
+        livello, boundary, strategia = 3, "h3", "h3_aware"
+        section_bodies = _split_sections(text, 3)
+    elif n_h2 >= 3:
+        livello, boundary, strategia = 2, "h2", "h2_paragraph_split"
+        section_bodies = _split_sections(text, 2)
+    elif n_h1 + n_h2 + n_h3 >= 1:
+        livello, boundary, strategia = 1, "paragrafo", "paragraph"
+        section_bodies = [b for b in re.split(r"\n{2,}", text) if b.strip()]
+    elif n_paragrafi >= 3:
+        livello, boundary, strategia = 1, "paragrafo", "paragraph"
+        section_bodies = [b for b in re.split(r"\n{2,}", text) if b.strip()]
+    else:
+        livello, boundary, strategia = 0, "nessuno", "sliding_window"
+        section_bodies = [text] if text.strip() else []
+
+    lengths = [len(b) for b in section_bodies if b.strip()]
+    lunghezza_media = int(sum(lengths) / len(lengths)) if lengths else 0
+    lingua = _detect_language(text)
+
+    avvertenze = []
+    short = sum(1 for l in lengths if l < 200)
+    long_ = sum(1 for l in lengths if l > 800)
+    if short:
+        avvertenze.append(f"{short} sezioni sotto i 200 caratteri — verranno accorpate")
+    if long_:
+        avvertenze.append(f"{long_} sezioni sopra i 800 caratteri — verranno divise")
+
+    return {
+        "livello_struttura": livello,
+        "n_h1": n_h1,
+        "n_h2": n_h2,
+        "n_h3": n_h3,
+        "n_paragrafi": n_paragrafi,
+        "boundary_primario": boundary,
+        "lingua_rilevata": lingua,
+        "lunghezza_media_sezione": lunghezza_media,
+        "strategia_chunking": strategia,
+        "avvertenze": avvertenze,
+    }
+
+
+# ─── Pipeline principale ──────────────────────────────────────────────────────
+
+def run(stem: str, project_root: Path, force: bool) -> bool:
+    pdf_path = project_root / "sources" / f"{stem}.pdf"
+    out_dir = project_root / "conversion" / stem
+    raw_out = out_dir / "raw.md"
+    clean_out = out_dir / "clean.md"
+    profile_out = out_dir / "structure_profile.json"
+
+    print(f"\n{'─' * 52}")
+    print(f"  {stem}")
+    print(f"{'─' * 52}")
+
+    if clean_out.exists() and not force:
+        print(f"  ⚠️  conversion/{stem}/clean.md già presente — skip")
+        print(f"      (usa --force per rieseguire)")
+        return True
+
+    # ── [1] Validazione ────────────────────────────────────────────────────
+    print("  [1/4] Validazione PDF...")
+    ok, msg = check_pdf(pdf_path)
+    if not ok:
+        print(f"  ✗ {msg}")
+        return False
+    print(f"  ✅ {msg}")
+
+    # ── [2] Conversione ────────────────────────────────────────────────────
+    print("  [2/4] Conversione PDF → Markdown (opendataloader-pdf)...")
+    with tempfile.TemporaryDirectory() as tmp:
+        try:
+            md_file = convert_pdf(pdf_path, Path(tmp))
+        except Exception as e:
+            print(f"  ✗ Conversione fallita: {e}")
+            return False
+        raw_text = md_file.read_text(encoding="utf-8")
+
+    size_kb = len(raw_text.encode()) // 1024
+    n_lines = raw_text.count("\n")
+    print(f"  ✅ Markdown grezzo: {size_kb} KB, {n_lines} righe")
+
+    # ── [3] Pulizia strutturale ────────────────────────────────────────────
+    print("  [3/4] Pulizia strutturale...")
+    clean_text, t_stats = apply_transforms(raw_text)
+    reduction = 100 * (1 - len(clean_text) / len(raw_text)) if raw_text else 0
+    print(f"  ✅ Immagini rimosse:      {t_stats['n_immagini_rimosse']}")
+    print(f"     Accenti corretti:      {t_stats['n_accenti_corretti']}")
+    print(f"     Dot-leader rimossi:    {t_stats['n_dotleader_rimossi']}")
+    print(f"     Header concat fixati:  {t_stats['n_header_concat_fixati']}")
+    print(f"     TOC rimosso:           {'sì' if t_stats['toc_rimosso'] else 'no'}")
+    print(f"     ALL-CAPS → ##:         {t_stats['n_header_allcaps']}")
+    print(f"     Sezioni → ###:         {t_stats['n_sezioni_numerate']}")
+    print(f"     Paragrafi uniti:       {t_stats['n_paragrafi_uniti']}")
+    print(f"     Riduzione testo:       {reduction:.0f}%")
+
+    # ── [4] Profilo strutturale ────────────────────────────────────────────
+    print("  [4/4] Analisi struttura...")
+    out_dir.mkdir(parents=True, exist_ok=True)
+    raw_out.write_text(raw_text, encoding="utf-8")
+    clean_out.write_text(clean_text, encoding="utf-8")
+    profile = analyze(clean_out)
+    profile_out.write_text(json.dumps(profile, ensure_ascii=False, indent=2), encoding="utf-8")
+
+    _LIVELLO_DESC = {3: "ricca (h3)", 2: "parziale (h2)", 1: "paragrafi", 0: "testo piatto"}
+    print(f"  ✅ Struttura: livello {profile['livello_struttura']} — {_LIVELLO_DESC[profile['livello_struttura']]}")
+    print(f"     h1={profile['n_h1']}  h2={profile['n_h2']}  h3={profile['n_h3']}  "
+          f"paragrafi={profile['n_paragrafi']}")
+    print(f"     Strategia chunking: {profile['strategia_chunking']}")
+    print(f"     Lingua rilevata:    {profile['lingua_rilevata']}")
+    for w in profile["avvertenze"]:
+        print(f"     ⚠️  {w}")
+
+    print(f"\n  Output:")
+    print(f"    conversion/{stem}/raw.md               (immutabile)")
+    print(f"    conversion/{stem}/clean.md")
+    print(f"    conversion/{stem}/structure_profile.json")
+    print(f"\n  Prossimo passo: python step-5/chunker.py --stem {stem}")
+    return True
+
+
+# ─── Entry point ─────────────────────────────────────────────────────────────
+
+if __name__ == "__main__":
+    project_root = Path(__file__).parent.parent
+
+    parser = argparse.ArgumentParser(
+        description="Pipeline PDF → clean Markdown (sostituisce step 0+1+2+3+4)",
+        epilog=(
+            "Output compatibile con step-5+.\n"
+            "Prerequisiti: pip install opendataloader-pdf  +  Java 11+ sul PATH"
+        ),
+    )
+    parser.add_argument(
+        "--stem",
+        help="Nome del documento (PDF in sources/<stem>.pdf). "
+             "Se omesso, elabora tutti i PDF in sources/.",
+    )
+    parser.add_argument(
+        "--force",
+        action="store_true",
+        help="Riesegui anche se clean.md è già presente",
+    )
+    args = parser.parse_args()
+
+    _check_deps()
+
+    if args.stem:
+        stems = [args.stem]
+    else:
+        sources_dir = project_root / "sources"
+        if not sources_dir.exists():
+            print("Errore: cartella sources/ non trovata")
+            sys.exit(1)
+        stems = sorted(p.stem for p in sources_dir.glob("*.pdf"))
+        if not stems:
+            print("Errore: nessun PDF trovato in sources/")
+            sys.exit(1)
+
+    results = [run(s, project_root, args.force) for s in stems]
+    ok = sum(results)
+    total = len(results)
+    print(f"\n{'✅' if all(results) else '⚠️ '} {ok}/{total} documenti convertiti")
+    sys.exit(0 if all(results) else 1)
diff --git a/requirements.txt b/requirements.txt
index a30e6e4..6cc5bce 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -4,5 +4,9 @@ pdfplumber==0.11.9
 # Step 2 — Conversione PDF → Markdown
 pymupdf4llm
 
+# conversione/ — Pipeline automatica PDF → clean Markdown (alternativa a step 0+1+2+3+4)
+# Richiede anche Java 11+ sul PATH: https://adoptium.net/
+opendataloader-pdf
+
 # Step 8 — Vettorizzazione
 chromadb

From 2545d834a9d02d28a241f3419ae9a16598dbea0a Mon Sep 17 00:00:00 2001
From: Davide Grilli <davide.grilli@outlook.com>
Date: Thu, 16 Apr 2026 15:30:59 +0200
Subject: [PATCH 02/22] refactor(pdf-to-md): rimuovi riferimenti agli step
 interni da conversione/
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

pipeline.py è una pipeline autonoma e non deve nominare la suddivisione
interna del progetto (step-0..4). Aggiornati docstring, commenti sezione,
messaggi di output e argparse description.
---
 conversione/pipeline.py | 52 ++++++++++++++++++-----------------------
 1 file changed, 23 insertions(+), 29 deletions(-)

diff --git a/conversione/pipeline.py b/conversione/pipeline.py
index 5ed919d..8b185e7 100644
--- a/conversione/pipeline.py
+++ b/conversione/pipeline.py
@@ -1,22 +1,23 @@
 #!/usr/bin/env python3
 """
-conversion/pipeline.py — PDF → clean Markdown (pipeline automatica)
+conversione/pipeline.py — PDF → clean Markdown (pipeline automatica)
 
-Sostituisce step-0 + step-1 + step-2 + step-3 + step-4 in un solo comando,
-senza operazioni manuali.
+Converte un PDF grezzo in Markdown strutturato e pulito, pronto per la
+suddivisione in chunk. Gestisce validazione, estrazione testo, pulizia
+strutturale e rilevamento automatico della struttura del documento.
 
 Usa opendataloader-pdf (algoritmo XY-Cut++ per ordine di lettura corretto,
-testo fluente, struttura preservata) al posto di pymupdf4llm.
+testo fluente, struttura preservata).
 
-Output (compatibile con step-5+):
-  conversion/<stem>/raw.md                — output grezzo opendataloader (immutabile)
-  conversion/<stem>/clean.md              — MD pulito e strutturato
-  conversion/<stem>/structure_profile.json
+Output per ciascuno stem:
+  conversione/<stem>/raw.md                — Markdown grezzo (immutabile)
+  conversione/<stem>/clean.md              — Markdown pulito e strutturato
+  conversione/<stem>/structure_profile.json
 
 Uso:
-    python conversion/pipeline.py --stem <nome>
-    python conversion/pipeline.py                       # tutti i PDF in sources/
-    python conversion/pipeline.py --stem <nome> --force # forza riesecuzione
+    python conversione/pipeline.py --stem <nome>
+    python conversione/pipeline.py                       # tutti i PDF in sources/
+    python conversione/pipeline.py --stem <nome> --force # forza riesecuzione
 
 Prerequisiti:
     pip install opendataloader-pdf
@@ -55,7 +56,7 @@ def _check_deps() -> None:
         sys.exit(1)
 
 
-# ─── [1] Validazione PDF (step-0 + step-1) ────────────────────────────────────
+# ─── [1] Validazione PDF ─────────────────────────────────────────────────────
 
 def check_pdf(pdf_path: Path) -> tuple[bool, str]:
     """
@@ -93,7 +94,7 @@ def check_pdf(pdf_path: Path) -> tuple[bool, str]:
         return False, f"Impossibile aprire: {e}"
 
 
-# ─── [2] Conversione PDF → Markdown (step-2) ─────────────────────────────────
+# ─── [2] Conversione PDF → Markdown ─────────────────────────────────────────
 
 def convert_pdf(pdf_path: Path, out_dir: Path) -> Path:
     """
@@ -131,9 +132,7 @@ def convert_pdf(pdf_path: Path, out_dir: Path) -> Path:
     return md_file
 
 
-# ─── [3] Pulizia strutturale (step-4 / revise.py) ────────────────────────────
-#
-# Logica identica a step-4/revise.py — mantenuta sincronizzata.
+# ─── [3] Pulizia strutturale ─────────────────────────────────────────────────
 
 _TOC_KEYWORDS = frozenset([
     "indice", "index", "contents", "table of contents",
@@ -473,9 +472,7 @@ def apply_transforms(text: str) -> tuple[str, dict]:
     return text, stats
 
 
-# ─── [4] Rilevamento struttura (step-3 / detect_structure.py) ────────────────
-#
-# Logica identica a step-3/detect_structure.py — mantenuta sincronizzata.
+# ─── [4] Rilevamento struttura ───────────────────────────────────────────────
 
 _IT_WORDS = frozenset([
     "il", "la", "di", "e", "che", "non", "per", "un", "una", "si",
@@ -580,7 +577,7 @@ def run(stem: str, project_root: Path, force: bool) -> bool:
     print(f"{'─' * 52}")
 
     if clean_out.exists() and not force:
-        print(f"  ⚠️  conversion/{stem}/clean.md già presente — skip")
+        print(f"  ⚠️  conversione/{stem}/clean.md già presente — skip")
         print(f"      (usa --force per rieseguire)")
         return True
 
@@ -638,10 +635,10 @@ def run(stem: str, project_root: Path, force: bool) -> bool:
         print(f"     ⚠️  {w}")
 
     print(f"\n  Output:")
-    print(f"    conversion/{stem}/raw.md               (immutabile)")
-    print(f"    conversion/{stem}/clean.md")
-    print(f"    conversion/{stem}/structure_profile.json")
-    print(f"\n  Prossimo passo: python step-5/chunker.py --stem {stem}")
+    print(f"    conversione/{stem}/raw.md               (immutabile)")
+    print(f"    conversione/{stem}/clean.md")
+    print(f"    conversione/{stem}/structure_profile.json")
+    print(f"\n  clean.md pronto per la suddivisione in chunk.")
     return True
 
 
@@ -651,11 +648,8 @@ if __name__ == "__main__":
     project_root = Path(__file__).parent.parent
 
     parser = argparse.ArgumentParser(
-        description="Pipeline PDF → clean Markdown (sostituisce step 0+1+2+3+4)",
-        epilog=(
-            "Output compatibile con step-5+.\n"
-            "Prerequisiti: pip install opendataloader-pdf  +  Java 11+ sul PATH"
-        ),
+        description="Pipeline PDF → clean Markdown strutturato, pronto per chunking",
+        epilog="Prerequisiti: pip install opendataloader-pdf  +  Java 11+ sul PATH",
     )
     parser.add_argument(
         "--stem",

From 6ec54c8616497aa09b4a104769d3c5a77f319f33 Mon Sep 17 00:00:00 2001
From: Davide Grilli <davide.grilli@outlook.com>
Date: Thu, 16 Apr 2026 15:35:42 +0200
Subject: [PATCH 03/22] docs(pdf-to-md): aggiungi README per conversione/

Spiega requisiti (Java 11+, opendataloader-pdf), setup, utilizzo,
output prodotti, tutte le trasformazioni strutturali e i tipi di
documento supportati.
---
 conversione/README.md | 175 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 175 insertions(+)
 create mode 100644 conversione/README.md

diff --git a/conversione/README.md b/conversione/README.md
new file mode 100644
index 0000000..bb8e983
--- /dev/null
+++ b/conversione/README.md
@@ -0,0 +1,175 @@
+# conversione — PDF → Markdown pulito
+
+Pipeline automatica che trasforma un PDF grezzo in Markdown strutturato e
+pronto per la suddivisione in chunk. Gestisce l'intero processo: validazione
+del PDF, estrazione del testo, pulizia strutturale e analisi della struttura
+del documento.
+
+## Requisiti
+
+### Python
+```
+pip install opendataloader-pdf pdfplumber
+```
+
+### Java 11+
+`opendataloader-pdf` richiede Java sul PATH. Se non è installato:
+
+```bash
+# Ubuntu / Debian / WSL
+sudo apt install default-jdk
+
+# Verifica
+java -version
+```
+
+Download alternativo: https://adoptium.net/
+
+---
+
+## Utilizzo
+
+Posiziona il PDF in `sources/<nome>.pdf`, poi:
+
+```bash
+# Singolo documento
+python conversione/pipeline.py --stem <nome>
+
+# Tutti i PDF in sources/
+python conversione/pipeline.py
+
+# Forza la riesecuzione (sovrascrive output esistente)
+python conversione/pipeline.py --stem <nome> --force
+```
+
+Il parametro `--stem` è il nome del file PDF senza estensione.  
+Esempio: `sources/analisi1.pdf` → `--stem analisi1`
+
+---
+
+## Output
+
+Per ogni stem vengono prodotti tre file in `conversione/<stem>/`:
+
+| File | Descrizione |
+|------|-------------|
+| `raw.md` | Markdown grezzo estratto dal PDF — **non modificare** |
+| `clean.md` | Markdown pulito e strutturato — input per il chunker |
+| `structure_profile.json` | Profilo strutturale del documento |
+
+### structure_profile.json
+
+```json
+{
+  "livello_struttura": 3,
+  "n_h1": 1,
+  "n_h2": 6,
+  "n_h3": 163,
+  "n_paragrafi": 213,
+  "boundary_primario": "h3",
+  "lingua_rilevata": "it",
+  "lunghezza_media_sezione": 520,
+  "strategia_chunking": "h3_aware",
+  "avvertenze": []
+}
+```
+
+**`strategia_chunking`** indica come il chunker dovrebbe suddividere il documento:
+
+| Valore | Significato |
+|--------|-------------|
+| `h3_aware` | Documento ricco di sezioni `###` — usa i `###` come boundary |
+| `h2_paragraph_split` | Struttura parziale `##` — suddividi per paragrafo dentro ogni `##` |
+| `paragraph` | Nessuna gerarchia chiara — suddividi per paragrafo |
+| `sliding_window` | Testo piatto — usa finestra scorrevole |
+
+---
+
+## Cosa fa la pipeline
+
+La pipeline esegue quattro fasi in sequenza.
+
+### Fase 1 — Validazione
+
+Verifica che il PDF esista, non sia vuoto, non sia protetto da password e
+contenga testo digitale estraibile. I PDF scansionati (immagini) non sono
+supportati.
+
+### Fase 2 — Estrazione testo
+
+Usa `opendataloader-pdf` con l'algoritmo **XY-Cut++** per ricostruire il
+corretto ordine di lettura anche in documenti multi-colonna. Le immagini
+vengono ignorate completamente — il `clean.md` non contiene mai riferimenti
+a immagini.
+
+### Fase 3 — Pulizia strutturale
+
+Serie di trasformazioni applicate al Markdown grezzo:
+
+| Trasformazione | Problema risolto |
+|----------------|-----------------|
+| Rimozione riferimenti immagini | Artefatti `![...]()` lasciati dal convertitore |
+| Fix accenti backtick LaTeX | `` `e``→`è`, ``puo` ``→`può`, ``sar`a``→`sarà` |
+| Rimozione dot-leader TOC | `- 1.1 Titolo . . . . . 42` (voci indice) |
+| Rimozione numerali romani pagina | `i`, `ii`, `iii` su riga isolata (footer LaTeX) |
+| Fix header + body concatenati | `### 11 TitoloCorpo testo...` → header + paragrafo separati |
+| Estrazione header Capitolo inline | `Capitolo 3: IL TITOLO` nel corpo → `## Capitolo 3: ...` |
+| Normalizzazione livelli header | `####`, `#####` → `###` (gerarchia uniforme a 3 livelli) |
+| Rimozione bold negli header | `## **Titolo**` → `## Titolo` |
+| Normalizzazione ALL-CAPS header | `## IL TITOLO` → `## Il titolo` |
+| Rimozione TOC | Blocchi indice/sommario rilevati per keyword |
+| ALL-CAPS standalone → header | Righe in maiuscolo isolate → `## Titolo` |
+| Sezioni numerate → header | `N. Titolo sezione` → `### N.` + corpo |
+| Sezioni con punto → header | `- N. Testo aphorismo...` → `### N.` + corpo |
+| Sezioni lista numerate → header | `- N Titolo Corpo testo...` → `### N. Titolo` + corpo |
+| Unione paragrafi spezzati | Paragrafi tagliati dal salto pagina PDF ricongiunti |
+| Normalizzazione whitespace | Spazi multipli ridotti a singoli |
+| Riduzione righe vuote | Tre o più righe vuote consecutive → due |
+| Rimozione URL watermark | `www.piattaforma.com`, `https://...` su riga isolata |
+| Rimozione header senza corpo | Sezioni vuote e header watermark scartati |
+
+> **Rilevamento automatico tipo documento**: se il documento contiene sezioni
+> "Esercizi" (libri di testo accademici), la conversione dei numeri di esercizio
+> in header viene disabilitata automaticamente.
+
+### Fase 4 — Analisi struttura
+
+Rileva la gerarchia del documento (conteggio `#`/`##`/`###`), la lingua
+(italiano / inglese / sconosciuta), la lunghezza media delle sezioni e
+suggerisce la strategia di chunking ottimale. I risultati sono scritti in
+`structure_profile.json`.
+
+---
+
+## Tipi di documento supportati
+
+| Tipo | Esempi | Note |
+|------|--------|------|
+| Testo giuridico / accademico | Manuali, dispense, codici | Header numerati `N.` e `N.N` |
+| Filosofia / saggistica | Aforismi numerati, capitoli | Pattern `- N. testo` |
+| Matematica / LaTeX | Analisi, algebra, fisica | Fix accenti, TOC, numerali romani |
+| Testo generico strutturato | Qualsiasi PDF digitale | Paragrafi e header standard |
+
+**Non supportati**: PDF scansionati (solo immagini), PDF protetti da password.
+
+---
+
+## Log di esecuzione
+
+Durante l'esecuzione la pipeline stampa le statistiche di ogni trasformazione:
+
+```
+  [3/4] Pulizia strutturale...
+  ✅ Immagini rimosse:      0
+     Accenti corretti:      3701
+     Dot-leader rimossi:    53
+     Header concat fixati:  0
+     TOC rimosso:           sì
+     ALL-CAPS → ##:         14
+     Sezioni → ###:         279
+     Paragrafi uniti:       12998
+     Riduzione testo:       3%
+```
+
+Se un documento è già stato convertito, la pipeline lo salta automaticamente.
+Usa `--force` per rieseguire.

From 5b6940e47937b309517ca30a917d40f41b6d609d Mon Sep 17 00:00:00 2001
From: Davide Grilli <davide.grilli@outlook.com>
Date: Thu, 16 Apr 2026 15:53:46 +0200
Subject: [PATCH 04/22] feat(pdf-to-md): sostituisci report.md con report.json
 + validate.py
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

pipeline.py produce conversione/<stem>/report.json invece di
structure_profile.json + report.md. Il JSON contiene tutto:
trasformazioni, struttura, distribuzione lunghezze sezioni,
anomalie (bare_headers, short/long sections) e residui con esempi.

Fix: bare_headers flagga solo header senza corpo < 30 chars;
header numerati con corpo lungo (aforismi) non sono anomalie.

Nuovo validate.py legge tutti i report.json e stampa tabella
di stato per ogni stem (✅ / ⚠️ / ❌) con soglie configurabili.

README aggiornato con sezione validazione batch e struttura report.json.
---
 conversione/README.md   |  87 +++++++++++++++++++++----
 conversione/pipeline.py | 132 ++++++++++++++++++++++++++++++++++++--
 conversione/validate.py | 136 ++++++++++++++++++++++++++++++++++++++++
 3 files changed, 338 insertions(+), 17 deletions(-)
 create mode 100644 conversione/validate.py

diff --git a/conversione/README.md b/conversione/README.md
index bb8e983..a5ef3fb 100644
--- a/conversione/README.md
+++ b/conversione/README.md
@@ -55,26 +55,49 @@ Per ogni stem vengono prodotti tre file in `conversione/<stem>/`:
 |------|-------------|
 | `raw.md` | Markdown grezzo estratto dal PDF — **non modificare** |
 | `clean.md` | Markdown pulito e strutturato — input per il chunker |
-| `structure_profile.json` | Profilo strutturale del documento |
+| `report.json` | Metriche complete di qualità della conversione |
 
-### structure_profile.json
+### report.json
+
+Contiene tutto ciò che serve per valutare la conversione: statistiche
+trasformazioni, struttura rilevata, distribuzione lunghezze sezioni,
+anomalie e problemi residui con esempi.
 
 ```json
 {
-  "livello_struttura": 3,
-  "n_h1": 1,
-  "n_h2": 6,
-  "n_h3": 163,
-  "n_paragrafi": 213,
-  "boundary_primario": "h3",
-  "lingua_rilevata": "it",
-  "lunghezza_media_sezione": 520,
-  "strategia_chunking": "h3_aware",
-  "avvertenze": []
+  "stem": "dirittoprivato",
+  "timestamp": "2026-04-16 15:41",
+  "transforms": {
+    "n_accenti_corretti": 0,
+    "n_dotleader_rimossi": 0,
+    "toc_rimosso": false,
+    "n_sezioni_numerate": 63,
+    "riduzione_pct": 1
+  },
+  "structure": {
+    "livello_struttura": 3,
+    "n_h1": 0, "n_h2": 6, "n_h3": 163,
+    "lingua_rilevata": "it",
+    "strategia_chunking": "h3_aware",
+    "avvertenze": []
+  },
+  "distribution": { "min": 12, "p25": 312, "mediana": 681, "p75": 1197, "max": 6120 },
+  "anomalie": {
+    "bare_headers": 0,
+    "short_sections": 1,
+    "long_sections": 39,
+    "bare_headers_list": [],
+    "short_sections_list": [...],
+    "long_sections_list": [...]
+  },
+  "residui": {
+    "backtick": 0, "dotleader": 0, "url": 0, "immagini": 0,
+    "backtick_esempi": []
+  }
 }
 ```
 
-**`strategia_chunking`** indica come il chunker dovrebbe suddividere il documento:
+**`strategia_chunking`** indica come suddividere il documento in chunk:
 
 | Valore | Significato |
 |--------|-------------|
@@ -85,6 +108,44 @@ Per ogni stem vengono prodotti tre file in `conversione/<stem>/`:
 
 ---
 
+## Validazione batch
+
+Dopo aver convertito uno o più documenti, esegui `validate.py` per ottenere
+una tabella di stato su tutti gli stem:
+
+```bash
+python conversione/validate.py
+```
+
+Output di esempio:
+
+```
+stem              h2   h3  strategia            bare corte lunghe backtick dotlead url  status
+──────────────────────────────────────────────────────────────────────────────────────────────
+analisi1          13  279  h3_aware                0    36    151       10       0   0  ⚠️
+dirittoprivato     6  163  h3_aware                0     1     39        0       0   0  ✅
+nietzsche          4  303  h3_aware                6   104    100        0       0   0  ⚠️
+──────────────────────────────────────────────────────────────────────────────────────────────
+Totale: 3  ✅ 1  ⚠️  2  ❌ 0
+```
+
+**Legenda colonne:**
+
+| Colonna | Significato | Soglia warning |
+|---------|-------------|----------------|
+| `bare` | Header solo-numero senza corpo (`### 1.` vuoto) | ≥ 1 |
+| `corte` | Sezioni con corpo < 150 chars | informativo |
+| `lunghe` | Sezioni con corpo > 1500 chars | ≥ 80 |
+| `backtick` | Backtick `` ` `` residui nel testo | ≥ 1 |
+| `dotlead` | Dot-leader residui (`. . . .`) | ≥ 1 |
+
+**Stato:**
+- ✅ nessuna anomalia critica
+- ⚠️ anomalie presenti, documento processabile ma da verificare
+- ❌ struttura non rilevata (`livello_struttura = 0`) o > 50 backtick residui
+
+---
+
 ## Cosa fa la pipeline
 
 La pipeline esegue quattro fasi in sequenza.
diff --git a/conversione/pipeline.py b/conversione/pipeline.py
index 8b185e7..77d31b7 100644
--- a/conversione/pipeline.py
+++ b/conversione/pipeline.py
@@ -30,6 +30,7 @@ import re
 import subprocess
 import sys
 import tempfile
+from datetime import datetime
 from pathlib import Path
 
 
@@ -563,14 +564,136 @@ def analyze(md_path: Path) -> dict:
     }
 
 
+# ─── Report di conversione ───────────────────────────────────────────────────
+
+def build_report(
+    stem: str,
+    out_dir: Path,
+    clean_text: str,
+    t_stats: dict,
+    profile: dict,
+    reduction: float,
+) -> Path:
+    """
+    Genera conversione/<stem>/report.json con tutte le metriche di qualità:
+    statistiche trasformazioni, struttura, distribuzione lunghezze, anomalie
+    e problemi residui. Leggibile da validate.py per la validazione batch.
+    """
+    text_lines = clean_text.split("\n")
+
+    # ── Raccolta sezioni ### con corpo ────────────────────────────────────
+    sections: list[tuple[str, str]] = []
+    cur_hdr: str | None = None
+    cur_body: list[str] = []
+    for line in text_lines:
+        if re.match(r"^### ", line):
+            if cur_hdr is not None:
+                sections.append((cur_hdr, "\n".join(cur_body).strip()))
+            cur_hdr = line
+            cur_body = []
+        elif cur_hdr is not None:
+            cur_body.append(line)
+    if cur_hdr is not None:
+        sections.append((cur_hdr, "\n".join(cur_body).strip()))
+
+    lengths = [len(body) for _, body in sections]
+
+    # ── Distribuzione lunghezze ───────────────────────────────────────────
+    def _pct(data: list[int], p: float) -> int:
+        if not data:
+            return 0
+        s = sorted(data)
+        return s[max(0, min(len(s) - 1, int(len(s) * p)))]
+
+    distribution = {
+        "min":     min(lengths) if lengths else 0,
+        "p25":     _pct(lengths, 0.25),
+        "mediana": _pct(lengths, 0.50),
+        "p75":     _pct(lengths, 0.75),
+        "max":     max(lengths) if lengths else 0,
+    }
+
+    # ── Anomalie ──────────────────────────────────────────────────────────
+    # Header solo-numero senza corpo sostanziale: anomalia solo se il corpo
+    # è vuoto o < 30 chars. Un body lungo è una sezione numerata legittima
+    # (es. aforismi numerati dove il numero è l'identificatore della sezione).
+    bare_hdrs = [
+        {"header": hdr, "corpo_inizio": body[:120].replace("\n", " ")}
+        for hdr, body in sections
+        if re.match(r"^### \d+\.\s*$", hdr) and len(body.strip()) < 30
+    ]
+
+    short_secs = [
+        {"header": hdr, "chars": length, "testo": body[:80].replace("\n", " ")}
+        for (hdr, body), length in zip(sections, lengths)
+        if 0 < length < 150
+    ]
+
+    long_secs = [
+        {"header": hdr, "chars": length}
+        for (hdr, _), length in zip(sections, lengths)
+        if length > 1500
+    ]
+
+    # ── Problemi residui (max 10 esempi ciascuno) ─────────────────────────
+    def _scan(pattern: str, max_n: int = 10) -> list[dict]:
+        hits = []
+        for i, line in enumerate(text_lines):
+            if re.search(pattern, line) and not re.match(r"^#+ ", line):
+                hits.append({"riga": i + 1, "testo": line.strip()[:120]})
+                if len(hits) >= max_n:
+                    break
+        return hits
+
+    residui = {
+        "backtick":  _scan(r"`"),
+        "dotleader": _scan(r"(?:\. ){3,}"),
+        "url":       _scan(r"^(https?://|www\.)\S+"),
+        "immagini":  _scan(r"!\[[^\]]*\]\([^)]*\)"),
+    }
+
+    # ── Composizione report ───────────────────────────────────────────────
+    report = {
+        "stem": stem,
+        "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M"),
+        "transforms": {
+            **t_stats,
+            "riduzione_pct": round(reduction),
+        },
+        "structure": profile,
+        "distribution": distribution,
+        "anomalie": {
+            "bare_headers":   len(bare_hdrs),
+            "short_sections": len(short_secs),
+            "long_sections":  len(long_secs),
+            "bare_headers_list":   bare_hdrs,
+            "short_sections_list": short_secs,
+            "long_sections_list":  long_secs,
+        },
+        "residui": {
+            "backtick":  len(residui["backtick"]),
+            "dotleader": len(residui["dotleader"]),
+            "url":       len(residui["url"]),
+            "immagini":  len(residui["immagini"]),
+            "backtick_esempi":  residui["backtick"],
+            "dotleader_esempi": residui["dotleader"],
+            "url_esempi":       residui["url"],
+            "immagini_esempi":  residui["immagini"],
+        },
+    }
+
+    report_path = out_dir / "report.json"
+    report_path.write_text(json.dumps(report, ensure_ascii=False, indent=2), encoding="utf-8")
+    return report_path
+
+
 # ─── Pipeline principale ──────────────────────────────────────────────────────
 
 def run(stem: str, project_root: Path, force: bool) -> bool:
     pdf_path = project_root / "sources" / f"{stem}.pdf"
-    out_dir = project_root / "conversion" / stem
+    out_dir = project_root / "conversione" / stem
     raw_out = out_dir / "raw.md"
     clean_out = out_dir / "clean.md"
-    profile_out = out_dir / "structure_profile.json"
 
     print(f"\n{'─' * 52}")
     print(f"  {stem}")
@@ -623,7 +746,6 @@ def run(stem: str, project_root: Path, force: bool) -> bool:
     raw_out.write_text(raw_text, encoding="utf-8")
     clean_out.write_text(clean_text, encoding="utf-8")
     profile = analyze(clean_out)
-    profile_out.write_text(json.dumps(profile, ensure_ascii=False, indent=2), encoding="utf-8")
 
     _LIVELLO_DESC = {3: "ricca (h3)", 2: "parziale (h2)", 1: "paragrafi", 0: "testo piatto"}
     print(f"  ✅ Struttura: livello {profile['livello_struttura']} — {_LIVELLO_DESC[profile['livello_struttura']]}")
@@ -634,10 +756,12 @@ def run(stem: str, project_root: Path, force: bool) -> bool:
     for w in profile["avvertenze"]:
         print(f"     ⚠️  {w}")
 
+    build_report(stem, out_dir, clean_text, t_stats, profile, reduction)
+
     print(f"\n  Output:")
     print(f"    conversione/{stem}/raw.md               (immutabile)")
     print(f"    conversione/{stem}/clean.md")
-    print(f"    conversione/{stem}/structure_profile.json")
+    print(f"    conversione/{stem}/report.json")
     print(f"\n  clean.md pronto per la suddivisione in chunk.")
     return True
 
diff --git a/conversione/validate.py b/conversione/validate.py
new file mode 100644
index 0000000..e6c4023
--- /dev/null
+++ b/conversione/validate.py
@@ -0,0 +1,136 @@
+#!/usr/bin/env python3
+"""
+conversione/validate.py — Validazione batch di tutti gli stem convertiti
+
+Legge i report.json prodotti da pipeline.py e stampa una tabella di stato
+per ogni documento, evidenziando anomalie e problemi residui.
+
+Stato per stem:
+  ✅  nessuna anomalia critica
+  ⚠️  anomalie presenti ma documento processabile
+  ❌  struttura non rilevata o problemi gravi
+
+Uso:
+    python conversione/validate.py              # tutti gli stem
+    python conversione/validate.py analisi1     # stem specifico
+"""
+
+import json
+import sys
+from pathlib import Path
+
+
+# ─── Soglie ──────────────────────────────────────────────────────────────────
+
+_CRITICO_STRUTTURA   = 0    # livello_struttura == 0 → testo piatto, nessun header
+_CRITICO_BACKTICK    = 50   # molti accenti non corretti → testo illeggibile
+_WARNING_BARE        = 1    # anche un solo header senza titolo è sospetto
+_WARNING_BACKTICK    = 1    # qualsiasi backtick residuo va verificato
+_WARNING_LONG_SECS   = 80   # troppe sezioni lunghe indica struttura insufficiente
+
+
+def _status(r: dict) -> str:
+    structure = r.get("structure", {})
+    anomalie  = r.get("anomalie", {})
+    residui   = r.get("residui", {})
+
+    livello  = structure.get("livello_struttura", -1)
+    backtick = residui.get("backtick", 0)
+
+    if livello <= _CRITICO_STRUTTURA or backtick >= _CRITICO_BACKTICK:
+        return "❌"
+    if (
+        anomalie.get("bare_headers", 0) >= _WARNING_BARE
+        or backtick >= _WARNING_BACKTICK
+        or anomalie.get("long_sections", 0) >= _WARNING_LONG_SECS
+    ):
+        return "⚠️ "
+    return "✅"
+
+
+def _fmt(value, width: int) -> str:
+    return str(value).ljust(width)
+
+
+def validate(stems: list[str], project_root: Path) -> None:
+    conv_dir = project_root / "conversione"
+
+    if stems:
+        paths = [conv_dir / s / "report.json" for s in stems]
+    else:
+        paths = sorted(conv_dir.glob("*/report.json"))
+
+    if not paths:
+        print("Nessun report.json trovato in conversione/*/")
+        sys.exit(0)
+
+    rows = []
+    for path in paths:
+        if not path.exists():
+            rows.append({"stem": path.parent.name, "_missing": True})
+            continue
+        r = json.loads(path.read_text(encoding="utf-8"))
+        rows.append(r)
+
+    # ── Intestazione ─────────────────────────────────────────────────────
+    col_stem    = max(len(r.get("stem", "stem")) for r in rows) + 2
+    header = (
+        f"{'stem':<{col_stem}}"
+        f"{'h2':>4}{'h3':>5}  "
+        f"{'strategia':<20}"
+        f"{'bare':>5}{'corte':>6}{'lunghe':>7}"
+        f"{'backtick':>9}{'dotlead':>8}{'url':>4}"
+        f"  {'status'}"
+    )
+    sep = "─" * len(header)
+    print()
+    print(header)
+    print(sep)
+
+    # ── Righe ─────────────────────────────────────────────────────────────
+    for r in rows:
+        if r.get("_missing"):
+            print(f"{r['stem']:<{col_stem}}  (report.json non trovato)")
+            continue
+
+        stem      = r.get("stem", "?")
+        structure = r.get("structure", {})
+        anomalie  = r.get("anomalie", {})
+        residui   = r.get("residui", {})
+
+        h2       = structure.get("n_h2", 0)
+        h3       = structure.get("n_h3", 0)
+        strat    = structure.get("strategia_chunking", "?")
+        bare     = anomalie.get("bare_headers", 0)
+        corte    = anomalie.get("short_sections", 0)
+        lunghe   = anomalie.get("long_sections", 0)
+        backtick = residui.get("backtick", 0)
+        dotlead  = residui.get("dotleader", 0)
+        url      = residui.get("url", 0)
+        status   = _status(r)
+
+        print(
+            f"{stem:<{col_stem}}"
+            f"{h2:>4}{h3:>5}  "
+            f"{strat:<20}"
+            f"{bare:>5}{corte:>6}{lunghe:>7}"
+            f"{backtick:>9}{dotlead:>8}{url:>4}"
+            f"  {status}"
+        )
+
+    print(sep)
+    totali = len(rows)
+    ok  = sum(1 for r in rows if not r.get("_missing") and _status(r) == "✅")
+    warn = sum(1 for r in rows if not r.get("_missing") and _status(r).startswith("⚠"))
+    err = sum(1 for r in rows if not r.get("_missing") and _status(r) == "❌")
+    print(f"Totale: {totali}  ✅ {ok}  ⚠️  {warn}  ❌ {err}")
+    print()
+    print("Legenda colonne: bare=header senza titolo  corte=sezioni<150ch  "
+          "lunghe=sezioni>1500ch  backtick=accenti residui")
+    print()
+
+
+if __name__ == "__main__":
+    project_root = Path(__file__).parent.parent
+    stems = sys.argv[1:]
+    validate(stems, project_root)

From bcf2e688aaffa9486ffea575316cf54ff6f1c1de Mon Sep 17 00:00:00 2001
From: Davide Grilli <davide.grilli@outlook.com>
Date: Thu, 16 Apr 2026 16:05:03 +0200
Subject: [PATCH 05/22] feat(validate): support single-file flags and explicit
 markdown score output

---
 conversione/validate.py | 199 ++++++++++++++++++++++++++++++++--------
 1 file changed, 160 insertions(+), 39 deletions(-)

diff --git a/conversione/validate.py b/conversione/validate.py
index e6c4023..6194367 100644
--- a/conversione/validate.py
+++ b/conversione/validate.py
@@ -2,56 +2,159 @@
 """
 conversione/validate.py — Validazione batch di tutti gli stem convertiti
 
-Legge i report.json prodotti da pipeline.py e stampa una tabella di stato
-per ogni documento, evidenziando anomalie e problemi residui.
+Legge i report.json prodotti da pipeline.py, stampa una tabella di stato
+e assegna un voto (0-100) a ogni documento per misurare la bontà del
+Markdown prodotto.
 
-Stato per stem:
-  ✅  nessuna anomalia critica
-  ⚠️  anomalie presenti ma documento processabile
-  ❌  struttura non rilevata o problemi gravi
+Voto:
+  90-100  A  — ottimo, pronto per il chunker
+  75-89   B  — buono, qualche sezione lunga ma accettabile
+  60-74   C  — accettabile, anomalie minori da verificare
+  40-59   D  — da rivedere, problemi strutturali o residui evidenti
+   0-39   F  — da riprocessare, struttura assente o testo corrotto
 
 Uso:
     python conversione/validate.py              # tutti gli stem
     python conversione/validate.py analisi1     # stem specifico
+    python conversione/validate.py --stem analisi1
+    python conversione/validate.py --analisi1   # compatibilità
 """
 
 import json
+import argparse
 import sys
 from pathlib import Path
 
 
-# ─── Soglie ──────────────────────────────────────────────────────────────────
+# ─── Punteggio ───────────────────────────────────────────────────────────────
 
-_CRITICO_STRUTTURA   = 0    # livello_struttura == 0 → testo piatto, nessun header
-_CRITICO_BACKTICK    = 50   # molti accenti non corretti → testo illeggibile
-_WARNING_BARE        = 1    # anche un solo header senza titolo è sospetto
-_WARNING_BACKTICK    = 1    # qualsiasi backtick residuo va verificato
-_WARNING_LONG_SECS   = 80   # troppe sezioni lunghe indica struttura insufficiente
+def _score(r: dict) -> int:
+    """
+    Calcola un punteggio 0-100 sulla qualità del Markdown prodotto.
 
-
-def _status(r: dict) -> str:
+    Penalità:
+      - struttura assente o piatta        → -40 / -15
+      - backtick residui nel testo        → -2 per occorrenza (max -30)
+      - URL / watermark residui           → -5 per occorrenza (max -15)
+      - immagini residue                  → -5 per occorrenza (max -10)
+      - dot-leader residui                → -5 per occorrenza (max -10)
+      - header senza titolo (bare)        → -3 per occorrenza (max -15)
+      - troppe sezioni > 1500 chars       → -5 / -10 (in % sul totale h3)
+    """
+    score     = 100
     structure = r.get("structure", {})
     anomalie  = r.get("anomalie", {})
     residui   = r.get("residui", {})
 
-    livello  = structure.get("livello_struttura", -1)
-    backtick = residui.get("backtick", 0)
+    livello  = structure.get("livello_struttura", 0)
+    n_h3     = max(structure.get("n_h3", 0), 1)
 
-    if livello <= _CRITICO_STRUTTURA or backtick >= _CRITICO_BACKTICK:
-        return "❌"
-    if (
-        anomalie.get("bare_headers", 0) >= _WARNING_BARE
-        or backtick >= _WARNING_BACKTICK
-        or anomalie.get("long_sections", 0) >= _WARNING_LONG_SECS
-    ):
-        return "⚠️ "
-    return "✅"
+    # Struttura
+    if livello == 0:
+        score -= 40
+    elif livello == 1:
+        score -= 15
+
+    # Residui nel testo
+    score -= min(30, residui.get("backtick",  0) * 2)
+    score -= min(15, residui.get("url",       0) * 5)
+    score -= min(10, residui.get("immagini",  0) * 5)
+    score -= min(10, residui.get("dotleader", 0) * 5)
+
+    # Anomalie strutturali
+    score -= min(15, anomalie.get("bare_headers", 0) * 3)
+
+    # Sezioni troppo lunghe (in % sul totale delle sezioni ###)
+    long_ratio = anomalie.get("long_sections", 0) / n_h3
+    if long_ratio > 0.6:
+        score -= 10
+    elif long_ratio > 0.35:
+        score -= 5
+
+    return max(0, score)
 
 
-def _fmt(value, width: int) -> str:
-    return str(value).ljust(width)
+def _grade(score: int) -> str:
+    if score >= 90: return "A"
+    if score >= 75: return "B"
+    if score >= 60: return "C"
+    if score >= 40: return "D"
+    return "F"
 
 
+# ─── CLI ─────────────────────────────────────────────────────────────────────
+
+def _normalize_target(token: str) -> str:
+    """
+    Normalizza un target CLI in stem:
+      - analisi1
+      - --analisi1          (compatibilità)
+      - conversione/analisi1/report.json
+      - analisi1.pdf / analisi1.md / report.json
+    """
+    raw = token.strip()
+    if not raw:
+        return raw
+
+    # Compatibilità con invocazione tipo: --analisi1
+    if raw.startswith("--") and len(raw) > 2:
+        raw = raw[2:]
+
+    p = Path(raw)
+
+    # Path diretto al report
+    if p.name == "report.json" and p.parent.name:
+        return p.parent.name
+
+    name = p.name
+    if name.endswith((".pdf", ".md", ".json")):
+        name = Path(name).stem
+
+    return name
+
+
+def _parse_cli_args(argv: list[str]) -> list[str]:
+    parser = argparse.ArgumentParser(
+        description="Valida i report Markdown prodotti in conversione/<stem>/report.json"
+    )
+    parser.add_argument(
+        "targets",
+        nargs="*",
+        help="Stem, file o path da validare (es: analisi1 oppure conversione/analisi1/report.json)",
+    )
+    parser.add_argument(
+        "-s",
+        "--stem",
+        action="append",
+        default=[],
+        help="Stem specifico (ripetibile, es: --stem analisi1 --stem nietzsche)",
+    )
+
+    args, unknown = parser.parse_known_args(argv)
+
+    targets = [*args.targets, *args.stem]
+
+    # Compatibilità: `python validate.py --analisi1`
+    for tok in unknown:
+        if tok.startswith("--") and len(tok) > 2:
+            targets.append(tok[2:])
+        else:
+            parser.error(f"Argomento non riconosciuto: {tok}")
+
+    stems = []
+    seen = set()
+    for t in targets:
+        stem = _normalize_target(t)
+        if not stem or stem in seen:
+            continue
+        seen.add(stem)
+        stems.append(stem)
+
+    return stems
+
+
+# ─── Validazione ─────────────────────────────────────────────────────────────
+
 def validate(stems: list[str], project_root: Path) -> None:
     conv_dir = project_root / "conversione"
 
@@ -73,20 +176,23 @@ def validate(stems: list[str], project_root: Path) -> None:
         rows.append(r)
 
     # ── Intestazione ─────────────────────────────────────────────────────
-    col_stem    = max(len(r.get("stem", "stem")) for r in rows) + 2
+    col_stem = max(len(r.get("stem", "stem")) for r in rows) + 2
     header = (
         f"{'stem':<{col_stem}}"
         f"{'h2':>4}{'h3':>5}  "
         f"{'strategia':<20}"
         f"{'bare':>5}{'corte':>6}{'lunghe':>7}"
         f"{'backtick':>9}{'dotlead':>8}{'url':>4}"
-        f"  {'status'}"
+        f"  {'voto':>4}  {'grade'}"
     )
     sep = "─" * len(header)
     print()
     print(header)
     print(sep)
 
+    scores = []
+    scored_docs = []
+
     # ── Righe ─────────────────────────────────────────────────────────────
     for r in rows:
         if r.get("_missing"):
@@ -107,7 +213,11 @@ def validate(stems: list[str], project_root: Path) -> None:
         backtick = residui.get("backtick", 0)
         dotlead  = residui.get("dotleader", 0)
         url      = residui.get("url", 0)
-        status   = _status(r)
+
+        s = _score(r)
+        g = _grade(s)
+        scores.append(s)
+        scored_docs.append((stem, s, g))
 
         print(
             f"{stem:<{col_stem}}"
@@ -115,22 +225,33 @@ def validate(stems: list[str], project_root: Path) -> None:
             f"{strat:<20}"
             f"{bare:>5}{corte:>6}{lunghe:>7}"
             f"{backtick:>9}{dotlead:>8}{url:>4}"
-            f"  {status}"
+            f"  {s:>4}  {g}"
         )
 
+    # ── Riepilogo ─────────────────────────────────────────────────────────
     print(sep)
-    totali = len(rows)
-    ok  = sum(1 for r in rows if not r.get("_missing") and _status(r) == "✅")
-    warn = sum(1 for r in rows if not r.get("_missing") and _status(r).startswith("⚠"))
-    err = sum(1 for r in rows if not r.get("_missing") and _status(r) == "❌")
-    print(f"Totale: {totali}  ✅ {ok}  ⚠️  {warn}  ❌ {err}")
+    if scores:
+        media = sum(scores) / len(scores)
+        grade_media = _grade(int(media))
+        print(f"Documenti: {len(scores)}   "
+              f"Voto medio: {media:.0f}/100  {grade_media}   "
+              f"(A≥90  B≥75  C≥60  D≥40  F<40)")
+        if len(scored_docs) == 1:
+            stem, score, grade = scored_docs[0]
+            print(f"Voto finale Markdown ({stem}): {score}/100  {grade}")
+        else:
+            voti = ", ".join(
+                f"{stem}={score}/100 {grade}"
+                for stem, score, grade in scored_docs
+            )
+            print(f"Voti Markdown: {voti}")
     print()
-    print("Legenda colonne: bare=header senza titolo  corte=sezioni<150ch  "
-          "lunghe=sezioni>1500ch  backtick=accenti residui")
+    print("Penalità: struttura assente −40, backtick residui −2/cad, "
+          "bare headers −3/cad, sezioni >1500ch >35% −5")
     print()
 
 
 if __name__ == "__main__":
     project_root = Path(__file__).parent.parent
-    stems = sys.argv[1:]
+    stems = _parse_cli_args(sys.argv[1:])
     validate(stems, project_root)

From 265ac92b6c0fde915a723b13d9599e8d615ab8ec Mon Sep 17 00:00:00 2001
From: Davide Grilli <davide.grilli@outlook.com>
Date: Fri, 17 Apr 2026 07:47:56 +0200
Subject: [PATCH 06/22] =?UTF-8?q?feat(conversione):=207=20nuovi=20transfor?=
 =?UTF-8?q?m=20pipeline,=20refactor=20validate=20=E2=80=94=20media=2092?=
 =?UTF-8?q?=E2=86=9299/100?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- dot-leader continui, strip "- " in allcaps, backtick orfani LaTeX
- TOC list removal, extract_article_headers, extract_math_environments, merge_title_headers
- validate.py: interfaccia semplificata, rimosso codice morto
---
 conversione/pipeline.py | 206 ++++++++++++++++++++++++++++++++--
 conversione/validate.py | 238 +++++++++++++---------------------------
 2 files changed, 271 insertions(+), 173 deletions(-)

diff --git a/conversione/pipeline.py b/conversione/pipeline.py
index 77d31b7..0a6014b 100644
--- a/conversione/pipeline.py
+++ b/conversione/pipeline.py
@@ -169,7 +169,9 @@ def _is_allcaps_line(line: str) -> bool:
 
 
 def _allcaps_to_header(raw_line: str) -> str:
-    text = raw_line.strip().rstrip(".").rstrip("?").strip()
+    # Rimuovi eventuale prefisso di lista "- " o "* " prima di creare l'header
+    text = re.sub(r"^[-*+]\s+", "", raw_line.strip())
+    text = text.rstrip(".").rstrip("?").strip()
 
     _ORD_IT_PAT = "|".join(_ORDINALS_IT.keys())
     m = re.match(rf"^CAPITOLO ({_ORD_IT_PAT})\. (.+)", text)
@@ -192,6 +194,152 @@ def _allcaps_to_header(raw_line: str) -> str:
     return f"## {_sentence_case(text)}"
 
 
+def _extract_math_environments(text: str) -> tuple[str, int]:
+    """
+    Converte paragrafi che iniziano con ambienti matematici in header ###.
+
+    'Teorema 1.6.3 (principio di induzione) Sia A ⊆ N...'
+    → '### Teorema 1.6.3 (principio di induzione)\n\nSia A ⊆ N...'
+
+    Riconosce: Definizione, Teorema, Lemma, Proposizione, Corollario,
+               Osservazione, Nota, Esempio (solo con numero di sezione).
+    Non tocca paragrafi che già iniziano con un header Markdown.
+    Deve girare PRIMA del merge paragrafi (step 5) per sfruttare i blocchi intatti.
+    """
+    _ENVS = (
+        r"Definizione|Teorema|Lemma|Proposizione|"
+        r"Corollario|Osservazione|Nota|Esempio"
+    )
+    count  = 0
+    blocks = text.split("\n\n")
+    result = []
+
+    for block in blocks:
+        stripped = block.strip()
+        if not stripped or stripped.startswith("#"):
+            result.append(block)
+            continue
+
+        m = re.match(
+            rf"^({_ENVS})\s+((?:\d+\.?){{1,4}})\s*(.*)",
+            stripped,
+            re.DOTALL,
+        )
+        if not m:
+            result.append(block)
+            continue
+
+        env  = m.group(1)
+        num  = m.group(2).rstrip(".")
+        rest = m.group(3).strip()
+
+        # Titolo opzionale tra parentesi: "(principio di induzione)"
+        title_m = re.match(r"^(\([^)]{2,60}\))\s+(.*)", rest, re.DOTALL)
+        if title_m:
+            header = f"### {env} {num} {title_m.group(1)}"
+            body   = title_m.group(2).strip()
+        else:
+            header = f"### {env} {num}."
+            body   = rest
+
+        result.append(f"{header}\n\n{body}" if body else header)
+        count += 1
+
+    return "\n\n".join(result), count
+
+
+def _merge_title_headers(text: str) -> tuple[str, int]:
+    """
+    Fonde header numerici isolati con il sottotitolo breve che li segue.
+
+    '### N.\n\nSottotitolo (riga singola ≤ 80 char, senza punto finale)'
+    → '### N. Sottotitolo'
+
+    Caso tipico: parti di un'opera (es. Nietzsche) dove il numero di sezione
+    e il titolo della sezione sono in blocchi Markdown separati.
+    Non tocca header con titolo già inline né header seguiti da testo lungo.
+    """
+    count  = 0
+    blocks = re.split(r"\n{2,}", text)
+    result = []
+    i = 0
+    while i < len(blocks):
+        block   = blocks[i]
+        stripped = block.strip()
+        if (
+            re.match(r"^#{2,3} \d+\.\s*$", stripped)
+            and i + 1 < len(blocks)
+        ):
+            nxt = blocks[i + 1].strip()
+            # Sottotitolo valido: riga singola, ≤ 80 char, non header, non numerazione pura
+            if (
+                nxt
+                and "\n" not in nxt
+                and len(nxt) <= 80
+                and not nxt.startswith("#")
+                and not re.match(r"^\d+[\.\)]\s", nxt)
+            ):
+                result.append(stripped.rstrip() + " " + nxt)
+                count += 1
+                i += 2
+                continue
+        result.append(block)
+        i += 1
+    return re.sub(r"\n{3,}", "\n\n", "\n\n".join(result)), count
+
+
+def _extract_article_headers(text: str) -> tuple[str, int]:
+    """
+    Converte voci di articolo dal formato lista Markdown al formato header ###.
+
+    '- Art. N[suffix]. Titolo. Corpo testo...' → '### Art. N[suffix]. Titolo.\n\nCorpo testo...'
+    '- Art. N[suffix]. (…) (1)'               → '### Art. N[suffix].\n\n(…) (1)'
+
+    Gestisce suffissi come: Art. 4-bis., Art. 14-ter., Art. 1-quinquies.
+    Il titolo è la prima frase con iniziale maiuscola che termina con '.' prima di
+    ulteriore testo (es. "Leggi. La formazione..." → titolo "Leggi", corpo "La formazione...").
+    Se il testo non ha titolo separabile, tutto diventa il corpo.
+    """
+    count = 0
+
+    def _repl(m: re.Match) -> str:
+        nonlocal count
+        num  = m.group(1)
+        rest = m.group(2).strip()
+
+        # Titolo: frase con iniziale maiuscola, max 75 char, termina con '.',
+        # seguita da almeno un'altra frase (minimo 5 char) che inizia con maiuscola
+        # o con '(' / cifra (note a piè o continuazione corpo).
+        title_m = re.match(
+            r"^([A-ZÀÈÉÌÍÒÓÙÚ].{1,74}?)\.\s+([A-ZÀÈÉÌÍÒÓÙÚ\(\d].{4,})",
+            rest,
+        )
+        if title_m:
+            count += 1
+            return (
+                f"### Art. {num}. {title_m.group(1)}.\n\n"
+                f"{title_m.group(2).strip()}"
+            )
+
+        # Nessun titolo separabile: tutto è corpo
+        if rest:
+            count += 1
+            return f"### Art. {num}.\n\n{rest}"
+
+        # Articolo senza testo inline (es. "- Art. 5. (…) (1)" già estratto sopra,
+        # oppure articolo vuoto nella lista)
+        count += 1
+        return f"### Art. {num}."
+
+    text = re.sub(
+        r"^-\s+Art\.\s+([\d]+[a-z\-]*)\.\s*(.*)",
+        _repl,
+        text,
+        flags=re.MULTILINE,
+    )
+    return text, count
+
+
 def apply_transforms(text: str) -> tuple[str, dict]:
     """
     Applica le trasformazioni strutturali al Markdown grezzo.
@@ -203,6 +351,9 @@ def apply_transforms(text: str) -> tuple[str, dict]:
         "n_accenti_corretti": 0,
         "n_dotleader_rimossi": 0,
         "n_header_concat_fixati": 0,
+        "n_articoli_estratti": 0,
+        "n_ambienti_matematici": 0,
+        "n_titoli_uniti": 0,
         "n_header_allcaps": 0,
         "n_sezioni_numerate": 0,
         "n_paragrafi_uniti": 0,
@@ -224,13 +375,23 @@ def apply_transforms(text: str) -> tuple[str, dict]:
     text = re.sub(r"([eEaAuUiIoO])`", lambda m: _ACCENT_MAP[m.group(1)], text)
     stats["n_accenti_corretti"] = n_bt_before - text.count("`")
 
+    # Backtick orfani: artefatti LaTeX rimasti dopo la correzione vocale
+    # (es. "propriet`" da "proprietà", "continuit`" da "continuità").
+    # In testi PDF non esistono backtick legittimi → rimozione sicura.
+    n_bt_orfani = text.count("`")
+    if n_bt_orfani:
+        text = re.sub(r"`", "", text)
+        stats["n_accenti_corretti"] += n_bt_orfani
+
     # 0b_pre. Rimuovi righe con dot-leader (voci di indice/sommario)
     #     Esempi: "- 1.1 Alfabeto greco . . . . . . 1", "3.4 Continuità . . . . 205"
     #     Pattern: almeno 3 occorrenze di ". " consecutive nella riga
+    # Cattura sia ". . . ." (spazi) sia "......." (punti continui, tipici dei TOC PDF)
+    _DOTLEADER_RE = r"^[^\n]*(?:(?:\. ){3,}|\.{4,})[^\n]*$"
     stats["n_dotleader_rimossi"] = len(
-        re.findall(r"^[^\n]*(?:\. ){3,}[^\n]*$", text, re.MULTILINE)
+        re.findall(_DOTLEADER_RE, text, re.MULTILINE)
     )
-    text = re.sub(r"^[^\n]*(?:\. ){3,}[^\n]*$", "", text, flags=re.MULTILINE)
+    text = re.sub(_DOTLEADER_RE, "", text, flags=re.MULTILINE)
 
     # 0b_pre2. Rimuovi righe che sono solo numerali romani (indicatori di pagina TOC)
     #     Esempi: "i", "ii", "iii", "iv", "v" su riga isolata (footer pagine indice LaTeX)
@@ -306,6 +467,12 @@ def apply_transforms(text: str) -> tuple[str, dict]:
         flags=re.MULTILINE,
     )
 
+    # 0e. Converti voci articolo "- Art. N. Titolo. Corpo" → "### Art. N. Titolo.\n\nCorpo"
+    #     Eseguito dopo la promozione h4+ → h3 (0d) per non duplicare Art. già header.
+    #     Eseguito prima del merge paragrafi (5): il boundary ### previene la fusione.
+    text, n_art = _extract_article_headers(text)
+    stats["n_articoli_estratti"] = n_art
+
     # 1. Rimuovi **bold** negli header esistenti: ## **Titolo** → ## Titolo
     text = re.sub(
         r"^(#{1,6})\s+\*\*(.+?)\*\*\s*$",
@@ -324,18 +491,26 @@ def apply_transforms(text: str) -> tuple[str, dict]:
     text = re.sub(r"^(#{1,6}) (.+)$", _norm_allcaps_header, text, flags=re.MULTILINE)
 
     # 2. Rimuovi righe TOC: header "# Indice", "# Contents", ecc.
-    #     Rimuove la riga stessa; le voci subordinate (dot-leader) sono già rimosse da 0b_pre.
-    #     L'header rimasto senza corpo viene poi eliminato dal transform 9.
+    #     + le voci lista numeriche che seguono (TOC senza dot-leader, es. Nietzsche):
+    #       "- 1. Dei pregiudizi dei filosofi" → rimossa se viene subito dopo un header TOC.
+    #     Le voci con dot-leader sono già rimosse da 0b_pre.
+    #     Gli header rimasti senza corpo vengono poi eliminati dal transform 9.
     lines = text.split("\n")
     new_lines = []
+    _in_toc = False
     for line in lines:
-        # Stripping del prefisso markdown (##, #, ecc.) prima del confronto keyword
-        bare = re.sub(r"^#+\s*", "", line.strip())
+        bare       = re.sub(r"^#+\s*", "", line.strip())
         first_word = bare.split(".")[0].strip().lower()
         if first_word in _TOC_KEYWORDS:
             stats["toc_rimosso"] = True
-        else:
-            new_lines.append(line)
+            _in_toc = True
+            continue
+        if _in_toc:
+            # Salta righe vuote e voci lista numeriche (- N. Titolo / - N Titolo)
+            if re.match(r"^\s*$", line) or re.match(r"^\s*[-*+]\s+\d", line):
+                continue
+            _in_toc = False
+        new_lines.append(line)
     text = "\n".join(new_lines)
 
     # 3. Converti righe ALL-CAPS standalone → ## header
@@ -419,6 +594,11 @@ def apply_transforms(text: str) -> tuple[str, dict]:
         flags=re.MULTILINE,
     )
 
+    # 4d. Converti ambienti matematici (Teorema/Definizione/...) → ### header
+    #     Eseguito prima del merge paragrafi (5) per sfruttare i blocchi intatti.
+    text, n_math = _extract_math_environments(text)
+    stats["n_ambienti_matematici"] = n_math
+
     # 5. Unisci paragrafi spezzati da salti pagina PDF
     _SENTENCE_END = set(".?!»)\"'")
     blocks = text.split("\n\n")
@@ -470,6 +650,11 @@ def apply_transforms(text: str) -> tuple[str, dict]:
         cleaned.append(block)
     text = re.sub(r"\n{3,}", "\n\n", "\n\n".join(cleaned))
 
+    # 9b. Fondi header numerici isolati con il sottotitolo breve successivo
+    #     "### N.\n\nSottotitolo" → "### N. Sottotitolo"  (es. parti Nietzsche)
+    text, n_titoli = _merge_title_headers(text)
+    stats["n_titoli_uniti"] = n_titoli
+
     return text, stats
 
 
@@ -734,6 +919,9 @@ def run(stem: str, project_root: Path, force: bool) -> bool:
     print(f"     Accenti corretti:      {t_stats['n_accenti_corretti']}")
     print(f"     Dot-leader rimossi:    {t_stats['n_dotleader_rimossi']}")
     print(f"     Header concat fixati:  {t_stats['n_header_concat_fixati']}")
+    print(f"     Articoli → ###:        {t_stats['n_articoli_estratti']}")
+    print(f"     Ambienti matematici:   {t_stats['n_ambienti_matematici']}")
+    print(f"     Titoli header uniti:   {t_stats['n_titoli_uniti']}")
     print(f"     TOC rimosso:           {'sì' if t_stats['toc_rimosso'] else 'no'}")
     print(f"     ALL-CAPS → ##:         {t_stats['n_header_allcaps']}")
     print(f"     Sezioni → ###:         {t_stats['n_sezioni_numerate']}")
diff --git a/conversione/validate.py b/conversione/validate.py
index 6194367..51702d1 100644
--- a/conversione/validate.py
+++ b/conversione/validate.py
@@ -1,12 +1,10 @@
 #!/usr/bin/env python3
 """
-conversione/validate.py — Validazione batch di tutti gli stem convertiti
+conversione/validate.py — Validazione qualità Markdown
 
 Legge i report.json prodotti da pipeline.py, stampa una tabella di stato
-e assegna un voto (0-100) a ogni documento per misurare la bontà del
-Markdown prodotto.
+e assegna un voto (0-100) a ogni documento.
 
-Voto:
   90-100  A  — ottimo, pronto per il chunker
   75-89   B  — buono, qualche sezione lunga ma accettabile
   60-74   C  — accettabile, anomalie minori da verificare
@@ -16,57 +14,54 @@ Voto:
 Uso:
     python conversione/validate.py              # tutti gli stem
     python conversione/validate.py analisi1     # stem specifico
-    python conversione/validate.py --stem analisi1
-    python conversione/validate.py --analisi1   # compatibilità
+    python conversione/validate.py a b c        # stem multipli
 """
 
-import json
 import argparse
+import json
 import sys
 from pathlib import Path
 
 
 # ─── Punteggio ───────────────────────────────────────────────────────────────
 
+_GRADES = [(90, "A"), (75, "B"), (60, "C"), (40, "D"), (0, "F")]
+
+
 def _score(r: dict) -> int:
     """
     Calcola un punteggio 0-100 sulla qualità del Markdown prodotto.
 
     Penalità:
-      - struttura assente o piatta        → -40 / -15
-      - backtick residui nel testo        → -2 per occorrenza (max -30)
-      - URL / watermark residui           → -5 per occorrenza (max -15)
-      - immagini residue                  → -5 per occorrenza (max -10)
-      - dot-leader residui                → -5 per occorrenza (max -10)
-      - header senza titolo (bare)        → -3 per occorrenza (max -15)
-      - troppe sezioni > 1500 chars       → -5 / -10 (in % sul totale h3)
+      struttura assente / piatta  → −40 / −15
+      backtick residui            → −2/cad (max −30)
+      URL / watermark             → −5/cad (max −15)
+      immagini residue            → −5/cad (max −10)
+      dot-leader residui          → −5/cad (max −10)
+      bare headers                → −3/cad (max −15)
+      sezioni >1500ch >35/60%     → −5 / −10
     """
-    score     = 100
+    score    = 100
     structure = r.get("structure", {})
-    anomalie  = r.get("anomalie", {})
-    residui   = r.get("residui", {})
+    anomalie  = r.get("anomalie",  {})
+    residui   = r.get("residui",   {})
 
-    livello  = structure.get("livello_struttura", 0)
-    n_h3     = max(structure.get("n_h3", 0), 1)
+    livello = structure.get("livello_struttura", 0)
+    n_h3    = max(structure.get("n_h3", 0), 1)
 
-    # Struttura
     if livello == 0:
         score -= 40
     elif livello == 1:
         score -= 15
 
-    # Residui nel testo
     score -= min(30, residui.get("backtick",  0) * 2)
     score -= min(15, residui.get("url",       0) * 5)
     score -= min(10, residui.get("immagini",  0) * 5)
     score -= min(10, residui.get("dotleader", 0) * 5)
-
-    # Anomalie strutturali
     score -= min(15, anomalie.get("bare_headers", 0) * 3)
 
-    # Sezioni troppo lunghe (in % sul totale delle sezioni ###)
     long_ratio = anomalie.get("long_sections", 0) / n_h3
-    if long_ratio > 0.6:
+    if long_ratio > 0.60:
         score -= 10
     elif long_ratio > 0.35:
         score -= 5
@@ -75,82 +70,7 @@ def _score(r: dict) -> int:
 
 
 def _grade(score: int) -> str:
-    if score >= 90: return "A"
-    if score >= 75: return "B"
-    if score >= 60: return "C"
-    if score >= 40: return "D"
-    return "F"
-
-
-# ─── CLI ─────────────────────────────────────────────────────────────────────
-
-def _normalize_target(token: str) -> str:
-    """
-    Normalizza un target CLI in stem:
-      - analisi1
-      - --analisi1          (compatibilità)
-      - conversione/analisi1/report.json
-      - analisi1.pdf / analisi1.md / report.json
-    """
-    raw = token.strip()
-    if not raw:
-        return raw
-
-    # Compatibilità con invocazione tipo: --analisi1
-    if raw.startswith("--") and len(raw) > 2:
-        raw = raw[2:]
-
-    p = Path(raw)
-
-    # Path diretto al report
-    if p.name == "report.json" and p.parent.name:
-        return p.parent.name
-
-    name = p.name
-    if name.endswith((".pdf", ".md", ".json")):
-        name = Path(name).stem
-
-    return name
-
-
-def _parse_cli_args(argv: list[str]) -> list[str]:
-    parser = argparse.ArgumentParser(
-        description="Valida i report Markdown prodotti in conversione/<stem>/report.json"
-    )
-    parser.add_argument(
-        "targets",
-        nargs="*",
-        help="Stem, file o path da validare (es: analisi1 oppure conversione/analisi1/report.json)",
-    )
-    parser.add_argument(
-        "-s",
-        "--stem",
-        action="append",
-        default=[],
-        help="Stem specifico (ripetibile, es: --stem analisi1 --stem nietzsche)",
-    )
-
-    args, unknown = parser.parse_known_args(argv)
-
-    targets = [*args.targets, *args.stem]
-
-    # Compatibilità: `python validate.py --analisi1`
-    for tok in unknown:
-        if tok.startswith("--") and len(tok) > 2:
-            targets.append(tok[2:])
-        else:
-            parser.error(f"Argomento non riconosciuto: {tok}")
-
-    stems = []
-    seen = set()
-    for t in targets:
-        stem = _normalize_target(t)
-        if not stem or stem in seen:
-            continue
-        seen.add(stem)
-        stems.append(stem)
-
-    return stems
+    return next(g for threshold, g in _GRADES if score >= threshold)
 
 
 # ─── Validazione ─────────────────────────────────────────────────────────────
@@ -158,100 +78,90 @@ def _parse_cli_args(argv: list[str]) -> list[str]:
 def validate(stems: list[str], project_root: Path) -> None:
     conv_dir = project_root / "conversione"
 
-    if stems:
-        paths = [conv_dir / s / "report.json" for s in stems]
-    else:
-        paths = sorted(conv_dir.glob("*/report.json"))
+    paths = (
+        [conv_dir / s / "report.json" for s in stems]
+        if stems
+        else sorted(conv_dir.glob("*/report.json"))
+    )
 
     if not paths:
         print("Nessun report.json trovato in conversione/*/")
         sys.exit(0)
 
-    rows = []
-    for path in paths:
-        if not path.exists():
-            rows.append({"stem": path.parent.name, "_missing": True})
-            continue
-        r = json.loads(path.read_text(encoding="utf-8"))
-        rows.append(r)
+    rows = [
+        json.loads(p.read_text(encoding="utf-8")) if p.exists()
+        else {"stem": p.parent.name, "_missing": True}
+        for p in paths
+    ]
 
     # ── Intestazione ─────────────────────────────────────────────────────
-    col_stem = max(len(r.get("stem", "stem")) for r in rows) + 2
+    col = max(len(r.get("stem", "stem")) for r in rows) + 2
     header = (
-        f"{'stem':<{col_stem}}"
+        f"{'stem':<{col}}"
         f"{'h2':>4}{'h3':>5}  "
         f"{'strategia':<20}"
         f"{'bare':>5}{'corte':>6}{'lunghe':>7}"
         f"{'backtick':>9}{'dotlead':>8}{'url':>4}"
-        f"  {'voto':>4}  {'grade'}"
+        f"  {'voto':>4}  grade"
     )
     sep = "─" * len(header)
-    print()
-    print(header)
-    print(sep)
+    print(f"\n{header}\n{sep}")
 
     scores = []
-    scored_docs = []
 
     # ── Righe ─────────────────────────────────────────────────────────────
     for r in rows:
         if r.get("_missing"):
-            print(f"{r['stem']:<{col_stem}}  (report.json non trovato)")
+            print(f"{r['stem']:<{col}}  (report.json non trovato)")
             continue
 
-        stem      = r.get("stem", "?")
-        structure = r.get("structure", {})
-        anomalie  = r.get("anomalie", {})
-        residui   = r.get("residui", {})
-
-        h2       = structure.get("n_h2", 0)
-        h3       = structure.get("n_h3", 0)
-        strat    = structure.get("strategia_chunking", "?")
-        bare     = anomalie.get("bare_headers", 0)
-        corte    = anomalie.get("short_sections", 0)
-        lunghe   = anomalie.get("long_sections", 0)
-        backtick = residui.get("backtick", 0)
-        dotlead  = residui.get("dotleader", 0)
-        url      = residui.get("url", 0)
-
-        s = _score(r)
-        g = _grade(s)
+        st  = r.get("structure", {})
+        an  = r.get("anomalie",  {})
+        res = r.get("residui",   {})
+        s   = _score(r)
         scores.append(s)
-        scored_docs.append((stem, s, g))
 
         print(
-            f"{stem:<{col_stem}}"
-            f"{h2:>4}{h3:>5}  "
-            f"{strat:<20}"
-            f"{bare:>5}{corte:>6}{lunghe:>7}"
-            f"{backtick:>9}{dotlead:>8}{url:>4}"
-            f"  {s:>4}  {g}"
+            f"{r['stem']:<{col}}"
+            f"{st.get('n_h2',              0):>4}"
+            f"{st.get('n_h3',              0):>5}  "
+            f"{st.get('strategia_chunking','?'):<20}"
+            f"{an.get('bare_headers',      0):>5}"
+            f"{an.get('short_sections',    0):>6}"
+            f"{an.get('long_sections',     0):>7}"
+            f"{res.get('backtick',         0):>9}"
+            f"{res.get('dotleader',        0):>8}"
+            f"{res.get('url',             0):>4}"
+            f"  {s:>4}  {_grade(s)}"
         )
 
     # ── Riepilogo ─────────────────────────────────────────────────────────
     print(sep)
     if scores:
         media = sum(scores) / len(scores)
-        grade_media = _grade(int(media))
-        print(f"Documenti: {len(scores)}   "
-              f"Voto medio: {media:.0f}/100  {grade_media}   "
-              f"(A≥90  B≥75  C≥60  D≥40  F<40)")
-        if len(scored_docs) == 1:
-            stem, score, grade = scored_docs[0]
-            print(f"Voto finale Markdown ({stem}): {score}/100  {grade}")
-        else:
-            voti = ", ".join(
-                f"{stem}={score}/100 {grade}"
-                for stem, score, grade in scored_docs
-            )
-            print(f"Voti Markdown: {voti}")
-    print()
-    print("Penalità: struttura assente −40, backtick residui −2/cad, "
-          "bare headers −3/cad, sezioni >1500ch >35% −5")
-    print()
+        print(
+            f"Documenti: {len(scores)}   "
+            f"Media: {media:.0f}/100 {_grade(int(media))}   "
+            f"(A≥90  B≥75  C≥60  D≥40  F<40)"
+        )
+    print(
+        "\nPenalità: struttura assente −40, backtick −2/cad, "
+        "bare headers −3/cad, sezioni >1500ch >35% −5\n"
+    )
 
 
+# ─── Entry point ─────────────────────────────────────────────────────────────
+
 if __name__ == "__main__":
-    project_root = Path(__file__).parent.parent
-    stems = _parse_cli_args(sys.argv[1:])
-    validate(stems, project_root)
+    parser = argparse.ArgumentParser(
+        description="Valida i report Markdown prodotti da pipeline.py",
+        epilog="Senza argomenti valida tutti gli stem in conversione/*/",
+    )
+    parser.add_argument(
+        "stems",
+        nargs="*",
+        metavar="STEM",
+        help="stem da validare (es: analisi1). Ometti per tutti.",
+    )
+    args = parser.parse_args()
+    validate(args.stems, Path(__file__).parent.parent)

From 9910a70823f8cacca044f4459473ff46f04c5784 Mon Sep 17 00:00:00 2001
From: Davide Grilli <davide.grilli@outlook.com>
Date: Fri, 17 Apr 2026 09:19:17 +0200
Subject: [PATCH 07/22] feat(conversione): aggiungi clear.sh per pulizia batch
 cartelle stem

Script bash con conferma interattiva e flag -f per eliminare in blocco
le cartelle stem gitignorate in conversione/*/
---
 conversione/clear.sh | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)
 create mode 100755 conversione/clear.sh

diff --git a/conversione/clear.sh b/conversione/clear.sh
new file mode 100755
index 0000000..3774610
--- /dev/null
+++ b/conversione/clear.sh
@@ -0,0 +1,29 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+cd "$SCRIPT_DIR"
+
+mapfile -t dirs < <(find . -maxdepth 1 -mindepth 1 -type d | sort)
+
+if [[ ${#dirs[@]} -eq 0 ]]; then
+    echo "Nessuna cartella da cancellare."
+    exit 0
+fi
+
+echo "Cartelle che verranno cancellate:"
+for d in "${dirs[@]}"; do
+    echo "  $d"
+done
+
+if [[ "${1:-}" != "-f" ]]; then
+    read -r -p "Confermi? [s/N] " answer
+    [[ "$answer" =~ ^[sS]$ ]] || { echo "Annullato."; exit 0; }
+fi
+
+for d in "${dirs[@]}"; do
+    rm -rf "$d"
+    echo "Rimossa: $d"
+done
+
+echo "Pulizia completata."

From ea721774da2a033d7bb4ee35609db671095dd0de Mon Sep 17 00:00:00 2001
From: Davide Grilli <davide.grilli@outlook.com>
Date: Fri, 17 Apr 2026 09:19:44 +0200
Subject: [PATCH 08/22] feat(pipeline): 10 nuovi transform e metriche residui
 estese
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- 0_br: rimozione tag <br> residui da tabelle PDF
- 0_tabsep: rimozione separatori | | e |---| (doppio pass pre/post merge)
- 0a2: correzione encoding " → × (moltiplicazione, solo digit-before)
- 0a3: correzione encoding ! → µ prima di unità SI
- 0a4: rimozione label formule inline [N.M]
- 9c: filtro garbage headers — simboli puri, abbreviazioni brevi, prefisso ...
- 9d: rimozione sezioni frontmatter (URL, email, copyright, affiliazione)
- build_report: tracking esteso br_inline, simboli_encoding, formule_inline
---
 conversione/pipeline.py | 135 ++++++++++++++++++++++++++++++++++++----
 1 file changed, 123 insertions(+), 12 deletions(-)

diff --git a/conversione/pipeline.py b/conversione/pipeline.py
index 0a6014b..03cde62 100644
--- a/conversione/pipeline.py
+++ b/conversione/pipeline.py
@@ -349,6 +349,12 @@ def apply_transforms(text: str) -> tuple[str, dict]:
         "toc_rimosso": False,
         "n_immagini_rimosse": 0,
         "n_accenti_corretti": 0,
+        "n_moltiplicazioni_corrette": 0,
+        "n_micro_corretti": 0,
+        "n_br_rimossi": 0,
+        "n_formule_rimossi": 0,
+        "n_garbage_headers_rimossi": 0,
+        "n_frontmatter_rimossi": 0,
         "n_dotleader_rimossi": 0,
         "n_header_concat_fixati": 0,
         "n_articoli_estratti": 0,
@@ -357,12 +363,26 @@ def apply_transforms(text: str) -> tuple[str, dict]:
         "n_header_allcaps": 0,
         "n_sezioni_numerate": 0,
         "n_paragrafi_uniti": 0,
+        "n_tabsep_rimossi": 0,
     }
 
     # 0. Rimuovi riferimenti immagini (artefatti opendataloader-pdf)
     stats["n_immagini_rimosse"] = len(re.findall(r"!\[[^\]]*\]\([^)]*\)", text))
     text = re.sub(r"!\[[^\]]*\]\([^)]*\)\s*", "", text)
 
+    # 0_br. Rimuovi tag <br> residui da tabelle e blocchi formula PDF
+    #     Nelle celle di tabella produce spazio; nel testo inline elimina rumore.
+    stats["n_br_rimossi"] = len(re.findall(r"<br>", text, re.IGNORECASE))
+    text = re.sub(r"<br>\s*", " ", text, flags=re.IGNORECASE)
+
+    # 0_tabsep. Rimuovi separatori tabella PDF: "| |" (riga vuota) e "|---|" (separatore).
+    #     Nascono da tabelle non strutturate nel PDF. Rimossi PRIMA del merge paragrafi
+    #     (step 5) altrimenti "|---|" viene fuso con il paragrafo successivo producendo
+    #     righe tipo "|---| Una caratterizzazione analoga...".
+    _pat_tabsep = re.compile(r"(?m)^\|\s*\|\s*$|^\|---\|?\s*$")
+    stats["n_tabsep_rimossi"] = len(_pat_tabsep.findall(text))
+    text = _pat_tabsep.sub("", text)
+
     # 0a. Fix artefatti backtick da PDF LaTeX: `e→è, e`→è, sar`a→sarà, ecc.
     #     I PDF prodotti da LaTeX estraggono gli accenti gravi come backtick separati
     #     dalla vocale accentata. Esempi: "`e" → "è", "puo`" → "può", "sar`a" → "sarà"
@@ -383,6 +403,30 @@ def apply_transforms(text: str) -> tuple[str, dict]:
         text = re.sub(r"`", "", text)
         stats["n_accenti_corretti"] += n_bt_orfani
 
+    # 0a2. Fix segno di moltiplicazione "→× (encoding font PDF non-standard)
+    #     Esempi: 2"107 → 2×107,  2"(10-2 m)3 → 2×(10-2 m)3
+    #     Lookbehind SOLO su cifra: evita falsi positivi tipo t1"t0 (→ limite)
+    #     o h"hf (→ differenza) dove la lettera prima della " non indica prodotto.
+    _n_cross = len(re.findall(r'(?<=[0-9])"(?=[0-9(])', text))
+    text = re.sub(r'(?<=[0-9])"(?=[0-9(])', '×', text)
+    stats["n_moltiplicazioni_corrette"] = _n_cross
+
+    # 0a3. Fix prefisso micro !→µ prima di unità SI note
+    #     "1 !m" → "1 µm",  "1 !A" → "1 µA",  "3 !s-1" → "3 µs-1"
+    #     Pattern stretto: cifra + spazio opzionale + ! + lettera unità SI a scelta ristretta.
+    #     Non tocca "4! steradianti" (spazio dopo !) né "mol!K" (non preceduto da cifra).
+    _SI_UNITS_RE = r'[mAsgVWFHTKNJClΩ]'
+    _n_micro = len(re.findall(rf'\d\s*!(?={_SI_UNITS_RE})', text))
+    text = re.sub(rf'(\d)\s*!({_SI_UNITS_RE})', r'\1 µ\2', text)
+    stats["n_micro_corretti"] = _n_micro
+
+    # 0a4. Rimuovi label formule inline [N.M] — es. [3.4], [10.7], [5.25]
+    #     Non aggiungono valore semantico per il RAG; restano come rumore numerico.
+    #     Preserva [N] senza punto (riferimenti bibliografici/note legittime).
+    n_form_before = len(re.findall(r"\[\d+\.\d+\]", text))
+    text = re.sub(r"\s*\[\d+\.\d+\]\s*", " ", text)
+    stats["n_formule_rimossi"] = n_form_before
+
     # 0b_pre. Rimuovi righe con dot-leader (voci di indice/sommario)
     #     Esempi: "- 1.1 Alfabeto greco . . . . . . 1", "3.4 Continuità . . . . 205"
     #     Pattern: almeno 3 occorrenze di ". " consecutive nella riga
@@ -624,6 +668,9 @@ def apply_transforms(text: str) -> tuple[str, dict]:
         i += 1
     text = "\n\n".join(merged)
 
+    # Secondo pass: rimuovi prefisso |---| eventualmente rimasto dopo il merge paragrafi
+    text = re.sub(r"(?m)^\|---\|\s*", "", text)
+
     # 6. Normalizza whitespace multiplo interno alle righe
     lines = text.split("\n")
     text = "\n".join(
@@ -655,6 +702,61 @@ def apply_transforms(text: str) -> tuple[str, dict]:
     text, n_titoli = _merge_title_headers(text)
     stats["n_titoli_uniti"] = n_titoli
 
+    # 9c. Rimuovi garbage headers: header ### senza parole reali o con solo
+    #     abbreviazioni matematiche.  Esempi: "### ( vm)", "### #", "### ! =",
+    #     "### (am)", "### 2. Il valore di hf si deter- mina risolvendo mg(h!hf)"
+    #     Questi nascono da espressioni matematiche scambiate per titoli di sezione.
+    #     Il corpo rimane nel testo e viene accorpato alla sezione precedente.
+    def _is_garbage_header(content: str) -> bool:
+        # Header con prefisso "..." — frammento di formula (es. "...Di", "...vi")
+        if content.lstrip().startswith("..."):
+            return True
+        # Nessuna sequenza alfabetica ≥ 2 char
+        if not re.search(r"[A-Za-zÀ-ÿ]{2,}", content):
+            return True
+        # Abbreviazione corta in parentesi opzionali: "(vm)", "( am)", "(am)"
+        if re.fullmatch(r"\(?\s*[A-Za-z]{1,4}\s*\)?", content.strip()):
+            return True
+        # Header molto lungo (>60ch) con artefatti formula inline
+        if len(content) > 60 and re.search(r"[!%#]\w|\w[!%#]|\b\w+-\s*\w", content):
+            return True
+        return False
+
+    lines = text.split("\n")
+    new_lines = []
+    for line in lines:
+        m = re.match(r"^#{1,6} (.+)$", line)
+        if m and _is_garbage_header(m.group(1)):
+            stats["n_garbage_headers_rimossi"] += 1
+            continue
+        new_lines.append(line)
+    text = "\n".join(new_lines)
+    text = re.sub(r"\n{3,}", "\n\n", text)
+
+    # 9d. Rimuovi sezioni frontmatter: header senza numero + corpo corto con
+    #     URL, email, affiliazione, copyright, edizione — metadati non-contenuto.
+    _FM_RE = re.compile(
+        r"https?://|www\.|@[A-Za-z]|\bUniversit[àa]\b|\bDipartimento\b|"
+        r"\bCopyright\b|\bLicenza\b|\bEdizione\b|"
+        r"protetto da|tutti i diritti",
+        re.IGNORECASE,
+    )
+    blocks = re.split(r"\n{2,}", text)
+    cleaned = []
+    for i, block in enumerate(blocks):
+        stripped = block.strip()
+        if not re.match(r"^### ", stripped) or re.match(r"^### \d", stripped):
+            cleaned.append(block)
+            continue
+        body = blocks[i + 1].strip() if i + 1 < len(blocks) else ""
+        is_fm_body  = len(body) < 250 and _FM_RE.search(body)
+        is_fm_hdr   = _FM_RE.search(stripped)
+        if is_fm_body or is_fm_hdr:
+            stats["n_frontmatter_rimossi"] += 1
+            continue
+        cleaned.append(block)
+    text = re.sub(r"\n{3,}", "\n\n", "\n\n".join(cleaned))
+
     return text, stats
 
 
@@ -831,10 +933,13 @@ def build_report(
         return hits
 
     residui = {
-        "backtick":  _scan(r"`"),
-        "dotleader": _scan(r"(?:\. ){3,}"),
-        "url":       _scan(r"^(https?://|www\.)\S+"),
-        "immagini":  _scan(r"!\[[^\]]*\]\([^)]*\)"),
+        "backtick":        _scan(r"`"),
+        "dotleader":       _scan(r"(?:\. ){3,}"),
+        "url":             _scan(r"^(https?://|www\.)\S+"),
+        "immagini":        _scan(r"!\[[^\]]*\]\([^)]*\)"),
+        "br_inline":       _scan(r"<br>"),
+        "simboli_encoding":_scan(r'(?<=[0-9A-Za-z])[!"](?=[0-9A-Za-z])'),
+        "formule_inline":  _scan(r"\[\d+\.\d+\]"),
     }
 
     # ── Composizione report ───────────────────────────────────────────────
@@ -856,14 +961,20 @@ def build_report(
             "long_sections_list":  long_secs,
         },
         "residui": {
-            "backtick":  len(residui["backtick"]),
-            "dotleader": len(residui["dotleader"]),
-            "url":       len(residui["url"]),
-            "immagini":  len(residui["immagini"]),
-            "backtick_esempi":  residui["backtick"],
-            "dotleader_esempi": residui["dotleader"],
-            "url_esempi":       residui["url"],
-            "immagini_esempi":  residui["immagini"],
+            "backtick":         len(residui["backtick"]),
+            "dotleader":        len(residui["dotleader"]),
+            "url":              len(residui["url"]),
+            "immagini":         len(residui["immagini"]),
+            "br_inline":        len(residui["br_inline"]),
+            "simboli_encoding": len(residui["simboli_encoding"]),
+            "formule_inline":   len(residui["formule_inline"]),
+            "backtick_esempi":         residui["backtick"],
+            "dotleader_esempi":        residui["dotleader"],
+            "url_esempi":              residui["url"],
+            "immagini_esempi":         residui["immagini"],
+            "br_inline_esempi":        residui["br_inline"],
+            "simboli_encoding_esempi": residui["simboli_encoding"],
+            "formule_inline_esempi":   residui["formule_inline"],
         },
     }
 

From 875a342efa4c6b20c83133e383211ff9f8ff2991 Mon Sep 17 00:00:00 2001
From: Davide Grilli <davide.grilli@outlook.com>
Date: Fri, 17 Apr 2026 09:20:15 +0200
Subject: [PATCH 09/22] feat(validate): scoring orientato a
 chunking/vettorizzazione, flag --detail
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- _score() ritorna (int, list[str]) con dettaglio penalità applicate
- Rimossi criteri non pertinenti al chunking: sezioni_corte, sezioni_lunghe,
  mediana, p25 — il chunker le normalizza già in fase di suddivisione
- Aggiunte penalità per residui che impattano i vettori: br_inline,
  simboli_encoding, formule_inline
- Flag --detail / -d per mostrare breakdown penalità per documento
- Colonne tabella aggiornate: btk, br, enc, url, med
---
 conversione/validate.py | 115 +++++++++++++++++++++++++++-------------
 1 file changed, 78 insertions(+), 37 deletions(-)

diff --git a/conversione/validate.py b/conversione/validate.py
index 51702d1..b9d71be 100644
--- a/conversione/validate.py
+++ b/conversione/validate.py
@@ -15,6 +15,7 @@ Uso:
     python conversione/validate.py              # tutti gli stem
     python conversione/validate.py analisi1     # stem specifico
     python conversione/validate.py a b c        # stem multipli
+    python conversione/validate.py --detail analisi1  # mostra dettaglio penalità
 """
 
 import argparse
@@ -28,45 +29,72 @@ from pathlib import Path
 _GRADES = [(90, "A"), (75, "B"), (60, "C"), (40, "D"), (0, "F")]
 
 
-def _score(r: dict) -> int:
+def _score(r: dict) -> tuple[int, list[str]]:
     """
-    Calcola un punteggio 0-100 sulla qualità del Markdown prodotto.
+    Calcola un punteggio 0-100 sulla qualità del clean.md ai fini della
+    suddivisione in chunk e vettorizzazione.
+    Restituisce (score, lista_penalità_applicate).
 
-    Penalità:
-      struttura assente / piatta  → −40 / −15
-      backtick residui            → −2/cad (max −30)
-      URL / watermark             → −5/cad (max −15)
-      immagini residue            → −5/cad (max −10)
-      dot-leader residui          → −5/cad (max −10)
-      bare headers                → −3/cad (max −15)
-      sezioni >1500ch >35/60%     → −5 / −10
+    Penalità struttura (il chunker non può operare senza header):
+      struttura assente (livello 0)    → −40
+      struttura piatta (livello 1)     → −15
+
+    Penalità residui (finiscono nei vettori e degradano il retrieval):
+      backtick                         → −2/cad  (max −20)
+      dot-leader                       → −5/cad  (max −10)
+      URL / watermark                  → −5/cad  (max −15)
+      immagini residue                 → −5/cad  (max −10)
+      <br> inline (artefatti tabelle)  → −2/cad  (max −15)
+      simboli encoding (!/" residui)   → −1/cad  (max −10)
+      formule inline [N.M]             → −1/cad  (max −8)
+
+    Penalità anomalie:
+      bare headers                     → −3/cad  (max −15)
+
+    Non penalizzate (il chunker le normalizza):
+      sezioni corte, sezioni lunghe, mediana, p25
     """
-    score    = 100
+    score  = 100
+    detail = []
     structure = r.get("structure", {})
     anomalie  = r.get("anomalie",  {})
     residui   = r.get("residui",   {})
 
     livello = structure.get("livello_struttura", 0)
-    n_h3    = max(structure.get("n_h3", 0), 1)
 
+    # ── Struttura ─────────────────────────────────────────────────────────
     if livello == 0:
         score -= 40
+        detail.append("struttura assente −40")
     elif livello == 1:
         score -= 15
+        detail.append("struttura piatta −15")
 
-    score -= min(30, residui.get("backtick",  0) * 2)
-    score -= min(15, residui.get("url",       0) * 5)
-    score -= min(10, residui.get("immagini",  0) * 5)
-    score -= min(10, residui.get("dotleader", 0) * 5)
-    score -= min(15, anomalie.get("bare_headers", 0) * 3)
+    # ── Residui ───────────────────────────────────────────────────────────
+    def _pen(key: str, per_item: int, cap: int, label: str) -> None:
+        n = residui.get(key, 0)
+        if n:
+            p = min(cap, n * per_item)
+            nonlocal score
+            score -= p
+            detail.append(f"{label} ×{n} −{p}")
 
-    long_ratio = anomalie.get("long_sections", 0) / n_h3
-    if long_ratio > 0.60:
-        score -= 10
-    elif long_ratio > 0.35:
-        score -= 5
+    _pen("backtick",         2, 20, "backtick")
+    _pen("dotleader",        5, 10, "dot-leader")
+    _pen("url",              5, 15, "url")
+    _pen("immagini",         5, 10, "immagini")
+    _pen("br_inline",        2, 15, "<br> inline")
+    _pen("simboli_encoding", 1, 10, "simboli encoding")
+    _pen("formule_inline",   1,  8, "formule inline")
 
-    return max(0, score)
+    # ── Anomalie ──────────────────────────────────────────────────────────
+    n_bare = anomalie.get("bare_headers", 0)
+    if n_bare:
+        p = min(15, n_bare * 3)
+        score -= p
+        detail.append(f"bare headers ×{n_bare} −{p}")
+
+    return max(0, score), detail
 
 
 def _grade(score: int) -> str:
@@ -75,7 +103,7 @@ def _grade(score: int) -> str:
 
 # ─── Validazione ─────────────────────────────────────────────────────────────
 
-def validate(stems: list[str], project_root: Path) -> None:
+def validate(stems: list[str], project_root: Path, detail: bool = False) -> None:
     conv_dir = project_root / "conversione"
 
     paths = (
@@ -99,9 +127,10 @@ def validate(stems: list[str], project_root: Path) -> None:
     header = (
         f"{'stem':<{col}}"
         f"{'h2':>4}{'h3':>5}  "
-        f"{'strategia':<20}"
+        f"{'strategia':<18}"
         f"{'bare':>5}{'corte':>6}{'lunghe':>7}"
-        f"{'backtick':>9}{'dotlead':>8}{'url':>4}"
+        f"{'btk':>5}{'br':>4}{'enc':>4}{'url':>4}"
+        f"{'med':>6}"
         f"  {'voto':>4}  grade"
     )
     sep = "─" * len(header)
@@ -115,26 +144,33 @@ def validate(stems: list[str], project_root: Path) -> None:
             print(f"{r['stem']:<{col}}  (report.json non trovato)")
             continue
 
-        st  = r.get("structure", {})
-        an  = r.get("anomalie",  {})
-        res = r.get("residui",   {})
-        s   = _score(r)
+        st   = r.get("structure",    {})
+        an   = r.get("anomalie",     {})
+        res  = r.get("residui",      {})
+        dist = r.get("distribution", {})
+        s, pen = _score(r)
         scores.append(s)
 
         print(
             f"{r['stem']:<{col}}"
             f"{st.get('n_h2',              0):>4}"
             f"{st.get('n_h3',              0):>5}  "
-            f"{st.get('strategia_chunking','?'):<20}"
+            f"{st.get('strategia_chunking','?'):<18}"
             f"{an.get('bare_headers',      0):>5}"
             f"{an.get('short_sections',    0):>6}"
             f"{an.get('long_sections',     0):>7}"
-            f"{res.get('backtick',         0):>9}"
-            f"{res.get('dotleader',        0):>8}"
-            f"{res.get('url',             0):>4}"
+            f"{res.get('backtick',         0):>5}"
+            f"{res.get('br_inline',        0):>4}"
+            f"{res.get('simboli_encoding', 0):>4}"
+            f"{res.get('url',              0):>4}"
+            f"{dist.get('mediana',         0):>6}"
             f"  {s:>4}  {_grade(s)}"
         )
 
+        if detail and pen:
+            for p in pen:
+                print(f"  {'':>{col}}  ↳ {p}")
+
     # ── Riepilogo ─────────────────────────────────────────────────────────
     print(sep)
     if scores:
@@ -145,8 +181,8 @@ def validate(stems: list[str], project_root: Path) -> None:
             f"(A≥90  B≥75  C≥60  D≥40  F<40)"
         )
     print(
-        "\nPenalità: struttura assente −40, backtick −2/cad, "
-        "bare headers −3/cad, sezioni >1500ch >35% −5\n"
+        "\nColonne: bare=header vuoti  corte=sez<150ch  lunghe=sez>1500ch  "
+        "btk=backtick  br=<br>inline  enc=simboli encoding  med=mediana chars\n"
     )
 
 
@@ -163,5 +199,10 @@ if __name__ == "__main__":
         metavar="STEM",
         help="stem da validare (es: analisi1). Ometti per tutti.",
     )
+    parser.add_argument(
+        "--detail", "-d",
+        action="store_true",
+        help="mostra dettaglio penalità per ogni documento",
+    )
     args = parser.parse_args()
-    validate(args.stems, Path(__file__).parent.parent)
+    validate(args.stems, Path(__file__).parent.parent, detail=args.detail)

From 757df26bc2347109fbb7851648acb847fe738b06 Mon Sep 17 00:00:00 2001
From: Davide Grilli <davide.grilli@outlook.com>
Date: Fri, 17 Apr 2026 09:46:50 +0200
Subject: [PATCH 10/22] refactor(pipeline): modularizza apply_transforms in 26
 funzioni _t_xxx

Estrae ogni trasformazione strutturale in una funzione dedicata
_t_xxx(text) -> tuple[str, int], sostituendo la mega-function da
418 righe con un loop su lista di coppie (stat_key, fn). Aggiunge
_parse_sections_with_body() condivisa tra analyze() e build_report().
Output identico verificato su tutti e 5 gli stem esistenti
---
 conversione/pipeline.py | 424 +++++++++++++++++++++-------------------
 1 file changed, 223 insertions(+), 201 deletions(-)

diff --git a/conversione/pipeline.py b/conversione/pipeline.py
index 03cde62..783f3d7 100644
--- a/conversione/pipeline.py
+++ b/conversione/pipeline.py
@@ -31,6 +31,7 @@ import subprocess
 import sys
 import tempfile
 from datetime import datetime
+from functools import partial
 from pathlib import Path
 
 
@@ -340,52 +341,29 @@ def _extract_article_headers(text: str) -> tuple[str, int]:
     return text, count
 
 
-def apply_transforms(text: str) -> tuple[str, dict]:
-    """
-    Applica le trasformazioni strutturali al Markdown grezzo.
-    Restituisce (testo_modificato, statistiche).
-    """
-    stats = {
-        "toc_rimosso": False,
-        "n_immagini_rimosse": 0,
-        "n_accenti_corretti": 0,
-        "n_moltiplicazioni_corrette": 0,
-        "n_micro_corretti": 0,
-        "n_br_rimossi": 0,
-        "n_formule_rimossi": 0,
-        "n_garbage_headers_rimossi": 0,
-        "n_frontmatter_rimossi": 0,
-        "n_dotleader_rimossi": 0,
-        "n_header_concat_fixati": 0,
-        "n_articoli_estratti": 0,
-        "n_ambienti_matematici": 0,
-        "n_titoli_uniti": 0,
-        "n_header_allcaps": 0,
-        "n_sezioni_numerate": 0,
-        "n_paragrafi_uniti": 0,
-        "n_tabsep_rimossi": 0,
-    }
+# ─── [3a] Funzioni di trasformazione ─────────────────────────────────────────
 
-    # 0. Rimuovi riferimenti immagini (artefatti opendataloader-pdf)
-    stats["n_immagini_rimosse"] = len(re.findall(r"!\[[^\]]*\]\([^)]*\)", text))
+def _t_remove_images(text: str) -> tuple[str, int]:
+    n = len(re.findall(r"!\[[^\]]*\]\([^)]*\)", text))
     text = re.sub(r"!\[[^\]]*\]\([^)]*\)\s*", "", text)
+    return text, n
 
-    # 0_br. Rimuovi tag <br> residui da tabelle e blocchi formula PDF
-    #     Nelle celle di tabella produce spazio; nel testo inline elimina rumore.
-    stats["n_br_rimossi"] = len(re.findall(r"<br>", text, re.IGNORECASE))
+
+def _t_fix_br(text: str) -> tuple[str, int]:
+    n = len(re.findall(r"<br>", text, re.IGNORECASE))
     text = re.sub(r"<br>\s*", " ", text, flags=re.IGNORECASE)
+    return text, n
 
-    # 0_tabsep. Rimuovi separatori tabella PDF: "| |" (riga vuota) e "|---|" (separatore).
-    #     Nascono da tabelle non strutturate nel PDF. Rimossi PRIMA del merge paragrafi
-    #     (step 5) altrimenti "|---|" viene fuso con il paragrafo successivo producendo
-    #     righe tipo "|---| Una caratterizzazione analoga...".
-    _pat_tabsep = re.compile(r"(?m)^\|\s*\|\s*$|^\|---\|?\s*$")
-    stats["n_tabsep_rimossi"] = len(_pat_tabsep.findall(text))
-    text = _pat_tabsep.sub("", text)
 
-    # 0a. Fix artefatti backtick da PDF LaTeX: `e→è, e`→è, sar`a→sarà, ecc.
-    #     I PDF prodotti da LaTeX estraggono gli accenti gravi come backtick separati
-    #     dalla vocale accentata. Esempi: "`e" → "è", "puo`" → "può", "sar`a" → "sarà"
+def _t_fix_tabsep(text: str) -> tuple[str, int]:
+    _pat = re.compile(r"(?m)^\|\s*\|\s*$|^\|---\|?\s*$")
+    n = len(_pat.findall(text))
+    text = _pat.sub("", text)
+    return text, n
+
+
+def _t_fix_accents(text: str) -> tuple[str, int]:
+    """Fix artefatti backtick da PDF LaTeX: `e→è, e`→è, sar`a→sarà, ecc."""
     _ACCENT_MAP = {
         "e": "è", "E": "È", "a": "à", "A": "À",
         "u": "ù", "U": "Ù", "i": "ì", "I": "Ì", "o": "ò", "O": "Ò",
@@ -393,73 +371,61 @@ def apply_transforms(text: str) -> tuple[str, dict]:
     n_bt_before = text.count("`")
     text = re.sub(r"`([eEaAuUiIoO])", lambda m: _ACCENT_MAP[m.group(1)], text)
     text = re.sub(r"([eEaAuUiIoO])`", lambda m: _ACCENT_MAP[m.group(1)], text)
-    stats["n_accenti_corretti"] = n_bt_before - text.count("`")
-
+    n_accenti = n_bt_before - text.count("`")
     # Backtick orfani: artefatti LaTeX rimasti dopo la correzione vocale
-    # (es. "propriet`" da "proprietà", "continuit`" da "continuità").
-    # In testi PDF non esistono backtick legittimi → rimozione sicura.
     n_bt_orfani = text.count("`")
     if n_bt_orfani:
         text = re.sub(r"`", "", text)
-        stats["n_accenti_corretti"] += n_bt_orfani
+        n_accenti += n_bt_orfani
+    return text, n_accenti
 
-    # 0a2. Fix segno di moltiplicazione "→× (encoding font PDF non-standard)
-    #     Esempi: 2"107 → 2×107,  2"(10-2 m)3 → 2×(10-2 m)3
-    #     Lookbehind SOLO su cifra: evita falsi positivi tipo t1"t0 (→ limite)
-    #     o h"hf (→ differenza) dove la lettera prima della " non indica prodotto.
-    _n_cross = len(re.findall(r'(?<=[0-9])"(?=[0-9(])', text))
+
+def _t_fix_multiplication(text: str) -> tuple[str, int]:
+    """Fix segno di moltiplicazione "→× (encoding font PDF non-standard)."""
+    n = len(re.findall(r'(?<=[0-9])"(?=[0-9(])', text))
     text = re.sub(r'(?<=[0-9])"(?=[0-9(])', '×', text)
-    stats["n_moltiplicazioni_corrette"] = _n_cross
+    return text, n
 
-    # 0a3. Fix prefisso micro !→µ prima di unità SI note
-    #     "1 !m" → "1 µm",  "1 !A" → "1 µA",  "3 !s-1" → "3 µs-1"
-    #     Pattern stretto: cifra + spazio opzionale + ! + lettera unità SI a scelta ristretta.
-    #     Non tocca "4! steradianti" (spazio dopo !) né "mol!K" (non preceduto da cifra).
+
+def _t_fix_micro(text: str) -> tuple[str, int]:
+    """Fix prefisso micro !→µ prima di unità SI note."""
     _SI_UNITS_RE = r'[mAsgVWFHTKNJClΩ]'
-    _n_micro = len(re.findall(rf'\d\s*!(?={_SI_UNITS_RE})', text))
+    n = len(re.findall(rf'\d\s*!(?={_SI_UNITS_RE})', text))
     text = re.sub(rf'(\d)\s*!({_SI_UNITS_RE})', r'\1 µ\2', text)
-    stats["n_micro_corretti"] = _n_micro
+    return text, n
 
-    # 0a4. Rimuovi label formule inline [N.M] — es. [3.4], [10.7], [5.25]
-    #     Non aggiungono valore semantico per il RAG; restano come rumore numerico.
-    #     Preserva [N] senza punto (riferimenti bibliografici/note legittime).
-    n_form_before = len(re.findall(r"\[\d+\.\d+\]", text))
+
+def _t_remove_formula_labels(text: str) -> tuple[str, int]:
+    """Rimuovi label formule inline [N.M] — es. [3.4], [10.7]."""
+    n = len(re.findall(r"\[\d+\.\d+\]", text))
     text = re.sub(r"\s*\[\d+\.\d+\]\s*", " ", text)
-    stats["n_formule_rimossi"] = n_form_before
+    return text, n
 
-    # 0b_pre. Rimuovi righe con dot-leader (voci di indice/sommario)
-    #     Esempi: "- 1.1 Alfabeto greco . . . . . . 1", "3.4 Continuità . . . . 205"
-    #     Pattern: almeno 3 occorrenze di ". " consecutive nella riga
-    # Cattura sia ". . . ." (spazi) sia "......." (punti continui, tipici dei TOC PDF)
+
+def _t_remove_dotleaders(text: str) -> tuple[str, int]:
+    """Rimuovi righe con dot-leader e numerali romani isolati (footer TOC)."""
     _DOTLEADER_RE = r"^[^\n]*(?:(?:\. ){3,}|\.{4,})[^\n]*$"
-    stats["n_dotleader_rimossi"] = len(
-        re.findall(_DOTLEADER_RE, text, re.MULTILINE)
-    )
+    n = len(re.findall(_DOTLEADER_RE, text, re.MULTILINE))
     text = re.sub(_DOTLEADER_RE, "", text, flags=re.MULTILINE)
-
-    # 0b_pre2. Rimuovi righe che sono solo numerali romani (indicatori di pagina TOC)
-    #     Esempi: "i", "ii", "iii", "iv", "v" su riga isolata (footer pagine indice LaTeX)
-    #     Questi impedirebbero al transform 9 di rimuovere le entry TOC rimaste senza corpo.
     text = re.sub(
         r"(?m)^(i{1,3}|iv|vi{0,3}|ix|xi{0,2}|x)$",
         "",
         text,
         flags=re.IGNORECASE,
     )
+    return text, n
 
-    # Flag documento: rilevamento sezioni esercizi (es. libri di testo accademici)
-    # Usato per disabilitare transform 4b che convertirebbe i numeri degli esercizi in header.
-    _has_exercise_sections = bool(re.search(r"\bEsercizi\b", text, re.IGNORECASE))
 
-    # 0b. Fix header + body concatenati senza separatore
-    #     "##### 11 TitoloCorpodel testo..." → "##### 11 Titolo\n\nCorpo del testo..."
-    def _fix_header_concat(m: re.Match) -> str:
+def _t_fix_header_concat(text: str) -> tuple[str, int]:
+    """Fix header + body concatenati senza separatore."""
+    count = 0
+
+    def _fix(m: re.Match) -> str:
+        nonlocal count
         hashes = m.group(1)
         full = m.group(2).strip()
         if len(full) < 60:
             return m.group(0)
-        # Cerca split: lettera minuscola (incluse accentate) seguita da maiuscola
-        # Salta i primi ~10 char per non spezzare il numero della sezione
         skip = min(10, len(full) // 3)
         split = re.search(r"(?<=[a-zàèéìíòóùúä])(?=[A-ZÀÈÉÌÍÒÓÙÚ])", full[skip:])
         if split:
@@ -467,16 +433,17 @@ def apply_transforms(text: str) -> tuple[str, dict]:
             title = full[:pos].strip()
             body = full[pos:].strip()
             if len(title) >= 5 and len(body) >= 15:
-                stats["n_header_concat_fixati"] += 1
+                count += 1
                 return f"{hashes} {title}\n\n{body}"
         return m.group(0)
 
-    text = re.sub(r"^(#{2,6})\s+(.{40,})$", _fix_header_concat, text, flags=re.MULTILINE)
+    text = re.sub(r"^(#{2,6})\s+(.{40,})$", _fix, text, flags=re.MULTILINE)
+    return text, count
 
-    # 0c. Estrai "Capitolo N: TITOLO" inline nel corpo del testo → ## header separato
-    #     "Capitolo 3: IL TITOLO DEL CAPITOLO - 16 Primo..."  → "## Capitolo 3: ..."
-    #     "Capitolo 1 : TITOLO CAPITOLO"                      → "## Capitolo 1: ..."
-    def _extract_capitolo(m: re.Match) -> str:
+
+def _t_extract_capitolo(text: str) -> tuple[str, int]:
+    """Estrai 'Capitolo N: TITOLO' inline nel corpo del testo → ## header."""
+    def _repl(m: re.Match) -> str:
         num = m.group(1)
         titolo = _sentence_case(m.group(2).strip().rstrip("- ").strip())
         return f"\n\n## Capitolo {num}: {titolo}\n\n"
@@ -484,126 +451,124 @@ def apply_transforms(text: str) -> tuple[str, dict]:
     text = re.sub(
         r"\bCapitolo\s+(\d+)\s*[:\s]\s*([A-ZÀÈÉÌÍÒÓÙÚ\'L][A-ZÀÈÉÌÍÒÓÙÚ\s\'\.,\(\)]{5,80}?)"
         r"(?=\s*[-–]\s*\d|\s*\n|\s*$)",
-        _extract_capitolo,
+        _repl,
         text,
     )
+    return text, 0
 
-    # 0d. Normalizza header di sezione a livello uniforme ###
-    #     "#### N Titolo"  → "### N. Titolo"  (numerati: aggiunge punto)
-    #     "#### B) Titolo" → "### B) Titolo"  (lettera: solo cambio livello)
-    #     "#### "          → rimosso           (vuoti)
-    text = re.sub(
-        r"^#{3,6}\s*$",
-        "",
-        text,
-        flags=re.MULTILINE,
-    )
+
+def _t_normalize_header_levels(text: str) -> tuple[str, int]:
+    """Normalizza h4+ → h3; rimuove header vuoti."""
+    text = re.sub(r"^#{3,6}\s*$", "", text, flags=re.MULTILINE)
     text = re.sub(
         r"^(#{3,6})\s+(\d{1,3})\s+(.+)$",
         lambda m: f"### {m.group(2)}. {m.group(3)}",
         text,
         flags=re.MULTILINE,
     )
-    text = re.sub(
-        r"^#{4,6}\s+(.+)$",
-        r"### \1",
-        text,
-        flags=re.MULTILINE,
-    )
+    text = re.sub(r"^#{4,6}\s+(.+)$", r"### \1", text, flags=re.MULTILINE)
+    return text, 0
 
-    # 0e. Converti voci articolo "- Art. N. Titolo. Corpo" → "### Art. N. Titolo.\n\nCorpo"
-    #     Eseguito dopo la promozione h4+ → h3 (0d) per non duplicare Art. già header.
-    #     Eseguito prima del merge paragrafi (5): il boundary ### previene la fusione.
-    text, n_art = _extract_article_headers(text)
-    stats["n_articoli_estratti"] = n_art
 
-    # 1. Rimuovi **bold** negli header esistenti: ## **Titolo** → ## Titolo
+def _t_extract_articles(text: str) -> tuple[str, int]:
+    """Converti voci articolo '- Art. N.' → '### Art. N.'"""
+    return _extract_article_headers(text)
+
+
+def _t_remove_header_bold(text: str) -> tuple[str, int]:
+    """Rimuovi **bold** negli header esistenti."""
     text = re.sub(
         r"^(#{1,6})\s+\*\*(.+?)\*\*\s*$",
         r"\1 \2",
         text, flags=re.MULTILINE,
     )
+    return text, 0
 
-    # 1b. Normalizza header ALL-CAPS → sentence-case
-    def _norm_allcaps_header(m: re.Match) -> str:
+
+def _t_normalize_allcaps_headers(text: str) -> tuple[str, int]:
+    """Normalizza header ALL-CAPS → sentence-case."""
+    def _norm(m: re.Match) -> str:
         hashes, content = m.group(1), m.group(2).strip()
         letters = [c for c in content if c.isalpha()]
         if letters and all(c.isupper() for c in letters):
             return f"{hashes} {_sentence_case(content)}"
         return m.group(0)
 
-    text = re.sub(r"^(#{1,6}) (.+)$", _norm_allcaps_header, text, flags=re.MULTILINE)
+    text = re.sub(r"^(#{1,6}) (.+)$", _norm, text, flags=re.MULTILINE)
+    return text, 0
 
-    # 2. Rimuovi righe TOC: header "# Indice", "# Contents", ecc.
-    #     + le voci lista numeriche che seguono (TOC senza dot-leader, es. Nietzsche):
-    #       "- 1. Dei pregiudizi dei filosofi" → rimossa se viene subito dopo un header TOC.
-    #     Le voci con dot-leader sono già rimosse da 0b_pre.
-    #     Gli header rimasti senza corpo vengono poi eliminati dal transform 9.
+
+def _t_remove_toc(text: str) -> tuple[str, int]:
+    """Rimuovi header TOC e voci lista numerate che seguono."""
     lines = text.split("\n")
     new_lines = []
     _in_toc = False
+    removed = False
     for line in lines:
-        bare       = re.sub(r"^#+\s*", "", line.strip())
+        bare = re.sub(r"^#+\s*", "", line.strip())
         first_word = bare.split(".")[0].strip().lower()
         if first_word in _TOC_KEYWORDS:
-            stats["toc_rimosso"] = True
+            removed = True
             _in_toc = True
             continue
         if _in_toc:
-            # Salta righe vuote e voci lista numeriche (- N. Titolo / - N Titolo)
             if re.match(r"^\s*$", line) or re.match(r"^\s*[-*+]\s+\d", line):
                 continue
             _in_toc = False
         new_lines.append(line)
-    text = "\n".join(new_lines)
+    return "\n".join(new_lines), 1 if removed else 0
 
-    # 3. Converti righe ALL-CAPS standalone → ## header
+
+def _t_allcaps_to_headers(text: str) -> tuple[str, int]:
+    """Converti righe ALL-CAPS standalone → ## header."""
+    count = 0
     blocks = text.split("\n\n")
     new_blocks = []
     for block in blocks:
         stripped = block.strip()
         if "\n" not in stripped and _is_allcaps_line(stripped):
             new_blocks.append(_allcaps_to_header(stripped))
-            stats["n_header_allcaps"] += 1
+            count += 1
         else:
             sub_lines = block.split("\n")
             converted = []
             for ln in sub_lines:
                 if _is_allcaps_line(ln) and len(ln.strip()) > 3:
                     converted.append(_allcaps_to_header(ln))
-                    stats["n_header_allcaps"] += 1
+                    count += 1
                 else:
                     converted.append(ln)
             new_blocks.append("\n".join(converted))
-    text = "\n\n".join(new_blocks)
+    return "\n\n".join(new_blocks), count
+
+
+def _t_numbered_sections(text: str, has_exercises: bool = False) -> tuple[str, int]:
+    """Converti sezioni numerate 'N. testo' / '- N. testo' / '- N testo' → ### header."""
+    count = 0
 
-    # 4. Converti sezioni numerate "N. testo" → "### N.\n\ntesto"
-    #     Guarda che il testo non sia una frase completa (es. esercizi numerati):
-    #     se termina con "." ed è più lungo di 40 caratteri, è probabilmente una frase,
-    #     non un titolo di sezione → lascia invariato.
     def _num_repl(m: re.Match) -> str:
+        nonlocal count
         content = m.group(2).strip()
         if content.endswith(".") and len(content) > 40:
             return m.group(0)
-        stats["n_sezioni_numerate"] += 1
+        count += 1
         return f"### {m.group(1)}.\n\n{content}"
 
     text = re.sub(r"^(\d+)\.\s+(.+)$", _num_repl, text, flags=re.MULTILINE)
 
     def _num_letter_repl(m: re.Match) -> str:
-        stats["n_sezioni_numerate"] += 1
+        nonlocal count
+        count += 1
         return f"### {m.group(1)}{m.group(2)}.\n\n{m.group(3).strip()}"
 
     text = re.sub(r"^(\d+)\s*([a-z])\.\s+(.+)$", _num_letter_repl, text, flags=re.MULTILINE)
 
-    # 4b. Converti "- N. testo" sezioni con punto → "### N.\n\ntesto"
-    #     "- 1. Testo del primo punto..."  → "### 1.\n\nTesto del primo punto..."
-    #     Deve precedere 4c: "- N." ha il punto, "- N testo" no.
-    #     Disabilitato se il documento contiene sezioni "Esercizi": in quel caso i
-    #     "- N. testo" sono numerazioni di esercizi, non header di sezione.
-    if not _has_exercise_sections:
+    # Disabilitato se il documento contiene sezioni "Esercizi": in quel caso i
+    # "- N. testo" sono numerazioni di esercizi, non header di sezione.
+    if not has_exercises:
         def _aphorism_repl(m: re.Match) -> str:
-            stats["n_sezioni_numerate"] += 1
+            nonlocal count
+            count += 1
             return f"\n\n### {m.group(1)}.\n\n{m.group(2).strip()}"
 
         text = re.sub(
@@ -613,22 +578,17 @@ def apply_transforms(text: str) -> tuple[str, dict]:
             flags=re.MULTILINE,
         )
 
-    # 4c. Converti "- N testo" list item numerati → "### N.\n\ntesto"
-    #     "- 12 Titolo sezione Corpo della sezione..." → "### 12. Titolo sezione\n\nCorpo..."
-    #     Non tocca "- a) testo", "- 1) testo" (già gestiti come liste)
     def _list_section_repl(m: re.Match) -> str:
+        nonlocal count
         num = m.group(1)
         content = m.group(2).strip()
-        stats["n_sezioni_numerate"] += 1
-        # Separa titolo da corpo: il titolo finisce dove una lettera minuscola
-        # è seguita da spazio e maiuscola (confine fine-titolo / inizio-corpo)
+        count += 1
         split = re.search(r"(?<=[a-zàèéìíòóùú])\s+(?=[A-ZÀÈÉÌÍÒÓÙÚ])", content)
         if split and split.start() >= 3:
             title = content[: split.start()].strip()
-            body = content[split.end() :].strip()
+            body = content[split.end():].strip()
             if len(body) >= 20:
                 return f"\n\n### {num}. {title}\n\n{body}"
-        # Nessun body inline: il content è solo il titolo
         return f"\n\n### {num}. {content}"
 
     text = re.sub(
@@ -637,16 +597,20 @@ def apply_transforms(text: str) -> tuple[str, dict]:
         text,
         flags=re.MULTILINE,
     )
+    return text, count
 
-    # 4d. Converti ambienti matematici (Teorema/Definizione/...) → ### header
-    #     Eseguito prima del merge paragrafi (5) per sfruttare i blocchi intatti.
-    text, n_math = _extract_math_environments(text)
-    stats["n_ambienti_matematici"] = n_math
 
-    # 5. Unisci paragrafi spezzati da salti pagina PDF
+def _t_extract_math(text: str) -> tuple[str, int]:
+    """Converti ambienti matematici (Teorema/Definizione/...) → ### header."""
+    return _extract_math_environments(text)
+
+
+def _t_merge_paragraphs(text: str) -> tuple[str, int]:
+    """Unisci paragrafi spezzati da salti pagina PDF."""
     _SENTENCE_END = set(".?!»)\"'")
     blocks = text.split("\n\n")
     merged = []
+    count = 0
     i = 0
     while i < len(blocks):
         b = blocks[i]
@@ -662,30 +626,38 @@ def apply_transforms(text: str) -> tuple[str, dict]:
                 break
             b = stripped + " " + nxt
             stripped = b.strip()
-            stats["n_paragrafi_uniti"] += 1
+            count += 1
             i += 1
         merged.append(b)
         i += 1
     text = "\n\n".join(merged)
-
-    # Secondo pass: rimuovi prefisso |---| eventualmente rimasto dopo il merge paragrafi
+    # Secondo pass: rimuovi prefisso |---| eventualmente rimasto dopo il merge
     text = re.sub(r"(?m)^\|---\|\s*", "", text)
+    return text, count
 
-    # 6. Normalizza whitespace multiplo interno alle righe
+
+def _t_normalize_whitespace(text: str) -> tuple[str, int]:
+    """Normalizza whitespace multiplo interno alle righe."""
     lines = text.split("\n")
     text = "\n".join(
         re.sub(r"  +", " ", line) if line.strip() else line
         for line in lines
     )
+    return text, 0
 
-    # 7. Riduci righe vuote multiple a doppie
-    text = re.sub(r"\n{3,}", "\n\n", text)
 
-    # 8. Rimuovi righe che sono solo URL (watermark, footer di piattaforme)
-    text = re.sub(r"(?m)^(https?://|www\.)\S+\s*$", "", text)
+def _t_collapse_blank_lines(text: str) -> tuple[str, int]:
+    """Riduci righe vuote multiple a doppie."""
+    return re.sub(r"\n{3,}", "\n\n", text), 0
 
-    # 9. Rimuovi header senza corpo: header seguito solo da righe vuote e poi
-    #    da un altro header o dalla fine del testo (sezioni vuote / watermark)
+
+def _t_remove_urls(text: str) -> tuple[str, int]:
+    """Rimuovi righe che sono solo URL (watermark, footer di piattaforme)."""
+    return re.sub(r"(?m)^(https?://|www\.)\S+\s*$", "", text), 0
+
+
+def _t_remove_empty_headers(text: str) -> tuple[str, int]:
+    """Rimuovi header senza corpo (sezioni vuote / watermark)."""
     blocks = re.split(r"\n{2,}", text)
     cleaned = []
     for i, block in enumerate(blocks):
@@ -693,48 +665,45 @@ def apply_transforms(text: str) -> tuple[str, dict]:
         if re.match(r"^#{1,6} ", stripped) and "\n" not in stripped:
             next_stripped = blocks[i + 1].strip() if i + 1 < len(blocks) else ""
             if not next_stripped or re.match(r"^#{1,6} ", next_stripped):
-                continue  # header senza corpo → scarta
+                continue
         cleaned.append(block)
-    text = re.sub(r"\n{3,}", "\n\n", "\n\n".join(cleaned))
+    return re.sub(r"\n{3,}", "\n\n", "\n\n".join(cleaned)), 0
 
-    # 9b. Fondi header numerici isolati con il sottotitolo breve successivo
-    #     "### N.\n\nSottotitolo" → "### N. Sottotitolo"  (es. parti Nietzsche)
-    text, n_titoli = _merge_title_headers(text)
-    stats["n_titoli_uniti"] = n_titoli
 
-    # 9c. Rimuovi garbage headers: header ### senza parole reali o con solo
-    #     abbreviazioni matematiche.  Esempi: "### ( vm)", "### #", "### ! =",
-    #     "### (am)", "### 2. Il valore di hf si deter- mina risolvendo mg(h!hf)"
-    #     Questi nascono da espressioni matematiche scambiate per titoli di sezione.
-    #     Il corpo rimane nel testo e viene accorpato alla sezione precedente.
+def _t_merge_title_headers(text: str) -> tuple[str, int]:
+    """Fondi header numerici isolati con il sottotitolo breve successivo."""
+    return _merge_title_headers(text)
+
+
+def _t_remove_garbage_headers(text: str) -> tuple[str, int]:
+    """Rimuovi garbage headers: simboli, abbreviazioni matematiche, frammenti formula."""
     def _is_garbage_header(content: str) -> bool:
-        # Header con prefisso "..." — frammento di formula (es. "...Di", "...vi")
         if content.lstrip().startswith("..."):
             return True
-        # Nessuna sequenza alfabetica ≥ 2 char
         if not re.search(r"[A-Za-zÀ-ÿ]{2,}", content):
             return True
-        # Abbreviazione corta in parentesi opzionali: "(vm)", "( am)", "(am)"
         if re.fullmatch(r"\(?\s*[A-Za-z]{1,4}\s*\)?", content.strip()):
             return True
-        # Header molto lungo (>60ch) con artefatti formula inline
         if len(content) > 60 and re.search(r"[!%#]\w|\w[!%#]|\b\w+-\s*\w", content):
             return True
         return False
 
+    count = 0
     lines = text.split("\n")
     new_lines = []
     for line in lines:
         m = re.match(r"^#{1,6} (.+)$", line)
         if m and _is_garbage_header(m.group(1)):
-            stats["n_garbage_headers_rimossi"] += 1
+            count += 1
             continue
         new_lines.append(line)
     text = "\n".join(new_lines)
     text = re.sub(r"\n{3,}", "\n\n", text)
+    return text, count
 
-    # 9d. Rimuovi sezioni frontmatter: header senza numero + corpo corto con
-    #     URL, email, affiliazione, copyright, edizione — metadati non-contenuto.
+
+def _t_remove_frontmatter(text: str) -> tuple[str, int]:
+    """Rimuovi sezioni frontmatter: URL, email, affiliazione, copyright."""
     _FM_RE = re.compile(
         r"https?://|www\.|@[A-Za-z]|\bUniversit[àa]\b|\bDipartimento\b|"
         r"\bCopyright\b|\bLicenza\b|\bEdizione\b|"
@@ -743,20 +712,69 @@ def apply_transforms(text: str) -> tuple[str, dict]:
     )
     blocks = re.split(r"\n{2,}", text)
     cleaned = []
+    count = 0
     for i, block in enumerate(blocks):
         stripped = block.strip()
         if not re.match(r"^### ", stripped) or re.match(r"^### \d", stripped):
             cleaned.append(block)
             continue
         body = blocks[i + 1].strip() if i + 1 < len(blocks) else ""
-        is_fm_body  = len(body) < 250 and _FM_RE.search(body)
-        is_fm_hdr   = _FM_RE.search(stripped)
+        is_fm_body = len(body) < 250 and _FM_RE.search(body)
+        is_fm_hdr = _FM_RE.search(stripped)
         if is_fm_body or is_fm_hdr:
-            stats["n_frontmatter_rimossi"] += 1
+            count += 1
             continue
         cleaned.append(block)
-    text = re.sub(r"\n{3,}", "\n\n", "\n\n".join(cleaned))
+    return re.sub(r"\n{3,}", "\n\n", "\n\n".join(cleaned)), count
 
+
+# ─── [3b] Pipeline delle trasformazioni ──────────────────────────────────────
+
+def apply_transforms(text: str) -> tuple[str, dict]:
+    """
+    Applica le trasformazioni strutturali al Markdown grezzo.
+    Restituisce (testo_modificato, statistiche).
+    """
+    # Flag calcolato prima del loop: disabilita il transform 4b nei documenti
+    # con sezioni "Esercizi" (i "- N. testo" sarebbero numerazioni, non header).
+    _has_ex = bool(re.search(r"\bEsercizi\b", text, re.IGNORECASE))
+
+    _transforms: list[tuple[str | None, object]] = [
+        ("n_immagini_rimosse",          _t_remove_images),
+        ("n_br_rimossi",                _t_fix_br),
+        ("n_tabsep_rimossi",            _t_fix_tabsep),
+        ("n_accenti_corretti",          _t_fix_accents),
+        ("n_moltiplicazioni_corrette",  _t_fix_multiplication),
+        ("n_micro_corretti",            _t_fix_micro),
+        ("n_formule_rimossi",           _t_remove_formula_labels),
+        ("n_dotleader_rimossi",         _t_remove_dotleaders),
+        ("n_header_concat_fixati",      _t_fix_header_concat),
+        (None,                          _t_extract_capitolo),
+        (None,                          _t_normalize_header_levels),
+        ("n_articoli_estratti",         _t_extract_articles),
+        (None,                          _t_remove_header_bold),
+        (None,                          _t_normalize_allcaps_headers),
+        ("toc_rimosso",                 _t_remove_toc),
+        ("n_header_allcaps",            _t_allcaps_to_headers),
+        ("n_sezioni_numerate",          partial(_t_numbered_sections, has_exercises=_has_ex)),
+        ("n_ambienti_matematici",       _t_extract_math),
+        ("n_paragrafi_uniti",           _t_merge_paragraphs),
+        (None,                          _t_normalize_whitespace),
+        (None,                          _t_collapse_blank_lines),
+        (None,                          _t_remove_urls),
+        (None,                          _t_remove_empty_headers),
+        ("n_titoli_uniti",              _t_merge_title_headers),
+        ("n_garbage_headers_rimossi",   _t_remove_garbage_headers),
+        ("n_frontmatter_rimossi",       _t_remove_frontmatter),
+    ]
+
+    stats: dict = {}
+    for stat_key, fn in _transforms:
+        text, n = fn(text)
+        if stat_key:
+            stats[stat_key] = stats.get(stat_key, 0) + n
+
+    stats["toc_rimosso"] = bool(stats.get("toc_rimosso", 0))
     return text, stats
 
 
@@ -802,6 +820,26 @@ def _split_sections(text: str, level: int) -> list[str]:
     return [p for p in parts[1:] if p.strip()]
 
 
+def _parse_sections_with_body(text: str, level: int = 3) -> list[tuple[str, str]]:
+    """Restituisce lista di (header_line, body_text) per tutti gli header al livello dato."""
+    prefix = "#" * level + " "
+    lines = text.split("\n")
+    sections: list[tuple[str, str]] = []
+    cur_hdr: str | None = None
+    cur_body: list[str] = []
+    for line in lines:
+        if line.startswith(prefix):
+            if cur_hdr is not None:
+                sections.append((cur_hdr, "\n".join(cur_body).strip()))
+            cur_hdr = line
+            cur_body = []
+        elif cur_hdr is not None:
+            cur_body.append(line)
+    if cur_hdr is not None:
+        sections.append((cur_hdr, "\n".join(cur_body).strip()))
+    return sections
+
+
 def analyze(md_path: Path) -> dict:
     text = md_path.read_text(encoding="utf-8")
     n_h1 = _count_headers(text, 1)
@@ -869,20 +907,7 @@ def build_report(
     text_lines = clean_text.split("\n")
 
     # ── Raccolta sezioni ### con corpo ────────────────────────────────────
-    sections: list[tuple[str, str]] = []
-    cur_hdr: str | None = None
-    cur_body: list[str] = []
-    for line in text_lines:
-        if re.match(r"^### ", line):
-            if cur_hdr is not None:
-                sections.append((cur_hdr, "\n".join(cur_body).strip()))
-            cur_hdr = line
-            cur_body = []
-        elif cur_hdr is not None:
-            cur_body.append(line)
-    if cur_hdr is not None:
-        sections.append((cur_hdr, "\n".join(cur_body).strip()))
-
+    sections = _parse_sections_with_body(clean_text, 3)
     lengths = [len(body) for _, body in sections]
 
     # ── Distribuzione lunghezze ───────────────────────────────────────────
@@ -901,9 +926,6 @@ def build_report(
     }
 
     # ── Anomalie ──────────────────────────────────────────────────────────
-    # Header solo-numero senza corpo sostanziale: anomalia solo se il corpo
-    # è vuoto o < 30 chars. Un body lungo è una sezione numerata legittima
-    # (es. aforismi numerati dove il numero è l'identificatore della sezione).
     bare_hdrs = [
         {"header": hdr, "corpo_inizio": body[:120].replace("\n", " ")}
         for hdr, body in sections

From 0a8d98279c50cb8fc3c0d54ac35558350d7544f5 Mon Sep 17 00:00:00 2001
From: Davide Grilli <davide.grilli@outlook.com>
Date: Fri, 17 Apr 2026 11:53:38 +0200
Subject: [PATCH 11/22] feat(conversione): robustezza e 7 nuovi transform
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- check_pdf: file < 1KB, campione esteso 15pp, MemoryError
- convert_pdf: validazione output ≥ 100 char
- analyze: rilevamento gerarchia invertita h3 > h2
- _detect_language: supporto FR/DE/ES
- 7 nuovi transform: fix_math_symbols, remove_recurring_lines,
  normalize_numbered_headings, remove_toc_page_list,
  restore_poetry_lines, demote_verse_headers, remove_watermarks
- bug fix: tabelle MD, garbage headers lowercase, empty headers
- run(): MemoryError / UnicodeDecodeError / PermissionError
---
 conversione/pipeline.py | 331 ++++++++++++++++++++++++++++++++++++++--
 1 file changed, 317 insertions(+), 14 deletions(-)

diff --git a/conversione/pipeline.py b/conversione/pipeline.py
index 783f3d7..099acec 100644
--- a/conversione/pipeline.py
+++ b/conversione/pipeline.py
@@ -69,8 +69,11 @@ def check_pdf(pdf_path: Path) -> tuple[bool, str]:
         return False, f"File non trovato: {pdf_path}"
     if pdf_path.suffix.lower() != ".pdf":
         return False, f"Non è un PDF: {pdf_path.name}"
-    if pdf_path.stat().st_size == 0:
+    size = pdf_path.stat().st_size
+    if size == 0:
         return False, "File vuoto"
+    if size < 1024:
+        return False, f"File troppo piccolo ({size} byte) — probabilmente corrotto"
 
     try:
         import pdfplumber
@@ -84,11 +87,26 @@ def check_pdf(pdf_path: Path) -> tuple[bool, str]:
                 if len((pdf.pages[i].extract_text() or "").strip()) > 50
             )
             if pages_with_text == 0:
+                # Estende il campione: copertine immagine o pagine bianche iniziali
+                extended = min(15, n_pages)
+                if extended > sample:
+                    ext_with_text = sum(
+                        1 for i in range(sample, extended)
+                        if len((pdf.pages[i].extract_text() or "").strip()) > 50
+                    )
+                    if ext_with_text > 0:
+                        return True, (
+                            f"{n_pages} pagine — prime {sample} vuote, "
+                            f"testo trovato in pagine successive "
+                            f"(possibile copertina immagine)"
+                        )
                 return False, (
-                    f"Nessun testo nelle prime {sample} pagine "
-                    f"— probabilmente scansionato (usa modalità hybrid)"
+                    f"Nessun testo nelle prime {extended} pagine "
+                    f"— probabilmente scansionato (OCR non supportato)"
                 )
         return True, f"{n_pages} pagine, testo digitale confermato"
+    except MemoryError:
+        return False, "Memoria esaurita durante l'apertura del PDF"
     except Exception as e:
         msg = str(e).lower()
         if "password" in msg or "encrypted" in msg:
@@ -131,6 +149,13 @@ def convert_pdf(pdf_path: Path, out_dir: Path) -> Path:
             raise RuntimeError(f"Nessun file .md prodotto in {out_dir}")
         md_file = candidates[0]
 
+    content = md_file.read_text(encoding="utf-8", errors="replace").strip()
+    if len(content) < 100:
+        raise RuntimeError(
+            f"opendataloader ha prodotto un file .md quasi vuoto ({len(content)} char) "
+            f"— il PDF potrebbe essere corrotto o non supportato"
+        )
+
     return md_file
 
 
@@ -139,6 +164,9 @@ def convert_pdf(pdf_path: Path, out_dir: Path) -> Path:
 _TOC_KEYWORDS = frozenset([
     "indice", "index", "contents", "table of contents",
     "sommario", "inhaltsverzeichnis", "inhalt",
+    "indice generale", "indice analitico", "indice dei contenuti",
+    "elenco dei capitoli", "argomenti", "table des matières",
+    "tabla de contenidos", "содержание",
 ])
 
 _ORDINALS_IT = {
@@ -166,6 +194,7 @@ def _is_allcaps_line(line: str) -> bool:
         len(letters) >= 3
         and all(c.isupper() for c in letters)
         and not stripped.startswith("#")
+        and not stripped.startswith("|")   # esclude righe tabella Markdown
     )
 
 
@@ -457,6 +486,48 @@ def _t_extract_capitolo(text: str) -> tuple[str, int]:
     return text, 0
 
 
+_NUMBERED_HDR_RE = re.compile(
+    r"^(#{1,6})\s+(\d+(?:\.\d+)*)\.\s+(.+)$",
+    re.MULTILINE,
+)
+
+
+def _t_normalize_numbered_headings(text: str) -> tuple[str, int]:
+    """Corregge livelli header per documenti con numerazione decimale.
+
+    Assegna livello heading in base alla profondità numerica usando come base
+    il livello corrente degli header di profondità minima.
+    Attivo solo se il documento ha almeno 2 profondità di numerazione.
+    """
+    all_matches = list(_NUMBERED_HDR_RE.finditer(text))
+    if not all_matches:
+        return text, 0
+
+    pairs = [
+        (m.group(2).count(".") + 1, len(m.group(1)))
+        for m in all_matches
+    ]
+    depths = [d for d, _ in pairs]
+    min_depth, max_depth = min(depths), max(depths)
+    if max_depth == min_depth:
+        return text, 0
+
+    base_level = min(lv for d, lv in pairs if d == min_depth)
+    count = 0
+
+    def _repl(m: re.Match) -> str:
+        nonlocal count
+        hashes, num, title = m.group(1), m.group(2), m.group(3)
+        depth = num.count(".") + 1
+        new_level = min(base_level + (depth - min_depth), 6)
+        if new_level == len(hashes):
+            return m.group(0)
+        count += 1
+        return f"{'#' * new_level} {num}. {title}"
+
+    return _NUMBERED_HDR_RE.sub(_repl, text), count
+
+
 def _t_normalize_header_levels(text: str) -> tuple[str, int]:
     """Normalizza h4+ → h3; rimuove header vuoti."""
     text = re.sub(r"^#{3,6}\s*$", "", text, flags=re.MULTILINE)
@@ -519,6 +590,30 @@ def _t_remove_toc(text: str) -> tuple[str, int]:
     return "\n".join(new_lines), 1 if removed else 0
 
 
+def _t_remove_toc_page_list(text: str) -> tuple[str, int]:
+    """Rimuovi voci lista TOC con numero di pagina finale.
+
+    Intercetta indici come '- Canto I 1', '- Canto XXIX 119' (eventualmente
+    fusi su una riga: '- Canto XXIX 119 - Canto XXX 123') che opendataloader
+    non separa dall'indice del PDF.
+    """
+    count = 0
+    lines = text.split("\n")
+    new_lines = []
+    for line in lines:
+        stripped = line.strip()
+        # Voce TOC fusa: "- X N - Y M" — le separiamo e le scartiamo entrambe
+        if re.match(r"^\s*-\s+.{2,50}\s+\d{1,4}\s+-\s+.{2,50}\s+\d{1,4}\s*$", stripped):
+            count += 2
+            continue
+        # Voce TOC semplice: "- Testo ... NN" dove NN è un numero pagina isolato
+        if re.match(r"^\s*-\s+\S.{1,60}\s+\d{1,4}\s*$", stripped):
+            count += 1
+            continue
+        new_lines.append(line)
+    return "\n".join(new_lines), count
+
+
 def _t_allcaps_to_headers(text: str) -> tuple[str, int]:
     """Converti righe ALL-CAPS standalone → ## header."""
     count = 0
@@ -619,10 +714,11 @@ def _t_merge_paragraphs(text: str) -> tuple[str, int]:
             i + 1 < len(blocks)
             and stripped
             and not stripped.startswith("#")
+            and not stripped.startswith("|")   # non unire righe tabella in avanti
             and stripped[-1] not in _SENTENCE_END
         ):
             nxt = blocks[i + 1].strip()
-            if not nxt or nxt.startswith("#") or re.match(r"^\d+\.", nxt):
+            if not nxt or nxt.startswith("#") or nxt.startswith("|") or re.match(r"^\d+\.", nxt):
                 break
             b = stripped + " " + nxt
             stripped = b.strip()
@@ -651,6 +747,97 @@ def _t_collapse_blank_lines(text: str) -> tuple[str, int]:
     return re.sub(r"\n{3,}", "\n\n", text), 0
 
 
+def _t_demote_verse_headers(text: str) -> tuple[str, int]:
+    """Demoti header che sono in realtà terzine/versi.
+
+    opendataloader promuove a ## le iscrizioni e i testi in evidenza nel PDF
+    (corpo maggiore, centrato). Si riconoscono perché:
+      - terminano con un numero nudo (numero di verso: 3, 6, 9, …)
+      - contengono punteggiatura interna di fine verso (', ' o '. ')
+    Esempio: '## «per me si va ne la città dolente, ... gente. 3'
+    → paragrafo normale senza il numero finale.
+    """
+    count = 0
+
+    def _demote(m: re.Match) -> str:
+        nonlocal count
+        hashes, content = m.group(1), m.group(2).strip()
+        # Deve terminare con numero nudo (numero di verso ≤ 9999)
+        if not re.search(r"\s\d{1,4}\s*$", content):
+            return m.group(0)
+        # Deve contenere punteggiatura interna (è un blocco di più versi)
+        inner = re.sub(r"\s\d{1,4}\s*$", "", content)
+        if not re.search(r"[,;:.!?»\"\']\s+[A-Za-zÀ-ÿ«\"]", inner):
+            return m.group(0)
+        count += 1
+        # Rimuovi il numero di verso finale e restituisci come testo normale
+        clean = re.sub(r"\s\d{1,4}\s*$", "", content)
+        return clean
+
+    text = re.sub(
+        r"^(#{1,6})\s+(.{20,})$",
+        _demote,
+        text,
+        flags=re.MULTILINE,
+    )
+    return text, count
+
+
+def _t_restore_poetry_lines(text: str) -> tuple[str, int]:
+    """Ripristina line break di poesia distrutti da keep_line_breaks=False.
+
+    Quando il PDF è poesia (terzine dantesche, sonetti, ecc.) opendataloader
+    con keep_line_breaks=False produce un unico paragrafo con i numeri di verso
+    (3, 6, 9 … oppure 1, 2, 3 …) incorporati inline:
+      'smarrita. 3 Ahi quanto a dir qual era è cosa dura … paura! 6 Tant'è …'
+
+    Il transform rileva blocchi con numeri di verso in progressione aritmetica
+    e li separa in righe, con riga vuota ogni 3 versi (terzina).
+    """
+    count = 0
+    blocks = text.split("\n\n")
+    result = []
+
+    # Pattern: numero isolato preceduto da punteggiatura-fine-verso e seguito
+    # da lettera maiuscola (inizio verso successivo).
+    _VERSE_NUM_RE = re.compile(
+        r'([.!?»\'\"]\s+)(\d+)(\s+)(?=[A-ZÀ-Ùa-zà-ù«"‟])'
+    )
+
+    for block in blocks:
+        stripped = block.strip()
+        if not stripped or stripped.startswith("#"):
+            result.append(block)
+            continue
+
+        matches = list(_VERSE_NUM_RE.finditer(stripped))
+        if len(matches) < 2:
+            result.append(block)
+            continue
+
+        nums = [int(m.group(2)) for m in matches]
+        diffs = [nums[i + 1] - nums[i] for i in range(len(nums) - 1)]
+        # Accetta progressioni con passo costante 1–5 (terzine: 3, endecasillabi: 1)
+        if not diffs or len(set(diffs)) > 2 or not (1 <= diffs[0] <= 5):
+            result.append(block)
+            continue
+
+        step = diffs[0]
+
+        def _replace_verse_num(m: re.Match) -> str:
+            n = int(m.group(2))
+            # Ogni 'step' versi → riga vuota (inizio nuova terzina/strofa)
+            sep = "\n\n" if n % (step * 3) == 0 else "\n"
+            return m.group(1).rstrip() + sep
+
+        new_block = _VERSE_NUM_RE.sub(_replace_verse_num, stripped)
+        if new_block != stripped:
+            count += len(matches)
+        result.append(new_block)
+
+    return "\n\n".join(result), count
+
+
 def _t_remove_urls(text: str) -> tuple[str, int]:
     """Rimuovi righe che sono solo URL (watermark, footer di piattaforme)."""
     return re.sub(r"(?m)^(https?://|www\.)\S+\s*$", "", text), 0
@@ -664,7 +851,14 @@ def _t_remove_empty_headers(text: str) -> tuple[str, int]:
         stripped = block.strip()
         if re.match(r"^#{1,6} ", stripped) and "\n" not in stripped:
             next_stripped = blocks[i + 1].strip() if i + 1 < len(blocks) else ""
-            if not next_stripped or re.match(r"^#{1,6} ", next_stripped):
+            # Non rimuovere un header breve se il successivo è un header molto lungo
+            # (> 80 char): quasi certamente è testo PDF mal classificato come heading.
+            next_is_long_header = (
+                re.match(r"^#{1,6} ", next_stripped) and len(next_stripped) > 80
+            )
+            if not next_stripped or (
+                re.match(r"^#{1,6} ", next_stripped) and not next_is_long_header
+            ):
                 continue
         cleaned.append(block)
     return re.sub(r"\n{3,}", "\n\n", "\n\n".join(cleaned)), 0
@@ -686,6 +880,11 @@ def _t_remove_garbage_headers(text: str) -> tuple[str, int]:
             return True
         if len(content) > 60 and re.search(r"[!%#]\w|\w[!%#]|\b\w+-\s*\w", content):
             return True
+        # Frammento di frase: inizia con minuscola ed è abbastanza lungo
+        # (testo spezzato dalla tabella che opendataloader ha promosso a heading)
+        first_alpha = next((c for c in content if c.isalpha()), None)
+        if first_alpha and first_alpha.islower() and len(content) > 40:
+            return True
         return False
 
     count = 0
@@ -728,6 +927,58 @@ def _t_remove_frontmatter(text: str) -> tuple[str, int]:
     return re.sub(r"\n{3,}", "\n\n", "\n\n".join(cleaned)), count
 
 
+_WATERMARK_RE = re.compile(
+    r"^(BOZZA|DRAFT|CONFIDENTIAL|RISERVATO|PROVVISORIO|SAMPLE|SPECIMEN"
+    r"|DO NOT DISTRIBUTE|NON DISTRIBUIRE|COPY|COPIA)\s*$",
+    re.IGNORECASE | re.MULTILINE,
+)
+
+
+def _t_remove_watermarks(text: str) -> tuple[str, int]:
+    """Rimuovi righe standalone con testo watermark comune."""
+    lines = text.split("\n")
+    result, count = [], 0
+    for line in lines:
+        if _WATERMARK_RE.match(line):
+            count += 1
+        else:
+            result.append(line)
+    return "\n".join(result), count
+
+
+def _t_fix_math_symbols(text: str) -> tuple[str, int]:
+    """Rimuovi righe composte solo da simboli box/placeholder (font non estratti)."""
+    lines = text.split("\n")
+    result, count = [], 0
+    for line in lines:
+        if line.strip() and re.match(r"^[\s□■▪▫◆◇●○•\u25a0-\u25ff]+$", line):
+            count += 1
+        else:
+            result.append(line)
+    return "\n".join(result), count
+
+
+def _t_remove_recurring_lines(text: str) -> tuple[str, int]:
+    """Rimuovi righe corte che si ripetono ≥3 volte (header/footer di pagina)."""
+    from collections import Counter
+    lines = text.split("\n")
+    short_lines = [
+        ln.strip() for ln in lines
+        if 3 < len(ln.strip()) < 80 and not ln.strip().startswith("#")
+    ]
+    freq = Counter(short_lines)
+    recurring = {ln for ln, c in freq.items() if c >= 3}
+    if not recurring:
+        return text, 0
+    result, count = [], 0
+    for line in lines:
+        if line.strip() in recurring:
+            count += 1
+        else:
+            result.append(line)
+    return "\n".join(result), count
+
+
 # ─── [3b] Pipeline delle trasformazioni ──────────────────────────────────────
 
 def apply_transforms(text: str) -> tuple[str, dict]:
@@ -746,26 +997,33 @@ def apply_transforms(text: str) -> tuple[str, dict]:
         ("n_accenti_corretti",          _t_fix_accents),
         ("n_moltiplicazioni_corrette",  _t_fix_multiplication),
         ("n_micro_corretti",            _t_fix_micro),
+        ("n_simboli_math_rimossi",      _t_fix_math_symbols),
         ("n_formule_rimossi",           _t_remove_formula_labels),
         ("n_dotleader_rimossi",         _t_remove_dotleaders),
+        ("n_righe_ricorrenti_rimosse",  _t_remove_recurring_lines),
         ("n_header_concat_fixati",      _t_fix_header_concat),
         (None,                          _t_extract_capitolo),
+        ("n_header_numerati_normalizzati", _t_normalize_numbered_headings),
         (None,                          _t_normalize_header_levels),
         ("n_articoli_estratti",         _t_extract_articles),
         (None,                          _t_remove_header_bold),
         (None,                          _t_normalize_allcaps_headers),
         ("toc_rimosso",                 _t_remove_toc),
+        ("n_toc_page_list_rimossi",     _t_remove_toc_page_list),
         ("n_header_allcaps",            _t_allcaps_to_headers),
         ("n_sezioni_numerate",          partial(_t_numbered_sections, has_exercises=_has_ex)),
         ("n_ambienti_matematici",       _t_extract_math),
         ("n_paragrafi_uniti",           _t_merge_paragraphs),
         (None,                          _t_normalize_whitespace),
         (None,                          _t_collapse_blank_lines),
+        ("n_versi_ripristinati",        _t_restore_poetry_lines),
+        ("n_header_verso_demotati",     _t_demote_verse_headers),
         (None,                          _t_remove_urls),
         (None,                          _t_remove_empty_headers),
         ("n_titoli_uniti",              _t_merge_title_headers),
         ("n_garbage_headers_rimossi",   _t_remove_garbage_headers),
         ("n_frontmatter_rimossi",       _t_remove_frontmatter),
+        ("n_watermark_rimossi",         _t_remove_watermarks),
     ]
 
     stats: dict = {}
@@ -792,16 +1050,35 @@ _EN_WORDS = frozenset([
     "from", "or", "an", "but", "not", "by", "he", "she", "we", "you",
     "which", "their", "been", "has", "would", "there", "when", "will",
 ])
+_FR_WORDS = frozenset([
+    "le", "les", "de", "du", "des", "et", "un", "une", "est", "que",
+    "pour", "dans", "sur", "avec", "qui", "par", "pas", "plus", "au",
+    "ce", "se", "ou", "mais", "comme", "aussi",
+])
+_DE_WORDS = frozenset([
+    "der", "die", "das", "und", "in", "von", "zu", "den", "mit", "ist",
+    "auf", "eine", "als", "dem", "des", "sich", "nicht", "auch", "werden",
+    "bei", "nach", "oder", "wenn", "wird", "war",
+])
+_ES_WORDS = frozenset([
+    "el", "los", "las", "de", "en", "un", "una", "es", "que", "por",
+    "con", "del", "para", "como", "pero", "sus", "son", "los", "hay",
+    "todo", "esta", "este", "ser", "más", "ya",
+])
 
 
 def _detect_language(text: str) -> str:
     words = re.findall(r"\b[a-zA-Z]{2,}\b", text.lower())
     sample = words[:2000]
-    it = sum(1 for w in sample if w in _IT_WORDS)
-    en = sum(1 for w in sample if w in _EN_WORDS)
-    if it == 0 and en == 0:
-        return "unknown"
-    return "it" if it >= en else "en"
+    scores = {
+        "it": sum(1 for w in sample if w in _IT_WORDS),
+        "en": sum(1 for w in sample if w in _EN_WORDS),
+        "fr": sum(1 for w in sample if w in _FR_WORDS),
+        "de": sum(1 for w in sample if w in _DE_WORDS),
+        "es": sum(1 for w in sample if w in _ES_WORDS),
+    }
+    best = max(scores, key=scores.get)
+    return best if scores[best] > 0 else "unknown"
 
 
 def _count_headers(text: str, level: int) -> int:
@@ -850,6 +1127,17 @@ def analyze(md_path: Path) -> dict:
     if n_h3 >= 5:
         livello, boundary, strategia = 3, "h3", "h3_aware"
         section_bodies = _split_sections(text, 3)
+        # Gerarchia invertita: h3 sono capitoli enormi, h2 sono sottosezioni più brevi.
+        # Succede quando opendataloader classifica titoli capitolo come h6 (→ normalizzati
+        # a h3) e le sottosezioni ALL-CAPS diventano ## (h2). In questo caso h2 è
+        # il boundary corretto per il chunking.
+        if n_h2 >= 3:
+            h2_bodies = _split_sections(text, 2)
+            avg_h3 = sum(len(b) for b in section_bodies) / len(section_bodies) if section_bodies else 0
+            avg_h2 = sum(len(b) for b in h2_bodies) / len(h2_bodies) if h2_bodies else 0
+            if avg_h3 > 5000 and avg_h2 < avg_h3 * 0.7:
+                livello, boundary, strategia = 2, "h2", "h2_paragraph_split"
+                section_bodies = h2_bodies
     elif n_h2 >= 3:
         livello, boundary, strategia = 2, "h2", "h2_paragraph_split"
         section_bodies = _split_sections(text, 2)
@@ -1035,10 +1323,17 @@ def run(stem: str, project_root: Path, force: bool) -> bool:
     with tempfile.TemporaryDirectory() as tmp:
         try:
             md_file = convert_pdf(pdf_path, Path(tmp))
+        except MemoryError:
+            print("  ✗ Memoria esaurita durante la conversione")
+            return False
         except Exception as e:
             print(f"  ✗ Conversione fallita: {e}")
             return False
-        raw_text = md_file.read_text(encoding="utf-8")
+        try:
+            raw_text = md_file.read_text(encoding="utf-8")
+        except UnicodeDecodeError as e:
+            print(f"  ✗ Errore encoding nel file prodotto: {e}")
+            return False
 
     size_kb = len(raw_text.encode()) // 1024
     n_lines = raw_text.count("\n")
@@ -1052,10 +1347,14 @@ def run(stem: str, project_root: Path, force: bool) -> bool:
     print(f"     Accenti corretti:      {t_stats['n_accenti_corretti']}")
     print(f"     Dot-leader rimossi:    {t_stats['n_dotleader_rimossi']}")
     print(f"     Header concat fixati:  {t_stats['n_header_concat_fixati']}")
+    print(f"     Header num. normaliz.: {t_stats['n_header_numerati_normalizzati']}")
     print(f"     Articoli → ###:        {t_stats['n_articoli_estratti']}")
     print(f"     Ambienti matematici:   {t_stats['n_ambienti_matematici']}")
     print(f"     Titoli header uniti:   {t_stats['n_titoli_uniti']}")
     print(f"     TOC rimosso:           {'sì' if t_stats['toc_rimosso'] else 'no'}")
+    print(f"     TOC voci pagina rim.:  {t_stats['n_toc_page_list_rimossi']}")
+    print(f"     Versi poesia riprist.: {t_stats['n_versi_ripristinati']}")
+    print(f"     Header verso demotati: {t_stats['n_header_verso_demotati']}")
     print(f"     ALL-CAPS → ##:         {t_stats['n_header_allcaps']}")
     print(f"     Sezioni → ###:         {t_stats['n_sezioni_numerate']}")
     print(f"     Paragrafi uniti:       {t_stats['n_paragrafi_uniti']}")
@@ -1063,9 +1362,13 @@ def run(stem: str, project_root: Path, force: bool) -> bool:
 
     # ── [4] Profilo strutturale ────────────────────────────────────────────
     print("  [4/4] Analisi struttura...")
-    out_dir.mkdir(parents=True, exist_ok=True)
-    raw_out.write_text(raw_text, encoding="utf-8")
-    clean_out.write_text(clean_text, encoding="utf-8")
+    try:
+        out_dir.mkdir(parents=True, exist_ok=True)
+        raw_out.write_text(raw_text, encoding="utf-8")
+        clean_out.write_text(clean_text, encoding="utf-8")
+    except PermissionError as e:
+        print(f"  ✗ Permesso negato durante la scrittura: {e}")
+        return False
     profile = analyze(clean_out)
 
     _LIVELLO_DESC = {3: "ricca (h3)", 2: "parziale (h2)", 1: "paragrafi", 0: "testo piatto"}

From ef8f56fdba1c6b8f1fdb14eed8bcdf494469a830 Mon Sep 17 00:00:00 2001
From: Davide Grilli <davide.grilli@outlook.com>
Date: Fri, 17 Apr 2026 12:06:19 +0200
Subject: [PATCH 12/22] fix(conversione): 5 fix robustezza e precisione
 transform
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- _t_remove_footnotes: rimuove marcatori superscript inline e righe
  corpo-nota (¹ testo, [N] testo) — nuovo transform in posizione early
- _t_numbered_sections: esclude voci bibliografiche (anno, pp., vol.,
  DOI, ISBN) dalla promozione a ### header
- _t_remove_toc: intercetta voci con numero pagina finale nel contesto
  TOC — rimosso _t_remove_toc_page_list standalone
- _t_remove_frontmatter: limitata alle prime ~20% sezioni del documento
- _t_remove_recurring_lines: soglia 3->5, Counter spostato a top-level
---
 conversione/pipeline.py | 53 +++++++++++++++++++++++++++++++++++++----
 1 file changed, 48 insertions(+), 5 deletions(-)

diff --git a/conversione/pipeline.py b/conversione/pipeline.py
index 099acec..e207b28 100644
--- a/conversione/pipeline.py
+++ b/conversione/pipeline.py
@@ -30,6 +30,7 @@ import re
 import subprocess
 import sys
 import tempfile
+from collections import Counter
 from datetime import datetime
 from functools import partial
 from pathlib import Path
@@ -378,6 +379,31 @@ def _t_remove_images(text: str) -> tuple[str, int]:
     return text, n
 
 
+# Superscript Unicode: ¹²³⁴⁵⁶⁷⁸⁹⁰
+_SUPERSCRIPT_RE = re.compile(r'[\u00b9\u00b2\u00b3\u2070\u2074-\u2079]+')
+# Riga corpo-nota: inizia con superscript o [N]
+_FOOTNOTE_BODY_RE = re.compile(
+    r'^([\u00b9\u00b2\u00b3\u2070\u2074-\u2079]+\s+|\[\d{1,3}\]\s+)'
+)
+
+
+def _t_remove_footnotes(text: str) -> tuple[str, int]:
+    """Rimuovi marcatori footnote superscript inline e righe corpo-nota."""
+    lines = text.split("\n")
+    result, count = [], 0
+    for line in lines:
+        stripped = line.strip()
+        # Corpo nota: riga breve che inizia con ¹ o [N]
+        if stripped and _FOOTNOTE_BODY_RE.match(stripped) and len(stripped) < 300:
+            count += 1
+            continue
+        cleaned = _SUPERSCRIPT_RE.sub("", line)
+        if cleaned != line:
+            count += 1
+        result.append(cleaned)
+    return "\n".join(result), count
+
+
 def _t_fix_br(text: str) -> tuple[str, int]:
     n = len(re.findall(r"<br>", text, re.IGNORECASE))
     text = re.sub(r"<br>\s*", " ", text, flags=re.IGNORECASE)
@@ -585,6 +611,9 @@ def _t_remove_toc(text: str) -> tuple[str, int]:
         if _in_toc:
             if re.match(r"^\s*$", line) or re.match(r"^\s*[-*+]\s+\d", line):
                 continue
+            # Voce TOC con numero pagina finale (sicuro: siamo già in contesto TOC)
+            if re.match(r"^\s*[-*+]\s+.{2,70}\s+\d{1,3}\s*$", line):
+                continue
             _in_toc = False
         new_lines.append(line)
     return "\n".join(new_lines), 1 if removed else 0
@@ -637,6 +666,13 @@ def _t_allcaps_to_headers(text: str) -> tuple[str, int]:
     return "\n\n".join(new_blocks), count
 
 
+_BIB_MARKERS_RE = re.compile(
+    r'\b(pp?\.|vol\.|n\.\s*\d|ed\.|edn\.|ISBN|DOI|arXiv)\b'
+    r'|\b(19|20)\d{2}\b',
+    re.IGNORECASE,
+)
+
+
 def _t_numbered_sections(text: str, has_exercises: bool = False) -> tuple[str, int]:
     """Converti sezioni numerate 'N. testo' / '- N. testo' / '- N testo' → ### header."""
     count = 0
@@ -646,6 +682,8 @@ def _t_numbered_sections(text: str, has_exercises: bool = False) -> tuple[str, i
         content = m.group(2).strip()
         if content.endswith(".") and len(content) > 40:
             return m.group(0)
+        if _BIB_MARKERS_RE.search(content):
+            return m.group(0)
         count += 1
         return f"### {m.group(1)}.\n\n{content}"
 
@@ -912,8 +950,14 @@ def _t_remove_frontmatter(text: str) -> tuple[str, int]:
     blocks = re.split(r"\n{2,}", text)
     cleaned = []
     count = 0
+    total = len(blocks)
+    cutoff = max(5, min(15, int(total * 0.20)))
     for i, block in enumerate(blocks):
         stripped = block.strip()
+        # Frontmatter compare solo nelle prime sezioni del documento
+        if i >= cutoff:
+            cleaned.append(block)
+            continue
         if not re.match(r"^### ", stripped) or re.match(r"^### \d", stripped):
             cleaned.append(block)
             continue
@@ -959,15 +1003,14 @@ def _t_fix_math_symbols(text: str) -> tuple[str, int]:
 
 
 def _t_remove_recurring_lines(text: str) -> tuple[str, int]:
-    """Rimuovi righe corte che si ripetono ≥3 volte (header/footer di pagina)."""
-    from collections import Counter
+    """Rimuovi righe corte che si ripetono ≥5 volte (header/footer di pagina)."""
     lines = text.split("\n")
     short_lines = [
         ln.strip() for ln in lines
         if 3 < len(ln.strip()) < 80 and not ln.strip().startswith("#")
     ]
     freq = Counter(short_lines)
-    recurring = {ln for ln, c in freq.items() if c >= 3}
+    recurring = {ln for ln, c in freq.items() if c >= 5}
     if not recurring:
         return text, 0
     result, count = [], 0
@@ -994,6 +1037,7 @@ def apply_transforms(text: str) -> tuple[str, dict]:
         ("n_immagini_rimosse",          _t_remove_images),
         ("n_br_rimossi",                _t_fix_br),
         ("n_tabsep_rimossi",            _t_fix_tabsep),
+        ("n_note_rimosse",              _t_remove_footnotes),
         ("n_accenti_corretti",          _t_fix_accents),
         ("n_moltiplicazioni_corrette",  _t_fix_multiplication),
         ("n_micro_corretti",            _t_fix_micro),
@@ -1009,7 +1053,6 @@ def apply_transforms(text: str) -> tuple[str, dict]:
         (None,                          _t_remove_header_bold),
         (None,                          _t_normalize_allcaps_headers),
         ("toc_rimosso",                 _t_remove_toc),
-        ("n_toc_page_list_rimossi",     _t_remove_toc_page_list),
         ("n_header_allcaps",            _t_allcaps_to_headers),
         ("n_sezioni_numerate",          partial(_t_numbered_sections, has_exercises=_has_ex)),
         ("n_ambienti_matematici",       _t_extract_math),
@@ -1344,6 +1387,7 @@ def run(stem: str, project_root: Path, force: bool) -> bool:
     clean_text, t_stats = apply_transforms(raw_text)
     reduction = 100 * (1 - len(clean_text) / len(raw_text)) if raw_text else 0
     print(f"  ✅ Immagini rimosse:      {t_stats['n_immagini_rimosse']}")
+    print(f"     Note rimossa:          {t_stats['n_note_rimosse']}")
     print(f"     Accenti corretti:      {t_stats['n_accenti_corretti']}")
     print(f"     Dot-leader rimossi:    {t_stats['n_dotleader_rimossi']}")
     print(f"     Header concat fixati:  {t_stats['n_header_concat_fixati']}")
@@ -1352,7 +1396,6 @@ def run(stem: str, project_root: Path, force: bool) -> bool:
     print(f"     Ambienti matematici:   {t_stats['n_ambienti_matematici']}")
     print(f"     Titoli header uniti:   {t_stats['n_titoli_uniti']}")
     print(f"     TOC rimosso:           {'sì' if t_stats['toc_rimosso'] else 'no'}")
-    print(f"     TOC voci pagina rim.:  {t_stats['n_toc_page_list_rimossi']}")
     print(f"     Versi poesia riprist.: {t_stats['n_versi_ripristinati']}")
     print(f"     Header verso demotati: {t_stats['n_header_verso_demotati']}")
     print(f"     ALL-CAPS → ##:         {t_stats['n_header_allcaps']}")

From cdb2d4cab9b2f5a451831283e6dc0b914606e66c Mon Sep 17 00:00:00 2001
From: Davide Grilli <davide.grilli@outlook.com>
Date: Fri, 17 Apr 2026 13:44:30 +0200
Subject: [PATCH 13/22] fix(conversione): PUA Symbol, garbage headers,
 merge+bib guard, math EN

---
 conversione/pipeline.py | 223 +++++++++++++++++++++++++++++++---------
 conversione/validate.py |   2 +
 2 files changed, 176 insertions(+), 49 deletions(-)

diff --git a/conversione/pipeline.py b/conversione/pipeline.py
index e207b28..eedf436 100644
--- a/conversione/pipeline.py
+++ b/conversione/pipeline.py
@@ -238,8 +238,9 @@ def _extract_math_environments(text: str) -> tuple[str, int]:
     Deve girare PRIMA del merge paragrafi (step 5) per sfruttare i blocchi intatti.
     """
     _ENVS = (
-        r"Definizione|Teorema|Lemma|Proposizione|"
-        r"Corollario|Osservazione|Nota|Esempio"
+        r"Definizione|Definition|Teorema|Theorem|Lemma|"
+        r"Proposizione|Proposition|Corollario|Corollary|"
+        r"Osservazione|Remark|Nota|Note|Esempio|Example"
     )
     count  = 0
     blocks = text.split("\n\n")
@@ -373,6 +374,127 @@ def _extract_article_headers(text: str) -> tuple[str, int]:
 
 # ─── [3a] Funzioni di trasformazione ─────────────────────────────────────────
 
+# Mapping PUA Unicode (U+F020-U+F0FF) → simboli corretti per font Symbol/Wingdings.
+# Il font Symbol di Windows codifica lettere greche e operatori matematici nel
+# range Private Use Area invece dei codepoint Unicode standard.
+_SYMBOL_PUA_MAP: dict[str, str] = {
+    "\uf020": " ",   # space
+    "\uf028": "(",
+    "\uf029": ")",
+    "\uf02b": "+",
+    "\uf02d": "\u2212",  # minus
+    "\uf02e": ".",
+    "\uf02f": "/",
+    "\uf030": "0", "\uf031": "1", "\uf032": "2", "\uf033": "3", "\uf034": "4",
+    "\uf035": "5", "\uf036": "6", "\uf037": "7", "\uf038": "8", "\uf039": "9",
+    "\uf03a": ":", "\uf03b": ";", "\uf03c": "<", "\uf03d": "=", "\uf03e": ">",
+    "\uf040": "\u2245",  # congruent
+    "\uf041": "\u0391",  # Alpha
+    "\uf042": "\u0392",  # Beta
+    "\uf043": "\u03a7",  # Chi
+    "\uf044": "\u0394",  # Delta
+    "\uf045": "\u0395",  # Epsilon
+    "\uf046": "\u03a6",  # Phi
+    "\uf047": "\u0393",  # Gamma
+    "\uf048": "\u0397",  # Eta
+    "\uf049": "\u0399",  # Iota
+    "\uf04a": "\u03d1",  # theta variant
+    "\uf04b": "\u039a",  # Kappa
+    "\uf04c": "\u039b",  # Lambda
+    "\uf04d": "\u039c",  # Mu
+    "\uf04e": "\u039d",  # Nu
+    "\uf04f": "\u039f",  # Omicron
+    "\uf050": "\u03a0",  # Pi
+    "\uf051": "\u0398",  # Theta
+    "\uf052": "\u03a1",  # Rho
+    "\uf053": "\u03a3",  # Sigma
+    "\uf054": "\u03a4",  # Tau
+    "\uf055": "\u03a5",  # Upsilon
+    "\uf056": "\u03c2",  # sigma final
+    "\uf057": "\u03a9",  # Omega
+    "\uf058": "\u039e",  # Xi
+    "\uf059": "\u03a8",  # Psi
+    "\uf05a": "\u0396",  # Zeta
+    "\uf05b": "[",
+    "\uf05c": "\u2234",  # therefore
+    "\uf05d": "]",
+    "\uf05e": "\u22a5",  # perpendicular
+    "\uf061": "\u03b1",  # alpha
+    "\uf062": "\u03b2",  # beta
+    "\uf063": "\u03c7",  # chi
+    "\uf064": "\u03b4",  # delta
+    "\uf065": "\u03b5",  # epsilon
+    "\uf066": "\u03c6",  # phi
+    "\uf067": "\u03b3",  # gamma
+    "\uf068": "\u03b7",  # eta
+    "\uf069": "\u03b9",  # iota
+    "\uf06a": "\u03d5",  # phi variant
+    "\uf06b": "\u03ba",  # kappa
+    "\uf06c": "\u03bb",  # lambda
+    "\uf06d": "\u03bc",  # mu
+    "\uf06e": "\u03bd",  # nu
+    "\uf06f": "\u03bf",  # omicron
+    "\uf070": "\u03c0",  # pi
+    "\uf071": "\u03b8",  # theta
+    "\uf072": "\u03c1",  # rho
+    "\uf073": "\u03c3",  # sigma
+    "\uf074": "\u03c4",  # tau
+    "\uf075": "\u03c5",  # upsilon
+    "\uf076": "\u03d6",  # pi symbol
+    "\uf077": "\u03c9",  # omega
+    "\uf078": "\u03be",  # xi
+    "\uf079": "\u03c8",  # psi
+    "\uf07a": "\u03b6",  # zeta
+    "\uf07b": "{",
+    "\uf07c": "|",
+    "\uf07d": "}",
+    "\uf07e": "~",
+    "\uf0b1": "\u00b1",  # plus-minus
+    "\uf0b7": "\u2022",  # bullet
+    "\uf0ba": "\u221a",  # square root
+    "\uf0bc": "\u2264",  # less or equal
+    "\uf0bd": "\u2265",  # greater or equal
+    "\uf0be": "\u221d",  # proportional
+    "\uf0d7": "\u00d7",  # multiplication
+    "\uf0f7": "\u00f7",  # division
+    "\uf0b4": "\u00d7",  # alternate multiply
+    "\uf0bb": "\u2260",  # not equal
+    "\uf0b9": "\u2260",  # not equal alternate
+    "\uf0b3": "\u2265",  # greater or equal alternate
+    "\uf0b2": "\u2032",  # prime
+    "\uf02a": "*",
+    "\uf02c": ",",
+    "\uf0a3": "\u2264",  # less or equal (Symbol 0xA3)
+    "\uf0a7": "\u2022",  # bullet (Wingdings 0xA7)
+    "\uf0a8": "\u2022",  # bullet variant
+    "\uf0ae": "\u2192",  # right arrow (Symbol 0xAE)
+    "\uf0b8": "\u00f7",  # division / range separator
+    "\uf0eb": "",        # Wingdings decorative icon (rimosso)
+    "\uf0f0": "\u2192",  # right arrow variant
+    "\uf0db": "",        # bracket extension piece (non ricostruibile)
+    "\uf0dc": "",        # bracket extension piece
+    "\uf0dd": "",        # bracket extension piece
+    "\uf0de": "",        # brace middle piece (non ricostruibile)
+    "\uf0df": "",        # brace extension piece
+}
+
+_SYMBOL_PUA_RE = re.compile(
+    "[" + "".join(re.escape(k) for k in _SYMBOL_PUA_MAP) + "]"
+)
+
+
+def _t_fix_symbol_font(text: str) -> tuple[str, int]:
+    """Rimappa caratteri PUA font Symbol (U+F020-U+F0FF) in simboli Unicode corretti."""
+    count = [0]
+
+    def _repl(m: re.Match) -> str:
+        count[0] += 1
+        return _SYMBOL_PUA_MAP[m.group(0)]
+
+    result = _SYMBOL_PUA_RE.sub(_repl, text)
+    return result, count[0]
+
+
 def _t_remove_images(text: str) -> tuple[str, int]:
     n = len(re.findall(r"!\[[^\]]*\]\([^)]*\)", text))
     text = re.sub(r"!\[[^\]]*\]\([^)]*\)\s*", "", text)
@@ -555,7 +677,7 @@ def _t_normalize_numbered_headings(text: str) -> tuple[str, int]:
 
 
 def _t_normalize_header_levels(text: str) -> tuple[str, int]:
-    """Normalizza h4+ → h3; rimuove header vuoti."""
+    """Normalizza h4+ → h3; rimuove header vuoti; rimuove numero pagina '| N' finale."""
     text = re.sub(r"^#{3,6}\s*$", "", text, flags=re.MULTILINE)
     text = re.sub(
         r"^(#{3,6})\s+(\d{1,3})\s+(.+)$",
@@ -611,37 +733,19 @@ def _t_remove_toc(text: str) -> tuple[str, int]:
         if _in_toc:
             if re.match(r"^\s*$", line) or re.match(r"^\s*[-*+]\s+\d", line):
                 continue
-            # Voce TOC con numero pagina finale (sicuro: siamo già in contesto TOC)
+            # Voce TOC con numero pagina finale (sicuro: siamo gia in contesto TOC)
             if re.match(r"^\s*[-*+]\s+.{2,70}\s+\d{1,3}\s*$", line):
                 continue
+            # Riga di testo lungo = probabilmente abstract o corpo, non voce di indice
+            if len(line.strip()) > 200:
+                _in_toc = False
+                new_lines.append(line)
+                continue
             _in_toc = False
         new_lines.append(line)
     return "\n".join(new_lines), 1 if removed else 0
 
 
-def _t_remove_toc_page_list(text: str) -> tuple[str, int]:
-    """Rimuovi voci lista TOC con numero di pagina finale.
-
-    Intercetta indici come '- Canto I 1', '- Canto XXIX 119' (eventualmente
-    fusi su una riga: '- Canto XXIX 119 - Canto XXX 123') che opendataloader
-    non separa dall'indice del PDF.
-    """
-    count = 0
-    lines = text.split("\n")
-    new_lines = []
-    for line in lines:
-        stripped = line.strip()
-        # Voce TOC fusa: "- X N - Y M" — le separiamo e le scartiamo entrambe
-        if re.match(r"^\s*-\s+.{2,50}\s+\d{1,4}\s+-\s+.{2,50}\s+\d{1,4}\s*$", stripped):
-            count += 2
-            continue
-        # Voce TOC semplice: "- Testo ... NN" dove NN è un numero pagina isolato
-        if re.match(r"^\s*-\s+\S.{1,60}\s+\d{1,4}\s*$", stripped):
-            count += 1
-            continue
-        new_lines.append(line)
-    return "\n".join(new_lines), count
-
 
 def _t_allcaps_to_headers(text: str) -> tuple[str, int]:
     """Converti righe ALL-CAPS standalone → ## header."""
@@ -701,8 +805,11 @@ def _t_numbered_sections(text: str, has_exercises: bool = False) -> tuple[str, i
     if not has_exercises:
         def _aphorism_repl(m: re.Match) -> str:
             nonlocal count
+            content = m.group(2).strip()
+            if _BIB_MARKERS_RE.search(content):
+                return m.group(0)
             count += 1
-            return f"\n\n### {m.group(1)}.\n\n{m.group(2).strip()}"
+            return f"\n\n### {m.group(1)}.\n\n{content}"
 
         text = re.sub(
             r"^-\s+(\d{1,3})\.\s+(.{10,})$",
@@ -715,6 +822,8 @@ def _t_numbered_sections(text: str, has_exercises: bool = False) -> tuple[str, i
         nonlocal count
         num = m.group(1)
         content = m.group(2).strip()
+        if _BIB_MARKERS_RE.search(content):
+            return m.group(0)
         count += 1
         split = re.search(r"(?<=[a-zàèéìíòóùú])\s+(?=[A-ZÀÈÉÌÍÒÓÙÚ])", content)
         if split and split.start() >= 3:
@@ -756,7 +865,7 @@ def _t_merge_paragraphs(text: str) -> tuple[str, int]:
             and stripped[-1] not in _SENTENCE_END
         ):
             nxt = blocks[i + 1].strip()
-            if not nxt or nxt.startswith("#") or nxt.startswith("|") or re.match(r"^\d+\.", nxt):
+            if not nxt or nxt.startswith("#") or nxt.startswith("|") or re.match(r"^\d+\.", nxt) or re.match(r"^[-*+]\s", nxt):
                 break
             b = stripped + " " + nxt
             stripped = b.strip()
@@ -912,17 +1021,22 @@ def _t_remove_garbage_headers(text: str) -> tuple[str, int]:
     def _is_garbage_header(content: str) -> bool:
         if content.lstrip().startswith("..."):
             return True
-        if not re.search(r"[A-Za-zÀ-ÿ]{2,}", content):
+        if not re.search(r"[A-Za-zÀ-ÿ\u0391-\u03c9]{2,}", content):
             return True
         if re.fullmatch(r"\(?\s*[A-Za-z]{1,4}\s*\)?", content.strip()):
             return True
         if len(content) > 60 and re.search(r"[!%#]\w|\w[!%#]|\b\w+-\s*\w", content):
             return True
-        # Frammento di frase: inizia con minuscola ed è abbastanza lungo
-        # (testo spezzato dalla tabella che opendataloader ha promosso a heading)
+        # Frammento di frase: inizia con minuscola ed e abbastanza lungo
         first_alpha = next((c for c in content if c.isalpha()), None)
         if first_alpha and first_alpha.islower() and len(content) > 40:
             return True
+        # Formula matematica: variabile singola (o breve) seguita da = o operatore
+        if re.match(r"^[A-Za-z\u0391-\u03c9_]{1,3}\s*[=<>≤≥]", content.strip()):
+            return True
+        # Didascalia figura/tabella: "Figura N..." o "Figure N..." o "Tabella N..."
+        if re.match(r"^(Figura|Figure|Fig\.|Tabella|Table|Tab\.)\s+\d", content.strip(), re.IGNORECASE):
+            return True
         return False
 
     count = 0
@@ -1007,7 +1121,9 @@ def _t_remove_recurring_lines(text: str) -> tuple[str, int]:
     lines = text.split("\n")
     short_lines = [
         ln.strip() for ln in lines
-        if 3 < len(ln.strip()) < 80 and not ln.strip().startswith("#")
+        if 3 < len(ln.strip()) < 80
+        and not ln.strip().startswith("#")
+        and not ln.strip().startswith("|")
     ]
     freq = Counter(short_lines)
     recurring = {ln for ln, c in freq.items() if c >= 5}
@@ -1031,9 +1147,10 @@ def apply_transforms(text: str) -> tuple[str, dict]:
     """
     # Flag calcolato prima del loop: disabilita il transform 4b nei documenti
     # con sezioni "Esercizi" (i "- N. testo" sarebbero numerazioni, non header).
-    _has_ex = bool(re.search(r"\bEsercizi\b", text, re.IGNORECASE))
+    _has_ex = bool(re.search(r"\b(Esercizi|Exercises|Problems|Homework)\b", text, re.IGNORECASE))
 
     _transforms: list[tuple[str | None, object]] = [
+        ("n_simboli_pua_corretti",      _t_fix_symbol_font),
         ("n_immagini_rimosse",          _t_remove_images),
         ("n_br_rimossi",                _t_fix_br),
         ("n_tabsep_rimossi",            _t_fix_tabsep),
@@ -1064,6 +1181,7 @@ def apply_transforms(text: str) -> tuple[str, dict]:
         (None,                          _t_remove_urls),
         (None,                          _t_remove_empty_headers),
         ("n_titoli_uniti",              _t_merge_title_headers),
+        (None,                          lambda t: (re.sub(r"(?m)^(#{1,6}.+?)\s*\|\s*\d{1,3}\s*$", r"\1", t), 0)),
         ("n_garbage_headers_rimossi",   _t_remove_garbage_headers),
         ("n_frontmatter_rimossi",       _t_remove_frontmatter),
         ("n_watermark_rimossi",         _t_remove_watermarks),
@@ -1286,13 +1404,15 @@ def build_report(
         return hits
 
     residui = {
-        "backtick":        _scan(r"`"),
-        "dotleader":       _scan(r"(?:\. ){3,}"),
-        "url":             _scan(r"^(https?://|www\.)\S+"),
-        "immagini":        _scan(r"!\[[^\]]*\]\([^)]*\)"),
-        "br_inline":       _scan(r"<br>"),
-        "simboli_encoding":_scan(r'(?<=[0-9A-Za-z])[!"](?=[0-9A-Za-z])'),
-        "formule_inline":  _scan(r"\[\d+\.\d+\]"),
+        "backtick":         _scan(r"`"),
+        "dotleader":        _scan(r"(?:\. ){3,}"),
+        "url":              _scan(r"^(https?://|www\.)\S+"),
+        "immagini":         _scan(r"!\[[^\]]*\]\([^)]*\)"),
+        "br_inline":        _scan(r"<br>"),
+        "simboli_encoding": _scan(r'(?<=[0-9A-Za-z])[!"](?=[0-9A-Za-z])'),
+        "formule_inline":   _scan(r"\[\d+\.\d+\]"),
+        "footnote_markers": _scan(r'[\u00b9\u00b2\u00b3\u2070\u2074-\u2079]'),
+        "pua_markers":      _scan(r'[\ue000-\uf8ff]'),
     }
 
     # ── Composizione report ───────────────────────────────────────────────
@@ -1321,13 +1441,17 @@ def build_report(
             "br_inline":        len(residui["br_inline"]),
             "simboli_encoding": len(residui["simboli_encoding"]),
             "formule_inline":   len(residui["formule_inline"]),
-            "backtick_esempi":         residui["backtick"],
-            "dotleader_esempi":        residui["dotleader"],
-            "url_esempi":              residui["url"],
-            "immagini_esempi":         residui["immagini"],
-            "br_inline_esempi":        residui["br_inline"],
-            "simboli_encoding_esempi": residui["simboli_encoding"],
-            "formule_inline_esempi":   residui["formule_inline"],
+            "footnote_markers": len(residui["footnote_markers"]),
+            "pua_markers":      len(residui["pua_markers"]),
+            "backtick_esempi":          residui["backtick"],
+            "dotleader_esempi":         residui["dotleader"],
+            "url_esempi":               residui["url"],
+            "immagini_esempi":          residui["immagini"],
+            "br_inline_esempi":         residui["br_inline"],
+            "simboli_encoding_esempi":  residui["simboli_encoding"],
+            "formule_inline_esempi":    residui["formule_inline"],
+            "footnote_markers_esempi":  residui["footnote_markers"],
+            "pua_markers_esempi":       residui["pua_markers"],
         },
     }
 
@@ -1386,7 +1510,8 @@ def run(stem: str, project_root: Path, force: bool) -> bool:
     print("  [3/4] Pulizia strutturale...")
     clean_text, t_stats = apply_transforms(raw_text)
     reduction = 100 * (1 - len(clean_text) / len(raw_text)) if raw_text else 0
-    print(f"  ✅ Immagini rimosse:      {t_stats['n_immagini_rimosse']}")
+    print(f"  ✅ Simboli PUA corretti:  {t_stats['n_simboli_pua_corretti']}")
+    print(f"     Immagini rimosse:      {t_stats['n_immagini_rimosse']}")
     print(f"     Note rimossa:          {t_stats['n_note_rimosse']}")
     print(f"     Accenti corretti:      {t_stats['n_accenti_corretti']}")
     print(f"     Dot-leader rimossi:    {t_stats['n_dotleader_rimossi']}")
diff --git a/conversione/validate.py b/conversione/validate.py
index b9d71be..f2c1ead 100644
--- a/conversione/validate.py
+++ b/conversione/validate.py
@@ -86,6 +86,8 @@ def _score(r: dict) -> tuple[int, list[str]]:
     _pen("br_inline",        2, 15, "<br> inline")
     _pen("simboli_encoding", 1, 10, "simboli encoding")
     _pen("formule_inline",   1,  8, "formule inline")
+    _pen("footnote_markers", 1,  8, "footnote residui")
+    _pen("pua_markers",      2, 20, "caratteri PUA font Symbol")
 
     # ── Anomalie ──────────────────────────────────────────────────────────
     n_bare = anomalie.get("bare_headers", 0)

From 368530bc2510ae9f5efc195d576c13ffa6c4c2da Mon Sep 17 00:00:00 2001
From: Davide Grilli <davide.grilli@outlook.com>
Date: Fri, 17 Apr 2026 13:44:41 +0200
Subject: [PATCH 14/22] refactor(docs): skill prepare-md sostituisce
 step4-review, CLAUDE.md senza step-X

---
 .claude/commands/prepare-md.md   | 199 +++++++++++++++++++++++++++++++
 .claude/commands/step4-review.md | 115 ------------------
 CLAUDE.md                        |  73 +++++-------
 3 files changed, 232 insertions(+), 155 deletions(-)
 create mode 100644 .claude/commands/prepare-md.md
 delete mode 100644 .claude/commands/step4-review.md

diff --git a/.claude/commands/prepare-md.md b/.claude/commands/prepare-md.md
new file mode 100644
index 0000000..0ed1f30
--- /dev/null
+++ b/.claude/commands/prepare-md.md
@@ -0,0 +1,199 @@
+---
+description: Legge un file Markdown, individua tutti i problemi che compromettono il chunking (artefatti, sillabazione, header malformati, paragrafi spezzati, gerarchia incoerente, sezioni vuote) e applica le correzioni direttamente sul file senza chiedere conferma per i casi chiari.
+allowed-tools: Read Bash Grep Edit
+argument-hint: <path/to/clean.md oppure stem>
+---
+
+Risolvi il percorso del file da preparare:
+
+!`python3 -c "
+import sys, json, re
+from pathlib import Path
+
+arg = '$ARGUMENTS'.strip()
+root = Path('.')
+
+candidates = [
+    Path(arg),
+    root / arg,
+    root / 'conversione' / arg / 'clean.md',
+    root / 'step-4' / arg / 'clean.md',
+]
+
+md_path = None
+for p in candidates:
+    if p.exists() and p.suffix == '.md':
+        md_path = p
+        break
+
+if not md_path:
+    print('ERRORE: file non trovato per:', arg)
+    sys.exit(1)
+
+print('MD_PATH=' + str(md_path))
+
+# Cerca profilo strutturale (report.json o structure_profile.json)
+stem = md_path.parent.name
+profile_candidates = [
+    md_path.parent / 'report.json',
+    md_path.parent / 'structure_profile.json',
+    root / 'step-4' / stem / 'structure_profile.json',
+    root / 'conversione' / stem / 'report.json',
+]
+for sp in profile_candidates:
+    if sp.exists():
+        try:
+            d = json.load(open(sp))
+            st = d.get('structure', d)
+            print(f'STRATEGIA={st.get(\"strategia_chunking\",\"?\")}')
+            print(f'LINGUA={st.get(\"lingua_rilevata\",\"?\")}')
+            print(f'H1={st.get(\"n_h1\",0)} H2={st.get(\"n_h2\",0)} H3={st.get(\"n_h3\",0)}')
+            for a in st.get('avvertenze', []):
+                print(f'AVVISO: {a}')
+        except Exception:
+            pass
+        break
+
+# Statistiche file
+text = md_path.read_text(encoding='utf-8')
+lines = text.split('\n')
+pua = len(re.findall(r'[\ue000-\uf8ff]', text))
+print(f'RIGHE={len(lines)} CHARS={len(text)}')
+if pua:
+    print(f'PUA_RESIDUI={pua}')
+" 2>/dev/null`
+
+Se l'output contiene `ERRORE`, comunica il percorso non trovato e fermati.
+
+---
+
+Leggi il file completo identificato da `MD_PATH` nell'output sopra. Poi esegui **tutti** i controlli e applica le correzioni nell'ordine indicato.
+
+I parametri di riferimento per il chunking sono: **MIN_CHARS=200, MAX_CHARS=800**.
+
+---
+
+## Controllo 1 — Sillabazione residua
+
+Cerca blocchi di testo (non header) dove una riga termina con `-` e la successiva inizia con lettera minuscola: è un'interruzione di parola non risolta da PDF.
+
+Esempio da correggere:
+```
+...il meccanismo di decen-
+tralizzazione permette...
+```
+→ `...il meccanismo di decentralizzazione permette...`
+
+**Applica** ogni fusione con Edit. Se la parola ricomposta sembra errata, segnala invece di correggere.
+
+---
+
+## Controllo 2 — Artefatti di pagina
+
+Righe standalone che sono esclusivamente:
+- Un numero intero isolato (numero di pagina)
+- Titolo del libro / nome autore che si ripete identico 3+ volte nel documento
+- Intestazioni di capitolo che si ripetono (es. `## 3. Termodinamica` appare sia come header legittimo che come riga di testo duplicata)
+
+**Applica** la rimozione con Edit per le ripetizioni chiaramente decorative. Segnala i casi ambigui.
+
+---
+
+## Controllo 3 — Numeri di pagina in header
+
+Header che terminano con ` | N` o ` N` dove N è un numero isolato (residuo di indice non rimosso):
+- `### 16. Link vari | 109` → `### 16. Link vari`
+- `## Capitolo 3 42` → `## Capitolo 3`
+
+**Applica** con Edit.
+
+---
+
+## Controllo 4 — Header malformati
+
+Per ogni header (`#`, `##`, `###`):
+
+**a) ALL-CAPS non convertito:**
+`## TERMODINAMICA DEI PROCESSI` → `## Termodinamica dei processi`
+Usa sentence case (prima lettera maiuscola, resto minuscolo salvo nomi propri evidenti).
+**Applica**.
+
+**b) Livello h4/h5/h6:**
+`#### Sottosezione` → `### Sottosezione`
+**Applica**.
+
+**c) Testo troppo lungo (> 120 char):**
+Probabilmente non è un header ma testo estratto erroneamente. Rimuovi i `#` iniziali lasciando il testo come paragrafo normale.
+**Applica** se chiaramente non è un titolo. Segnala se ambiguo.
+
+**d) Header duplicati:**
+Se lo stesso header appare due volte, rimuovi la seconda occorrenza (o la prima se è quella fuori contesto).
+**Applica**.
+
+---
+
+## Controllo 5 — Paragrafi spezzati
+
+Blocchi di testo (non header, non liste) che terminano senza punteggiatura finale (`.?!»)`).
+
+Se il blocco successivo non inizia con lettera maiuscola e non è un header/lista, i due blocchi sono parte della stessa frase spezzata da un salto pagina PDF.
+
+**Applica** la fusione solo quando sei certo (la congiunzione è evidente: inizia con congiunzione, continua la frase in modo inequivocabile). Segnala i casi dubbi invece di correggere.
+
+---
+
+## Controllo 6 — Sezioni quasi-vuote o vuote
+
+Sezione (header + corpo) con corpo < 100 caratteri:
+- Se il contenuto è evidentemente una sottosezione o introduzione di ciò che segue (e non ha senso da solo), rimuovi l'header e unisci il testo alla sezione precedente o successiva.
+- Se è un header di capitolo che introduce legittime sottosezioni (`##` seguito da `###`), lascia invariato.
+
+**Applica** le fusioni sicure. Segnala quelle ambigue.
+
+---
+
+## Controllo 7 — Gerarchia heading
+
+Verifica che la gerarchia sia coerente. Problemi da correggere:
+
+- Più di un `# ` (h1) nel documento → il secondo e successivi diventano `## ` salvo che siano chiaramente titoli di parti distinte
+- `### ` prima del primo `## ` → abbassa il `###` a `## ` o aggiungi un `## ` genitore appropriato
+- `## ` prima del primo `# ` in documenti con h1 → lascia invariato (alcuni documenti non hanno h1)
+
+**Applica** solo le correzioni di livello sicure. Segnala le ristrutturazioni che richiedono giudizio.
+
+---
+
+## Controllo 8 — Sezioni troppo lunghe senza struttura
+
+Sezione (## o ###) con corpo > 3000 caratteri e nessun header figlio al suo interno: il chunker la spezzerà su frasi in modo meccanico, perdendo coerenza semantica.
+
+Se il testo contiene chiari cambio-argomento (paragrafi separati da riga vuota, con transizioni come "Inoltre...", "In secondo luogo...", "Un altro aspetto..."), considera di aggiungere un `### ` per suddividere semanticamente.
+
+**Non aggiungere header inventati.** Segnala le sezioni candidate e proponi i titoli: applica solo su risposta affermativa.
+
+---
+
+## Report finale
+
+Dopo aver applicato tutte le correzioni automatiche, mostra:
+
+```
+File: <path>
+Correzioni applicate: N totali
+
+  Sillabazione risolta:       N
+  Artefatti pagina rimossi:   N
+  Numeri pagina in header:    N
+  Header normalizzati:        N (ALL-CAPS, livello, lunghezza, duplicati)
+  Paragrafi fusi:             N
+  Sezioni quasi-vuote risolte:N
+  Gerarchia corretta:         N
+
+Problemi aperti (richiedono giudizio manuale):
+  [riga N] <descrizione precisa>
+  ...
+```
+
+Se non ci sono problemi aperti: **"Markdown pronto per il chunking."**
+Se ci sono problemi aperti: elencali e chiedi quali applicare.
diff --git a/.claude/commands/step4-review.md b/.claude/commands/step4-review.md
deleted file mode 100644
index 61c5566..0000000
--- a/.claude/commands/step4-review.md
+++ /dev/null
@@ -1,115 +0,0 @@
----
-description: Revisione qualitativa del clean.md dopo il pre-processing automatico (step 4). Trova artefatti residui, paragrafi spezzati e header errati, poi propone le correzioni.
-allowed-tools: Read Bash Grep Edit
-argument-hint: <stem>
----
-
-Esegui la revisione qualitativa di `step-4/$ARGUMENTS/clean.md`.
-
-**Cosa è già stato fatto automaticamente (revision_log):**
-!`grep -A 12 "^## $ARGUMENTS" step-4/revision_log.md 2>/dev/null || echo "(nessun log trovato per questo stem)"`
-
-**Profilo strutturale attuale:**
-!`python3 -c "
-import json, sys
-try:
-    d = json.load(open('step-4/$ARGUMENTS/structure_profile.json'))
-    print(f'Livello: {d[\"livello_struttura\"]}  Strategia: {d[\"strategia_chunking\"]}')
-    print(f'h1={d[\"n_h1\"]}  h2={d[\"n_h2\"]}  h3={d[\"n_h3\"]}  paragrafi={d[\"n_paragrafi\"]}')
-    print(f'Lunghezza media sezione: {d[\"lunghezza_media_sezione\"]} char')
-    for a in d.get('avvertenze', []): print(f'  ⚠️  {a}')
-except Exception as e: print(f'ERRORE: {e}')
-" 2>/dev/null`
-
----
-
-Analizza `step-4/$ARGUMENTS/clean.md` eseguendo i grep seguenti e ragionando sui risultati. Per ogni check: esegui il grep, conta i risultati, riporta i casi concreti (max 5 esempi con numero di riga).
-
-## Check 1 — Sillabazione residua
-
-Righe che terminano con trattino seguito da testo nella riga successiva (artefatto PDF non risolto):
-
-```bash
-grep -n "\-$" step-4/$ARGUMENTS/clean.md | head -20
-```
-
-Segnala se presenti: numero di riga, testo della riga e della riga successiva.
-
-## Check 2 — Righe orfane (artefatti PDF)
-
-Righe standalone (non header `#`, non vuote) di meno di 60 caratteri che sembrano artefatti:
-
-```bash
-grep -n "^[^#\-\*\|].\{1,59\}$" step-4/$ARGUMENTS/clean.md | grep -v "^\s*$" | head -30
-```
-
-Valuta ogni riga: è testo normale breve (legittimo) o artefatto (numero di pagina, nome autore isolato, riga di intestazione ripetuta)?
-
-## Check 3 — Paragrafi con frase spezzata
-
-Blocchi di testo che terminano senza punteggiatura di fine frase (`.?!»)`):
-
-```bash
-grep -n "[^.!?»)\]\'\"]$" step-4/$ARGUMENTS/clean.md | grep -v "^[0-9]*:#" | grep -v "^[0-9]*:\s*$" | grep -v "^\s*[-\*]" | head -20
-```
-
-Riporta i casi più sospetti (righe brevi che finiscono a metà concetto).
-
-## Check 4 — Header sospetti
-
-```bash
-grep -n "^##\? " step-4/$ARGUMENTS/clean.md | head -40
-```
-
-Verifica:
-- `##` o `###` con contenuto interamente MAIUSCOLO non convertito → segnala
-- Header duplicati (stesso testo che appare due volte) → segnala
-- `##` con testo > 80 caratteri (probabile testo che non è un header) → segnala
-- Salti di livello anomali (es. `###` senza un `##` padre) → segnala
-
-## Check 5 — Sezioni quasi vuote
-
-```bash
-python3 -c "
-import re, sys
-text = open('step-4/$ARGUMENTS/clean.md').read()
-sections = re.split(r'^(#{1,3} .+)$', text, flags=re.MULTILINE)
-for i in range(1, len(sections)-1, 2):
-    header = sections[i].strip()
-    body = sections[i+1].strip() if i+1 < len(sections) else ''
-    if len(body) < 80 and body:
-        print(f'{header!r} → {len(body)} char: {body[:60]!r}')
-    elif not body:
-        print(f'{header!r} → VUOTA')
-" 2>/dev/null | head -20
-```
-
-Sezioni con body < 80 char o vuote compromettono il chunking. Segnala quelle che non hanno senso come sezione autonoma.
-
-## Check 6 — Gerarchia strutturale
-
-```bash
-grep -n "^#\{1,3\} " step-4/$ARGUMENTS/clean.md | head -50
-```
-
-Verifica che la gerarchia sia coerente: `# → ## → ###`. Segnala se ci sono `###` prima del primo `##`, o `##` prima del primo `#`, o `#` multipli (più di un h1).
-
----
-
-## Report finale
-
-```
-🔴 BLOCCANTI (compromettono il chunking o il retrieval)
-  [riga N] descrizione precisa del problema
-  ...
-
-🟡 MINORI (artefatti visibili, non bloccanti)
-  [riga N] descrizione
-  ...
-
-🟢 OK — nessun problema rilevato in questa categoria
-```
-
-Poi chiedi: **"Applico le correzioni per i 🔴? E per i 🟡?"**
-
-Applica solo ciò che viene esplicitamente approvato. Usa Edit per ogni modifica — mai riscrivere l'intero file.
diff --git a/CLAUDE.md b/CLAUDE.md
index 4b25071..698d0e4 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -4,83 +4,76 @@
 
 - **Lingua:** Rispondi sempre in italiano.
 - **Venv obbligatorio:** Usa `.venv/bin/python` o attiva con `source .venv/bin/activate`. Mai `pip`/`python` di sistema.
-- **Non modificare `raw.md`:** `step-2/<stem>/raw.md` è immutabile. La copia di lavoro è `step-4/<stem>/clean.md`.
+- **Non modificare `raw.md`:** Il file `raw.md` di ogni stem è immutabile. La copia di lavoro è sempre `clean.md`.
 
 ---
 
-## Pipeline (ordine obbligatorio)
+## Pipeline (operazioni in ordine)
 
 ```
-PDF (sources/) → step-0 → step-1 → step-2 → step-3
-              → step-4 (CRITICO: revisione manuale clean.md)
-              → step-5 → step-6 → step-7 (Ollama) → step-8 → step-9
+PDF (sources/)
+  → conversione    (PDF → clean.md + structure_profile.json)
+  → chunking       (clean.md → chunks.json)
+  → verifica       (chunks.json → report + fix automatici)
+  → vettorizzazione (chunks.json → ChromaDB)
+  → retrieval      (query → risposta via Ollama)
 ```
 
 Il parametro `--stem` identifica il documento (nome PDF senza `.pdf`). Lo stem è anche il nome della collection ChromaDB.
 
-Comandi tipici:
-```bash
-source .venv/bin/activate
-python step-4/revise.py --stem <stem>
-python step-5/chunker.py --stem <stem>
-python step-6/verify_chunks.py --stem <stem>
-python step-8/ingest.py --stem <stem>
-python step-9/rag.py --stem <stem>
-```
-
 ---
 
 ## File critici
 
 | File | Ruolo |
 |---|---|
-| `step-9/config.py` | Fonte di verità: `EMBED_MODEL`, `OLLAMA_MODEL`, `TOP_K`, `TEMPERATURE`, `SYSTEM_PROMPT` |
-| `step-5/chunker.py` | Chunking adattivo — `MIN_CHARS=200`, `MAX_CHARS=800`, `OVERLAP_S=2` |
-| `step-6/verify_chunks.py` | Verifica chunk — stesse soglie di `chunker.py` |
-| `step-6/fix_chunks.py` | Fix automatici su chunk anomali |
-| `step-4/revise.py` | Pre-processing MD automatico (8 trasformazioni euristiche) |
-| `step-8/ingest.py` | Vettorizzazione ChromaDB — legge `EMBED_MODEL` da `config.py` |
-| `step-9/rag.py` | Pipeline RAG interattiva |
+| `config.py` | Fonte di verità: `EMBED_MODEL`, `OLLAMA_MODEL`, `TOP_K`, `TEMPERATURE`, `SYSTEM_PROMPT` |
+| `chunker.py` | Chunking adattivo — `MIN_CHARS=200`, `MAX_CHARS=800`, `OVERLAP_S=2` |
+| `verify_chunks.py` | Verifica chunk — stesse soglie di `chunker.py` |
+| `fix_chunks.py` | Fix automatici su chunk anomali |
+| `ingest.py` | Vettorizzazione ChromaDB — legge `EMBED_MODEL` da `config.py` |
+| `rag.py` | Pipeline RAG interattiva |
+| `conversione/pipeline.py` | Conversione PDF → clean Markdown strutturato |
 
 ---
 
 ## Regole di assistenza
 
-**Modifica `EMBED_MODEL` in `step-9/config.py`:**
+**Modifica `EMBED_MODEL` in `config.py`:**
 Avvisa sempre che serve rieseguire la vettorizzazione:
 ```bash
-python step-8/ingest.py --stem <stem> --force
+python ingest.py --stem <stem> --force
 ```
 `ingest.py` importa `EMBED_MODEL` direttamente da `config.py` — la coerenza è critica: se violata non produce errori ma restituisce risultati insensati.
 
 **Modifica soglie chunking (`MIN_CHARS`, `MAX_CHARS`, `OVERLAP_S`):**
-I valori compaiono in tre file che vanno sincronizzati manualmente:
-1. `step-5/chunker.py`
-2. `step-6/verify_chunks.py`
-3. `step-6/fix_chunks.py`
+I valori compaiono in più file che vanno sincronizzati manualmente:
+- `chunker.py`
+- `verify_chunks.py`
+- `fix_chunks.py`
 
-**Step 4 — revisione clean.md:**
-`revise.py` applica trasformazioni automatiche, ma il risultato va sempre revisionato a mano. La qualità del RAG dipende da `clean.md` più di qualsiasi parametro tecnico. Suggerisci sempre `/step4-review <stem>` dopo `revise.py`.
+**Conversione PDF → Markdown:**
+`conversione/pipeline.py` produce `raw.md` e `clean.md`. Il `clean.md` va sempre revisionato dopo la conversione automatica — la qualità del RAG dipende da esso più di qualsiasi parametro tecnico. Suggerisci sempre `/prepare-md conversione/<stem>/clean.md` dopo la conversione.
 
-**Step 6 — verifica chunk:**
-Dopo `verify_chunks.py`, usa `/step6-fix <stem>` prima di passare a step-8.
+**Verifica chunk:**
+Dopo `verify_chunks.py`, usa `/step6-fix <stem>` prima di procedere con la vettorizzazione.
 
 ---
 
 ## Skills custom
 
-- `/step4-review <stem>` — Revisione qualitativa `clean.md`: artefatti, paragrafi spezzati, header errati.
+- `/prepare-md <path>` — Revisione e correzione automatica di qualsiasi `clean.md`: sillabazione, artefatti, header malformati, paragrafi spezzati, gerarchia, sezioni vuote. Accetta path completo (`conversione/bitcoin/clean.md`) o stem (`bitcoin`).
 - `/step6-fix <stem>` — Dry-run e applicazione fix chunk tramite `fix_chunks.py`.
 
 ---
 
-## Struttura directory per stem
+## Output per stem
 
 ```
-step-2/<stem>/raw.md                ← immutabile
-step-4/<stem>/clean.md              ← copia di lavoro
-step-4/<stem>/structure_profile.json
-step-5/<stem>/chunks.json
-step-6/<stem>/report.json
-chroma_db/<stem>/                   ← collection ChromaDB
+conversione/<stem>/raw.md              ← immutabile
+conversione/<stem>/clean.md            ← copia di lavoro
+conversione/<stem>/structure_profile.json
+<stem>/chunks.json
+<stem>/report.json
+chroma_db/<stem>/                      ← collection ChromaDB
 ```

From 82f205faa2cacefe34371cafeec35cf4a053c29f Mon Sep 17 00:00:00 2001
From: Davide Grilli <davide.grilli@outlook.com>
Date: Fri, 17 Apr 2026 16:04:59 +0200
Subject: [PATCH 15/22] chore: rimuovi cartelle step-0..step-4 ora obsolete
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

La logica è consolidata in conversione/pipeline.py.
---
 step-0/check_pdf.py        | 229 --------------------
 step-1/inspect_pdf.py      | 199 -----------------
 step-2/convert_pdf.py      |  80 -------
 step-3/detect_structure.py | 223 -------------------
 step-4/revise.py           | 433 -------------------------------------
 5 files changed, 1164 deletions(-)
 delete mode 100644 step-0/check_pdf.py
 delete mode 100644 step-1/inspect_pdf.py
 delete mode 100644 step-2/convert_pdf.py
 delete mode 100644 step-3/detect_structure.py
 delete mode 100644 step-4/revise.py

diff --git a/step-0/check_pdf.py b/step-0/check_pdf.py
deleted file mode 100644
index 2bee9b8..0000000
--- a/step-0/check_pdf.py
+++ /dev/null
@@ -1,229 +0,0 @@
-#!/usr/bin/env python3
-"""
-Step 0 — Verifica idoneità PDF
-
-Legge tutti i PDF in sources/ e salva un report per ognuno in step-0/.
-
-Uso:
-    python step-0/check_pdf.py
-
-Output:
-    step-0/<nome_pdf>_step0_report.txt
-"""
-
-import sys
-import statistics
-from datetime import datetime
-from pathlib import Path
-
-
-def check_pdf(pdf_path: str, save: bool = True) -> None:
-    try:
-        import pdfplumber
-    except ImportError:
-        print("Errore: pdfplumber non è installato.")
-        print("       pip install pdfplumber")
-        sys.exit(1)
-
-    path = Path(pdf_path)
-    if not path.exists():
-        print(f"Errore: file non trovato — {pdf_path}")
-        sys.exit(1)
-    if path.suffix.lower() != ".pdf":
-        print(f"Errore: il file non è un PDF — {pdf_path}")
-        sys.exit(1)
-
-    lines = []  # righe del report
-    results = []  # (etichetta, stato, messaggio)
-
-    def out(text=""):
-        lines.append(text)
-        print(text)
-
-    out(f"Step 0 — Verifica idoneità PDF")
-    out(f"File:    {path.name}")
-    out(f"Data:    {datetime.now().strftime('%Y-%m-%d %H:%M')}")
-    out("=" * 50)
-
-    # ------------------------------------------------------------------ #
-    # Criterio 1 — Non protetto da password
-    # ------------------------------------------------------------------ #
-    try:
-        with pdfplumber.open(path) as pdf:
-            n_pages = len(pdf.pages)
-        results.append(("Non protetto da password", "PASS", f"{n_pages} pagine"))
-    except Exception as e:
-        msg = str(e).lower()
-        if "password" in msg or "encrypted" in msg or "decrypt" in msg:
-            results.append(("Non protetto da password", "FAIL",
-                             "Il PDF è cifrato — non può essere elaborato"))
-        else:
-            results.append(("Non protetto da password", "FAIL",
-                             f"Impossibile aprire il file: {e}"))
-        _render_results(results, out)
-        _maybe_save(lines, path, save)
-        return
-
-    # ------------------------------------------------------------------ #
-    # Lettura pagine — una sola passata
-    # ------------------------------------------------------------------ #
-    char_counts = []
-    line_lengths = []
-    all_text = ""
-    empty_pages = 0
-
-    with pdfplumber.open(path) as pdf:
-        for page in pdf.pages:
-            text = page.extract_text() or ""
-            all_text += text + "\n"
-            chars = len(text.strip())
-            char_counts.append(chars)
-            if chars == 0:
-                empty_pages += 1
-            for line in text.splitlines():
-                stripped = line.strip()
-                if stripped:
-                    line_lengths.append(len(stripped))
-
-    total_pages = len(char_counts)
-    pages_with_text = sum(1 for c in char_counts if c > 50)
-    text_coverage = pages_with_text / total_pages if total_pages > 0 else 0
-
-    # ------------------------------------------------------------------ #
-    # Criterio 2 — Testo estraibile
-    # ------------------------------------------------------------------ #
-    if text_coverage >= 0.7:
-        results.append(("Testo estraibile", "PASS",
-                         f"{pages_with_text}/{total_pages} pagine con testo ({text_coverage:.0%})"))
-    elif text_coverage >= 0.4:
-        results.append(("Testo estraibile", "WARN",
-                         f"Solo {pages_with_text}/{total_pages} pagine con testo — revisione estesa necessaria"))
-    else:
-        results.append(("Testo estraibile", "FAIL",
-                         f"Solo {pages_with_text}/{total_pages} pagine con testo — probabilmente scansionato"))
-
-    # ------------------------------------------------------------------ #
-    # Criterio 3 — Generato digitalmente (non scansionato)
-    # ------------------------------------------------------------------ #
-    pages_text_only = [c for c in char_counts if c > 0]
-    avg_chars = statistics.mean(pages_text_only) if pages_text_only else 0
-
-    if avg_chars >= 300:
-        results.append(("Generato digitalmente (non scansionato)", "PASS",
-                         f"Media {avg_chars:.0f} char/pagina"))
-    elif avg_chars >= 100:
-        results.append(("Generato digitalmente (non scansionato)", "WARN",
-                         f"Media bassa: {avg_chars:.0f} char/pagina — alcune pagine potrebbero essere immagini"))
-    else:
-        results.append(("Generato digitalmente (non scansionato)", "FAIL",
-                         f"Media molto bassa: {avg_chars:.0f} char/pagina — il PDF sembra scansionato"))
-
-    # ------------------------------------------------------------------ #
-    # Criterio 4 — Pagine vuote
-    # ------------------------------------------------------------------ #
-    if empty_pages == 0:
-        results.append(("Pagine vuote", "PASS", "Nessuna pagina vuota"))
-    elif empty_pages <= total_pages * 0.05:
-        results.append(("Pagine vuote", "WARN",
-                         f"{empty_pages} pagine vuote (≤ 5%) — probabilmente copertine o separatori"))
-    else:
-        results.append(("Pagine vuote", "WARN",
-                         f"{empty_pages} pagine vuote ({empty_pages/total_pages:.0%}) — controllare"))
-
-    # ------------------------------------------------------------------ #
-    # Criterio desiderabile — Layout a colonne singola
-    # ------------------------------------------------------------------ #
-    if line_lengths:
-        median_len = statistics.median(line_lengths)
-        short_lines = sum(1 for l in line_lengths if l < median_len * 0.4)
-        short_ratio = short_lines / len(line_lengths)
-        if short_ratio < 0.15:
-            results.append(("Layout a colonne singola (desiderabile)", "PASS",
-                             f"Righe corte: {short_ratio:.0%} — struttura lineare"))
-        elif short_ratio < 0.35:
-            results.append(("Layout a colonne singola (desiderabile)", "WARN",
-                             f"Righe corte: {short_ratio:.0%} — possibile layout a colonne parziale"))
-        else:
-            results.append(("Layout a colonne singola (desiderabile)", "WARN",
-                             f"Righe corte: {short_ratio:.0%} — probabile layout a colonne multiple"))
-    else:
-        results.append(("Layout a colonne singola (desiderabile)", "WARN",
-                         "Impossibile analizzare (nessuna riga estratta)"))
-
-    # ------------------------------------------------------------------ #
-    # Criterio desiderabile — Struttura logica (titoli)
-    # ------------------------------------------------------------------ #
-    candidate_headings = [
-        line.strip() for line in all_text.splitlines()
-        if 3 <= len(line.strip()) <= 80
-        and line.strip()[0].isupper()
-        and not line.strip().endswith(".")
-        and not line.strip().endswith(",")
-        and len(line.strip().split()) <= 10
-    ]
-    heading_density = len(candidate_headings) / total_pages if total_pages > 0 else 0
-
-    if heading_density >= 1.0:
-        results.append(("Struttura logica riconoscibile (desiderabile)", "PASS",
-                         f"~{len(candidate_headings)} possibili titoli rilevati ({heading_density:.1f}/pagina)"))
-    elif heading_density >= 0.3:
-        results.append(("Struttura logica riconoscibile (desiderabile)", "WARN",
-                         f"~{len(candidate_headings)} possibili titoli ({heading_density:.1f}/pagina) — struttura parziale"))
-    else:
-        results.append(("Struttura logica riconoscibile (desiderabile)", "WARN",
-                         "Pochi titoli rilevati — testo narrativo o struttura non standard"))
-
-    _render_results(results, out)
-    _maybe_save(lines, path, save)
-
-
-def _render_results(results: list, out) -> None:
-    icons = {"PASS": "✅", "WARN": "⚠️ ", "FAIL": "❌"}
-    out()
-    for label, status, message in results:
-        icon = icons.get(status, "  ")
-        out(f"  {icon} {label}")
-        out(f"       {message}")
-    out()
-
-    fails = [r for r in results if r[1] == "FAIL"]
-    warns = [r for r in results if r[1] == "WARN"]
-
-    if fails:
-        out("ESITO: ❌ PDF NON IDONEO")
-        out("       Criteri obbligatori non soddisfatti — scegli un PDF diverso.")
-    elif warns:
-        out("ESITO: ⚠️  PDF ACCETTABILE CON CAUTELA")
-        out("       Procedi, ma aspettati più lavoro nella revisione manuale (step 4).")
-    else:
-        out("ESITO: ✅ PDF IDONEO")
-        out("       Tutti i criteri soddisfatti — procedi con lo step 1.")
-    out()
-
-
-def _maybe_save(lines: list, pdf_path: Path, save: bool) -> None:
-    if not save:
-        return
-    script_dir = Path(__file__).parent
-    out_file = script_dir / f"{pdf_path.stem}_step0_report.txt"
-    out_file.write_text("\n".join(lines), encoding="utf-8")
-    print(f"Report salvato in: {out_file}")
-
-
-if __name__ == "__main__":
-    project_root = Path(__file__).parent.parent
-    sources_dir = project_root / "sources"
-
-    if not sources_dir.exists():
-        print(f"Errore: cartella sources/ non trovata in {project_root}")
-        sys.exit(1)
-
-    pdfs = sorted(sources_dir.glob("*.pdf"))
-    if not pdfs:
-        print(f"Errore: nessun PDF trovato in {sources_dir}")
-        sys.exit(1)
-
-    for pdf in pdfs:
-        check_pdf(str(pdf), save=True)
-        if len(pdfs) > 1:
-            print("-" * 50)
diff --git a/step-1/inspect_pdf.py b/step-1/inspect_pdf.py
deleted file mode 100644
index 0c2bfdd..0000000
--- a/step-1/inspect_pdf.py
+++ /dev/null
@@ -1,199 +0,0 @@
-#!/usr/bin/env python3
-"""
-Step 1 — Ispezione automatica PDF
-
-Analizza il PDF pagina per pagina e produce un report con score (0–100)
-e lista dei problemi per pagina. Serve per capire la qualità del documento
-e mappare i problemi prima della revisione manuale (step 4).
-
-Uso:
-    python step1/inspect.py
-
-Output:
-    step1/<nome_pdf>_step1_report.txt
-"""
-
-import re
-import sys
-import statistics
-from collections import Counter
-from datetime import datetime
-from pathlib import Path
-
-
-# ── Penalità per il calcolo dello score ───────────────────────────────────
-SYLLABIF_PENALTY  = 0.3   # per occorrenza di sillabazione
-COLUMN_PENALTY    = 3.0   # per pagina con layout a colonne
-UNICODE_PENALTY   = 1.5   # per pagina con caratteri anomali
-EMPTY_PENALTY     = 1.0   # per pagina vuota
-HEADER_FOOTER_PEN = 5.0   # fisso se intestazioni/piè ripetitivi rilevati
-
-
-def inspect_pdf(pdf_path: str, save: bool = True) -> None:
-    try:
-        import pdfplumber
-    except ImportError:
-        print("Errore: pdfplumber non è installato.")
-        print("       pip install pdfplumber")
-        sys.exit(1)
-
-    path = Path(pdf_path)
-    if not path.exists():
-        print(f"Errore: file non trovato — {pdf_path}")
-        sys.exit(1)
-
-    lines = []
-
-    def out(text=""):
-        lines.append(text)
-        print(text)
-
-    out("Step 1 — Ispezione automatica PDF")
-    out(f"File:    {path.name}")
-    out(f"Data:    {datetime.now().strftime('%Y-%m-%d %H:%M')}")
-    out("=" * 50)
-
-    # ── Lettura pagine ─────────────────────────────────────────────────────
-    with pdfplumber.open(path) as pdf:
-        n_pages = len(pdf.pages)
-        pages_text = [page.extract_text() or "" for page in pdf.pages]
-
-    # ── Analisi per pagina ─────────────────────────────────────────────────
-    issues = []       # (page_num, descrizione)  — page_num=0 → problema globale
-    deductions = 0.0
-
-    first_lines = []  # prima riga significativa di ogni pagina (per header)
-    last_lines  = []  # ultima riga significativa di ogni pagina (per footer)
-
-    for i, text in enumerate(pages_text):
-        page_num = i + 1
-        stripped = text.strip()
-
-        # 1. Pagina vuota
-        if len(stripped) < 50:
-            issues.append((page_num, "pagina vuota"))
-            deductions += EMPTY_PENALTY
-            continue
-
-        page_lines = text.splitlines()
-        nonempty   = [l.strip() for l in page_lines if l.strip()]
-
-        # Raccogli prima/ultima riga per il controllo header/footer
-        if nonempty:
-            first_lines.append(nonempty[0])
-            last_lines.append(nonempty[-1])
-
-        # 2. Sillabazione a fine riga  (es. "estra-" + a capo)
-        syllabif = sum(
-            1 for line in page_lines
-            if re.search(r'\b\w{2,}-$', line.rstrip())
-        )
-        if syllabif:
-            label = "occorrenza" if syllabif == 1 else "occorrenze"
-            issues.append((page_num, f"sillabazione rilevata ({syllabif} {label})"))
-            deductions += syllabif * SYLLABIF_PENALTY
-
-        # 3. Layout a colonne  (righe molto corte e numerose)
-        if len(nonempty) >= 10:
-            median_len  = statistics.median(len(l) for l in nonempty)
-            short_ratio = sum(1 for l in nonempty if len(l) < median_len * 0.4) / len(nonempty)
-            if short_ratio > 0.35:
-                issues.append((page_num, f"possibile layout a colonne ({short_ratio:.0%} righe corte)"))
-                deductions += COLUMN_PENALTY
-
-        # 4. Caratteri Unicode anomali
-        #    (control chars esclusi \n \t \r, replacement char, PUA block)
-        anomalies = re.findall(
-            r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f-\x9f\ufffd\ue000-\uf8ff]', text
-        )
-        if anomalies:
-            issues.append((page_num, f"caratteri Unicode anomali ({len(anomalies)} trovati)"))
-            deductions += UNICODE_PENALTY
-
-    # ── Intestazioni e piè di pagina ripetitivi ────────────────────────────
-    def _check_repetition(line_list: list, label: str) -> None:
-        nonlocal deductions
-        if not line_list:
-            return
-        threshold = max(3, len(line_list) * 0.25)
-        repeated  = [
-            (txt, cnt) for txt, cnt in Counter(line_list).items()
-            if cnt >= threshold and len(txt) > 3
-        ]
-        if repeated:
-            deductions += HEADER_FOOTER_PEN
-            for txt, cnt in repeated[:3]:
-                issues.append((0, f"{label} ripetitivo: \"{txt[:45]}\" ({cnt} volte)"))
-
-    _check_repetition(first_lines, "intestazione")
-    _check_repetition(last_lines,  "piè di pagina")
-
-    # ── Score ──────────────────────────────────────────────────────────────
-    score = max(0, round(100 - deductions))
-
-    # ── Riepilogo ──────────────────────────────────────────────────────────
-    pages_with_issues = len({p for p, _ in issues if p > 0})
-    out()
-    out(f"Score: {score}/100")
-    out(f"Pagine totali:        {n_pages}")
-    out(f"Pagine con problemi:  {pages_with_issues}")
-    out()
-
-    if issues:
-        global_issues = [(p, d) for p, d in issues if p == 0]
-        page_issues   = sorted([(p, d) for p, d in issues if p > 0])
-        for _, desc in global_issues:
-            out(f"  ⚠️  {desc}")
-        for page_num, desc in page_issues:
-            out(f"  Pagina {page_num:>4}: {desc}")
-    else:
-        out("  Nessun problema rilevato.")
-
-    out()
-
-    # ── Prossimi passi ─────────────────────────────────────────────────────
-    out("PROSSIMI PASSI:")
-    if score >= 70:
-        out("  → conversione con marker funzionerà bene")
-    elif score >= 40:
-        out("  → conversione possibile, attendi più errori nella revisione")
-    else:
-        out("  → qualità bassa — valuta una fonte PDF migliore")
-
-    attention_pages = sorted({p for p, _ in issues if p > 0})
-    if attention_pages:
-        sample = ", ".join(str(p) for p in attention_pages[:10])
-        if len(attention_pages) > 10:
-            sample += f" … e altre {len(attention_pages) - 10}"
-        out(f"  → attenzione alle pagine {sample} nella revisione manuale")
-    out()
-
-    _maybe_save(lines, path, save)
-
-
-def _maybe_save(lines: list, pdf_path: Path, save: bool) -> None:
-    if not save:
-        return
-    script_dir = Path(__file__).parent
-    out_file   = script_dir / f"{pdf_path.stem}_step1_report.txt"
-    out_file.write_text("\n".join(lines), encoding="utf-8")
-    print(f"Report salvato in: {out_file}")
-
-
-if __name__ == "__main__":
-    project_root = Path(__file__).parent.parent
-    sources_dir  = project_root / "sources"
-
-    if not sources_dir.exists():
-        print(f"Errore: cartella sources/ non trovata in {project_root}")
-        sys.exit(1)
-
-    pdfs = sorted(sources_dir.glob("*.pdf"))
-    if not pdfs:
-        print(f"Errore: nessun PDF trovato in {sources_dir}")
-        sys.exit(1)
-
-    for pdf in pdfs:
-        inspect_pdf(str(pdf), save=True)
-        if len(pdfs) > 1:
-            print("-" * 50)
diff --git a/step-2/convert_pdf.py b/step-2/convert_pdf.py
deleted file mode 100644
index efc6376..0000000
--- a/step-2/convert_pdf.py
+++ /dev/null
@@ -1,80 +0,0 @@
-#!/usr/bin/env python3
-"""
-Step 2 — Conversione PDF → Markdown grezzo
-
-Usa pymupdf4llm (PyMuPDF puro C, zero modelli ML, ~30-50 MB RAM)
-per convertire ogni PDF in sources/ e organizza l'output in:
-  step-2/<stem>/raw.md    — MD grezzo, non modificare mai
-  step-2/<stem>/clean.md  — copia di lavoro per lo step 4
-
-Uso:
-    python step-2/convert_pdf.py                        # tutti i PDF in sources/
-    python step-2/convert_pdf.py --pdf sources/doc.pdf  # un solo PDF
-"""
-
-import argparse
-import shutil
-import sys
-from pathlib import Path
-
-import pymupdf4llm
-
-
-def convert_pdf(pdf_path: Path, project_root: Path) -> bool:
-    stem = pdf_path.stem
-    out_dir = project_root / "step-2" / stem
-    raw_md = out_dir / "raw.md"
-    clean_md = out_dir / "clean.md"
-
-    print(f"\nConversione: {pdf_path.name}")
-    print(f"  Output:    step-2/{stem}/")
-
-    if raw_md.exists():
-        print(f"  ⚠️  raw.md già presente — skip")
-        print(f"       (elimina {raw_md} per riconvertire)")
-        return True
-
-    out_dir.mkdir(parents=True, exist_ok=True)
-
-    print(f"  Conversione in corso...")
-    md_text = pymupdf4llm.to_markdown(str(pdf_path))
-
-    raw_md.write_text(md_text, encoding="utf-8")
-    shutil.copy2(raw_md, clean_md)
-
-    size_kb = raw_md.stat().st_size // 1024
-    print(f"  ✅ raw.md salvato ({size_kb} KB)")
-    print(f"  ✅ clean.md creato (copia di lavoro per step 4)")
-    return True
-
-
-if __name__ == "__main__":
-    project_root = Path(__file__).parent.parent
-
-    parser = argparse.ArgumentParser(description="Step 2 — Conversione PDF → Markdown")
-    parser.add_argument("--pdf", help="Percorso di un singolo PDF da convertire")
-    args = parser.parse_args()
-
-    if args.pdf:
-        pdf_path = Path(args.pdf)
-        if not pdf_path.exists():
-            print(f"Errore: file non trovato — {args.pdf}")
-            sys.exit(1)
-        pdfs = [pdf_path]
-    else:
-        sources_dir = project_root / "sources"
-        if not sources_dir.exists():
-            print(f"Errore: cartella sources/ non trovata in {project_root}")
-            sys.exit(1)
-        pdfs = sorted(sources_dir.glob("*.pdf"))
-        if not pdfs:
-            print(f"Errore: nessun PDF trovato in {sources_dir}")
-            sys.exit(1)
-
-    results = [convert_pdf(p, project_root) for p in pdfs]
-
-    ok_count = sum(results)
-    total = len(results)
-    print(f"\n{'✅' if all(results) else '⚠️ '} {ok_count}/{total} PDF convertiti")
-
-    sys.exit(0 if all(results) else 1)
diff --git a/step-3/detect_structure.py b/step-3/detect_structure.py
deleted file mode 100644
index e3a426b..0000000
--- a/step-3/detect_structure.py
+++ /dev/null
@@ -1,223 +0,0 @@
-#!/usr/bin/env python3
-"""
-Step 3 — Rilevamento struttura Markdown
-
-Analizza il Markdown grezzo prodotto dallo step 2 senza modificarlo.
-Copia i file da step-2/<stem>/ e produce structure_profile.json che
-guida la revisione manuale (step 4) e il chunker adattivo (step 5).
-
-Output in step-3/<stem>/:
-  raw.md                  — copia da step-2 (non modificare mai)
-  clean.md                — copia da step-2 (da revisionare nello step 4)
-  structure_profile.json  — profilo strutturale
-
-Uso:
-    python step-3/detect_structure.py                    # tutti i documenti in step-2/
-    python step-3/detect_structure.py --stem nietzsche   # un solo documento
-    python step-3/detect_structure.py --force            # riesegui anche se già presente
-"""
-
-import argparse
-import json
-import re
-import shutil
-import sys
-from pathlib import Path
-
-
-# ─── Language detection ───────────────────────────────────────────────────────
-
-_IT_WORDS = frozenset([
-    "il", "la", "di", "e", "che", "non", "per", "un", "una", "si",
-    "con", "da", "del", "della", "dei", "in", "ma", "se", "lo", "le",
-    "gli", "al", "alla", "ai", "alle", "sono", "ha", "hanno", "era",
-    "erano", "nel", "nella", "nei", "nelle", "questo", "questa", "così",
-])
-
-_EN_WORDS = frozenset([
-    "the", "of", "and", "to", "in", "is", "that", "it", "was", "for",
-    "on", "are", "as", "with", "his", "they", "at", "be", "this", "have",
-    "from", "or", "an", "but", "not", "by", "he", "she", "we", "you",
-    "which", "their", "been", "has", "would", "there", "when", "will",
-])
-
-
-def detect_language(text: str) -> str:
-    words = re.findall(r'\b[a-zA-Z]{2,}\b', text.lower())
-    sample = words[:2000]
-    it = sum(1 for w in sample if w in _IT_WORDS)
-    en = sum(1 for w in sample if w in _EN_WORDS)
-    if it == 0 and en == 0:
-        return "unknown"
-    return "it" if it >= en else "en"
-
-
-# ─── Markdown parsing ─────────────────────────────────────────────────────────
-
-def split_sections(text: str, header_level: int) -> list[str]:
-    """
-    Split text on headers of the given level (1=h1, 2=h2, 3=h3).
-    Returns list of body texts for each matching section.
-    """
-    prefix = "#" * header_level + " "
-    parts = re.split(rf'(?m)^{re.escape(prefix)}.+', text)
-    # parts[0] is preamble, rest are section bodies
-    return [p for p in parts[1:] if p.strip()]
-
-
-def count_headers(text: str, level: int) -> int:
-    prefix = "#" * level + " "
-    return len(re.findall(rf'(?m)^{re.escape(prefix)}', text))
-
-
-def count_paragraphs(text: str) -> int:
-    """Count non-empty, non-header paragraph blocks."""
-    blocks = re.split(r'\n{2,}', text)
-    return sum(1 for b in blocks if b.strip() and not re.match(r'^#+\s', b.strip()))
-
-
-# ─── Core analysis ────────────────────────────────────────────────────────────
-
-def analyze(raw_md_path: Path) -> dict:
-    text = raw_md_path.read_text(encoding="utf-8")
-
-    n_h1 = count_headers(text, 1)
-    n_h2 = count_headers(text, 2)
-    n_h3 = count_headers(text, 3)
-    n_paragrafi = count_paragraphs(text)
-
-    # Determine structural level and primary boundary
-    if n_h3 >= 5:
-        livello = 3
-        boundary = "h3"
-        strategia = "h3_aware"
-        section_bodies = split_sections(text, 3)
-    elif n_h2 >= 3:
-        livello = 2
-        boundary = "h2"
-        strategia = "h2_paragraph_split"
-        section_bodies = split_sections(text, 2)
-    elif n_h1 + n_h2 + n_h3 >= 1:
-        livello = 1
-        boundary = "paragrafo"
-        strategia = "paragraph"
-        section_bodies = [b for b in re.split(r'\n{2,}', text) if b.strip()]
-    else:
-        if n_paragrafi >= 3:
-            livello = 1
-            boundary = "paragrafo"
-            strategia = "paragraph"
-            section_bodies = [b for b in re.split(r'\n{2,}', text) if b.strip()]
-        else:
-            livello = 0
-            boundary = "nessuno"
-            strategia = "sliding_window"
-            section_bodies = [text] if text.strip() else []
-
-    lengths = [len(b) for b in section_bodies if b.strip()]
-    lunghezza_media = int(sum(lengths) / len(lengths)) if lengths else 0
-
-    lingua = detect_language(text)
-
-    avvertenze = []
-    short = sum(1 for l in lengths if l < 200)
-    long_ = sum(1 for l in lengths if l > 800)
-    if short:
-        avvertenze.append(f"{short} sezioni sotto i 200 caratteri — verranno accorpate")
-    if long_:
-        avvertenze.append(f"{long_} sezioni sopra i 800 caratteri — verranno divise")
-
-    return {
-        "livello_struttura": livello,
-        "n_h1": n_h1,
-        "n_h2": n_h2,
-        "n_h3": n_h3,
-        "n_paragrafi": n_paragrafi,
-        "boundary_primario": boundary,
-        "lingua_rilevata": lingua,
-        "lunghezza_media_sezione": lunghezza_media,
-        "strategia_chunking": strategia,
-        "avvertenze": avvertenze,
-    }
-
-
-# ─── Per-document processing ─────────────────────────────────────────────────
-
-def process_stem(stem: str, project_root: Path, force: bool) -> bool:
-    src_dir = project_root / "step-2" / stem
-    out_dir = project_root / "step-3" / stem
-    raw_src = src_dir / "raw.md"
-    clean_src = src_dir / "clean.md"
-    profile_out = out_dir / "structure_profile.json"
-
-    print(f"\nDocumento: {stem}")
-
-    if not raw_src.exists():
-        print(f"  ✗ raw.md non trovato in step-2/{stem}/ — skip")
-        return False
-
-    if profile_out.exists() and not force:
-        print(f"  ⚠️  structure_profile.json già presente — skip")
-        print(f"       (usa --force per rieseguire)")
-        return True
-
-    out_dir.mkdir(parents=True, exist_ok=True)
-
-    # Copy files from step-2
-    shutil.copy2(raw_src, out_dir / "raw.md")
-    if clean_src.exists():
-        shutil.copy2(clean_src, out_dir / "clean.md")
-    print(f"  Copiati raw.md e clean.md da step-2/{stem}/")
-
-    # Analyze
-    print(f"  Analisi struttura in corso...")
-    profile = analyze(out_dir / "raw.md")
-
-    profile_out.write_text(json.dumps(profile, ensure_ascii=False, indent=2), encoding="utf-8")
-
-    # Report
-    _LIVELLO_DESC = {
-        3: "struttura ricca (###)",
-        2: "struttura parziale (##)",
-        1: "solo paragrafi",
-        0: "testo piatto",
-    }
-    print(f"  ✅ Livello {profile['livello_struttura']} — {_LIVELLO_DESC[profile['livello_struttura']]}")
-    print(f"     h1={profile['n_h1']}  h2={profile['n_h2']}  h3={profile['n_h3']}  paragrafi={profile['n_paragrafi']}")
-    print(f"     Boundary: {profile['boundary_primario']}  |  Strategia: {profile['strategia_chunking']}")
-    print(f"     Lingua: {profile['lingua_rilevata']}  |  Lunghezza media sezione: {profile['lunghezza_media_sezione']} char")
-    for w in profile["avvertenze"]:
-        print(f"     ⚠️  {w}")
-    print(f"  ✅ structure_profile.json salvato")
-    return True
-
-
-# ─── Entry point ─────────────────────────────────────────────────────────────
-
-if __name__ == "__main__":
-    project_root = Path(__file__).parent.parent
-
-    parser = argparse.ArgumentParser(description="Step 3 — Rilevamento struttura Markdown")
-    parser.add_argument("--stem", help="Nome del documento (sottocartella di step-2/)")
-    parser.add_argument("--force", action="store_true", help="Riesegui anche se già presente")
-    args = parser.parse_args()
-
-    if args.stem:
-        stems = [args.stem]
-    else:
-        step2_dir = project_root / "step-2"
-        if not step2_dir.exists():
-            print(f"Errore: cartella step-2/ non trovata in {project_root}")
-            sys.exit(1)
-        stems = sorted(p.name for p in step2_dir.iterdir() if p.is_dir())
-        if not stems:
-            print(f"Errore: nessun documento trovato in step-2/")
-            sys.exit(1)
-
-    results = [process_stem(s, project_root, args.force) for s in stems]
-
-    ok = sum(results)
-    total = len(results)
-    print(f"\n{'✅' if all(results) else '⚠️ '} {ok}/{total} documenti analizzati")
-
-    sys.exit(0 if all(results) else 1)
diff --git a/step-4/revise.py b/step-4/revise.py
deleted file mode 100644
index cf703a2..0000000
--- a/step-4/revise.py
+++ /dev/null
@@ -1,433 +0,0 @@
-#!/usr/bin/env python3
-"""
-Step 4 — Revisione automatica del Markdown
-
-Trasforma clean.md da step-3 rivelando la struttura latente del documento.
-Le trasformazioni sono euristiche universali che funzionano su qualsiasi PDF:
-
-  - Normalizza whitespace multiplo (artefatto PDF)
-  - Riduce righe vuote multiple
-  - Rimuove marcatori **bold** nelle intestazioni esistenti
-  - Converte righe ALL-CAPS standalone → ## header (euristico, qualsiasi lingua)
-  - Converte sezioni numerate "N.  testo" → ### N. (qualsiasi numerazione)
-  - Rimuove blocchi TOC (righe che iniziano con parole-chiave indice)
-
-Per ogni documento viene ricalcolato il profilo strutturale: il livello può
-salire (es. livello 1 → 3) se le strutture latenti vengono rilevate.
-
-Output in step-4/<stem>/:
-  raw.md                  — copia da step-3 (non modificare mai)
-  clean.md                — MD revisionato
-  structure_profile.json  — profilo aggiornato dopo la revisione
-
-Uso:
-    python step-4/revise.py                    # tutti i documenti in step-3/
-    python step-4/revise.py --stem nietzsche   # un solo documento
-    python step-4/revise.py --force            # riesegui anche se già presente
-"""
-
-import argparse
-import json
-import re
-import shutil
-import sys
-from datetime import date
-from pathlib import Path
-
-# Riusa la funzione analyze() già scritta nello step 3
-sys.path.insert(0, str(Path(__file__).parent.parent / "step-3"))
-from detect_structure import analyze  # noqa: E402
-
-
-# ─── Costanti ─────────────────────────────────────────────────────────────────
-
-# Parole-chiave che identificano blocchi TOC (da rimuovere)
-_TOC_KEYWORDS = frozenset([
-    "indice", "index", "contents", "table of contents",
-    "sommario", "inhaltsverzeichnis", "inhalt",
-])
-
-# Preposizioni/articoli da non capitalizzare nel title-case
-_STOP_IT_EN = frozenset([
-    # italiano
-    "di", "del", "della", "dei", "delle", "da", "in", "e", "il", "la",
-    "lo", "le", "gli", "un", "una", "per", "a", "al", "alla", "ai",
-    "alle", "con", "su", "sul", "sulla", "che", "o",
-    # inglese
-    "of", "the", "a", "an", "and", "or", "but", "in", "on", "at",
-    "to", "for", "with", "by", "from", "as",
-])
-
-# Ordinali italiani → romani (per titoli come "CAPITOLO PRIMO")
-_ORDINALS_IT = {
-    "PRIMO": "I", "SECONDO": "II", "TERZO": "III", "QUARTO": "IV",
-    "QUINTO": "V", "SESTO": "VI", "SETTIMO": "VII", "OTTAVO": "VIII",
-    "NONO": "IX", "DECIMO": "X",
-}
-
-# Ordinali inglesi → arabici (per "CHAPTER ONE")
-_ORDINALS_EN = {
-    "ONE": "1", "TWO": "2", "THREE": "3", "FOUR": "4", "FIVE": "5",
-    "SIX": "6", "SEVEN": "7", "EIGHT": "8", "NINE": "9", "TEN": "10",
-}
-
-
-# ─── Utilità ──────────────────────────────────────────────────────────────────
-
-def _sentence_case(s: str) -> str:
-    """
-    Sentence-case: prima lettera maiuscola, resto minuscolo.
-    Corretto per l'italiano e accettabile per l'inglese accademico.
-    """
-    if not s:
-        return s
-    lower = s.lower()
-    return lower[0].upper() + lower[1:]
-
-
-def _is_allcaps_line(line: str) -> bool:
-    """
-    True se la riga è una candidata per conversione a ## header.
-    Criterio: tutti i caratteri alfabetici sono maiuscoli, lunghezza >= 3.
-    """
-    stripped = line.strip()
-    letters = [c for c in stripped if c.isalpha()]
-    return (
-        len(letters) >= 3
-        and all(c.isupper() for c in letters)
-        and not stripped.startswith("#")
-    )
-
-
-def _allcaps_to_header(raw_line: str) -> str:
-    """
-    Converte una riga ALL-CAPS in un ## header title-case.
-    Riconosce pattern specifici (CAPITOLO ORDINE, CHAPTER N) come bonus,
-    ma funziona in modalità generica su qualsiasi testo.
-    """
-    text = raw_line.strip().rstrip('.').rstrip('?').strip()
-
-    # ── Pattern italiano: "CAPITOLO PRIMO. TITOLO DEL CAPITOLO"
-    _ORD_IT_PAT = "|".join(_ORDINALS_IT.keys())
-    m = re.match(rf'^CAPITOLO ({_ORD_IT_PAT})\. (.+)', text)
-    if m:
-        roman = _ORDINALS_IT[m.group(1)]
-        titolo = m.group(2).rstrip('.').rstrip('?').strip()
-        return f"## Capitolo {roman} — {_sentence_case(titolo)}"
-
-    # ── Pattern inglese: "CHAPTER ONE. TITLE" o "CHAPTER 1. TITLE"
-    _ORD_EN_PAT = "|".join(_ORDINALS_EN.keys())
-    m = re.match(rf'^CHAPTER ({_ORD_EN_PAT}|\d+)\.? (.+)', text)
-    if m:
-        n = _ORDINALS_EN.get(m.group(1), m.group(1))
-        titolo = m.group(2).rstrip('.').rstrip('?').strip()
-        return f"## Chapter {n} — {_sentence_case(titolo)}"
-
-    # ── Pattern generico con numerazione romana o arabica nel prefisso
-    m = re.match(r'^([IVXLCDM]+|[0-9]+)\. (.+)', text)
-    if m:
-        n = m.group(1)
-        titolo = m.group(2).rstrip('.').strip()
-        return f"## {n}. {_sentence_case(titolo)}"
-
-    # ── Caso generico: tutto maiuscolo senza pattern riconoscibile
-    return f"## {_sentence_case(text)}"
-
-
-def _is_toc_line(line: str) -> bool:
-    """True se la riga è l'intestazione di un blocco indice/TOC."""
-    first_word = line.strip().split('.')[0].strip().lower()
-    return first_word in _TOC_KEYWORDS
-
-
-# ─── Trasformazioni ────────────────────────────────────────────────────────────
-
-def apply_transforms(text: str) -> tuple[str, dict]:
-    """
-    Applica tutte le trasformazioni strutturali al testo MD.
-    Restituisce (testo_modificato, statistiche).
-    """
-    stats = {
-        "toc_rimosso": False,
-        "n_header_allcaps": 0,
-        "n_sezioni_numerate": 0,
-        "n_paragrafi_uniti": 0,
-    }
-
-    # ── 1. Rimuovi marcatori **bold** nelle intestazioni esistenti
-    #       ## **Titolo** → ## Titolo
-    text = re.sub(
-        r'^(#{1,6})\s+\*\*(.+?)\*\*\s*$',
-        r'\1 \2',
-        text, flags=re.MULTILINE,
-    )
-
-    # ── 1b. Normalizza header esistenti con contenuto ALL-CAPS → sentence-case
-    #        ## AL DI LA' DEL BENE E DEL MALE → ## Al di la' del bene e del male
-    def _norm_allcaps_header(m: re.Match) -> str:
-        hashes = m.group(1)
-        content = m.group(2).strip()
-        letters = [c for c in content if c.isalpha()]
-        if letters and all(c.isupper() for c in letters):
-            return f"{hashes} {_sentence_case(content)}"
-        return m.group(0)
-
-    text = re.sub(
-        r'^(#{1,6}) (.+)$',
-        _norm_allcaps_header,
-        text, flags=re.MULTILINE,
-    )
-
-    # ── 2. Rimuovi blocco TOC (riga indice + contenuto inline sulla stessa riga)
-    #       "INDICE. Capitolo 1 Capitolo 2 ..."  → rimossa
-    lines = text.split('\n')
-    new_lines = []
-    for line in lines:
-        if _is_toc_line(line):
-            stats["toc_rimosso"] = True
-        else:
-            new_lines.append(line)
-    text = '\n'.join(new_lines)
-
-    # ── 3. Converti righe ALL-CAPS standalone → ## header
-    #       Una riga è "standalone" se è preceduta/seguita da riga vuota
-    #       oppure si trova all'inizio/fine del documento.
-    blocks = text.split('\n\n')
-    new_blocks = []
-    for block in blocks:
-        stripped = block.strip()
-        # Blocco standalone = un'unica riga (nessun \n interno rilevante)
-        if '\n' not in stripped and _is_allcaps_line(stripped):
-            new_blocks.append(_allcaps_to_header(stripped))
-            stats["n_header_allcaps"] += 1
-        else:
-            # Controlla riga per riga per righe ALL-CAPS seguite da altri contenuti
-            sub_lines = block.split('\n')
-            converted = []
-            for ln in sub_lines:
-                if _is_allcaps_line(ln) and len(ln.strip()) > 3:
-                    converted.append(_allcaps_to_header(ln))
-                    stats["n_header_allcaps"] += 1
-                else:
-                    converted.append(ln)
-            new_blocks.append('\n'.join(converted))
-    text = '\n\n'.join(new_blocks)
-
-    # ── 4. Converti sezioni numerate "N.  testo" → "### N.\n\ntesto"
-    #       Riconosce: "1.  Testo", "42.  Testo" (due o più spazi dopo il punto)
-    def _num_repl(m: re.Match) -> str:
-        num = m.group(1)
-        testo = m.group(2).strip()
-        stats["n_sezioni_numerate"] += 1
-        return f"### {num}.\n\n{testo}"
-
-    # Pattern standard: "1.  testo" o "1. testo"
-    text = re.sub(
-        r'^(\d+)\.\s+(.+)$',
-        _num_repl,
-        text, flags=re.MULTILINE,
-    )
-
-    # Pattern con lettera-suffisso: "65 a. testo" o "65a. testo"
-    def _num_letter_repl(m: re.Match) -> str:
-        num = m.group(1) + m.group(2)
-        testo = m.group(3).strip()
-        stats["n_sezioni_numerate"] += 1
-        return f"### {num}.\n\n{testo}"
-
-    text = re.sub(
-        r'^(\d+)\s*([a-z])\.\s+(.+)$',
-        _num_letter_repl,
-        text, flags=re.MULTILINE,
-    )
-
-    # ── 5. Unisci paragrafi spezzati da salti pagina PDF
-    #       Criterio: blocco A non finisce con punteggiatura di fine frase,
-    #       blocco B non inizia con maiuscola "di sezione" né è un header.
-    #       Unione sicura: mai attraverso confini ###/##.
-    _SENTENCE_END = set('.?!»)\'"')
-    blocks = text.split('\n\n')
-    merged = []
-    i = 0
-    while i < len(blocks):
-        b = blocks[i]
-        stripped = b.strip()
-        # Prova a unire con il successivo se la frase è spezzata
-        while (
-            i + 1 < len(blocks)
-            and stripped
-            and not stripped.startswith('#')
-            and stripped[-1] not in _SENTENCE_END
-        ):
-            nxt = blocks[i + 1].strip()
-            # Non unire se il successivo è un header o è vuoto
-            if not nxt or nxt.startswith('#'):
-                break
-            # Non unire se il successivo inizia con una cifra seguita da punto
-            # (sarebbe l'inizio di un nuovo aforisma non ancora convertito)
-            if re.match(r'^\d+\.', nxt):
-                break
-            b = stripped + ' ' + nxt
-            stripped = b.strip()
-            stats["n_paragrafi_uniti"] += 1
-            i += 1
-        merged.append(b)
-        i += 1
-    text = '\n\n'.join(merged)
-
-    # ── 6. Normalizza whitespace multiplo interno alle righe
-    #       "parola  parola" → "parola parola"  (inclusi gli header)
-    lines = text.split('\n')
-    normalized = []
-    for line in lines:
-        if not line.strip():
-            normalized.append(line)
-        else:
-            normalized.append(re.sub(r'  +', ' ', line))
-    text = '\n'.join(normalized)
-
-    # ── 7. Riduci righe vuote multiple a doppie
-    text = re.sub(r'\n{3,}', '\n\n', text)
-
-    return text, stats
-
-
-# ─── Aggiornamento revision log ────────────────────────────────────────────────
-
-def update_revision_log(
-    log_path: Path,
-    stem: str,
-    profile_before: dict,
-    profile_after: dict,
-    t_stats: dict,
-) -> None:
-    header_exists = log_path.exists() and log_path.stat().st_size > 0
-
-    avv = profile_after.get("avvertenze", [])
-    avv_str = "; ".join(avv) if avv else "nessuna"
-
-    entry = f"""
-## {stem} — {date.today().isoformat()}
-
-**Trasformazioni automatiche:**
-- Normalizzazione whitespace multiplo e righe vuote
-- Blocco TOC rimosso: {'sì' if t_stats['toc_rimosso'] else 'no'}
-- Righe ALL-CAPS → ## header: {t_stats['n_header_allcaps']}
-- Sezioni numerate → ### header: {t_stats['n_sezioni_numerate']}
-- Paragrafi uniti (salti pagina PDF): {t_stats['n_paragrafi_uniti']}
-- Livello struttura: {profile_before.get('livello_struttura', '?')} → {profile_after.get('livello_struttura', '?')}
-
-**Avvertenze residue:** {avv_str}
-
-**Revisioni manuali pendenti:**
-- [ ] Verificare conversioni ALL-CAPS errate
-- [ ] Controllare sezioni troppo corte o troppo lunghe
-"""
-
-    if not header_exists:
-        log_path.write_text("# Revision log\n" + entry, encoding="utf-8")
-    else:
-        existing = log_path.read_text(encoding="utf-8")
-        log_path.write_text(existing + entry, encoding="utf-8")
-
-
-# ─── Per-document processing ─────────────────────────────────────────────────
-
-def process_stem(stem: str, project_root: Path, force: bool) -> bool:
-    src_dir = project_root / "step-3" / stem
-    out_dir = project_root / "step-4" / stem
-    raw_src = src_dir / "raw.md"
-    clean_src = src_dir / "clean.md"
-    profile_src = src_dir / "structure_profile.json"
-    clean_out = out_dir / "clean.md"
-    profile_out = out_dir / "structure_profile.json"
-
-    print(f"\nDocumento: {stem}")
-
-    if not clean_src.exists():
-        print(f"  ✗ clean.md non trovato in step-3/{stem}/ — skip")
-        return False
-
-    if clean_out.exists() and not force:
-        print(f"  ⚠️  clean.md già presente — skip")
-        print(f"       (usa --force per rieseguire)")
-        return True
-
-    out_dir.mkdir(parents=True, exist_ok=True)
-
-    # Copia raw.md immutabile (riferimento)
-    if raw_src.exists():
-        shutil.copy2(raw_src, out_dir / "raw.md")
-        print(f"  Copiato raw.md da step-3/{stem}/")
-
-    # Leggi profilo step-3 (per confronto nel report)
-    profile_before: dict = {}
-    if profile_src.exists():
-        profile_before = json.loads(profile_src.read_text(encoding="utf-8"))
-
-    # Applica trasformazioni
-    print(f"  Applicazione trasformazioni strutturali...")
-    text = clean_src.read_text(encoding="utf-8")
-    text_revised, t_stats = apply_transforms(text)
-
-    # Salva clean.md revisionato
-    clean_out.write_text(text_revised, encoding="utf-8")
-
-    # Ricalcola profilo sul nuovo clean.md
-    profile_after = analyze(clean_out)
-    profile_out.write_text(
-        json.dumps(profile_after, ensure_ascii=False, indent=2),
-        encoding="utf-8",
-    )
-
-    # Report
-    lv_b = profile_before.get("livello_struttura", "?")
-    lv_a = profile_after["livello_struttura"]
-    _STRAT = {3: "h3_aware", 2: "h2_paragraph_split", 1: "paragraph", 0: "sliding_window"}
-    print(f"  ✅ Livello struttura: {lv_b} → {lv_a}  ({_STRAT.get(lv_a, '?')})")
-    print(f"     h2: {profile_before.get('n_h2','?')} → {profile_after['n_h2']}")
-    print(f"     h3: {profile_before.get('n_h3','?')} → {profile_after['n_h3']}")
-    print(f"     TOC rimosso: {'sì' if t_stats['toc_rimosso'] else 'no'}")
-    print(f"     Righe ALL-CAPS → ##: {t_stats['n_header_allcaps']}")
-    print(f"     Sezioni numerate → ###: {t_stats['n_sezioni_numerate']}")
-    print(f"     Paragrafi uniti (salti pagina): {t_stats['n_paragrafi_uniti']}")
-    for w in profile_after["avvertenze"]:
-        print(f"     ⚠️  {w}")
-
-    # Aggiorna revision log (direttamente in step-4/, non in sottocartella)
-    log_path = project_root / "step-4" / "revision_log.md"
-    update_revision_log(log_path, stem, profile_before, profile_after, t_stats)
-    print(f"  ✅ step-4/revision_log.md aggiornato")
-    print(f"  ✅ structure_profile.json salvato")
-    return True
-
-
-# ─── Entry point ─────────────────────────────────────────────────────────────
-
-if __name__ == "__main__":
-    project_root = Path(__file__).parent.parent
-
-    parser = argparse.ArgumentParser(description="Step 4 — Revisione automatica Markdown")
-    parser.add_argument("--stem", help="Nome del documento (sottocartella di step-3/)")
-    parser.add_argument("--force", action="store_true", help="Riesegui anche se già presente")
-    args = parser.parse_args()
-
-    if args.stem:
-        stems = [args.stem]
-    else:
-        step3_dir = project_root / "step-3"
-        if not step3_dir.exists():
-            print(f"Errore: cartella step-3/ non trovata in {project_root}")
-            sys.exit(1)
-        stems = sorted(p.name for p in step3_dir.iterdir() if p.is_dir())
-        if not stems:
-            print(f"Errore: nessun documento trovato in step-3/")
-            sys.exit(1)
-
-    results = [process_stem(s, project_root, args.force) for s in stems]
-
-    ok = sum(results)
-    total = len(results)
-    print(f"\n{'✅' if all(results) else '⚠️ '} {ok}/{total} documenti revisionati")
-
-    sys.exit(0 if all(results) else 1)

From fc457e8525bcf075bd81ac8442e7d410ffbd082b Mon Sep 17 00:00:00 2001
From: Davide Grilli <davide.grilli@outlook.com>
Date: Fri, 17 Apr 2026 18:16:32 +0200
Subject: [PATCH 16/22] =?UTF-8?q?feat(ollama):=20aggiungi=20step=207=20?=
 =?UTF-8?q?=E2=80=94=20verifica=20ambiente=20Ollama?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Script check_env.py e README per controllare prerequisiti prima
della vettorizzazione: ollama nel PATH, modelli embedding e LLM
disponibili, chromadb importabile
---
 ollama/README.md    | 113 ++++++++++++++++++++
 ollama/check_env.py | 252 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 365 insertions(+)
 create mode 100644 ollama/README.md
 create mode 100644 ollama/check_env.py

diff --git a/ollama/README.md b/ollama/README.md
new file mode 100644
index 0000000..053049a
--- /dev/null
+++ b/ollama/README.md
@@ -0,0 +1,113 @@
+# Ollama — Step 7 (Verifica Ambiente)
+
+Prima di procedere con la vettorizzazione (step 8) devi avere installato:
+
+- **Ollama** — server locale per LLM e embedding
+- un **modello di embedding** (es. `qwen3-embedding:0.6b`, `bge-m3`)
+- un **modello LLM** (es. `qwen3.5:4b`)
+- **chromadb** — libreria Python per il vector store
+
+---
+
+## 1. Installa Ollama
+
+```bash
+curl -fsSL https://ollama.com/install.sh | sh
+```
+
+Verifica che il servizio sia attivo:
+
+```bash
+ollama list
+```
+
+### Disinstalla Ollama
+
+```bash
+# Ferma e rimuovi il servizio systemd
+sudo systemctl stop ollama
+sudo systemctl disable ollama
+sudo rm /etc/systemd/system/ollama.service
+sudo systemctl daemon-reload
+
+# Rimuovi il binario
+sudo rm /usr/local/bin/ollama
+
+# Rimuovi modelli e dati (opzionale)
+sudo rm -rf /usr/share/ollama
+
+# Rimuovi utente e gruppo di sistema (opzionale)
+sudo userdel ollama
+sudo groupdel ollama
+```
+
+---
+
+## 2. Scarica i modelli
+
+### Modello di embedding (consigliato)
+
+```bash
+ollama pull qwen3-embedding:0.6b
+```
+
+Alternative supportate:
+
+- `nomic-embed-text-v2-moe`
+- `bge-m3`
+- `nomic-embed-text`
+
+Se cambi embedding model rispetto a quello usato in step-8, riesegui ingest con `--force` e aggiorna `EMBED_MODEL` in `step-9/config.py`.
+
+### Modello LLM (consigliato per 8 GB RAM)
+
+```bash
+ollama pull qwen3.5:4b
+```
+
+Se usi un modello diverso, aggiorna `OLLAMA_MODEL` in `step-9/config.py`.
+
+### Disinstalla un modello
+
+```bash
+ollama rm qwen3.5:4b
+ollama rm qwen3-embedding:0.6b
+```
+
+---
+
+## 3. Installa le dipendenze Python
+
+```bash
+source .venv/bin/activate
+pip install -r requirements.txt
+```
+
+---
+
+## 4. Verifica ambiente
+
+```bash
+source .venv/bin/activate
+python ollama/check_env.py
+```
+
+Output atteso (esempio):
+
+```text
+✅ ollama trovato nel PATH
+✅ ollama risponde correttamente
+✅ embedding disponibile: qwen3-embedding:0.6b
+✅ LLM disponibile: qwen3.5:4b
+✅ chromadb importabile
+✅ Ambiente pronto — procedi con la vettorizzazione:
+   python step-8/ingest.py --stem <nome>
+```
+
+---
+
+## Prossimo step
+
+```bash
+python step-8/ingest.py --stem <nome>
+```
diff --git a/ollama/check_env.py b/ollama/check_env.py
new file mode 100644
index 0000000..cd50ccc
--- /dev/null
+++ b/ollama/check_env.py
@@ -0,0 +1,252 @@
+#!/usr/bin/env python3
+"""
+Verifica ambiente Ollama
+
+Controlla che tutti i prerequisiti per la vettorizzazione siano soddisfatti:
+  1. ollama è nel PATH e risponde
+  2. Almeno un modello di embedding è scaricato
+  3. Almeno un modello LLM è scaricato
+  4. chromadb è importabile
+
+Output: report a schermo con ✅ / ❌ per ogni componente.
+Nessun file scritto. Exit 0 se tutto OK, 1 altrimenti.
+
+Uso:
+    python ollama/check_env.py
+"""
+
+import shutil
+import subprocess
+import sys
+from pathlib import Path
+
+
+# ─── Lista canonica di modelli embedding supportati ───────────────────────────
+# Ordine: prima scelta → ultima scelta (come da README step-7)
+EMBED_MODELS = [
+    "qwen3-embedding",
+    "nomic-embed-text-v2-moe",
+    "bge-m3",
+    "nomic-embed-text",
+    "mxbai-embed-large",
+    "paraphrase-multilingual",
+    "all-minilm",
+]
+EMBED_MODEL_PREFIXES = tuple(EMBED_MODELS)
+
+OLLAMA_SERVE_HINT = "   → Avvia il servizio con: ollama serve"
+RECOMMENDED_EMBED_MODEL = "qwen3-embedding:0.6b"
+RECOMMENDED_LLM_MODEL = "qwen3.5:4b"
+
+
+def _is_embed(model_name: str) -> bool:
+    """True se il modello è riconosciuto come embedding (lista canonica o keyword)."""
+    base = model_name.split(":")[0].lower()
+    return base.startswith(EMBED_MODEL_PREFIXES) or "embed" in base
+
+
+def _parse_ollama_models(raw_output: str) -> list[str]:
+    """Estrae i nomi modello dall'output di `ollama list`."""
+    models: list[str] = []
+    for idx, line in enumerate(raw_output.splitlines()):
+        line = line.strip()
+        if not line:
+            continue
+        # Prima riga: header tabellare ("NAME ...")
+        if idx == 0 and line.lower().startswith("name"):
+            continue
+        model_name = line.split(maxsplit=1)[0]
+        models.append(model_name)
+    return models
+
+
+# ─── Modelli configurati in config.py ─────────────────────────────────────────
+# Per spostare config.py alla root: cambia solo la riga qui sotto.
+sys.path.insert(0, str(Path(__file__).parent.parent / "step-9"))
+try:
+    from config import EMBED_MODEL as CONFIGURED_EMBED, OLLAMA_MODEL as CONFIGURED_LLM
+except Exception:
+    CONFIGURED_EMBED = None
+    CONFIGURED_LLM = None
+
+REQUIRED_LIBS = ["chromadb"]
+
+
+# ─── Checks ───────────────────────────────────────────────────────────────────
+
+def _print_model_list(title: str, models: list[str]) -> None:
+    """Stampa in modo uniforme una lista di modelli."""
+    if not models:
+        print(f"   {title}: nessuno")
+        return
+    print(f"   {title} ({len(models)}):")
+    for model in models:
+        print(f"   - {model}")
+
+def check_ollama_in_path() -> bool:
+    """Verifica che ollama sia nel PATH."""
+    found = shutil.which("ollama") is not None
+    if found:
+        print("✅ ollama trovato nel PATH")
+    else:
+        print("❌ ollama non trovato nel PATH")
+        print("   → Installa con: curl -fsSL https://ollama.com/install.sh | sh")
+    return found
+
+
+def check_ollama_running() -> list[str] | None:
+    """
+    Esegue 'ollama list' e ritorna la lista dei modelli disponibili.
+    Ritorna None se ollama non risponde.
+    """
+    try:
+        result = subprocess.run(
+            ["ollama", "list"],
+            capture_output=True, text=True, timeout=10
+        )
+        if result.returncode != 0:
+            print("❌ ollama non risponde (errore all'avvio)")
+            print(OLLAMA_SERVE_HINT)
+            return None
+        models = _parse_ollama_models(result.stdout)
+        print("✅ ollama risponde correttamente")
+        return models
+    except FileNotFoundError:
+        print("❌ ollama non trovato (FileNotFoundError)")
+        return None
+    except subprocess.TimeoutExpired:
+        print("❌ ollama non risponde (timeout)")
+        print(OLLAMA_SERVE_HINT)
+        return None
+
+
+def _match(model_name: str, available: list[str]) -> str | None:
+    """
+    Ritorna il nome completo del modello trovato in 'available' che corrisponde
+    a 'model_name' (confronto per prefisso), oppure None.
+    """
+    for m in available:
+        if m == model_name or m.startswith(model_name + ":") or m.startswith(model_name + "-"):
+            return m
+    return None
+
+
+def _check_configured_model(
+    configured_name: str | None,
+    available: list[str],
+    label: str,
+) -> bool | None:
+    """
+    Se esiste un modello configurato, lo verifica e ritorna True/False.
+    Se non è configurato, ritorna None (il chiamante userà il fallback).
+    """
+    if not configured_name:
+        return None
+
+    print(f"   modello configurato (config.py): {configured_name}")
+    found = _match(configured_name, available)
+    if found:
+        print(f"✅ {label} disponibile: {found}")
+        return True
+
+    print(f"❌ {configured_name} non trovato in Ollama")
+    print(f"   → ollama pull {configured_name}")
+    return False
+
+
+def check_embed_model(available: list[str]) -> bool:
+    """Verifica che il modello di embedding configurato sia disponibile."""
+    configured_check = _check_configured_model(CONFIGURED_EMBED, available, "embedding")
+    if configured_check is not None:
+        return configured_check
+
+    # fallback: config.py non leggibile
+    found = next((m for m in available if _is_embed(m)), None)
+    if found:
+        print(f"✅ modello embedding trovato: {found}")
+        return True
+    print("❌ nessun modello di embedding trovato")
+    print(f"   → Prima scelta: ollama pull {RECOMMENDED_EMBED_MODEL}")
+    return False
+
+
+def check_llm_model(available: list[str]) -> bool:
+    """Verifica che il modello LLM configurato sia disponibile."""
+    configured_check = _check_configured_model(CONFIGURED_LLM, available, "LLM")
+    if configured_check is not None:
+        return configured_check
+
+    # fallback: config.py non leggibile
+    first_llm = next((m for m in available if not _is_embed(m)), None)
+    if first_llm:
+        print(f"✅ modello LLM trovato: {first_llm}")
+        return True
+    print("❌ nessun modello LLM trovato")
+    print(f"   → Consigliato per 8 GB RAM: ollama pull {RECOMMENDED_LLM_MODEL}")
+    return False
+
+
+def check_library(lib: str) -> bool:
+    """Verifica che una libreria Python sia importabile."""
+    try:
+        __import__(lib)
+        print(f"✅ {lib} importabile")
+        return True
+    except ImportError:
+        print(f"❌ {lib} non importabile")
+        print(f"   → Installa con: pip install {lib}")
+        return False
+
+
+# ─── Entry point ──────────────────────────────────────────────────────────────
+
+def main() -> int:
+    print("─── Verifica ambiente Ollama ─────────────────────────────────────────\n")
+
+    results: list[bool] = []
+
+    # 1. ollama nel PATH
+    in_path = check_ollama_in_path()
+    results.append(in_path)
+
+    # 2. ollama risponde + modelli
+    if in_path:
+        available = check_ollama_running()
+        if available is None:
+            results.extend([False, False, False])
+        else:
+            results.append(True)
+            _print_model_list(
+                "modelli embedding rilevati",
+                [m for m in available if _is_embed(m)],
+            )
+            _print_model_list(
+                "modelli LLM rilevati",
+                [m for m in available if not _is_embed(m)],
+            )
+            results.append(check_embed_model(available))
+            results.append(check_llm_model(available))
+    else:
+        results.extend([False, False, False])
+        print("⚠️  modelli non verificabili (ollama non trovato)")
+
+    # 3. Librerie Python
+    print()
+    for lib in REQUIRED_LIBS:
+        results.append(check_library(lib))
+
+    # ── Riepilogo ─────────────────────────────────────────────────────────────
+    print()
+    print("──────────────────────────────────────────────────────────────────────")
+    all_ok = all(results)
+    if all_ok:
+        print("✅ Ambiente pronto")
+    else:
+        n_fail = sum(1 for r in results if not r)
+        print(f"⚠️  {n_fail} problema/i rilevato/i — risolvi prima di procedere.")
+
+    return 0 if all_ok else 1
+
+
+if __name__ == "__main__":
+    sys.exit(main())

From 12effa1a51a5a4a282832ee0204b4049355b01d0 Mon Sep 17 00:00:00 2001
From: Davide Grilli <davide.grilli@outlook.com>
Date: Fri, 17 Apr 2026 18:50:31 +0200
Subject: [PATCH 17/22] refactor: elimina step-7 e step-9, consolida script
 alla root
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- step-9/: config.py, rag.py, retrieve.py → root;
  test_ollama.py → ollama/
- step-7/: eliminata, già coperta da ollama/
- sys.path aggiornati in rag.py, retrieve.py, ingest.py,
  check_env.py (step-7 e ollama)
- Riferimenti step-9/config.py → config.py in tutti i file
---
 config.py             |  54 +++++++++
 ollama/README.md      |   4 +-
 ollama/check_env.py   |   6 +-
 ollama/test_ollama.py |  66 +++++++++++
 rag.py                | 252 ++++++++++++++++++++++++++++++++++++++++++
 retrieve.py           | 217 ++++++++++++++++++++++++++++++++++++
 step-8/README.md      |  16 +--
 step-8/ingest.py      |  10 +-
 8 files changed, 605 insertions(+), 20 deletions(-)
 create mode 100644 config.py
 create mode 100644 ollama/test_ollama.py
 create mode 100644 rag.py
 create mode 100644 retrieve.py

diff --git a/config.py b/config.py
new file mode 100644
index 0000000..870c2d3
--- /dev/null
+++ b/config.py
@@ -0,0 +1,54 @@
+# ─── Configurazione RAG ───────────────────────────────────────────────────────
+#
+# Modifica questo file per cambiare i parametri della pipeline.
+#
+# Uso:
+#   python rag.py --stem nietzsche
+# ──────────────────────────────────────────────────────────────────────────────
+
+# ── Retrieval ─────────────────────────────────────────────────────────────────
+
+# Numero di chunk da recuperare per ogni domanda.
+# Valori più alti = più contesto, risposte potenzialmente più complete,
+# ma prompt più lunghi e generazione più lenta.
+TOP_K = 6
+
+# ── Generazione ───────────────────────────────────────────────────────────────
+
+# Temperatura del modello LLM.
+# 0.0 = completamente deterministico (stessa risposta ad ogni run)
+# 0.7 = più creativo e vario
+TEMPERATURE = 0.0
+
+# Disabilita il "thinking" (ragionamento interno) nei modelli Qwen3/Qwen3.5.
+# True  = risposta diretta, più veloce
+# False = ragionamento interno abilitato (più lento ma potenzialmente più accurato)
+NO_THINK = True
+
+# ── Embedding ─────────────────────────────────────────────────────────────────
+
+# Modello di embedding usato da Ollama.
+# Deve corrispondere al modello usato durante la vettorizzazione (step-8).
+# Se cambi questo, devi rieseguire step-8 con --force.
+EMBED_MODEL = "nomic-embed-text"
+
+# ── Ollama ────────────────────────────────────────────────────────────────────
+
+# URL del server Ollama (default: locale sulla porta 11434).
+OLLAMA_URL = "http://localhost:11434"
+
+# Modello LLM. Scegli in base alla RAM disponibile (vedi README).
+OLLAMA_MODEL = "qwen3.5:0.8b"
+
+# ── Prompt di sistema ─────────────────────────────────────────────────────────
+
+# Istruzioni di comportamento inviate al LLM prima del contesto e della domanda.
+# Modifica per cambiare il tono, la lingua, il grado di libertà interpretativa
+# o le condizioni di fallback ("non so rispondere").
+SYSTEM_PROMPT = (
+    "Sei un assistente che risponde usando il contesto fornito. "
+    "Sintetizza e interpreta liberamente i passaggi del contesto per rispondere alla domanda. "
+    "Se il contesto contiene informazioni pertinenti, anche indirette, usale per costruire una risposta. "
+    "Solo se il contesto è completamente irrilevante, rispondi: "
+    "\"Non trovo questa informazione nel documento.\""
+)
diff --git a/ollama/README.md b/ollama/README.md
index 053049a..79012ce 100644
--- a/ollama/README.md
+++ b/ollama/README.md
@@ -57,7 +57,7 @@ Alternative supportate:
 - `bge-m3`
 - `nomic-embed-text`
 
-Se cambi embedding model rispetto a quello usato in step-8, riesegui ingest con `--force` e aggiorna `EMBED_MODEL` in `step-9/config.py`.
+Se cambi embedding model rispetto a quello usato in step-8, riesegui ingest con `--force` e aggiorna `EMBED_MODEL` in `config.py`.
 
 ### Modello LLM (consigliato per 8 GB RAM)
 
@@ -65,7 +65,7 @@ Se cambi embedding model rispetto a quello usato in step-8, riesegui ingest con
 ollama pull qwen3.5:4b
 ```
 
-Se usi un modello diverso, aggiorna `OLLAMA_MODEL` in `step-9/config.py`.
+Se usi un modello diverso, aggiorna `OLLAMA_MODEL` in `config.py`.
 
 ### Disinstalla un modello
 
diff --git a/ollama/check_env.py b/ollama/check_env.py
index cd50ccc..359f0e9 100644
--- a/ollama/check_env.py
+++ b/ollama/check_env.py
@@ -22,7 +22,7 @@ from pathlib import Path
 
 
 # ─── Lista canonica di modelli embedding supportati ───────────────────────────
-# Ordine: prima scelta → ultima scelta (come da README step-7)
+# Ordine: prima scelta → ultima scelta (come da ollama/README.md)
 EMBED_MODELS = [
     "qwen3-embedding",
     "nomic-embed-text-v2-moe",
@@ -60,9 +60,7 @@ def _parse_ollama_models(raw_output: str) -> list[str]:
     return models
 
 
-# ─── Modelli configurati in config.py ─────────────────────────────────────────
-# Per spostare config.py alla root: cambia solo la riga qui sotto.
-sys.path.insert(0, str(Path(__file__).parent.parent / "step-9"))
+sys.path.insert(0, str(Path(__file__).parent.parent))
 try:
     from config import EMBED_MODEL as CONFIGURED_EMBED, OLLAMA_MODEL as CONFIGURED_LLM
 except Exception:
diff --git a/ollama/test_ollama.py b/ollama/test_ollama.py
new file mode 100644
index 0000000..3054d59
--- /dev/null
+++ b/ollama/test_ollama.py
@@ -0,0 +1,66 @@
+#!/usr/bin/env python3
+"""
+Test chat locale Ollama — senza RAG, senza ChromaDB.
+Uso: python ollama/test_ollama.py
+"""
+
+import json
+import sys
+import urllib.error
+import urllib.request
+from pathlib import Path
+
+sys.path.insert(0, str(Path(__file__).parent.parent))
+import config as _cfg
+
+OLLAMA_URL  = _cfg.OLLAMA_URL
+MODEL       = _cfg.OLLAMA_MODEL
+TEMPERATURE = _cfg.TEMPERATURE
+NO_THINK    = _cfg.NO_THINK
+
+
+def chat(prompt: str) -> str:
+    payload = json.dumps({
+        "model": MODEL,
+        "prompt": prompt,
+        "stream": False,
+        "think": not NO_THINK,
+        "options": {"temperature": TEMPERATURE},
+    }).encode()
+    req = urllib.request.Request(
+        f"{OLLAMA_URL}/api/generate",
+        data=payload,
+        headers={"Content-Type": "application/json"},
+        method="POST",
+    )
+    with urllib.request.urlopen(req, timeout=300) as resp:
+        return json.loads(resp.read())["response"].strip()
+
+
+def main() -> int:
+    print(f"─── Chat Ollama ──────────────────────────────── (exit per uscire)")
+    print(f"  Modello   : {MODEL}")
+    print(f"  Thinking  : {'off' if NO_THINK else 'on'}")
+    print()
+
+    while True:
+        try:
+            user = input("Tu: ").strip()
+        except (EOFError, KeyboardInterrupt):
+            print("\nUscita.")
+            break
+        if not user:
+            continue
+        if user.lower() == "exit":
+            break
+        try:
+            reply = chat(user)
+            print(f"\nAssistente: {reply}\n")
+        except (urllib.error.URLError, OSError) as e:
+            print(f"❌ Errore: {e}")
+
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/rag.py b/rag.py
new file mode 100644
index 0000000..f8f406e
--- /dev/null
+++ b/rag.py
@@ -0,0 +1,252 @@
+#!/usr/bin/env python3
+"""
+Pipeline RAG interattiva
+
+Riceve una domanda, recupera i chunk più rilevanti da ChromaDB (retrieval)
+e genera una risposta tramite Ollama (generation).
+
+Input:  chroma_db/<stem> (collection ChromaDB)
+Output: risposta a schermo
+
+Uso:
+    python rag.py --stem <nome>
+
+Nel loop interattivo:
+    Domanda: <testo>       → risposta
+    Domanda: <testo> -v    → risposta + chunk recuperati
+    Domanda: exit          → uscita
+"""
+
+import argparse
+import json
+import sys
+import urllib.error
+import urllib.request
+from pathlib import Path
+
+import chromadb
+
+# ─── Configurazione ───────────────────────────────────────────────────────────
+
+sys.path.insert(0, str(Path(__file__).parent))
+import config as _cfg
+
+project_root = Path(__file__).parent
+CHROMA_DIR   = project_root / "chroma_db"
+
+OLLAMA_URL    = _cfg.OLLAMA_URL
+EMBED_MODEL   = _cfg.EMBED_MODEL
+LLM_MODEL     = _cfg.OLLAMA_MODEL
+TOP_K         = _cfg.TOP_K
+TEMPERATURE   = _cfg.TEMPERATURE
+NO_THINK      = _cfg.NO_THINK
+SYSTEM_PROMPT = _cfg.SYSTEM_PROMPT
+
+
+# ─── Embedding ────────────────────────────────────────────────────────────────
+
+def embed(text: str) -> list[float]:
+    """Genera il vettore della domanda tramite Ollama."""
+    payload = json.dumps({"model": EMBED_MODEL, "prompt": text}).encode()
+    req = urllib.request.Request(
+        f"{OLLAMA_URL}/api/embeddings",
+        data=payload,
+        headers={"Content-Type": "application/json"},
+        method="POST",
+    )
+    with urllib.request.urlopen(req, timeout=30) as resp:
+        return json.loads(resp.read())["embedding"]
+
+
+# ─── Generazione ──────────────────────────────────────────────────────────────
+
+def call_ollama(prompt: str, system: str = "") -> str:
+    """Chiama Ollama /api/generate e ritorna la risposta."""
+    payload = json.dumps({
+        "model": LLM_MODEL,
+        "system": system,
+        "prompt": prompt,
+        "stream": False,
+        "think": not NO_THINK,
+        "options": {"temperature": TEMPERATURE},
+    }).encode()
+    req = urllib.request.Request(
+        f"{OLLAMA_URL}/api/generate",
+        data=payload,
+        headers={"Content-Type": "application/json"},
+        method="POST",
+    )
+    with urllib.request.urlopen(req, timeout=300) as resp:
+        return json.loads(resp.read())["response"].strip()
+
+
+# ─── Retrieval ────────────────────────────────────────────────────────────────
+
+def retrieve(collection: chromadb.Collection, question: str) -> list[dict]:
+    """
+    Genera l'embedding della domanda e recupera i TOP_K chunk più simili.
+    Ritorna lista di dict con chiavi: text, sezione, titolo, distance.
+    """
+    vector = embed(question)
+    results = collection.query(
+        query_embeddings=[vector],
+        n_results=TOP_K,
+        include=["documents", "metadatas", "distances"],
+    )
+    chunks = []
+    for text, meta, dist in zip(
+        results["documents"][0],
+        results["metadatas"][0],
+        results["distances"][0],
+    ):
+        chunks.append({
+            "text":     text,
+            "sezione":  meta.get("sezione", ""),
+            "titolo":   meta.get("titolo", ""),
+            "distance": dist,
+        })
+    return chunks
+
+
+# ─── Prompt ───────────────────────────────────────────────────────────────────
+
+def build_prompt(question: str, chunks: list[dict]) -> str:
+    """Ritorna (system, user_prompt) separati per l'API Ollama."""
+    context_parts = []
+    for i, c in enumerate(chunks, start=1):
+        header = f"[Contesto {i}"
+        if c["sezione"]:
+            header += f" — {c['sezione']}"
+            if c["titolo"]:
+                header += f" > {c['titolo']}"
+        header += "]"
+        context_parts.append(f"{header}\n{c['text']}")
+
+    context = "\n\n".join(context_parts)
+    user_prompt = f"{context}\n\nDomanda: {question}"
+    return SYSTEM_PROMPT, user_prompt
+
+
+# ─── Loop interattivo ─────────────────────────────────────────────────────────
+
+def answer(question: str, collection: chromadb.Collection, verbose: bool) -> None:
+    try:
+        chunks = retrieve(collection, question)
+    except (urllib.error.URLError, OSError) as e:
+        print(f"❌ Errore embedding: {e}")
+        return
+
+    if verbose:
+        print("\n── Chunk recuperati ──────────────────────────────────────────")
+        for i, c in enumerate(chunks, start=1):
+            loc = c["sezione"]
+            if c["titolo"]:
+                loc += f" > {c['titolo']}"
+            sim = 1 - c["distance"]
+            print(f"  [{i}] {loc}  (similarità: {sim:.3f})")
+            print(f"      {c['text'][:120].replace(chr(10), ' ')}...")
+        print("──────────────────────────────────────────────────────────────\n")
+
+    system, prompt = build_prompt(question, chunks)
+
+    try:
+        response = call_ollama(prompt, system=system)
+    except (urllib.error.URLError, OSError) as e:
+        print(f"❌ Errore generazione: {e}")
+        return
+
+    print(f"\n{response}\n")
+
+
+def run_loop(collection: chromadb.Collection) -> None:
+    print("── Loop RAG ─────────────────────────────────────── (exit per uscire)\n")
+    while True:
+        try:
+            raw = input("Domanda: ").strip()
+        except (EOFError, KeyboardInterrupt):
+            print("\nUscita.")
+            break
+
+        if not raw:
+            continue
+        if raw.lower() == "exit":
+            break
+
+        verbose = raw.endswith(" -v")
+        question = raw[:-3].strip() if verbose else raw
+
+        answer(question, collection, verbose)
+
+
+# ─── Entry point ──────────────────────────────────────────────────────────────
+
+def _build_epilog() -> str:
+    lines = [
+        "Uso:",
+        "  python rag.py --stem <nome>",
+        "",
+        "Loop interattivo:",
+        "  <domanda>       risposta basata sul documento",
+        "  <domanda> -v    risposta + chunk recuperati con score di similarità",
+        "  exit            termina",
+    ]
+    if CHROMA_DIR.exists():
+        try:
+            client = chromadb.PersistentClient(path=str(CHROMA_DIR))
+            names = [c.name for c in client.list_collections()]
+            if names:
+                lines += ["", f"Collection disponibili: {', '.join(names)}"]
+            else:
+                lines += ["", "Nessuna collection trovata — eseguire prima: python step-8/ingest.py"]
+        except Exception:
+            pass
+    return "\n".join(lines)
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(
+        description=(
+            "Pipeline RAG interattiva\n\n"
+            "Risponde a domande in linguaggio naturale su un documento\n"
+            "indicizzato in ChromaDB da step-8/ingest.py."
+        ),
+        epilog=_build_epilog(),
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+    )
+    parser.add_argument(
+        "--stem",
+        required=True,
+        help=(
+            "Nome della collection ChromaDB da interrogare. "
+            "Le collection vengono create da: python step-8/ingest.py --stem <nome>"
+        ),
+    )
+    args = parser.parse_args()
+
+    print("─── Pipeline RAG ────────────────────────────────────────────\n")
+    print(f"  Documento : {args.stem}")
+    print(f"  Modello   : {LLM_MODEL}")
+    print(f"  Top-K     : {TOP_K}")
+    print(f"  Thinking  : {'off' if NO_THINK else 'on'}")
+    print()
+
+    if not CHROMA_DIR.exists():
+        print("❌ chroma_db/ non trovata — esegui prima step-8")
+        return 1
+
+    client = chromadb.PersistentClient(path=str(CHROMA_DIR))
+    collections = [c.name for c in client.list_collections()]
+    if args.stem not in collections:
+        print(f"❌ Collection '{args.stem}' non trovata in chroma_db/")
+        print(f"   → python step-8/ingest.py --stem {args.stem}")
+        return 1
+
+    collection = client.get_collection(args.stem)
+    print(f"✅ Collection '{args.stem}' caricata ({collection.count()} chunk)\n")
+
+    run_loop(collection)
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/retrieve.py b/retrieve.py
new file mode 100644
index 0000000..03b26a1
--- /dev/null
+++ b/retrieve.py
@@ -0,0 +1,217 @@
+#!/usr/bin/env python3
+"""
+Retrieval puro (senza generazione LLM)
+
+Loop interattivo: inserisci una query, ottieni i chunk più simili dalla
+collection ChromaDB tramite embedding semantico — senza chiamare Ollama
+per la generation.
+
+Utile per:
+  - verificare la qualità del retrieval prima di diagnosticare risposte sbagliate
+  - controllare che i chunk giusti vengano recuperati per una query
+  - usare la pipeline come motore di ricerca semantica
+
+Input:  chroma_db/<stem> (collection ChromaDB)
+Output: lista chunk con score di similarità
+
+Uso:
+    python retrieve.py --stem <nome>
+
+Nel loop interattivo:
+    Query: <testo>      → chunk più simili con score
+    Query: <testo> -f   → testo completo dei chunk
+    Query: exit         → uscita
+"""
+
+import argparse
+import json
+import sys
+import urllib.error
+import urllib.request
+from pathlib import Path
+
+import chromadb
+
+# ─── Configurazione ───────────────────────────────────────────────────────────
+
+sys.path.insert(0, str(Path(__file__).parent))
+import config as _cfg
+
+project_root = Path(__file__).parent
+CHROMA_DIR   = project_root / "chroma_db"
+
+OLLAMA_URL  = _cfg.OLLAMA_URL
+EMBED_MODEL = _cfg.EMBED_MODEL
+TOP_K       = _cfg.TOP_K
+
+
+# ─── Embedding ────────────────────────────────────────────────────────────────
+
+def embed(text: str) -> list[float]:
+    """Genera il vettore della query tramite Ollama."""
+    payload = json.dumps({"model": EMBED_MODEL, "prompt": text}).encode()
+    req = urllib.request.Request(
+        f"{OLLAMA_URL}/api/embeddings",
+        data=payload,
+        headers={"Content-Type": "application/json"},
+        method="POST",
+    )
+    with urllib.request.urlopen(req, timeout=30) as resp:
+        return json.loads(resp.read())["embedding"]
+
+
+# ─── Retrieval ────────────────────────────────────────────────────────────────
+
+def retrieve(collection: chromadb.Collection, query: str, top_k: int) -> list[dict]:
+    """
+    Genera l'embedding della query e recupera i top_k chunk più simili.
+    Ritorna lista di dict con chiavi: rank, similarity, sezione, titolo, text.
+    """
+    vector = embed(query)
+    results = collection.query(
+        query_embeddings=[vector],
+        n_results=top_k,
+        include=["documents", "metadatas", "distances"],
+    )
+    chunks = []
+    for rank, (text, meta, dist) in enumerate(
+        zip(
+            results["documents"][0],
+            results["metadatas"][0],
+            results["distances"][0],
+        ),
+        start=1,
+    ):
+        chunks.append({
+            "rank":       rank,
+            "similarity": round(1 - dist, 4),
+            "sezione":    meta.get("sezione", ""),
+            "titolo":     meta.get("titolo", ""),
+            "text":       text,
+        })
+    return chunks
+
+
+# ─── Output ───────────────────────────────────────────────────────────────────
+
+def print_results(chunks: list[dict], full: bool = False) -> None:
+    print(f"── {len(chunks)} chunk recuperati ─────────────────────────────────\n")
+    for c in chunks:
+        loc = c["sezione"]
+        if c["titolo"]:
+            loc += f" > {c['titolo']}"
+        print(f"  [{c['rank']}] similarità: {c['similarity']:.4f}  |  {loc}")
+        if full:
+            print()
+            print(c["text"])
+        else:
+            print(f"      {c['text'][:200].replace(chr(10), ' ')}")
+            if len(c["text"]) > 200:
+                print(f"      … ({len(c['text'])} caratteri totali)")
+        print()
+
+
+# ─── Loop interattivo ─────────────────────────────────────────────────────────
+
+def run_loop(collection: chromadb.Collection, top_k: int) -> None:
+    print("── Loop retrieval ──────────────────────── (exit per uscire, -f per testo completo)\n")
+    while True:
+        try:
+            raw = input("Query: ").strip()
+        except (EOFError, KeyboardInterrupt):
+            print("\nUscita.")
+            break
+
+        if not raw:
+            continue
+        if raw.lower() == "exit":
+            break
+
+        full = raw.endswith(" -f")
+        query = raw[:-3].strip() if full else raw
+
+        try:
+            chunks = retrieve(collection, query, top_k)
+        except (urllib.error.URLError, OSError) as e:
+            print(f"❌ Errore embedding (Ollama raggiungibile?): {e}\n")
+            continue
+
+        print()
+        print_results(chunks, full=full)
+
+
+# ─── Entry point ──────────────────────────────────────────────────────────────
+
+def _build_epilog() -> str:
+    lines = [
+        "Uso:",
+        "  python retrieve.py --stem <nome>",
+        "",
+        "Nel loop interattivo:",
+        "  <query>       chunk più simili con score (testo troncato)",
+        "  <query> -f    testo completo dei chunk",
+        "  exit          termina",
+    ]
+    if CHROMA_DIR.exists():
+        try:
+            client = chromadb.PersistentClient(path=str(CHROMA_DIR))
+            names = [c.name for c in client.list_collections()]
+            if names:
+                lines += ["", f"Collection disponibili: {', '.join(names)}"]
+            else:
+                lines += ["", "Nessuna collection trovata — eseguire prima: python step-8/ingest.py"]
+        except Exception:
+            pass
+    return "\n".join(lines)
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(
+        description=(
+            "Retrieval puro (senza LLM)\n\n"
+            "Loop interattivo: inserisci una query e ottieni i chunk più simili\n"
+            "tramite embedding semantico, senza generazione LLM."
+        ),
+        epilog=_build_epilog(),
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+    )
+    parser.add_argument(
+        "--stem",
+        required=True,
+        help="Nome della collection ChromaDB da interrogare.",
+    )
+    parser.add_argument(
+        "--top-k",
+        type=int,
+        default=TOP_K,
+        metavar="N",
+        help=f"Numero di chunk da restituire per query (default: {TOP_K} da config.py).",
+    )
+    args = parser.parse_args()
+
+    print("─── Retrieval puro ──────────────────────────────────────────\n")
+    print(f"  Documento    : {args.stem}")
+    print(f"  Embed model  : {EMBED_MODEL}")
+    print(f"  Top-K        : {args.top_k}")
+    print()
+
+    if not CHROMA_DIR.exists():
+        print("❌ chroma_db/ non trovata — esegui prima step-8", file=sys.stderr)
+        return 1
+
+    client = chromadb.PersistentClient(path=str(CHROMA_DIR))
+    collections = [c.name for c in client.list_collections()]
+    if args.stem not in collections:
+        print(f"❌ Collection '{args.stem}' non trovata in chroma_db/", file=sys.stderr)
+        print(f"   → python step-8/ingest.py --stem {args.stem}", file=sys.stderr)
+        return 1
+
+    collection = client.get_collection(args.stem)
+    print(f"✅ Collection '{args.stem}' caricata ({collection.count()} chunk)\n")
+
+    run_loop(collection, args.top_k)
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/step-8/README.md b/step-8/README.md
index 322b38a..afcef49 100644
--- a/step-8/README.md
+++ b/step-8/README.md
@@ -15,14 +15,14 @@ salva in ChromaDB (vector store persistente su disco).
 
 ## Configurazione modello
 
-Il modello di embedding viene letto da **`step-9/config.py`**:
+Il modello di embedding viene letto da **`config.py`**:
 
 ```python
-# step-9/config.py
+# config.py
 EMBED_MODEL = "nomic-embed-text"   # ← cambia qui
 ```
 
-> Il modello scelto qui deve corrispondere a quello usato in step-9.
+> Il modello scelto qui deve corrispondere a quello usato in rag.py.
 > Se lo cambi dopo aver già vettorizzato, devi rieseguire step-8 con `--force`.
 
 ---
@@ -54,7 +54,7 @@ distanza coseno. La directory è ignorata da git (generata automaticamente).
 
 ## Modelli supportati
 
-Stessi modelli raccomandati nel [README di step-7](../step-7/README.md).
+Stessi modelli raccomandati nel [README di ollama](../ollama/README.md).
 Il modello deve essere scaricato in Ollama prima di eseguire questo script
 (`ollama pull <modello>`).
 
@@ -81,16 +81,16 @@ Prima scelta: `qwen3-embedding:0.6b`.
 `qwen3-embedding` + `qwen3.5` condividono tokenizer e spazio semantico —
 il retrieval è più coerente rispetto a modelli di famiglie diverse.
 
-### Coerenza tra step-8 e step-9
+### Coerenza tra ingest e retrieval
 
-**`EMBED_MODEL` deve essere identico in step-8 e step-9.**
-ChromaDB memorizza i vettori generati con un certo modello. Se step-9 usa un
+**`EMBED_MODEL` deve essere identico in `ingest.py` e `rag.py`.**
+ChromaDB memorizza i vettori generati con un certo modello. Se `rag.py` usa un
 modello diverso per la query di ricerca, gli spazi vettoriali non corrispondono
 e il retrieval restituisce risultati casuali — senza alcun errore visibile.
 
 **Dopo aver cambiato `EMBED_MODEL`, riesegui sempre con `--force`.**
 Senza `--force` lo script salta la collection già esistente — i vecchi vettori
-(generati col modello precedente) restano e continuano a essere usati da step-9.
+(generati col modello precedente) restano e continuano a essere usati da `rag.py`.
 
 ```bash
 # Cambio modello → ricrea sempre la collection
diff --git a/step-8/ingest.py b/step-8/ingest.py
index 8db0329..7dda557 100644
--- a/step-8/ingest.py
+++ b/step-8/ingest.py
@@ -5,9 +5,9 @@ Step 8 — Vettorizzazione
 Legge i chunk prodotti da step-6, genera gli embedding tramite Ollama
 e li indicizza in ChromaDB (persistente).
 
-Il modello di embedding viene letto da step-9/config.py (EMBED_MODEL).
+Il modello di embedding viene letto da config.py (EMBED_MODEL).
 Puoi sovrascriverlo con --model, ma deve corrispondere al modello che
-userai in step-9 — altrimenti riesegui con --force dopo aver cambiato.
+userai in rag.py — altrimenti riesegui con --force dopo aver cambiato.
 
 Input:  step-6/<stem>/chunks.json
 Output: chroma_db/<stem> (collection ChromaDB)
@@ -36,9 +36,7 @@ project_root = Path(__file__).parent.parent
 CHUNKS_DIR = project_root / "step-6"
 CHROMA_DIR = project_root / "chroma_db"
 
-# Legge EMBED_MODEL e OLLAMA_URL da step-9/config.py (fonte di verità).
-# Per spostare config.py alla root: cambia solo la riga qui sotto.
-sys.path.insert(0, str(project_root / "step-9"))
+sys.path.insert(0, str(project_root))
 from config import EMBED_MODEL, OLLAMA_URL  # noqa: E402
 
 EMBED_ENDPOINT = f"{OLLAMA_URL}/api/embeddings"
@@ -205,7 +203,7 @@ def main() -> int:
     parser.add_argument("--force", action="store_true",
                         help="Sovrascrive la collection se già esistente")
     parser.add_argument("--model", default=EMBED_MODEL,
-                        help=f"Modello embedding Ollama (default da step-9/config.py: {EMBED_MODEL})")
+                        help=f"Modello embedding Ollama (default da config.py: {EMBED_MODEL})")
     args = parser.parse_args()
 
     print("─── Step 8 — Vettorizzazione ─────────────────────────────────────────\n")

From e02e3496a38bfc241b70472cf9489bbc19340dbf Mon Sep 17 00:00:00 2001
From: Davide Grilli <davide.grilli@outlook.com>
Date: Fri, 17 Apr 2026 18:50:50 +0200
Subject: [PATCH 18/22] chore(requirements): rimuovi commenti step-X obsoleti

---
 requirements.txt | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 6cc5bce..dc5da54 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,12 +1,4 @@
-# Step 0-1 — Ispezione e verifica PDF
 pdfplumber==0.11.9
-
-# Step 2 — Conversione PDF → Markdown
 pymupdf4llm
-
-# conversione/ — Pipeline automatica PDF → clean Markdown (alternativa a step 0+1+2+3+4)
-# Richiede anche Java 11+ sul PATH: https://adoptium.net/
 opendataloader-pdf
-
-# Step 8 — Vettorizzazione
 chromadb

From af9ffc05593079c284b5452f0313c1c942e183d6 Mon Sep 17 00:00:00 2001
From: Davide Grilli <davide.grilli@outlook.com>
Date: Fri, 17 Apr 2026 18:51:09 +0200
Subject: [PATCH 19/22] docs(README): riscrittura per struttura reale del
 progetto
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Sostituisce la struttura step-0…step-10 con la pipeline
effettiva: conversione/, revisione /prepare-md, chunking,
verifica, ollama/, vettorizzazione, interrogazione
---
 README.md | 719 +++++++++++++++---------------------------------------
 1 file changed, 201 insertions(+), 518 deletions(-)

diff --git a/README.md b/README.md
index c35ffd5..32465cc 100644
--- a/README.md
+++ b/README.md
@@ -3,7 +3,7 @@
 Sistema RAG (Retrieval-Augmented Generation) costruito da zero, senza framework di alto livello.
 Funziona su qualsiasi PDF digitale. Gira interamente in locale, senza GPU, senza cloud.
 
-**Stack:** Python · Ollama · nomic-embed-text · Qwen3.5 · ChromaDB  
+**Stack:** Python · Ollama · ChromaDB · Qwen3-embedding · Qwen3.5  
 **Compatibile con:** Linux · macOS · Windows (WSL2) · CPU Only · ~8 GB RAM libera
 
 ---
@@ -12,18 +12,14 @@ Funziona su qualsiasi PDF digitale. Gira interamente in locale, senza GPU, senza
 
 - [Panoramica](#panoramica)
 - [Struttura del progetto](#struttura-del-progetto)
-- [Gli step](#gli-step)
-  - [Step 0 — Scegli il PDF](#step-0--scegli-il-pdf)
-  - [Step 1 — Ispezione automatica](#step-1--ispezione-automatica)
-  - [Step 2 — Conversione in Markdown grezzo](#step-2--conversione-in-markdown-grezzo)
-  - [Step 3 — Rilevamento struttura](#step-3--rilevamento-struttura)
-  - [Step 4 — Revisione manuale](#step-4--revisione-manuale)
-  - [Step 5 — Chunking adattivo](#step-5--chunking-adattivo)
-  - [Step 6 — Verifica chunk](#step-6--verifica-chunk)
-  - [Step 7 — Installazione ambiente](#step-7--installazione-ambiente)
-  - [Step 8 — Vettorizzazione](#step-8--vettorizzazione)
-  - [Step 9 — Pipeline RAG](#step-9--pipeline-rag)
-  - [Step 10 — Test automatici](#step-10--test-automatici)
+- [Pipeline](#pipeline)
+  - [Conversione](#conversione)
+  - [Revisione Markdown](#revisione-markdown)
+  - [Chunking](#chunking)
+  - [Verifica chunk](#verifica-chunk)
+  - [Ambiente Ollama](#ambiente-ollama)
+  - [Vettorizzazione](#vettorizzazione)
+  - [Interrogazione](#interrogazione)
 - [Principi di progettazione](#principi-di-progettazione)
 
 ---
@@ -31,36 +27,35 @@ Funziona su qualsiasi PDF digitale. Gira interamente in locale, senza GPU, senza
 ## Panoramica
 
 ```
-PDF
- └─► STEP 1  Ispezione automatica
-      └─► STEP 2  Conversione in Markdown grezzo
-           └─► STEP 3  Rilevamento struttura
-                └─► STEP 4  Revisione manuale        ← step più importante
-                     └─► STEP 5  Chunking adattivo
-                          └─► STEP 6  Verifica chunk
-                               └─► STEP 8  Vettorizzazione
-                                    └─► STEP 9  Pipeline RAG
-                                         └─► STEP 10 Test automatici
-
-STEP 0  Prerequisito iniziale (PDF adatto)
-STEP 7  Prerequisito tecnico (ambiente locale)
+PDF (sources/)
+    │
+    ▼  conversione/pipeline.py
+clean.md  ←  revisiona con /prepare-md
+    │
+    ▼  step-5/chunker.py
+chunks.json
+    │
+    ▼  step-6/verify_chunks.py + fix_chunks.py
+chunks.json verificato
+    │
+    ▼  step-8/ingest.py
+ChromaDB
+    │
+    ▼  rag.py
+risposta
 ```
 
 ### Dove si concentra il rischio
 
-| Step | Rischio | Motivo |
+| Fase | Rischio | Motivo |
 |---|---|---|
-| Step 0 | 🔴 Alto | Un PDF inadatto invalida tutto il lavoro successivo |
-| Step 1 | 🟢 Basso | Automatico, solo osservazione |
-| Step 2 | 🟢 Basso | Automatico, tool maturo |
-| Step 3 | 🟢 Basso | Automatico, solo analisi |
-| Step 4 | 🔴 Alto | Manuale — la qualità del MD determina la qualità del RAG |
-| Step 5 | 🟡 Medio | Logica adattiva, dipende dalla qualità del MD |
-| Step 6 | 🟢 Basso | Automatico, solo verifica |
-| Step 7 | 🟢 Basso | Installazione standard |
-| Step 8 | 🟢 Basso | Meccanico, lento ma affidabile |
-| Step 9 | 🟡 Medio | Qualità del prompt |
-| Step 10 | 🟢 Basso | Test automatici |
+| Conversione | 🟡 Medio | Automatica, ma il PDF deve essere digitale e non protetto |
+| Revisione Markdown | 🔴 Alto | Manuale — la qualità del MD determina la qualità del RAG |
+| Chunking | 🟡 Medio | Logica adattiva, dipende dalla qualità del MD |
+| Verifica chunk | 🟢 Basso | Automatica, solo verifica |
+| Ambiente Ollama | 🟢 Basso | Installazione standard |
+| Vettorizzazione | 🟢 Basso | Meccanica, lenta ma affidabile |
+| Interrogazione | 🟡 Medio | Qualità del prompt e dei parametri in `config.py` |
 
 ---
 
@@ -72,54 +67,38 @@ rag-from-scratch/
 ├── sources/                        # PDF originali — non modificare mai
 │   └── documento.pdf
 │
-├── step-0/
-│   └── check_pdf.py                # Verifica requisiti del PDF
-│
-├── step-1/
-│   └── inspect_pdf.py              # Ispezione automatica del PDF
-│
-├── step-2/
-│   ├── convert_pdf.py              # Conversione PDF → Markdown grezzo
+├── conversione/                    # PDF → Markdown strutturato
+│   ├── pipeline.py                 # Conversione PDF → clean.md
+│   ├── validate.py                 # Validazione batch di tutti gli stem
 │   └── <stem>/
-│       └── raw.md                  # MD grezzo (non toccare)
+│       ├── raw.md                  # MD grezzo (non toccare)
+│       ├── clean.md                # MD pulito — copia di lavoro
+│       └── report.json             # Metriche qualità conversione
 │
-├── step-3/
-│   ├── detect_structure.py         # Rilevamento struttura MD
+├── step-5/                         # Chunking adattivo
+│   ├── chunker.py
 │   └── <stem>/
-│       └── structure_profile.json  # Profilo struttura
+│       └── chunks.json
 │
-├── step-4/
-│   ├── revise.py                   # Pre-processing automatico MD
-│   ├── revision_log.md             # Log modifiche manuali
-│   └── <stem>/
-│       ├── clean.md                # MD revisionato
-│       └── structure_profile.json  # Profilo aggiornato
-│
-├── step-5/
-│   ├── chunker.py                  # Chunking adattivo
-│   └── <stem>/
-│       └── chunks.json             # Chunk pronti per la vettorizzazione
-│
-├── step-6/
-│   ├── verify_chunks.py            # Verifica chunk
-│   ├── fix_chunks.py               # Fix chunk problematici
+├── step-6/                         # Verifica e fix chunk
+│   ├── verify_chunks.py
+│   ├── fix_chunks.py
 │   └── <stem>/
 │       └── chunks.json             # Chunk verificati
 │
-├── step-7/
-│   ├── check_env.py                # Verifica ambiente locale
-│   └── README.md                   # Guida installazione Ollama e dipendenze
-│
-├── step-8/
-│   └── ingest.py                   # Vettorizzazione → ChromaDB
-│
-├── step-9/
-│   ├── config.py                   # Configurazione pipeline RAG ← modifica qui
-│   ├── rag.py                      # Pipeline RAG interattiva
-│   ├── test_ollama.py              # Test chat Ollama senza RAG
+├── step-8/                         # Vettorizzazione → ChromaDB
+│   ├── ingest.py
 │   └── README.md
 │
-├── chroma_db/                      # Vector store — generato da step-8
+├── ollama/                         # Ambiente Ollama
+│   ├── check_env.py                # Verifica prerequisiti
+│   ├── test_ollama.py              # Test chat senza RAG
+│   └── README.md
+│
+├── chroma_db/                      # Vector store — generato da ingest.py
+├── config.py                       # Configurazione pipeline RAG ← modifica qui
+├── rag.py                          # Pipeline RAG interattiva
+├── retrieve.py                     # Retrieval puro (senza LLM)
 ├── requirements.txt
 ├── .gitignore
 └── README.md
@@ -127,305 +106,67 @@ rag-from-scratch/
 
 ---
 
-## Gli step
+## Pipeline
 
 ---
 
-### Step 0 — Scegli il PDF
-
-**Tipo:** prerequisito manuale  
-**Input:** nessuno  
-**Output:** un PDF adatto al sistema
-
-Il PDF deve soddisfare requisiti minimi prima di qualsiasi elaborazione.
-Un PDF inadatto rende tutto il lavoro successivo inutile.
-
-**Criteri obbligatori:**
-
-- Il testo è selezionabile nel PDF reader — se non riesci a copiare una parola,
-  pdfplumber non la leggerà
-- Non è protetto da password
-- È generato digitalmente, non scansionato — una foto di un libro non è un PDF di testo
-- Il contenuto importante è nel testo, non nelle immagini
-
-**Criteri desiderabili:**
-
-- Ha una struttura logica riconoscibile: capitoli, sezioni, paragrafi
-- Le sezioni hanno titoli espliciti
-- Non ha layout a colonne multiple
-- È in una lingua sola o prevalentemente una
-
-**Come verificarlo:**
-Apri il PDF nel tuo reader, seleziona del testo da pagine diverse e copialo.
-Se il testo copiato è leggibile e nell'ordine giusto, il PDF è adatto.
-Se ottieni caratteri strani o testo nell'ordine sbagliato, il PDF ha problemi.
-
----
-
-### Step 1 — Ispezione automatica
+### Conversione
 
 **Tipo:** automatico  
-**Input:** tutti i PDF in `sources/`  
-**Output:** `step-1/<stem>_step1_report.txt`  
-**Script:** `step-1/inspect_pdf.py`
+**Input:** `sources/<stem>.pdf`  
+**Output:** `conversione/<stem>/clean.md` + `report.json`  
+**Script:** `conversione/pipeline.py`
 
 ```bash
-python step-1/inspect_pdf.py
+# Singolo documento
+python conversione/pipeline.py --stem <nome>
+
+# Tutti i PDF in sources/
+python conversione/pipeline.py
+
+# Forza riesecuzione (sovrascrive output esistente)
+python conversione/pipeline.py --stem <nome> --force
 ```
 
-Lo script scansiona automaticamente tutti i PDF in `sources/`, analizza ogni documento pagina per pagina e produce un report.
-Serve per capire la qualità del documento e mappare i problemi
-prima di affrontare la revisione manuale.
+Converte il PDF in Markdown strutturato in quattro fasi automatiche: validazione, estrazione testo (algoritmo XY-Cut++ per layout multi-colonna), pulizia strutturale e analisi della struttura del documento.
 
-**Cosa rileva:**
+Produce tre file in `conversione/<stem>/`:
 
-- Testo non estraibile (pagine con sole immagini)
-- Sillabazioni a fine riga
-- Layout a colonne (righe molto corte e numerose)
-- Intestazioni e piè di pagina ripetitivi
-- Caratteri Unicode anomali
-- Pagine vuote
-
-**Output del report:**
-
-```
-Score: 87/100
-Pagine totali:       243
-Pagine con problemi:  12
-
-  Pagina 14: sillabazione rilevata (3 occorrenze)
-  Pagina 67: possibile layout a colonne
-  Pagina 201: caratteri Unicode anomali
-
-PROSSIMI PASSI:
-  → conversione con marker funzionerà bene
-  → attenzione alle pagine 14 e 67 nella revisione manuale
-```
-
-**Decisione:**
-
-| Score | Azione |
+| File | Descrizione |
 |---|---|
-| ≥ 70 | Procedi allo step 2 |
-| 40–70 | Procedi con cautela, revisione estesa necessaria |
-| < 40 | Valuta una fonte PDF migliore |
+| `raw.md` | Markdown grezzo estratto dal PDF — **non modificare mai** |
+| `clean.md` | Markdown pulito e strutturato — input per il chunker |
+| `report.json` | Metriche qualità, anomalie, strategia di chunking suggerita |
 
----
+**Requisiti aggiuntivi:** Java 11+ nel PATH (`opendataloader-pdf` lo richiede).
 
-### Step 2 — Conversione in Markdown grezzo
-
-**Tipo:** automatico  
-**Input:** tutti i PDF in `sources/` (o uno solo con `--pdf`)  
-**Output:** `step-2/<stem>/raw.md` + `step-2/<stem>/clean.md`  
-**Script:** `step-2/convert_pdf.py`
+**Validazione batch:**
 
 ```bash
-python step-2/convert_pdf.py                         # tutti i PDF in sources/
-python step-2/convert_pdf.py --pdf sources/doc.pdf   # un solo PDF
+python conversione/validate.py
 ```
 
-Converte il PDF in Markdown usando `pymupdf4llm`. Il risultato non è perfetto — è la base
-su cui lavorerai nello step 4.
+Mostra una tabella di stato per tutti gli stem convertiti. Vedi [`conversione/README.md`](conversione/README.md) per dettagli completi.
 
-Lo script crea due file:
-- `raw.md` — conversione grezza, **non modificare mai**. È il punto di partenza di riferimento.
-- `clean.md` — copia di lavoro che verrà modificata negli step successivi.
-
-**Cosa produce la conversione:**
-
-- Titoli riconosciuti e convertiti in `#` `##` `###`
-- Paragrafi separati da righe vuote
-- Sillabazione parzialmente risolta
-
-**Cosa non produce:**
-
-- Rimozione intestazioni e piè di pagina
-- Correzione completa del layout a colonne
-- Descrizione del contenuto delle immagini
+**PDF supportati:** digitali con testo selezionabile. Non supportati: scansionati (solo immagini) e protetti da password.
 
 ---
 
-### Step 3 — Rilevamento struttura
+### Revisione Markdown
 
-**Tipo:** automatico  
-**Input:** `step-2/<stem>/`  
-**Output:** `step-3/<stem>/structure_profile.json`  
-**Script:** `step-3/detect_structure.py`
+**Tipo:** semi-automatico  
+**Input:** `conversione/<stem>/clean.md`  
+**Output:** `conversione/<stem>/clean.md` corretto in-place
 
-```bash
-python step-3/detect_structure.py                    # tutti i documenti in step-2/
-python step-3/detect_structure.py --stem <nome>      # un solo documento
-python step-3/detect_structure.py --force            # riesegui anche se già presente
-```
-
-Copia `raw.md` e `clean.md` da `step-2/<stem>/` e analizza la struttura del Markdown senza modificarla.
-Il profilo prodotto guida sia la revisione manuale che il chunker.
-
-**I quattro livelli strutturali:**
-
-```
-Livello 3 — struttura ricca
-  Il documento ha ### con regolarità.
-  Ogni ### è un'unità semantica chiara.
-  Esempi: opere filosofiche, manuali tecnici, leggi.
-  Strategia chunking: boundary su ###
-
-Livello 2 — struttura parziale
-  Il documento ha ## ma pochi o nessun ###.
-  Le sezioni sono i capitoli, non le sottosezioni.
-  Esempi: articoli scientifici, report, saggi.
-  Strategia chunking: boundary su ##, split interno su paragrafi
-
-Livello 1 — solo paragrafi
-  Il documento non ha titoli significativi.
-  La struttura è data dalle righe vuote.
-  Esempi: testi narrativi, lettere, trascrizioni.
-  Strategia chunking: boundary su paragrafo
-
-Livello 0 — testo piatto
-  Un blocco continuo senza struttura riconoscibile.
-  Esempi: PDF mal convertiti, testi antichi.
-  Strategia chunking: sliding window su frasi
-```
-
-**Profilo prodotto:**
-
-```json
-{
-  "livello_struttura": 3,
-  "n_h1": 1,
-  "n_h2": 9,
-  "n_h3": 296,
-  "n_paragrafi": 312,
-  "boundary_primario": "h3",
-  "lingua_rilevata": "it",
-  "lunghezza_media_sezione": 420,
-  "strategia_chunking": "h3_aware",
-  "avvertenze": [
-    "14 sezioni sotto i 200 caratteri — verranno accorpate",
-    "8 sezioni sopra i 800 caratteri — verranno divise"
-  ]
-}
-```
-
----
-
-### Step 4 — Revisione manuale
-
-**Tipo:** manuale (con pre-processing automatico)  
-**Input:** `step-3/<stem>/clean.md` + `step-3/<stem>/structure_profile.json`  
-**Output:** `step-4/<stem>/clean.md` — MD revisionato  
-**Script:** `step-4/revise.py`
-
-> Questo è lo step più importante dell'intera pipeline.
-> La qualità del RAG dipende da questo step più di qualsiasi
+> Questo è il passaggio più importante dell'intera pipeline.
+> La qualità del RAG dipende da questo passaggio più di qualsiasi
 > parametro tecnico o scelta di modello.
 
-#### Pre-processing automatico
-
-Prima di qualsiasi revisione manuale, esegui lo script di revisione automatica:
-
-```bash
-python step-4/revise.py --stem documento
+```
+/prepare-md conversione/<stem>/clean.md
 ```
 
-Lo script applica le seguenti trasformazioni euristiche, valide per qualsiasi documento:
-
-| Trasformazione | Descrizione |
-|---|---|
-| Rimozione TOC | Righe che iniziano con `INDICE`, `INDEX`, `CONTENTS`, ecc. |
-| ALL-CAPS → `##` | Righe standalone in maiuscolo convertite in header section-case |
-| `N.  testo` → `### N.` | Sezioni numerate (con 1+ spazio dopo il punto) convertite in h3 |
-| Unione paragrafi | Blocchi spezzati da salti pagina PDF uniti automaticamente |
-| Whitespace | Spazi multipli normalizzati, righe vuote ridotte |
-
-Il profilo strutturale aggiornato viene salvato in `step-4/<stem>/structure_profile.json`.
-
-#### Revisione assistita da Claude Code
-
-Dopo il pre-processing, usa la skill integrata per una revisione qualitativa:
-
-```
-/step4-review documento
-```
-
-La skill analizza `step-4/<stem>/clean.md` e produce un report strutturato:
-
-```
-🔴 BLOCCANTI  — problemi che compromettono il chunking
-🟡 MINORI     — artefatti visibili ma non bloccanti
-🟢 OK         — categorie senza problemi
-```
-
-Poi propone le correzioni e le applica solo su tua approvazione.
-
-#### Revisione manuale (senza Claude Code)
-
-Se non usi Claude Code, esegui questi 6 check dal terminale.
-In tutti i comandi sostituisci `<stem>` con il nome reale del documento.
-
-**Check 1 — Sillabazione residua**
-Parole spezzate a fine riga con trattino (artefatto da PDF non risolto):
-```bash
-grep -n "\-$" step-4/<stem>/clean.md | head -20
-```
-Se trovi risultati: unisci la riga con la successiva eliminando il trattino
-e il ritorno a capo.
-
-**Check 2 — Righe orfane**
-Righe brevi (<60 char) isolate che sembrano numeri di pagina, autori, intestazioni:
-```bash
-grep -n "^[^#\-\*\|].\{1,59\}$" step-4/<stem>/clean.md | grep -v "^\s*$" | head -30
-```
-Per ogni riga: valuta se è testo legittimo (frase breve) o artefatto
-(numero di pagina, nome autore ripetuto, intestazione PDF). Gli artefatti vanno eliminati.
-
-**Check 3 — Frasi spezzate**
-Paragrafi che terminano senza punteggiatura di fine frase:
-```bash
-grep -n "[^.!?»)\]\'\"]$" step-4/<stem>/clean.md \
-  | grep -v "^[0-9]*:#" \
-  | grep -v "^[0-9]*:\s*$" \
-  | grep -v "^\s*[-\*]" \
-  | head -20
-```
-Segnala le righe brevi che finiscono a metà concetto. Uniscile alla riga successiva.
-
-**Check 4 — Header sospetti**
-```bash
-grep -n "^##\? " step-4/<stem>/clean.md | head -40
-```
-Verifica:
-- Header con testo >80 caratteri → probabilmente è testo normale, non un header
-- Header in MAIUSCOLO non convertito → cambia in formato sentence-case
-- Header duplicati (stesso testo due volte) → valuta se unire o rinominare
-- `###` senza un `##` padre → salto di gerarchia anomalo
-
-**Check 5 — Sezioni quasi vuote**
-```bash
-python3 -c "
-import re
-text = open('step-4/<stem>/clean.md').read()
-sections = re.split(r'^(#{1,3} .+)$', text, flags=re.MULTILINE)
-for i in range(1, len(sections)-1, 2):
-    header = sections[i].strip()
-    body = sections[i+1].strip() if i+1 < len(sections) else ''
-    if not body:
-        print(f'VUOTA: {header!r}')
-    elif len(body) < 80:
-        print(f'CORTA ({len(body)} char): {header!r} → {body[:60]!r}')
-"
-```
-Le sezioni vuote generano chunk inutili. Eliminale o accorpale alla sezione precedente.
-
-**Check 6 — Gerarchia strutturale**
-```bash
-grep -n "^#\{1,3\} " step-4/<stem>/clean.md | head -50
-```
-Deve esserci un solo `# h1` all'inizio. Poi `## h2` e opzionalmente `### h3`.
-Segnala `###` prima del primo `##`, o più di un `#`.
-
----
+La skill analizza il `clean.md` e corregge automaticamente i problemi che compromettono il chunking: sillabazione, artefatti, header malformati, paragrafi spezzati, gerarchia incoerente, sezioni vuote.
 
 **Struttura target dopo la revisione:**
 
@@ -441,36 +182,31 @@ Ogni paragrafo è semanticamente autonomo.
 Una riga vuota separa le sezioni.
 ```
 
-**Criterio di qualità:**
-Leggi ogni sezione ad alta voce. Se suona naturale è corretta.
-Se si interrompe c'è una riga spezzata. Se suona ripetitiva c'è un artefatto.
+**Criterio di qualità:** leggi ogni sezione ad alta voce. Se suona naturale è corretta. Se si interrompe c'è una riga spezzata. Se suona ripetitiva c'è un artefatto.
 
 ---
 
-### Step 5 — Chunking adattivo
+### Chunking
 
 **Tipo:** automatico  
-**Input:** `step-4/<stem>/clean.md` + `step-4/<stem>/structure_profile.json`  
+**Input:** `conversione/<stem>/clean.md`  
 **Output:** `step-5/<stem>/chunks.json`  
 **Script:** `step-5/chunker.py`
 
 ```bash
-python step-5/chunker.py --stem documento
+python step-5/chunker.py --stem <stem>
 ```
 
-Divide il Markdown pulito in chunk. Usa il profilo strutturale
-per scegliere la strategia giusta. Non sa nulla del contenuto —
-si basa solo sulla struttura.
+Divide il Markdown pulito in chunk. Usa il profilo strutturale da `report.json` per scegliere la strategia giusta. Non sa nulla del contenuto — si basa solo sulla struttura.
 
 **Regole invarianti per qualsiasi documento:**
 
 - Un chunk non attraversa mai il confine tra due sezioni diverse
 - Un chunk non spezza mai una frase a metà
 - Ogni chunk porta il suo contesto nel prefisso
-- L'overlap tra chunk avviene solo su frasi intere,
-  mai tra sezioni diverse
+- L'overlap tra chunk avviene solo su frasi intere, mai tra sezioni diverse
 
-**Parametri:**
+**Parametri (in `step-5/chunker.py`):**
 
 | Parametro | Default | Significato |
 |---|---|---|
@@ -491,95 +227,54 @@ si basa solo sulla struttura.
 }
 ```
 
-Il prefisso `[Sezione > Titolo]` è fondamentale: permette all'embedding
-di catturare il contesto topico del chunk anche quando il testo
-da solo sarebbe ambiguo.
+Il prefisso `[Sezione > Titolo]` è fondamentale: permette all'embedding di catturare il contesto topico del chunk anche quando il testo da solo sarebbe ambiguo.
 
 ---
 
-### Step 6 — Verifica e fix chunk
+### Verifica chunk
 
 **Tipo:** automatico  
 **Input:** `step-5/<stem>/chunks.json`  
 **Output:** `step-6/<stem>/chunks.json` verificato + `report.json`  
 **Script:** `step-6/verify_chunks.py`, `step-6/fix_chunks.py`
 
-Questo step si articola in un ciclo: verifica → fix automatico → ri-verifica. Non si va allo step 8 finché non ci sono 🔴.
+Questo passaggio si articola in un ciclo: verifica → fix automatico → ri-verifica. Non si procede alla vettorizzazione finché non ci sono 🔴.
 
-**Workflow completo:**
+**Workflow:**
 
 ```
 1. Verifica
-   python step-6/verify_chunks.py --stem documento
+   python step-6/verify_chunks.py --stem <stem>
 
-2a. Se ✅ OK o solo 🟡 → vai allo step 8
+2a. Se ✅ OK o solo 🟡 → vai alla vettorizzazione
 
 2b. Se ci sono 🔴 → prova il fix automatico:
-   python step-6/fix_chunks.py --stem documento --dry-run   # anteprima
-   python step-6/fix_chunks.py --stem documento             # applica
+   python step-6/fix_chunks.py --stem <stem> --dry-run   # anteprima
+   python step-6/fix_chunks.py --stem <stem>             # applica
 
 3. Ri-verifica dopo il fix:
-   python step-6/verify_chunks.py --stem documento
+   python step-6/verify_chunks.py --stem <stem>
 
-4. Se rimangono 🔴 → torna allo step 4 e correggi clean.md,
+4. Se rimangono 🔴 → torna alla revisione Markdown e correggi clean.md,
    poi riesegui dall'inizio:
-   python step-5/chunker.py --stem documento --force
-   python step-6/verify_chunks.py --stem documento
+   python step-5/chunker.py --stem <stem> --force
+   python step-6/verify_chunks.py --stem <stem>
 ```
 
 > **Shortcut con Claude:** usa `/step6-fix <stem>` — esegue dry-run, spiega le operazioni, chiede conferma e ri-verifica automaticamente.
 
-#### Senza Claude Code — come leggere l'output e decidere
-
-**1. Leggi l'output di `verify_chunks.py`**
-
-L'output termina con una delle tre condizioni:
+**Output di `verify_chunks.py` — tre condizioni finali:**
 
 | Condizione | Significato | Cosa fare |
 |---|---|---|
-| `✅ N/N documenti senza problemi` | Nessun problema | Vai allo step 8 |
-| `🟡 Solo avvisi minori` | Chunk corti o lunghi, non bloccanti | Puoi andare allo step 8 oppure ottimizzare con `fix_chunks.py` |
+| `✅ N/N documenti senza problemi` | Nessun problema | Procedi |
+| `🟡 Solo avvisi minori` | Chunk corti o lunghi, non bloccanti | Puoi procedere o ottimizzare con `fix_chunks.py` |
 | `⚠️ 0/N documenti senza problemi` + 🔴 | Frasi spezzate o chunk vuoti | Esegui `fix_chunks.py`, poi ri-verifica |
 
-**2. Prima di applicare il fix: leggi il dry-run**
+**Cosa verifica:**
 
-```bash
-python step-6/fix_chunks.py --stem <stem> --dry-run
-```
-
-L'output elenca le operazioni pianificate. Significato:
-
-| Operazione | Cosa fa | Sicurezza |
-|---|---|---|
-| `fondi N chunk incompleti` | Unisce il chunk troncato col successivo | Sempre sicura |
-| `fondi N chunk troppo corti` | Unisce chunk <200 char col successivo | Sicura se il risultato non supera MAX×1.5 |
-| `spezza N chunk troppo lunghi` | Divide chunk >1200 char su frasi | Sicura solo se esistono frasi naturali dove spezzare |
-| `rimuovi N chunk vuoti` | Elimina chunk senza testo | Sempre sicura |
-
-**3. Se i 🔴 persistono dopo il fix**
-
-`fix_chunks.py` non riesce ad autocorreggersi quando il problema
-è nella struttura del testo sorgente. I casi tipici e la soluzione in `clean.md`:
-
-| Sintomo nel report | Causa in `clean.md` | Correzione |
-|---|---|---|
-| Chunk finisce con `:` | Intro di un elenco separata dall'elenco da una riga vuota | Rimuovi la riga vuota tra l'intro e la lista |
-| Chunk finisce a metà parola | Salto di pagina PDF con numero di pagina nel mezzo | Trova e rimuovi il numero di pagina, unisci le righe |
-| Chunk con testo artefatto (URL, watermark) | Artefatto non rimosso allo step 4 | Elimina la sezione in `clean.md` |
-| Chunk con frase enorme non spezzabile | Singolo paragrafo >MAX_CHARS senza frasi intermedie | Spezza manualmente il paragrafo su frasi logiche |
-
-Dopo ogni correzione in `clean.md` riesegui dall'inizio dello step 5:
-
-```bash
-python step-5/chunker.py --stem <stem> --force
-rm -f step-6/<stem>/chunks.json          # forza la rilettura da step-5
-python step-6/verify_chunks.py --stem <stem>
-```
-
-**Cosa verifica `verify_chunks.py`:**
-
-- Nessun chunk è sotto `MIN_CHARS` 🟡
-- Nessun chunk è sopra `MAX_CHARS × 1.5` 🟡
+- Nessun chunk sotto `MIN_CHARS` 🟡
+- Nessun chunk sopra `MAX_CHARS × 1.5` 🟡
 - Ogni chunk finisce con punteggiatura di fine frase 🔴
 
 **Cosa corregge `fix_chunks.py`:**
@@ -591,52 +286,39 @@ python step-6/verify_chunks.py --stem <stem>
 | Fondi chunk troppo corto col successivo | Chunk sotto `MIN_CHARS` |
 | Spezza chunk troppo lungo | Chunk sopra `MAX_CHARS × 1.5` |
 
-**Tabella diagnosi — problemi non risolvibili con fix_chunks:**
+**Se i 🔴 persistono dopo il fix** — i casi tipici e la soluzione in `clean.md`:
 
-| Sintomo | Causa probabile | Soluzione |
+| Sintomo nel report | Causa in `clean.md` | Correzione |
 |---|---|---|
-| Molti chunk corti dopo il fix | `MIN_CHARS` troppo alto o testo frammentato nel MD | Abbassa `MIN_CHARS` o correggi step 4 |
-| Chunk spezzato creato dal fix stesso | Frase singola > `MAX_CHARS` non spezzabile | Spezza manualmente il paragrafo in step 4 |
-| Chunk che finisce a metà frase non risolvibile | Salto di pagina PDF non sanato nel MD | Correggi la riga spezzata in `clean.md` |
-
-**Output se tutto ok:**
-
-```
-Totale chunk:     301
-✅ OK:            301
-
-Distribuzione lunghezze:
-  Min:    187 char
-  Max:    923 char
-  Media:  401 char
-
-✅ 1/1 documenti senza problemi
-```
+| Chunk finisce con `:` | Intro di un elenco separata dall'elenco da una riga vuota | Rimuovi la riga vuota tra l'intro e la lista |
+| Chunk finisce a metà parola | Numero di pagina nel mezzo del testo | Trova e rimuovi il numero di pagina, unisci le righe |
+| Chunk con testo artefatto | Artefatto non rimosso nella revisione | Elimina la sezione in `clean.md` |
+| Chunk con frase enorme non spezzabile | Paragrafo >MAX_CHARS senza frasi intermedie | Spezza manualmente il paragrafo |
 
 ---
 
-### Step 7 — Installazione ambiente
+### Ambiente Ollama
 
 **Tipo:** manuale (una volta sola)  
 **Input:** nessuno  
 **Output:** ambiente locale funzionante  
-**Script:** `step-7/check_env.py`
+**Script:** `ollama/check_env.py`
 
-Installa Ollama, scarica i modelli e verifica l'ambiente. Si esegue una volta sola.
+Installa Ollama, scarica i modelli e verifica l'ambiente. Si esegue una volta sola prima della vettorizzazione.
 
-Vedi [`step-7/README.md`](step-7/README.md) per istruzioni dettagliate e scelta dei modelli.
+Vedi [`ollama/README.md`](ollama/README.md) per istruzioni dettagliate e scelta dei modelli.
 
 ```bash
-python step-7/check_env.py
+python ollama/check_env.py
 ```
 
 ---
 
-### Step 8 — Vettorizzazione
+### Vettorizzazione
 
 **Tipo:** automatico (lento)  
 **Input:** `step-6/<stem>/chunks.json`  
-**Output:** `chroma_db/` popolato  
+**Output:** `chroma_db/<stem>` popolato  
 **Script:** `step-8/ingest.py`
 
 ```bash
@@ -653,125 +335,126 @@ Per 900 chunk aspetta circa 15 minuti.
 | Argomento | Descrizione |
 |---|---|
 | `--stem <nome>` | Processa un singolo documento. Senza questo argomento processa tutti gli stem trovati in `step-6/` |
-| `--force` | Cancella e ricrea la collection se esiste già. Senza `--force`, se la collection è presente lo step viene saltato |
+| `--force` | Cancella e ricrea la collection se esiste già |
 
 **Quando usare `--force`:**
-Se hai modificato i chunk (es. hai rieseguito step-6 dopo correzioni), la collection in ChromaDB
-contiene ancora i vecchi vettori. `--force` la cancella e la ricrea da zero con i chunk aggiornati.
+Se hai modificato i chunk o cambiato `EMBED_MODEL` in `config.py`, la collection in ChromaDB contiene i vecchi vettori. `--force` la cancella e ricrea da zero.
 
 **Cosa succede per ogni chunk:**
 
 ```
 testo del chunk
     │
-    ▼  Ollama (nomic-embed-text)
-vettore di 768 numeri
-[0.23, -0.41, 0.87, 0.12, ...]
+    ▼  Ollama (EMBED_MODEL)
+vettore N-dim
     │
     ▼  ChromaDB
 salva: testo + vettore + metadati (sezione, titolo, sub_index)
 ```
 
-**Perché 768 numeri:**
-Ogni numero rappresenta una dimensione semantica.
-Testi con significato simile producono vettori simili —
-i loro numeri sono vicini nello spazio a 768 dimensioni.
-Questo è ciò che permette il retrieval semantico.
-
-**Output durante l'esecuzione:**
-
-```
-✅ Ollama OK — nomic-embed-text disponibile
-
-📦 872 chunk da ingestire
-
-  [  1/872] ✓ sezione_1__sotto_1__s0                ETA: 870s
-  [  2/872] ✓ sezione_1__sotto_2__s0                ETA: 867s
-  ...
-  [872/872] ✓ sezione_9__sotto_42__s0               ETA: 0s
-
-✅ Ingestione completata in 718s — 872/872 chunk salvati
-   Collection 'nietzsche' in chroma_db/
-```
-
-`chroma_db/` contiene ora tutti i vettori su disco.
-Non è necessario ripetere questo step a meno che il documento cambi.
+Vedi [`step-8/README.md`](step-8/README.md) per la scelta del modello di embedding e le regole di coerenza con la fase di interrogazione.
 
 ---
 
-### Step 9 — Pipeline RAG
+### Interrogazione
 
 **Tipo:** interattivo  
-**Input:** `chroma_db/` + domanda dell'utente  
-**Output:** risposta basata sul documento  
-**Script:** `step-9/rag.py`
+**Input:** `chroma_db/<stem>` + domanda dell'utente  
+**Output:** risposta basata sul documento
+
+Due modalità:
+
+| Script | Modalità | Quando usarlo |
+|---|---|---|
+| `rag.py` | Retrieval + generazione LLM | Risposta in linguaggio naturale |
+| `retrieve.py` | Solo retrieval (no LLM) | Debug, verifica chunk, ricerca semantica |
+
+#### rag.py — Risposta in linguaggio naturale
 
 ```bash
 source .venv/bin/activate
-python step-9/rag.py --stem <nome>
+python rag.py --stem <nome>
 ```
 
-Loop interattivo che risponde a domande sul documento. Configura i parametri in `step-9/config.py` prima di avviare.
+| Sintassi | Comportamento |
+|---|---|
+| `<testo>` | Risposta basata sul documento |
+| `<testo> -v` | Risposta + chunk recuperati con score di similarità |
+| `exit` | Esce dal programma |
 
-Vedi [`step-9/README.md`](step-9/README.md) per la configurazione completa.
+Flusso interno:
 
----
+```
+domanda
+    │
+    ▼  embed (EMBED_MODEL, Ollama)
+vettore N-dim
+    │
+    ▼  query ChromaDB — similarità coseno, top-K
+chunk rilevanti
+    │
+    ▼  build_prompt (SYSTEM_PROMPT + contesti + domanda)
+    │
+    ▼  generate (OLLAMA_MODEL, Ollama)
+risposta
+```
 
-### Step 10 — Test automatici
-
-**Tipo:** automatico  
-**Input:** sistema completo  
-**Output:** tutti i test verdi  
-**Script:** `step-10/test_pipeline.py` *(da implementare)*
+#### retrieve.py — Retrieval puro (senza LLM)
 
 ```bash
-python step-10/test_pipeline.py --stem <nome>
+source .venv/bin/activate
+python retrieve.py --stem <nome>
 ```
 
-Verifica ogni componente in isolamento e poi nel sistema completo.
-I test non dipendono dal contenuto del documento — usano dati
-fittizi creati e distrutti in memoria.
+Vettorizza la query e restituisce i chunk più simili con score di similarità — senza chiamare Ollama per la generation. Utile per verificare la qualità del retrieval e diagnosticare risposte sbagliate.
 
-**Struttura dei test:**
+| Sintassi | Comportamento |
+|---|---|
+| `<testo>` | Chunk più simili con score (testo troncato a 200 car.) |
+| `<testo> -f` | Chunk più simili con testo completo |
+| `exit` | Esce dal programma |
 
+Accetta `--top-k N` per sovrascrivere il valore di `config.py` per quella sessione.
+
+#### Configurazione (`config.py`)
+
+| Parametro | Default | Descrizione |
+|---|---|---|
+| `TOP_K` | `6` | Chunk recuperati per ogni domanda. Valori consigliati: `3`–`10` |
+| `TEMPERATURE` | `0.0` | Deterministico a `0.0`, creativo verso `1.0`. Per RAG consigliato `0.0` |
+| `NO_THINK` | `True` | Disabilita il chain-of-thought interno dei modelli Qwen3/Qwen3.5. `True` = risposta diretta, più veloce |
+| `EMBED_MODEL` | `"nomic-embed-text"` | Deve corrispondere al modello usato in `ingest.py`. Se cambiato, rieseguire con `--force` |
+| `OLLAMA_URL` | `"http://localhost:11434"` | Modifica solo se Ollama gira su porta o host diversi |
+| `OLLAMA_MODEL` | `"qwen3.5:0.8b"` | Modello LLM. Vedi [`ollama/README.md`](ollama/README.md) per la scelta |
+| `SYSTEM_PROMPT` | *(vedi file)* | Istruzioni di comportamento inviate al LLM. Modifica per cambiare tono, lingua o condizione di fallback |
+
+#### Test senza RAG
+
+Per verificare che Ollama risponda correttamente prima di interrogare il documento:
+
+```bash
+python ollama/test_ollama.py
 ```
-Test unitari — ogni componente isolato
-  ✓ split_sentences non spezza le frasi
-  ✓ parse_markdown rileva la struttura corretta
-  ✓ chunk_sezione rispetta i boundary
-  ✓ il prefisso è sempre presente in ogni chunk
 
-Test integrazione — i componenti parlano tra loro
-  ✓ Ollama è raggiungibile
-  ✓ i modelli sono disponibili
-  ✓ l'embedding produce 768 dimensioni
-  ✓ testi diversi producono vettori diversi
-  ✓ ChromaDB scrive e legge correttamente
-
-Test qualità — il sistema si comporta bene
-  ✓ il retrieval trova il chunk pertinente
-  ✓ il retrieval non trova il chunk non pertinente
-  ✓ il LLM usa il contesto fornito
-  ✓ il LLM ammette quando la risposta non è nel contesto
-```
+Chat diretta con il modello, senza ChromaDB. Usa gli stessi parametri di `config.py`.
 
 ---
 
 ## Principi di progettazione
 
 **Atomico**
-Ogni step fa una cosa sola. Il chunker non sa niente di Ollama.
+Ogni fase fa una cosa sola. Il chunker non sa niente di Ollama.
 L'ingestione non sa niente del MD originale.
 Se un pezzo si rompe, sai esattamente dove.
 
 **Verificabile**
-Ogni step ha un criterio di completamento oggettivo.
-Non si passa allo step successivo finché il precedente non è verificato.
+Ogni fase ha un criterio di completamento oggettivo.
+Non si passa alla fase successiva finché la precedente non è verificata.
 
 **Reversibile**
-Puoi tornare indietro senza perdere il lavoro degli altri step.
-Cambi il MD? Riesegui solo step 5 e 8.
-Cambi i parametri del chunker? Riesegui solo step 5 e 8.
+Puoi tornare indietro senza perdere il lavoro delle altre fasi.
+Cambi il MD? Riesegui solo chunking e vettorizzazione.
+Cambi i parametri del chunker? Riesegui solo chunking e vettorizzazione.
 Non si riparte mai da zero.
 
 **Senza assunzioni**

From e4dc0856bb11f066aa678444e68f6800ae424c1d Mon Sep 17 00:00:00 2001
From: Davide Grilli <davide.grilli@outlook.com>
Date: Fri, 17 Apr 2026 18:52:13 +0200
Subject: [PATCH 20/22] refactor: pulizia files

---
 step-7/README.md      | 147 ------------------------
 step-7/check_env.py   | 208 ----------------------------------
 step-9/README.md      | 109 ------------------
 step-9/config.py      |  54 ---------
 step-9/rag.py         | 252 ------------------------------------------
 step-9/retrieve.py    | 217 ------------------------------------
 step-9/test_ollama.py |  66 -----------
 7 files changed, 1053 deletions(-)
 delete mode 100644 step-7/README.md
 delete mode 100644 step-7/check_env.py
 delete mode 100644 step-9/README.md
 delete mode 100644 step-9/config.py
 delete mode 100644 step-9/rag.py
 delete mode 100644 step-9/retrieve.py
 delete mode 100644 step-9/test_ollama.py

diff --git a/step-7/README.md b/step-7/README.md
deleted file mode 100644
index c5a5b72..0000000
--- a/step-7/README.md
+++ /dev/null
@@ -1,147 +0,0 @@
-# Step 7 — Verifica ambiente
-
-Prima di procedere con la vettorizzazione (step 8) devi avere installato:
-
-- **Ollama** — server locale per LLM e embedding
-- un **modello di embedding** (es. `nomic-embed-text`, `bge-m3`)
-- un **modello LLM** (es. `qwen3.5:4b`, `qwen3:4b`)
-- **chromadb** — libreria Python per il vector store
-
----
-
-## 1. Installa Ollama
-
-```bash
-curl -fsSL https://ollama.com/install.sh | sh
-```
-
-Verifica che il servizio sia attivo:
-
-```bash
-ollama list
-```
-
-### Disinstalla Ollama
-
-```bash
-# Ferma e rimuovi il servizio systemd
-sudo systemctl stop ollama
-sudo systemctl disable ollama
-sudo rm /etc/systemd/system/ollama.service
-sudo systemctl daemon-reload
-
-# Rimuovi il binario
-sudo rm /usr/local/bin/ollama
-
-# Rimuovi modelli e dati (opzionale — occupa spazio su disco)
-# I modelli sono salvati sotto l'utente di sistema "ollama", non nella tua home
-sudo rm -rf /usr/share/ollama
-
-# Rimuovi l'utente e il gruppo di sistema creati dall'installer (opzionale)
-sudo userdel ollama
-sudo groupdel ollama
-```
-
----
-
-## 2. Scarica i modelli
-
-### Modello di embedding
-
-Per testi in italiano serve un modello multilingue — i modelli English-first producono embeddings di qualità inferiore su lingue diverse dall'inglese, con retrieval meno preciso.
-
-Prima scelta consigliata:
-
-```bash
-ollama pull qwen3-embedding:0.6b
-```
-
-Stessa famiglia del LLM in uso (`qwen3.5`), multilingue, recente, gira comodamente in CPU.
-
-| Modello | Dim | Dimensione | Lingue | Consigliato |
-|---|---|---|---|---|
-| `qwen3-embedding:0.6b` | 1024 | ~522 MB | multilingue | ✅ prima scelta |
-| `nomic-embed-text-v2-moe` | 768 | ~523 MB | multilingue | ✅ seconda scelta |
-| `bge-m3` | 1024 | ~1.2 GB | 100+ lingue incl. IT | ✅ terza scelta |
-| `nomic-embed-text` | 768 | ~274 MB | principalmente EN | ⚠️ default corrente |
-| `mxbai-embed-large` | 1024 | ~670 MB | principalmente EN | ❌ |
-| `paraphrase-multilingual` | 768 | ~278 MB | multilingue | ❌ obsoleto |
-| `all-minilm` | 384 | ~46 MB | principalmente EN | ❌ troppo piccolo |
-
-Se cambi modello rispetto a quello usato in step-8, devi rieseguire la vettorizzazione con `--force` e aggiornare `EMBED_MODEL` in `step-9/config.py`.
-
-### Modello LLM
-
-Per RAG su testi italiani servono: buon instruction following, supporto multilingue e context window ampia (i prompt RAG includono più chunk).
-
-Prima scelta consigliata per 8 GB RAM:
-
-```bash
-ollama pull qwen3.5:4b
-```
-
-Il progetto è pensato per la famiglia **Qwen3.5** — stessa famiglia dell'embedding consigliato (`qwen3-embedding`), context window 256K, ottimo italiano. Altri modelli sono compatibili ma non testati.
-
-| Modello | RAM | Note |
-|---|---|---|
-| `qwen3.5:0.8b` | ≥ 1 GB | minimo assoluto |
-| `qwen3.5:2b` | ≥ 3 GB | leggero |
-| `qwen3.5:4b` | ≥ 5 GB | **consigliato per 8 GB** |
-| `qwen3.5:9b` | ≥ 8 GB | lento su CPU, meglio con GPU |
-
-Se usi un modello diverso da `qwen3.5:4b`, aggiorna `OLLAMA_MODEL` in `step-9/config.py`.
-
-### Disinstalla un modello
-
-```bash
-ollama rm qwen3.5:4b
-ollama rm nomic-embed-text
-```
-
-Per vedere tutti i modelli installati:
-
-```bash
-ollama list
-```
-
----
-
-## 3. Installa le dipendenze nel venv
-
-Assicurati di avere `chromadb` installato nel `.venv`:
-
-```bash
-source .venv/bin/activate
-pip install -r requirements.txt
-```
-
----
-
-## 4. Verifica tutto
-
-```bash
-source .venv/bin/activate
-python step-7/check_env.py
-```
-
-Output atteso se tutto è a posto:
-
-```
-✅ ollama trovato nel PATH
-✅ ollama risponde correttamente
-✅ modello embedding trovato: nomic-embed-text:latest
-✅ modello LLM trovato: qwen3.5:4b
-
-✅ chromadb importabile
-
-✅ Ambiente pronto — procedi con la vettorizzazione:
-   python step-8/ingest.py --stem <nome>
-```
-
----
-
-## Prossimo step
-
-```bash
-python step-8/ingest.py --stem <nome>
-```
diff --git a/step-7/check_env.py b/step-7/check_env.py
deleted file mode 100644
index 071c48d..0000000
--- a/step-7/check_env.py
+++ /dev/null
@@ -1,208 +0,0 @@
-#!/usr/bin/env python3
-"""
-Step 7 — Verifica ambiente
-
-Controlla che tutti i prerequisiti per la vettorizzazione siano soddisfatti:
-  1. ollama è nel PATH e risponde
-  2. Almeno un modello di embedding è scaricato
-  3. Almeno un modello LLM è scaricato
-  4. chromadb è importabile
-
-Output: report a schermo con ✅ / ❌ per ogni componente.
-Nessun file scritto. Exit 0 se tutto OK, 1 altrimenti.
-
-Uso:
-    python step-7/check_env.py
-"""
-
-import shutil
-import subprocess
-import sys
-from pathlib import Path
-
-
-# ─── Lista canonica di modelli embedding supportati ───────────────────────────
-# Ordine: prima scelta → ultima scelta (come da README step-7)
-EMBED_MODELS = [
-    "qwen3-embedding",
-    "nomic-embed-text-v2-moe",
-    "bge-m3",
-    "nomic-embed-text",
-    "mxbai-embed-large",
-    "paraphrase-multilingual",
-    "all-minilm",
-]
-
-
-def _is_embed(model_name: str) -> bool:
-    """True se il modello è riconosciuto come embedding (lista canonica o keyword)."""
-    base = model_name.split(":")[0].lower()
-    return any(base == e or base.startswith(e) for e in EMBED_MODELS) or "embed" in base
-
-
-# ─── Modelli configurati in step-9/config.py ─────────────────────────────────
-# Per spostare config.py alla root: cambia solo la riga qui sotto.
-sys.path.insert(0, str(Path(__file__).parent.parent / "step-9"))
-try:
-    from config import EMBED_MODEL as CONFIGURED_EMBED, OLLAMA_MODEL as CONFIGURED_LLM
-except Exception:
-    CONFIGURED_EMBED = None
-    CONFIGURED_LLM = None
-
-REQUIRED_LIBS = ["chromadb"]
-
-
-# ─── Checks ───────────────────────────────────────────────────────────────────
-
-def check_ollama_in_path() -> bool:
-    """Verifica che ollama sia nel PATH."""
-    found = shutil.which("ollama") is not None
-    if found:
-        print("✅ ollama trovato nel PATH")
-    else:
-        print("❌ ollama non trovato nel PATH")
-        print("   → Installa con: curl -fsSL https://ollama.com/install.sh | sh")
-    return found
-
-
-def check_ollama_running() -> list[str] | None:
-    """
-    Esegue 'ollama list' e ritorna la lista dei modelli disponibili.
-    Ritorna None se ollama non risponde.
-    """
-    try:
-        result = subprocess.run(
-            ["ollama", "list"],
-            capture_output=True, text=True, timeout=10
-        )
-        if result.returncode != 0:
-            print("❌ ollama non risponde (errore all'avvio)")
-            print("   → Avvia il servizio con: ollama serve")
-            return None
-        lines = result.stdout.strip().splitlines()
-        models = []
-        for line in lines[1:]:  # salta l'header
-            parts = line.split()
-            if parts:
-                models.append(parts[0])
-        print("✅ ollama risponde correttamente")
-        return models
-    except FileNotFoundError:
-        print("❌ ollama non trovato (FileNotFoundError)")
-        return None
-    except subprocess.TimeoutExpired:
-        print("❌ ollama non risponde (timeout)")
-        print("   → Avvia il servizio con: ollama serve")
-        return None
-
-
-def _match(model_name: str, available: list[str]) -> str | None:
-    """
-    Ritorna il nome completo del modello trovato in 'available' che corrisponde
-    a 'model_name' (confronto per prefisso), oppure None.
-    """
-    for m in available:
-        if m == model_name or m.startswith(model_name + ":") or m.startswith(model_name + "-"):
-            return m
-    return None
-
-
-def check_embed_model(available: list[str]) -> bool:
-    """Verifica che il modello di embedding configurato sia disponibile."""
-    if CONFIGURED_EMBED:
-        print(f"   modello configurato (step-9/config.py): {CONFIGURED_EMBED}")
-        found = _match(CONFIGURED_EMBED, available)
-        if found:
-            print(f"✅ embedding disponibile: {found}")
-            return True
-        print(f"❌ {CONFIGURED_EMBED} non trovato in Ollama")
-        print(f"   → ollama pull {CONFIGURED_EMBED}")
-        return False
-    # fallback: config.py non leggibile
-    found = next((m for m in available if _is_embed(m)), None)
-    if found:
-        print(f"✅ modello embedding trovato: {found}")
-        return True
-    print("❌ nessun modello di embedding trovato")
-    print(f"   → Prima scelta: ollama pull qwen3-embedding:0.6b")
-    return False
-
-
-def check_llm_model(available: list[str]) -> bool:
-    """Verifica che il modello LLM configurato sia disponibile."""
-    if CONFIGURED_LLM:
-        print(f"   modello configurato (step-9/config.py): {CONFIGURED_LLM}")
-        found = _match(CONFIGURED_LLM, available)
-        if found:
-            print(f"✅ LLM disponibile: {found}")
-            return True
-        print(f"❌ {CONFIGURED_LLM} non trovato in Ollama")
-        print(f"   → ollama pull {CONFIGURED_LLM}")
-        return False
-    # fallback: config.py non leggibile
-    llm_candidates = [m for m in available if not _is_embed(m)]
-    if llm_candidates:
-        print(f"✅ modello LLM trovato: {llm_candidates[0]}")
-        return True
-    print("❌ nessun modello LLM trovato")
-    print(f"   → Consigliato per 8 GB RAM: ollama pull qwen3.5:4b")
-    return False
-
-
-def check_library(lib: str) -> bool:
-    """Verifica che una libreria Python sia importabile."""
-    try:
-        __import__(lib)
-        print(f"✅ {lib} importabile")
-        return True
-    except ImportError:
-        print(f"❌ {lib} non importabile")
-        print(f"   → Installa con: pip install {lib}")
-        return False
-
-
-# ─── Entry point ──────────────────────────────────────────────────────────────
-
-def main() -> int:
-    print("─── Step 7 — Verifica ambiente ───────────────────────────────────────\n")
-
-    results: list[bool] = []
-
-    # 1. ollama nel PATH
-    in_path = check_ollama_in_path()
-    results.append(in_path)
-
-    # 2. ollama risponde + modelli
-    if in_path:
-        available = check_ollama_running()
-        if available is None:
-            results.extend([False, False, False])
-        else:
-            results.append(True)
-            results.append(check_embed_model(available))
-            results.append(check_llm_model(available))
-    else:
-        results.extend([False, False, False])
-        print("⚠️  modelli non verificabili (ollama non trovato)")
-
-    # 3. Librerie Python
-    print()
-    for lib in REQUIRED_LIBS:
-        results.append(check_library(lib))
-
-    # ── Riepilogo ─────────────────────────────────────────────────────────────
-    print()
-    print("──────────────────────────────────────────────────────────────────────")
-    all_ok = all(results)
-    if all_ok:
-        print("✅ Ambiente pronto — procedi con la vettorizzazione:")
-        print("   python step-8/ingest.py --stem <nome>")
-    else:
-        n_fail = sum(1 for r in results if not r)
-        print(f"⚠️  {n_fail} problema/i rilevato/i — risolvi prima di procedere con step-8.")
-
-    return 0 if all_ok else 1
-
-
-if __name__ == "__main__":
-    sys.exit(main())
diff --git a/step-9/README.md b/step-9/README.md
deleted file mode 100644
index 48fd47d..0000000
--- a/step-9/README.md
+++ /dev/null
@@ -1,109 +0,0 @@
-# Step 9 — Interrogazione del documento
-
-Due modalità di interrogazione, entrambe con loop interattivo:
-
-| Script | Modalità | Quando usarlo |
-|---|---|---|
-| `rag.py` | Retrieval + generazione LLM | Risposta in linguaggio naturale |
-| `retrieve.py` | Solo retrieval (no LLM) | Debug, verifica chunk, ricerca semantica |
-
----
-
-## Prerequisiti
-
-- Step 8 completato (`chroma_db/` popolata)
-- Ollama attivo con il modello di embedding scaricato
-- Per `rag.py`: anche il modello LLM scaricato
-
----
-
-## rag.py — Risposta in linguaggio naturale
-
-```bash
-source .venv/bin/activate
-python step-9/rag.py --stem <nome>
-```
-
-Per ogni domanda: vettorizza la query, recupera i chunk più rilevanti da ChromaDB e genera la risposta tramite Ollama.
-
-```
-── Loop RAG ─────────────────────────────────────── (exit per uscire)
-
-Domanda:
-```
-
-| Sintassi | Comportamento |
-|---|---|
-| `<testo>` | Risposta basata sul documento |
-| `<testo> -v` | Risposta + chunk recuperati con score di similarità |
-| `exit` | Esce dal programma |
-
-Flusso interno:
-
-```
-domanda
-    │
-    ▼  embed (EMBED_MODEL, Ollama)
-vettore N-dim
-    │
-    ▼  query ChromaDB — similarità coseno, top-K
-chunk rilevanti
-    │
-    ▼  build_prompt (SYSTEM_PROMPT + contesti + domanda)
-    │
-    ▼  generate (OLLAMA_MODEL, Ollama)
-risposta
-```
-
-Il LLM risponde esclusivamente dal contesto fornito. Se il contesto è irrilevante rispetto alla domanda, lo dichiara esplicitamente.
-
----
-
-## retrieve.py — Retrieval puro (senza LLM)
-
-```bash
-source .venv/bin/activate
-python step-9/retrieve.py --stem <nome>
-```
-
-Vettorizza la query e restituisce i chunk più simili con score di similarità — senza chiamare Ollama per la generation. Utile per verificare la qualità del retrieval e diagnosticare risposte sbagliate.
-
-```
-── Loop retrieval ──────────────────────── (exit per uscire, -f per testo completo)
-
-Query:
-```
-
-| Sintassi | Comportamento |
-|---|---|
-| `<testo>` | Chunk più simili con score di similarità (testo troncato a 200 car.) |
-| `<testo> -f` | Chunk più simili con testo completo |
-| `exit` | Esce dal programma |
-
-Accetta `--top-k N` per sovrascrivere il valore di `config.py` per quella sessione.
-
----
-
-## Configurazione (`config.py`)
-
-| Parametro | Default | Descrizione |
-|---|---|---|
-| `TOP_K` | `6` | Chunk recuperati per ogni domanda. Valori consigliati: `3`–`10` |
-| `TEMPERATURE` | `0.0` | Deterministico a `0.0`, creativo verso `1.0`. Per RAG consigliato `0.0` |
-| `NO_THINK` | `True` | Disabilita il chain-of-thought interno dei modelli Qwen3/Qwen3.5. `True` = risposta diretta, più veloce |
-| `EMBED_MODEL` | `"nomic-embed-text"` | Deve corrispondere al modello usato in step-8. Se cambiato, rieseguire step-8 con `--force` |
-| `OLLAMA_URL` | `"http://localhost:11434"` | Modifica solo se Ollama gira su porta o host diversi |
-| `OLLAMA_MODEL` | `"qwen3.5:0.8b"` | Modello LLM. Vedi `step-7/README.md` per la scelta |
-| `SYSTEM_PROMPT` | *(vedi file)* | Istruzioni di comportamento inviate al LLM. Modifica per cambiare tono, lingua o condizione di fallback |
-
----
-
-## Test senza RAG
-
-Per verificare che Ollama risponda correttamente prima di interrogare il documento:
-
-```bash
-python step-9/test_ollama.py
-```
-
-Chat diretta con il modello, senza ChromaDB. Usa gli stessi parametri di `config.py`.
diff --git a/step-9/config.py b/step-9/config.py
deleted file mode 100644
index 8d83cd9..0000000
--- a/step-9/config.py
+++ /dev/null
@@ -1,54 +0,0 @@
-# ─── Step 9 — Configurazione RAG ─────────────────────────────────────────────
-#
-# Modifica questo file per cambiare i parametri della pipeline.
-#
-# Uso:
-#   python step-9/rag.py --stem nietzsche
-# ──────────────────────────────────────────────────────────────────────────────
-
-# ── Retrieval ─────────────────────────────────────────────────────────────────
-
-# Numero di chunk da recuperare per ogni domanda.
-# Valori più alti = più contesto, risposte potenzialmente più complete,
-# ma prompt più lunghi e generazione più lenta.
-TOP_K = 6
-
-# ── Generazione ───────────────────────────────────────────────────────────────
-
-# Temperatura del modello LLM.
-# 0.0 = completamente deterministico (stessa risposta ad ogni run)
-# 0.7 = più creativo e vario
-TEMPERATURE = 0.0
-
-# Disabilita il "thinking" (ragionamento interno) nei modelli Qwen3/Qwen3.5.
-# True  = risposta diretta, più veloce
-# False = ragionamento interno abilitato (più lento ma potenzialmente più accurato)
-NO_THINK = True
-
-# ── Embedding ─────────────────────────────────────────────────────────────────
-
-# Modello di embedding usato da Ollama.
-# Deve corrispondere al modello usato durante la vettorizzazione (step-8).
-# Se cambi questo, devi rieseguire step-8 con --force.
-EMBED_MODEL = "nomic-embed-text"
-
-# ── Ollama ────────────────────────────────────────────────────────────────────
-
-# URL del server Ollama (default: locale sulla porta 11434).
-OLLAMA_URL = "http://localhost:11434"
-
-# Modello LLM. Scegli in base alla RAM disponibile (vedi README).
-OLLAMA_MODEL = "qwen3.5:0.8b"
-
-# ── Prompt di sistema ─────────────────────────────────────────────────────────
-
-# Istruzioni di comportamento inviate al LLM prima del contesto e della domanda.
-# Modifica per cambiare il tono, la lingua, il grado di libertà interpretativa
-# o le condizioni di fallback ("non so rispondere").
-SYSTEM_PROMPT = (
-    "Sei un assistente che risponde usando il contesto fornito. "
-    "Sintetizza e interpreta liberamente i passaggi del contesto per rispondere alla domanda. "
-    "Se il contesto contiene informazioni pertinenti, anche indirette, usale per costruire una risposta. "
-    "Solo se il contesto è completamente irrilevante, rispondi: "
-    "\"Non trovo questa informazione nel documento.\""
-)
diff --git a/step-9/rag.py b/step-9/rag.py
deleted file mode 100644
index ffd0402..0000000
--- a/step-9/rag.py
+++ /dev/null
@@ -1,252 +0,0 @@
-#!/usr/bin/env python3
-"""
-Step 9 — Pipeline RAG interattiva
-
-Riceve una domanda, recupera i chunk più rilevanti da ChromaDB (retrieval)
-e genera una risposta tramite Ollama (generation).
-
-Input:  chroma_db/<stem> (collection ChromaDB)
-Output: risposta a schermo
-
-Uso:
-    python step-9/rag.py --stem <nome>
-
-Nel loop interattivo:
-    Domanda: <testo>       → risposta
-    Domanda: <testo> -v    → risposta + chunk recuperati
-    Domanda: exit          → uscita
-"""
-
-import argparse
-import json
-import sys
-import urllib.error
-import urllib.request
-from pathlib import Path
-
-import chromadb
-
-# ─── Configurazione ───────────────────────────────────────────────────────────
-
-sys.path.insert(0, str(Path(__file__).parent))
-import config as _cfg
-
-project_root = Path(__file__).parent.parent
-CHROMA_DIR   = project_root / "chroma_db"
-
-OLLAMA_URL    = _cfg.OLLAMA_URL
-EMBED_MODEL   = _cfg.EMBED_MODEL
-LLM_MODEL     = _cfg.OLLAMA_MODEL
-TOP_K         = _cfg.TOP_K
-TEMPERATURE   = _cfg.TEMPERATURE
-NO_THINK      = _cfg.NO_THINK
-SYSTEM_PROMPT = _cfg.SYSTEM_PROMPT
-
-
-# ─── Embedding ────────────────────────────────────────────────────────────────
-
-def embed(text: str) -> list[float]:
-    """Genera il vettore della domanda tramite Ollama."""
-    payload = json.dumps({"model": EMBED_MODEL, "prompt": text}).encode()
-    req = urllib.request.Request(
-        f"{OLLAMA_URL}/api/embeddings",
-        data=payload,
-        headers={"Content-Type": "application/json"},
-        method="POST",
-    )
-    with urllib.request.urlopen(req, timeout=30) as resp:
-        return json.loads(resp.read())["embedding"]
-
-
-# ─── Generazione ──────────────────────────────────────────────────────────────
-
-def call_ollama(prompt: str, system: str = "") -> str:
-    """Chiama Ollama /api/generate e ritorna la risposta."""
-    payload = json.dumps({
-        "model": LLM_MODEL,
-        "system": system,
-        "prompt": prompt,
-        "stream": False,
-        "think": not NO_THINK,
-        "options": {"temperature": TEMPERATURE},
-    }).encode()
-    req = urllib.request.Request(
-        f"{OLLAMA_URL}/api/generate",
-        data=payload,
-        headers={"Content-Type": "application/json"},
-        method="POST",
-    )
-    with urllib.request.urlopen(req, timeout=300) as resp:
-        return json.loads(resp.read())["response"].strip()
-
-
-# ─── Retrieval ────────────────────────────────────────────────────────────────
-
-def retrieve(collection: chromadb.Collection, question: str) -> list[dict]:
-    """
-    Genera l'embedding della domanda e recupera i TOP_K chunk più simili.
-    Ritorna lista di dict con chiavi: text, sezione, titolo, distance.
-    """
-    vector = embed(question)
-    results = collection.query(
-        query_embeddings=[vector],
-        n_results=TOP_K,
-        include=["documents", "metadatas", "distances"],
-    )
-    chunks = []
-    for text, meta, dist in zip(
-        results["documents"][0],
-        results["metadatas"][0],
-        results["distances"][0],
-    ):
-        chunks.append({
-            "text":     text,
-            "sezione":  meta.get("sezione", ""),
-            "titolo":   meta.get("titolo", ""),
-            "distance": dist,
-        })
-    return chunks
-
-
-# ─── Prompt ───────────────────────────────────────────────────────────────────
-
-def build_prompt(question: str, chunks: list[dict]) -> str:
-    """Ritorna (system, user_prompt) separati per l'API Ollama."""
-    context_parts = []
-    for i, c in enumerate(chunks, start=1):
-        header = f"[Contesto {i}"
-        if c["sezione"]:
-            header += f" — {c['sezione']}"
-            if c["titolo"]:
-                header += f" > {c['titolo']}"
-        header += "]"
-        context_parts.append(f"{header}\n{c['text']}")
-
-    context = "\n\n".join(context_parts)
-    user_prompt = f"{context}\n\nDomanda: {question}"
-    return SYSTEM_PROMPT, user_prompt
-
-
-# ─── Loop interattivo ─────────────────────────────────────────────────────────
-
-def answer(question: str, collection: chromadb.Collection, verbose: bool) -> None:
-    try:
-        chunks = retrieve(collection, question)
-    except (urllib.error.URLError, OSError) as e:
-        print(f"❌ Errore embedding: {e}")
-        return
-
-    if verbose:
-        print("\n── Chunk recuperati ──────────────────────────────────────────")
-        for i, c in enumerate(chunks, start=1):
-            loc = c["sezione"]
-            if c["titolo"]:
-                loc += f" > {c['titolo']}"
-            sim = 1 - c["distance"]
-            print(f"  [{i}] {loc}  (similarità: {sim:.3f})")
-            print(f"      {c['text'][:120].replace(chr(10), ' ')}...")
-        print("──────────────────────────────────────────────────────────────\n")
-
-    system, prompt = build_prompt(question, chunks)
-
-    try:
-        response = call_ollama(prompt, system=system)
-    except (urllib.error.URLError, OSError) as e:
-        print(f"❌ Errore generazione: {e}")
-        return
-
-    print(f"\n{response}\n")
-
-
-def run_loop(collection: chromadb.Collection) -> None:
-    print("── Loop RAG ─────────────────────────────────────── (exit per uscire)\n")
-    while True:
-        try:
-            raw = input("Domanda: ").strip()
-        except (EOFError, KeyboardInterrupt):
-            print("\nUscita.")
-            break
-
-        if not raw:
-            continue
-        if raw.lower() == "exit":
-            break
-
-        verbose = raw.endswith(" -v")
-        question = raw[:-3].strip() if verbose else raw
-
-        answer(question, collection, verbose)
-
-
-# ─── Entry point ──────────────────────────────────────────────────────────────
-
-def _build_epilog() -> str:
-    lines = [
-        "Uso:",
-        "  python step-9/rag.py --stem <nome>",
-        "",
-        "Loop interattivo:",
-        "  <domanda>       risposta basata sul documento",
-        "  <domanda> -v    risposta + chunk recuperati con score di similarità",
-        "  exit            termina",
-    ]
-    if CHROMA_DIR.exists():
-        try:
-            client = chromadb.PersistentClient(path=str(CHROMA_DIR))
-            names = [c.name for c in client.list_collections()]
-            if names:
-                lines += ["", f"Collection disponibili: {', '.join(names)}"]
-            else:
-                lines += ["", "Nessuna collection trovata — eseguire prima: python step-8/ingest.py"]
-        except Exception:
-            pass
-    return "\n".join(lines)
-
-
-def main() -> int:
-    parser = argparse.ArgumentParser(
-        description=(
-            "Step 9 — Pipeline RAG interattiva\n\n"
-            "Risponde a domande in linguaggio naturale su un documento\n"
-            "indicizzato in ChromaDB da step-8/ingest.py."
-        ),
-        epilog=_build_epilog(),
-        formatter_class=argparse.RawDescriptionHelpFormatter,
-    )
-    parser.add_argument(
-        "--stem",
-        required=True,
-        help=(
-            "Nome della collection ChromaDB da interrogare. "
-            "Le collection vengono create da: python step-8/ingest.py --stem <nome>"
-        ),
-    )
-    args = parser.parse_args()
-
-    print("─── Step 9 — Pipeline RAG ────────────────────────────────────────────\n")
-    print(f"  Documento : {args.stem}")
-    print(f"  Modello   : {LLM_MODEL}")
-    print(f"  Top-K     : {TOP_K}")
-    print(f"  Thinking  : {'off' if NO_THINK else 'on'}")
-    print()
-
-    if not CHROMA_DIR.exists():
-        print("❌ chroma_db/ non trovata — esegui prima step-8")
-        return 1
-
-    client = chromadb.PersistentClient(path=str(CHROMA_DIR))
-    collections = [c.name for c in client.list_collections()]
-    if args.stem not in collections:
-        print(f"❌ Collection '{args.stem}' non trovata in chroma_db/")
-        print(f"   → python step-8/ingest.py --stem {args.stem}")
-        return 1
-
-    collection = client.get_collection(args.stem)
-    print(f"✅ Collection '{args.stem}' caricata ({collection.count()} chunk)\n")
-
-    run_loop(collection)
-    return 0
-
-
-if __name__ == "__main__":
-    sys.exit(main())
diff --git a/step-9/retrieve.py b/step-9/retrieve.py
deleted file mode 100644
index aad0d63..0000000
--- a/step-9/retrieve.py
+++ /dev/null
@@ -1,217 +0,0 @@
-#!/usr/bin/env python3
-"""
-Step 9 — Retrieval puro (senza generazione LLM)
-
-Loop interattivo: inserisci una query, ottieni i chunk più simili dalla
-collection ChromaDB tramite embedding semantico — senza chiamare Ollama
-per la generation.
-
-Utile per:
-  - verificare la qualità del retrieval prima di diagnosticare risposte sbagliate
-  - controllare che i chunk giusti vengano recuperati per una query
-  - usare la pipeline come motore di ricerca semantica
-
-Input:  chroma_db/<stem> (collection ChromaDB)
-Output: lista chunk con score di similarità
-
-Uso:
-    python step-9/retrieve.py --stem <nome>
-
-Nel loop interattivo:
-    Query: <testo>      → chunk più simili con score
-    Query: <testo> -f   → testo completo dei chunk
-    Query: exit         → uscita
-"""
-
-import argparse
-import json
-import sys
-import urllib.error
-import urllib.request
-from pathlib import Path
-
-import chromadb
-
-# ─── Configurazione ───────────────────────────────────────────────────────────
-
-sys.path.insert(0, str(Path(__file__).parent))
-import config as _cfg
-
-project_root = Path(__file__).parent.parent
-CHROMA_DIR   = project_root / "chroma_db"
-
-OLLAMA_URL  = _cfg.OLLAMA_URL
-EMBED_MODEL = _cfg.EMBED_MODEL
-TOP_K       = _cfg.TOP_K
-
-
-# ─── Embedding ────────────────────────────────────────────────────────────────
-
-def embed(text: str) -> list[float]:
-    """Genera il vettore della query tramite Ollama."""
-    payload = json.dumps({"model": EMBED_MODEL, "prompt": text}).encode()
-    req = urllib.request.Request(
-        f"{OLLAMA_URL}/api/embeddings",
-        data=payload,
-        headers={"Content-Type": "application/json"},
-        method="POST",
-    )
-    with urllib.request.urlopen(req, timeout=30) as resp:
-        return json.loads(resp.read())["embedding"]
-
-
-# ─── Retrieval ────────────────────────────────────────────────────────────────
-
-def retrieve(collection: chromadb.Collection, query: str, top_k: int) -> list[dict]:
-    """
-    Genera l'embedding della query e recupera i top_k chunk più simili.
-    Ritorna lista di dict con chiavi: rank, similarity, sezione, titolo, text.
-    """
-    vector = embed(query)
-    results = collection.query(
-        query_embeddings=[vector],
-        n_results=top_k,
-        include=["documents", "metadatas", "distances"],
-    )
-    chunks = []
-    for rank, (text, meta, dist) in enumerate(
-        zip(
-            results["documents"][0],
-            results["metadatas"][0],
-            results["distances"][0],
-        ),
-        start=1,
-    ):
-        chunks.append({
-            "rank":       rank,
-            "similarity": round(1 - dist, 4),
-            "sezione":    meta.get("sezione", ""),
-            "titolo":     meta.get("titolo", ""),
-            "text":       text,
-        })
-    return chunks
-
-
-# ─── Output ───────────────────────────────────────────────────────────────────
-
-def print_results(chunks: list[dict], full: bool = False) -> None:
-    print(f"── {len(chunks)} chunk recuperati ─────────────────────────────────\n")
-    for c in chunks:
-        loc = c["sezione"]
-        if c["titolo"]:
-            loc += f" > {c['titolo']}"
-        print(f"  [{c['rank']}] similarità: {c['similarity']:.4f}  |  {loc}")
-        if full:
-            print()
-            print(c["text"])
-        else:
-            print(f"      {c['text'][:200].replace(chr(10), ' ')}")
-            if len(c["text"]) > 200:
-                print(f"      … ({len(c['text'])} caratteri totali)")
-        print()
-
-
-# ─── Loop interattivo ─────────────────────────────────────────────────────────
-
-def run_loop(collection: chromadb.Collection, top_k: int) -> None:
-    print("── Loop retrieval ──────────────────────── (exit per uscire, -f per testo completo)\n")
-    while True:
-        try:
-            raw = input("Query: ").strip()
-        except (EOFError, KeyboardInterrupt):
-            print("\nUscita.")
-            break
-
-        if not raw:
-            continue
-        if raw.lower() == "exit":
-            break
-
-        full = raw.endswith(" -f")
-        query = raw[:-3].strip() if full else raw
-
-        try:
-            chunks = retrieve(collection, query, top_k)
-        except (urllib.error.URLError, OSError) as e:
-            print(f"❌ Errore embedding (Ollama raggiungibile?): {e}\n")
-            continue
-
-        print()
-        print_results(chunks, full=full)
-
-
-# ─── Entry point ──────────────────────────────────────────────────────────────
-
-def _build_epilog() -> str:
-    lines = [
-        "Uso:",
-        "  python step-9/retrieve.py --stem <nome>",
-        "",
-        "Nel loop interattivo:",
-        "  <query>       chunk più simili con score (testo troncato)",
-        "  <query> -f    testo completo dei chunk",
-        "  exit          termina",
-    ]
-    if CHROMA_DIR.exists():
-        try:
-            client = chromadb.PersistentClient(path=str(CHROMA_DIR))
-            names = [c.name for c in client.list_collections()]
-            if names:
-                lines += ["", f"Collection disponibili: {', '.join(names)}"]
-            else:
-                lines += ["", "Nessuna collection trovata — eseguire prima: python step-8/ingest.py"]
-        except Exception:
-            pass
-    return "\n".join(lines)
-
-
-def main() -> int:
-    parser = argparse.ArgumentParser(
-        description=(
-            "Step 9 — Retrieval puro (senza LLM)\n\n"
-            "Loop interattivo: inserisci una query e ottieni i chunk più simili\n"
-            "tramite embedding semantico, senza generazione LLM."
-        ),
-        epilog=_build_epilog(),
-        formatter_class=argparse.RawDescriptionHelpFormatter,
-    )
-    parser.add_argument(
-        "--stem",
-        required=True,
-        help="Nome della collection ChromaDB da interrogare.",
-    )
-    parser.add_argument(
-        "--top-k",
-        type=int,
-        default=TOP_K,
-        metavar="N",
-        help=f"Numero di chunk da restituire per query (default: {TOP_K} da config.py).",
-    )
-    args = parser.parse_args()
-
-    print("─── Step 9 — Retrieval puro ──────────────────────────────────────────\n")
-    print(f"  Documento    : {args.stem}")
-    print(f"  Embed model  : {EMBED_MODEL}")
-    print(f"  Top-K        : {args.top_k}")
-    print()
-
-    if not CHROMA_DIR.exists():
-        print("❌ chroma_db/ non trovata — esegui prima step-8", file=sys.stderr)
-        return 1
-
-    client = chromadb.PersistentClient(path=str(CHROMA_DIR))
-    collections = [c.name for c in client.list_collections()]
-    if args.stem not in collections:
-        print(f"❌ Collection '{args.stem}' non trovata in chroma_db/", file=sys.stderr)
-        print(f"   → python step-8/ingest.py --stem {args.stem}", file=sys.stderr)
-        return 1
-
-    collection = client.get_collection(args.stem)
-    print(f"✅ Collection '{args.stem}' caricata ({collection.count()} chunk)\n")
-
-    run_loop(collection, args.top_k)
-    return 0
-
-
-if __name__ == "__main__":
-    sys.exit(main())
diff --git a/step-9/test_ollama.py b/step-9/test_ollama.py
deleted file mode 100644
index 8b683f1..0000000
--- a/step-9/test_ollama.py
+++ /dev/null
@@ -1,66 +0,0 @@
-#!/usr/bin/env python3
-"""
-Test chat locale Ollama — senza RAG, senza ChromaDB.
-Uso: python step-9/test_ollama.py
-"""
-
-import json
-import sys
-import urllib.error
-import urllib.request
-from pathlib import Path
-
-sys.path.insert(0, str(Path(__file__).parent))
-import config as _cfg
-
-OLLAMA_URL  = _cfg.OLLAMA_URL
-MODEL       = _cfg.OLLAMA_MODEL
-TEMPERATURE = _cfg.TEMPERATURE
-NO_THINK    = _cfg.NO_THINK
-
-
-def chat(prompt: str) -> str:
-    payload = json.dumps({
-        "model": MODEL,
-        "prompt": prompt,
-        "stream": False,
-        "think": not NO_THINK,
-        "options": {"temperature": TEMPERATURE},
-    }).encode()
-    req = urllib.request.Request(
-        f"{OLLAMA_URL}/api/generate",
-        data=payload,
-        headers={"Content-Type": "application/json"},
-        method="POST",
-    )
-    with urllib.request.urlopen(req, timeout=300) as resp:
-        return json.loads(resp.read())["response"].strip()
-
-
-def main() -> int:
-    print(f"─── Chat Ollama ──────────────────────────────── (exit per uscire)")
-    print(f"  Modello   : {MODEL}")
-    print(f"  Thinking  : {'off' if NO_THINK else 'on'}")
-    print()
-
-    while True:
-        try:
-            user = input("Tu: ").strip()
-        except (EOFError, KeyboardInterrupt):
-            print("\nUscita.")
-            break
-        if not user:
-            continue
-        if user.lower() == "exit":
-            break
-        try:
-            reply = chat(user)
-            print(f"\nAssistente: {reply}\n")
-        except (urllib.error.URLError, OSError) as e:
-            print(f"❌ Errore: {e}")
-
-    return 0
-
-
-if __name__ == "__main__":
-    sys.exit(main())

From c8167d4f01463d90f0df9df6934b9ce8bd54d23c Mon Sep 17 00:00:00 2001
From: Davide Grilli <davide.grilli@outlook.com>
Date: Sun, 19 Apr 2026 00:03:43 +0200
Subject: [PATCH 21/22] =?UTF-8?q?fix:=20aggiorna=20path=20step-4/=20?=
 =?UTF-8?q?=E2=86=92=20conversione/=20e=20riferimenti=20step-X?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- chunker.py: input da conversione/<stem>/ (era step-4/, non esistente)
- verify_chunks.py: messaggi errore aggiornati a conversione/
- config.py: commenti step-8 → ingest.py
---
 config.py               |  4 ++--
 step-5/chunker.py       | 32 ++++++++++++++++----------------
 step-6/verify_chunks.py |  8 ++++----
 3 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/config.py b/config.py
index 870c2d3..efd9d55 100644
--- a/config.py
+++ b/config.py
@@ -28,8 +28,8 @@ NO_THINK = True
 # ── Embedding ─────────────────────────────────────────────────────────────────
 
 # Modello di embedding usato da Ollama.
-# Deve corrispondere al modello usato durante la vettorizzazione (step-8).
-# Se cambi questo, devi rieseguire step-8 con --force.
+# Deve corrispondere al modello usato durante la vettorizzazione (ingest.py).
+# Se cambi questo, devi rieseguire ingest.py con --force.
 EMBED_MODEL = "nomic-embed-text"
 
 # ── Ollama ────────────────────────────────────────────────────────────────────
diff --git a/step-5/chunker.py b/step-5/chunker.py
index f8af623..b5ba539 100644
--- a/step-5/chunker.py
+++ b/step-5/chunker.py
@@ -1,15 +1,15 @@
 #!/usr/bin/env python3
 """
-Step 5 — Chunking adattivo
+Chunking adattivo
 
-Divide il Markdown revisionato (step 4) in chunk semantici pronti per la
+Divide il Markdown revisionato in chunk semantici pronti per la
 vettorizzazione. La strategia dipende dal profilo strutturale del documento.
 
-Input:  step-4/<stem>/clean.md + step-4/<stem>/structure_profile.json
+Input:  conversione/<stem>/clean.md + conversione/<stem>/structure_profile.json
 Output: step-5/<stem>/chunks.json
 
 Uso:
-    python step-5/chunker.py                    # tutti i documenti in step-4/
+    python step-5/chunker.py                    # tutti i documenti in conversione/
     python step-5/chunker.py --stem documento   # un solo documento
     python step-5/chunker.py --stem documento --force
 """
@@ -375,19 +375,19 @@ def chunk_document(clean_md: Path, profile: dict, stem: str) -> list[dict]:
 # ─── Per-document processing ──────────────────────────────────────────────────
 
 def process_stem(stem: str, project_root: Path, force: bool) -> bool:
-    step4_dir = project_root / "step-4" / stem
+    conv_dir = project_root / "conversione" / stem
     out_dir = project_root / "step-5" / stem
-    clean_md = step4_dir / "clean.md"
-    profile_path = step4_dir / "structure_profile.json"
+    clean_md = conv_dir / "clean.md"
+    profile_path = conv_dir / "structure_profile.json"
     out_file = out_dir / "chunks.json"
 
     print(f"\nDocumento: {stem}")
 
     if not clean_md.exists():
-        print(f"  ✗ clean.md non trovato in step-4/{stem}/ — skip")
+        print(f"  ✗ clean.md non trovato in conversione/{stem}/ — skip")
         return False
     if not profile_path.exists():
-        print(f"  ✗ structure_profile.json non trovato in step-4/{stem}/ — skip")
+        print(f"  ✗ structure_profile.json non trovato in conversione/{stem}/ — skip")
         return False
 
     if out_file.exists() and not force:
@@ -432,21 +432,21 @@ def process_stem(stem: str, project_root: Path, force: bool) -> bool:
 if __name__ == "__main__":
     project_root = Path(__file__).parent.parent
 
-    parser = argparse.ArgumentParser(description="Step 5 — Chunking adattivo")
-    parser.add_argument("--stem", help="Nome del documento (sottocartella di step-4/)")
+    parser = argparse.ArgumentParser(description="Chunking adattivo")
+    parser.add_argument("--stem", help="Nome del documento (sottocartella di conversione/)")
     parser.add_argument("--force", action="store_true", help="Riesegui anche se già presente")
     args = parser.parse_args()
 
     if args.stem:
         stems = [args.stem]
     else:
-        step4_dir = project_root / "step-4"
-        if not step4_dir.exists():
-            print(f"Errore: cartella step-4/ non trovata in {project_root}")
+        conv_dir = project_root / "conversione"
+        if not conv_dir.exists():
+            print(f"Errore: cartella conversione/ non trovata in {project_root}")
             sys.exit(1)
-        stems = sorted(p.name for p in step4_dir.iterdir() if p.is_dir())
+        stems = sorted(p.name for p in conv_dir.iterdir() if p.is_dir() and (p / "clean.md").exists())
         if not stems:
-            print(f"Errore: nessun documento trovato in step-4/")
+            print(f"Errore: nessun documento trovato in conversione/")
             sys.exit(1)
 
     results = [process_stem(s, project_root, args.force) for s in stems]
diff --git a/step-6/verify_chunks.py b/step-6/verify_chunks.py
index d8eb125..cf881af 100644
--- a/step-6/verify_chunks.py
+++ b/step-6/verify_chunks.py
@@ -179,8 +179,8 @@ def verify_stem(stem: str, project_root: Path, min_chars: int, max_chars: int) -
             print(f"  [{c.get('chunk_id', '?')}] ...{last_line!r}")
         if len(incomplete) > 5:
             print(f"  ... e altri {len(incomplete) - 5}")
-        print(f"  → Causa probabile: paragrafo spezzato nel MD (step 4)")
-        print(f"  → Soluzione: correggi le righe spezzate in step-4/{stem}/clean.md")
+        print(f"  → Causa probabile: paragrafo spezzato nel MD")
+        print(f"  → Soluzione: correggi le righe spezzate in conversione/{stem}/clean.md")
 
     # ── Costruisci e salva report.json ───────────────────────────────────────
 
@@ -263,10 +263,10 @@ def verify_stem(stem: str, project_root: Path, min_chars: int, max_chars: int) -
         print()
         if empty_chunks:
             print(f"    • {len(empty_chunks)} chunk vuoti")
-            print(f"      → Controlla step-4/{stem}/clean.md per sezioni prive di testo")
+            print(f"      → Controlla conversione/{stem}/clean.md per sezioni prive di testo")
         if no_prefix:
             print(f"    • {len(no_prefix)} chunk senza prefisso di contesto")
-            print(f"      → Controlla che gli header ### siano corretti in step-4/{stem}/clean.md")
+            print(f"      → Controlla che gli header ### siano corretti in conversione/{stem}/clean.md")
         if incomplete:
             print(f"    • {len(incomplete)} chunk con frase spezzata")
             print(f"      → Esegui: python step-6/fix_chunks.py --stem {stem}")

From 6f8785d90ab0197ad6f31b2f1fe6d8c62ca790b7 Mon Sep 17 00:00:00 2001
From: Davide Grilli <davide.grilli@outlook.com>
Date: Mon, 20 Apr 2026 11:05:20 +0200
Subject: [PATCH 22/22] docs(CLAUDE.md): semplifica istruzioni, rimuovi path
 step-X hardcoded

---
 CLAUDE.md | 75 ++++++++++++++++---------------------------------------
 1 file changed, 21 insertions(+), 54 deletions(-)

diff --git a/CLAUDE.md b/CLAUDE.md
index 698d0e4..fc0e27b 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -3,77 +3,44 @@
 ## Regole invarianti
 
 - **Lingua:** Rispondi sempre in italiano.
-- **Venv obbligatorio:** Usa `.venv/bin/python` o attiva con `source .venv/bin/activate`. Mai `pip`/`python` di sistema.
-- **Non modificare `raw.md`:** Il file `raw.md` di ogni stem è immutabile. La copia di lavoro è sempre `clean.md`.
+- **Venv:** Usa `.venv/bin/python` o `source .venv/bin/activate`. Mai `pip`/`python` di sistema.
+- **`raw.md` immutabile:** La copia di lavoro è sempre `clean.md`.
 
 ---
 
-## Pipeline (operazioni in ordine)
+## Pipeline
 
 ```
-PDF (sources/)
-  → conversione    (PDF → clean.md + structure_profile.json)
-  → chunking       (clean.md → chunks.json)
-  → verifica       (chunks.json → report + fix automatici)
-  → vettorizzazione (chunks.json → ChromaDB)
-  → retrieval      (query → risposta via Ollama)
+PDF → conversione → chunking → verifica → vettorizzazione → retrieval
 ```
 
-Il parametro `--stem` identifica il documento (nome PDF senza `.pdf`). Lo stem è anche il nome della collection ChromaDB.
+`--stem` = nome PDF senza estensione = nome collection ChromaDB.
+
+Per i path degli script e degli output usa `git ls-files` o esplora la root: la struttura è in evoluzione verso un programma unico.
 
 ---
 
-## File critici
+## Configurazione
 
-| File | Ruolo |
-|---|---|
-| `config.py` | Fonte di verità: `EMBED_MODEL`, `OLLAMA_MODEL`, `TOP_K`, `TEMPERATURE`, `SYSTEM_PROMPT` |
-| `chunker.py` | Chunking adattivo — `MIN_CHARS=200`, `MAX_CHARS=800`, `OVERLAP_S=2` |
-| `verify_chunks.py` | Verifica chunk — stesse soglie di `chunker.py` |
-| `fix_chunks.py` | Fix automatici su chunk anomali |
-| `ingest.py` | Vettorizzazione ChromaDB — legge `EMBED_MODEL` da `config.py` |
-| `rag.py` | Pipeline RAG interattiva |
-| `conversione/pipeline.py` | Conversione PDF → clean Markdown strutturato |
+`config.py` è la fonte di verità: `EMBED_MODEL`, `OLLAMA_MODEL`, `TOP_K`, `TEMPERATURE`, `SYSTEM_PROMPT`.
+
+**Se cambi `EMBED_MODEL`:** riesegui ingest con `--force` — embedding incoerenti non producono errori ma risposte insensate.
+
+**Se cambi `MIN_CHARS` / `MAX_CHARS`:** cerca tutte le occorrenze nel repo e sincronizza.
 
 ---
 
-## Regole di assistenza
+## Workflow consigliato
 
-**Modifica `EMBED_MODEL` in `config.py`:**
-Avvisa sempre che serve rieseguire la vettorizzazione:
-```bash
-python ingest.py --stem <stem> --force
-```
-`ingest.py` importa `EMBED_MODEL` direttamente da `config.py` — la coerenza è critica: se violata non produce errori ma restituisce risultati insensati.
-
-**Modifica soglie chunking (`MIN_CHARS`, `MAX_CHARS`, `OVERLAP_S`):**
-I valori compaiono in più file che vanno sincronizzati manualmente:
-- `chunker.py`
-- `verify_chunks.py`
-- `fix_chunks.py`
-
-**Conversione PDF → Markdown:**
-`conversione/pipeline.py` produce `raw.md` e `clean.md`. Il `clean.md` va sempre revisionato dopo la conversione automatica — la qualità del RAG dipende da esso più di qualsiasi parametro tecnico. Suggerisci sempre `/prepare-md conversione/<stem>/clean.md` dopo la conversione.
-
-**Verifica chunk:**
-Dopo `verify_chunks.py`, usa `/step6-fix <stem>` prima di procedere con la vettorizzazione.
+1. Converti il PDF con lo script di conversione
+2. `/prepare-md conversione/<stem>/clean.md`
+3. Chunking
+4. Vettorizza con `--stem <stem>`
+6. `python rag.py --stem <stem>`
 
 ---
 
 ## Skills custom
 
-- `/prepare-md <path>` — Revisione e correzione automatica di qualsiasi `clean.md`: sillabazione, artefatti, header malformati, paragrafi spezzati, gerarchia, sezioni vuote. Accetta path completo (`conversione/bitcoin/clean.md`) o stem (`bitcoin`).
-- `/step6-fix <stem>` — Dry-run e applicazione fix chunk tramite `fix_chunks.py`.
-
----
-
-## Output per stem
-
-```
-conversione/<stem>/raw.md              ← immutabile
-conversione/<stem>/clean.md            ← copia di lavoro
-conversione/<stem>/structure_profile.json
-<stem>/chunks.json
-<stem>/report.json
-chroma_db/<stem>/                      ← collection ChromaDB
-```
+- `/prepare-md <path|stem>` — corregge `clean.md`: sillabazione, artefatti, header, paragrafi spezzati, gerarchia.
+- `/step6-fix <stem>` — verifica chunk, dry-run e applicazione fix via `fix_chunks.py`.