step-4/revise.py

#!/usr/bin/env python3
"""
Step 4 — Revisione automatica del Markdown

Trasforma clean.md da step-3 rivelando la struttura latente del documento.
Le trasformazioni sono euristiche universali che funzionano su qualsiasi PDF:

  - Normalizza whitespace multiplo (artefatto PDF)
  - Riduce righe vuote multiple
  - Rimuove marcatori **bold** nelle intestazioni esistenti
  - Converte righe ALL-CAPS standalone → ## header (euristico, qualsiasi lingua)
  - Converte sezioni numerate "N.  testo" → ### N. (qualsiasi numerazione)
  - Rimuove blocchi TOC (righe che iniziano con parole-chiave indice)

Per ogni documento viene ricalcolato il profilo strutturale: il livello può
salire (es. livello 1 → 3) se le strutture latenti vengono rilevate.

Output in step-4/<stem>/:
  raw.md                  — copia da step-3 (non modificare mai)
  clean.md                — MD revisionato
  structure_profile.json  — profilo aggiornato dopo la revisione

Uso:
    python step-4/revise.py                    # tutti i documenti in step-3/
    python step-4/revise.py --stem nietzsche   # un solo documento
    python step-4/revise.py --force            # riesegui anche se già presente
"""

import argparse
import json
import re
import shutil
import sys
from datetime import date
from pathlib import Path

# Riusa la funzione analyze() già scritta nello step 3
sys.path.insert(0, str(Path(__file__).parent.parent / "step-3"))
from detect_structure import analyze  # noqa: E402


# ─── Costanti ─────────────────────────────────────────────────────────────────

# Parole-chiave che identificano blocchi TOC (da rimuovere)
_TOC_KEYWORDS = frozenset([
    "indice", "index", "contents", "table of contents",
    "sommario", "inhaltsverzeichnis", "inhalt",
])

# Preposizioni/articoli da non capitalizzare nel title-case
_STOP_IT_EN = frozenset([
    # italiano
    "di", "del", "della", "dei", "delle", "da", "in", "e", "il", "la",
    "lo", "le", "gli", "un", "una", "per", "a", "al", "alla", "ai",
    "alle", "con", "su", "sul", "sulla", "che", "o",
    # inglese
    "of", "the", "a", "an", "and", "or", "but", "in", "on", "at",
    "to", "for", "with", "by", "from", "as",
])

# Ordinali italiani → romani (per titoli come "CAPITOLO PRIMO")
_ORDINALS_IT = {
    "PRIMO": "I", "SECONDO": "II", "TERZO": "III", "QUARTO": "IV",
    "QUINTO": "V", "SESTO": "VI", "SETTIMO": "VII", "OTTAVO": "VIII",
    "NONO": "IX", "DECIMO": "X",
}

# Ordinali inglesi → arabici (per "CHAPTER ONE")
_ORDINALS_EN = {
    "ONE": "1", "TWO": "2", "THREE": "3", "FOUR": "4", "FIVE": "5",
    "SIX": "6", "SEVEN": "7", "EIGHT": "8", "NINE": "9", "TEN": "10",
}


# ─── Utilità ──────────────────────────────────────────────────────────────────

def _sentence_case(s: str) -> str:
    """
    Sentence-case: prima lettera maiuscola, resto minuscolo.
    Corretto per l'italiano e accettabile per l'inglese accademico.
    """
    if not s:
        return s
    lower = s.lower()
    return lower[0].upper() + lower[1:]


def _is_allcaps_line(line: str) -> bool:
    """
    True se la riga è una candidata per conversione a ## header.
    Criterio: tutti i caratteri alfabetici sono maiuscoli, lunghezza >= 3.
    """
    stripped = line.strip()
    letters = [c for c in stripped if c.isalpha()]
    return (
        len(letters) >= 3
        and all(c.isupper() for c in letters)
        and not stripped.startswith("#")
    )


def _allcaps_to_header(raw_line: str) -> str:
    """
    Converte una riga ALL-CAPS in un ## header title-case.
    Riconosce pattern specifici (CAPITOLO ORDINE, CHAPTER N) come bonus,
    ma funziona in modalità generica su qualsiasi testo.
    """
    text = raw_line.strip().rstrip('.').rstrip('?').strip()

    # ── Pattern italiano: "CAPITOLO PRIMO. TITOLO DEL CAPITOLO"
    _ORD_IT_PAT = "|".join(_ORDINALS_IT.keys())
    m = re.match(rf'^CAPITOLO ({_ORD_IT_PAT})\. (.+)', text)
    if m:
        roman = _ORDINALS_IT[m.group(1)]
        titolo = m.group(2).rstrip('.').rstrip('?').strip()
        return f"## Capitolo {roman} — {_sentence_case(titolo)}"

    # ── Pattern inglese: "CHAPTER ONE. TITLE" o "CHAPTER 1. TITLE"
    _ORD_EN_PAT = "|".join(_ORDINALS_EN.keys())
    m = re.match(rf'^CHAPTER ({_ORD_EN_PAT}|\d+)\.? (.+)', text)
    if m:
        n = _ORDINALS_EN.get(m.group(1), m.group(1))
        titolo = m.group(2).rstrip('.').rstrip('?').strip()
        return f"## Chapter {n} — {_sentence_case(titolo)}"

    # ── Pattern generico con numerazione romana o arabica nel prefisso
    m = re.match(r'^([IVXLCDM]+|[0-9]+)\. (.+)', text)
    if m:
        n = m.group(1)
        titolo = m.group(2).rstrip('.').strip()
        return f"## {n}. {_sentence_case(titolo)}"

    # ── Caso generico: tutto maiuscolo senza pattern riconoscibile
    return f"## {_sentence_case(text)}"


def _is_toc_line(line: str) -> bool:
    """True se la riga è l'intestazione di un blocco indice/TOC."""
    first_word = line.strip().split('.')[0].strip().lower()
    return first_word in _TOC_KEYWORDS


# ─── Trasformazioni ────────────────────────────────────────────────────────────

def apply_transforms(text: str) -> tuple[str, dict]:
    """
    Applica tutte le trasformazioni strutturali al testo MD.
    Restituisce (testo_modificato, statistiche).
    """
    stats = {
        "toc_rimosso": False,
        "n_header_allcaps": 0,
        "n_sezioni_numerate": 0,
        "n_paragrafi_uniti": 0,
    }

    # ── 1. Rimuovi marcatori **bold** nelle intestazioni esistenti
    #       ## **Titolo** → ## Titolo
    text = re.sub(
        r'^(#{1,6})\s+\*\*(.+?)\*\*\s*$',
        r'\1 \2',
        text, flags=re.MULTILINE,
    )

    # ── 1b. Normalizza header esistenti con contenuto ALL-CAPS → sentence-case
    #        ## AL DI LA' DEL BENE E DEL MALE → ## Al di la' del bene e del male
    def _norm_allcaps_header(m: re.Match) -> str:
        hashes = m.group(1)
        content = m.group(2).strip()
        letters = [c for c in content if c.isalpha()]
        if letters and all(c.isupper() for c in letters):
            return f"{hashes} {_sentence_case(content)}"
        return m.group(0)

    text = re.sub(
        r'^(#{1,6}) (.+)$',
        _norm_allcaps_header,
        text, flags=re.MULTILINE,
    )

    # ── 2. Rimuovi blocco TOC (riga indice + contenuto inline sulla stessa riga)
    #       "INDICE. Capitolo 1 Capitolo 2 ..."  → rimossa
    lines = text.split('\n')
    new_lines = []
    for line in lines:
        if _is_toc_line(line):
            stats["toc_rimosso"] = True
        else:
            new_lines.append(line)
    text = '\n'.join(new_lines)

    # ── 3. Converti righe ALL-CAPS standalone → ## header
    #       Una riga è "standalone" se è preceduta/seguita da riga vuota
    #       oppure si trova all'inizio/fine del documento.
    blocks = text.split('\n\n')
    new_blocks = []
    for block in blocks:
        stripped = block.strip()
        # Blocco standalone = un'unica riga (nessun \n interno rilevante)
        if '\n' not in stripped and _is_allcaps_line(stripped):
            new_blocks.append(_allcaps_to_header(stripped))
            stats["n_header_allcaps"] += 1
        else:
            # Controlla riga per riga per righe ALL-CAPS seguite da altri contenuti
            sub_lines = block.split('\n')
            converted = []
            for ln in sub_lines:
                if _is_allcaps_line(ln) and len(ln.strip()) > 3:
                    converted.append(_allcaps_to_header(ln))
                    stats["n_header_allcaps"] += 1
                else:
                    converted.append(ln)
            new_blocks.append('\n'.join(converted))
    text = '\n\n'.join(new_blocks)

    # ── 4. Converti sezioni numerate "N.  testo" → "### N.\n\ntesto"
    #       Riconosce: "1.  Testo", "42.  Testo" (due o più spazi dopo il punto)
    def _num_repl(m: re.Match) -> str:
        num = m.group(1)
        testo = m.group(2).strip()
        stats["n_sezioni_numerate"] += 1
        return f"### {num}.\n\n{testo}"

    # Pattern standard: "1.  testo" o "1. testo"
    text = re.sub(
        r'^(\d+)\.\s+(.+)$',
        _num_repl,
        text, flags=re.MULTILINE,
    )

    # Pattern con lettera-suffisso: "65 a. testo" o "65a. testo"
    def _num_letter_repl(m: re.Match) -> str:
        num = m.group(1) + m.group(2)
        testo = m.group(3).strip()
        stats["n_sezioni_numerate"] += 1
        return f"### {num}.\n\n{testo}"

    text = re.sub(
        r'^(\d+)\s*([a-z])\.\s+(.+)$',
        _num_letter_repl,
        text, flags=re.MULTILINE,
    )

    # ── 5. Unisci paragrafi spezzati da salti pagina PDF
    #       Criterio: blocco A non finisce con punteggiatura di fine frase,
    #       blocco B non inizia con maiuscola "di sezione" né è un header.
    #       Unione sicura: mai attraverso confini ###/##.
    _SENTENCE_END = set('.?!»)\'"')
    blocks = text.split('\n\n')
    merged = []
    i = 0
    while i < len(blocks):
        b = blocks[i]
        stripped = b.strip()
        # Prova a unire con il successivo se la frase è spezzata
        while (
            i + 1 < len(blocks)
            and stripped
            and not stripped.startswith('#')
            and stripped[-1] not in _SENTENCE_END
        ):
            nxt = blocks[i + 1].strip()
            # Non unire se il successivo è un header o è vuoto
            if not nxt or nxt.startswith('#'):
                break
            # Non unire se il successivo inizia con una cifra seguita da punto
            # (sarebbe l'inizio di un nuovo aforisma non ancora convertito)
            if re.match(r'^\d+\.', nxt):
                break
            b = stripped + ' ' + nxt
            stripped = b.strip()
            stats["n_paragrafi_uniti"] += 1
            i += 1
        merged.append(b)
        i += 1
    text = '\n\n'.join(merged)

    # ── 6. Normalizza whitespace multiplo interno alle righe
    #       "parola  parola" → "parola parola"  (inclusi gli header)
    lines = text.split('\n')
    normalized = []
    for line in lines:
        if not line.strip():
            normalized.append(line)
        else:
            normalized.append(re.sub(r'  +', ' ', line))
    text = '\n'.join(normalized)

    # ── 7. Riduci righe vuote multiple a doppie
    text = re.sub(r'\n{3,}', '\n\n', text)

    return text, stats


# ─── Aggiornamento revision log ────────────────────────────────────────────────

def update_revision_log(
    log_path: Path,
    stem: str,
    profile_before: dict,
    profile_after: dict,
    t_stats: dict,
) -> None:
    header_exists = log_path.exists() and log_path.stat().st_size > 0

    avv = profile_after.get("avvertenze", [])
    avv_str = "; ".join(avv) if avv else "nessuna"

    entry = f"""
## {stem} — {date.today().isoformat()}

**Trasformazioni automatiche:**
- Normalizzazione whitespace multiplo e righe vuote
- Blocco TOC rimosso: {'sì' if t_stats['toc_rimosso'] else 'no'}
- Righe ALL-CAPS → ## header: {t_stats['n_header_allcaps']}
- Sezioni numerate → ### header: {t_stats['n_sezioni_numerate']}
- Paragrafi uniti (salti pagina PDF): {t_stats['n_paragrafi_uniti']}
- Livello struttura: {profile_before.get('livello_struttura', '?')} → {profile_after.get('livello_struttura', '?')}

**Avvertenze residue:** {avv_str}

**Revisioni manuali pendenti:**
- [ ] Verificare conversioni ALL-CAPS errate
- [ ] Controllare sezioni troppo corte o troppo lunghe
"""

    if not header_exists:
        log_path.write_text("# Revision log\n" + entry, encoding="utf-8")
    else:
        existing = log_path.read_text(encoding="utf-8")
        log_path.write_text(existing + entry, encoding="utf-8")


# ─── Per-document processing ─────────────────────────────────────────────────

def process_stem(stem: str, project_root: Path, force: bool) -> bool:
    src_dir = project_root / "step-3" / stem
    out_dir = project_root / "step-4" / stem
    raw_src = src_dir / "raw.md"
    clean_src = src_dir / "clean.md"
    profile_src = src_dir / "structure_profile.json"
    clean_out = out_dir / "clean.md"
    profile_out = out_dir / "structure_profile.json"

    print(f"\nDocumento: {stem}")

    if not clean_src.exists():
        print(f"  ✗ clean.md non trovato in step-3/{stem}/ — skip")
        return False

    if clean_out.exists() and not force:
        print(f"  ⚠️  clean.md già presente — skip")
        print(f"       (usa --force per rieseguire)")
        return True

    out_dir.mkdir(parents=True, exist_ok=True)

    # Copia raw.md immutabile (riferimento)
    if raw_src.exists():
        shutil.copy2(raw_src, out_dir / "raw.md")
        print(f"  Copiato raw.md da step-3/{stem}/")

    # Leggi profilo step-3 (per confronto nel report)
    profile_before: dict = {}
    if profile_src.exists():
        profile_before = json.loads(profile_src.read_text(encoding="utf-8"))

    # Applica trasformazioni
    print(f"  Applicazione trasformazioni strutturali...")
    text = clean_src.read_text(encoding="utf-8")
    text_revised, t_stats = apply_transforms(text)

    # Salva clean.md revisionato
    clean_out.write_text(text_revised, encoding="utf-8")

    # Ricalcola profilo sul nuovo clean.md
    profile_after = analyze(clean_out)
    profile_out.write_text(
        json.dumps(profile_after, ensure_ascii=False, indent=2),
        encoding="utf-8",
    )

    # Report
    lv_b = profile_before.get("livello_struttura", "?")
    lv_a = profile_after["livello_struttura"]
    _STRAT = {3: "h3_aware", 2: "h2_paragraph_split", 1: "paragraph", 0: "sliding_window"}
    print(f"  ✅ Livello struttura: {lv_b} → {lv_a}  ({_STRAT.get(lv_a, '?')})")
    print(f"     h2: {profile_before.get('n_h2','?')} → {profile_after['n_h2']}")
    print(f"     h3: {profile_before.get('n_h3','?')} → {profile_after['n_h3']}")
    print(f"     TOC rimosso: {'sì' if t_stats['toc_rimosso'] else 'no'}")
    print(f"     Righe ALL-CAPS → ##: {t_stats['n_header_allcaps']}")
    print(f"     Sezioni numerate → ###: {t_stats['n_sezioni_numerate']}")
    print(f"     Paragrafi uniti (salti pagina): {t_stats['n_paragrafi_uniti']}")
    for w in profile_after["avvertenze"]:
        print(f"     ⚠️  {w}")

    # Aggiorna revision log (direttamente in step-4/, non in sottocartella)
    log_path = project_root / "step-4" / "revision_log.md"
    update_revision_log(log_path, stem, profile_before, profile_after, t_stats)
    print(f"  ✅ step-4/revision_log.md aggiornato")
    print(f"  ✅ structure_profile.json salvato")
    return True


# ─── Entry point ─────────────────────────────────────────────────────────────

if __name__ == "__main__":
    project_root = Path(__file__).parent.parent

    parser = argparse.ArgumentParser(description="Step 4 — Revisione automatica Markdown")
    parser.add_argument("--stem", help="Nome del documento (sottocartella di step-3/)")
    parser.add_argument("--force", action="store_true", help="Riesegui anche se già presente")
    args = parser.parse_args()

    if args.stem:
        stems = [args.stem]
    else:
        step3_dir = project_root / "step-3"
        if not step3_dir.exists():
            print(f"Errore: cartella step-3/ non trovata in {project_root}")
            sys.exit(1)
        stems = sorted(p.name for p in step3_dir.iterdir() if p.is_dir())
        if not stems:
            print(f"Errore: nessun documento trovato in step-3/")
            sys.exit(1)

    results = [process_stem(s, project_root, args.force) for s in stems]

    ok = sum(results)
    total = len(results)
    print(f"\n{'✅' if all(results) else '⚠️ '} {ok}/{total} documenti revisionati")

    sys.exit(0 if all(results) else 1)
step-4: add revise.py, step4-review skill, README update 2026-04-13 12:21:26 +02:00			`#!/usr/bin/env python3`
			`"""`
			`Step 4 — Revisione automatica del Markdown`

			`Trasforma clean.md da step-3 rivelando la struttura latente del documento.`
			`Le trasformazioni sono euristiche universali che funzionano su qualsiasi PDF:`

			`- Normalizza whitespace multiplo (artefatto PDF)`
			`- Riduce righe vuote multiple`
			`- Rimuove marcatori bold nelle intestazioni esistenti`
			`- Converte righe ALL-CAPS standalone → ## header (euristico, qualsiasi lingua)`
			`- Converte sezioni numerate "N. testo" → ### N. (qualsiasi numerazione)`
			`- Rimuove blocchi TOC (righe che iniziano con parole-chiave indice)`

			`Per ogni documento viene ricalcolato il profilo strutturale: il livello può`
			`salire (es. livello 1 → 3) se le strutture latenti vengono rilevate.`

			`Output in step-4/<stem>/:`
			`raw.md — copia da step-3 (non modificare mai)`
			`clean.md — MD revisionato`
			`structure_profile.json — profilo aggiornato dopo la revisione`

			`Uso:`
			`python step-4/revise.py # tutti i documenti in step-3/`
			`python step-4/revise.py --stem nietzsche # un solo documento`
			`python step-4/revise.py --force # riesegui anche se già presente`
			`"""`

			`import argparse`
			`import json`
			`import re`
			`import shutil`
			`import sys`
			`from datetime import date`
			`from pathlib import Path`

			`# Riusa la funzione analyze() già scritta nello step 3`
			`sys.path.insert(0, str(Path(__file__).parent.parent / "step-3"))`
			`from detect_structure import analyze # noqa: E402`


			`# ─── Costanti ─────────────────────────────────────────────────────────────────`

			`# Parole-chiave che identificano blocchi TOC (da rimuovere)`
			`_TOC_KEYWORDS = frozenset([`
			`"indice", "index", "contents", "table of contents",`
			`"sommario", "inhaltsverzeichnis", "inhalt",`
			`])`

			`# Preposizioni/articoli da non capitalizzare nel title-case`
			`_STOP_IT_EN = frozenset([`
			`# italiano`
			`"di", "del", "della", "dei", "delle", "da", "in", "e", "il", "la",`
			`"lo", "le", "gli", "un", "una", "per", "a", "al", "alla", "ai",`
			`"alle", "con", "su", "sul", "sulla", "che", "o",`
			`# inglese`
			`"of", "the", "a", "an", "and", "or", "but", "in", "on", "at",`
			`"to", "for", "with", "by", "from", "as",`
			`])`

			`# Ordinali italiani → romani (per titoli come "CAPITOLO PRIMO")`
			`_ORDINALS_IT = {`
			`"PRIMO": "I", "SECONDO": "II", "TERZO": "III", "QUARTO": "IV",`
			`"QUINTO": "V", "SESTO": "VI", "SETTIMO": "VII", "OTTAVO": "VIII",`
			`"NONO": "IX", "DECIMO": "X",`
			`}`

			`# Ordinali inglesi → arabici (per "CHAPTER ONE")`
			`_ORDINALS_EN = {`
			`"ONE": "1", "TWO": "2", "THREE": "3", "FOUR": "4", "FIVE": "5",`
			`"SIX": "6", "SEVEN": "7", "EIGHT": "8", "NINE": "9", "TEN": "10",`
			`}`


			`# ─── Utilità ──────────────────────────────────────────────────────────────────`

			`def _sentence_case(s: str) -> str:`
			`"""`
			`Sentence-case: prima lettera maiuscola, resto minuscolo.`
			`Corretto per l'italiano e accettabile per l'inglese accademico.`
			`"""`
			`if not s:`
			`return s`
			`lower = s.lower()`
			`return lower[0].upper() + lower[1:]`


			`def _is_allcaps_line(line: str) -> bool:`
			`"""`
			`True se la riga è una candidata per conversione a ## header.`
			`Criterio: tutti i caratteri alfabetici sono maiuscoli, lunghezza >= 3.`
			`"""`
			`stripped = line.strip()`
			`letters = [c for c in stripped if c.isalpha()]`
			`return (`
			`len(letters) >= 3`
			`and all(c.isupper() for c in letters)`
			`and not stripped.startswith("#")`
			`)`


			`def _allcaps_to_header(raw_line: str) -> str:`
			`"""`
			`Converte una riga ALL-CAPS in un ## header title-case.`
			`Riconosce pattern specifici (CAPITOLO ORDINE, CHAPTER N) come bonus,`
			`ma funziona in modalità generica su qualsiasi testo.`
			`"""`
			`text = raw_line.strip().rstrip('.').rstrip('?').strip()`

			`# ── Pattern italiano: "CAPITOLO PRIMO. TITOLO DEL CAPITOLO"`
			`_ORD_IT_PAT = "\|".join(_ORDINALS_IT.keys())`
			`m = re.match(rf'^CAPITOLO ({_ORD_IT_PAT})\. (.+)', text)`
			`if m:`
			`roman = _ORDINALS_IT[m.group(1)]`
			`titolo = m.group(2).rstrip('.').rstrip('?').strip()`
			`return f"## Capitolo {roman} — {_sentence_case(titolo)}"`

			`# ── Pattern inglese: "CHAPTER ONE. TITLE" o "CHAPTER 1. TITLE"`
			`_ORD_EN_PAT = "\|".join(_ORDINALS_EN.keys())`
			`m = re.match(rf'^CHAPTER ({_ORD_EN_PAT}\|\d+)\.? (.+)', text)`
			`if m:`
			`n = _ORDINALS_EN.get(m.group(1), m.group(1))`
			`titolo = m.group(2).rstrip('.').rstrip('?').strip()`
			`return f"## Chapter {n} — {_sentence_case(titolo)}"`

			`# ── Pattern generico con numerazione romana o arabica nel prefisso`
			`m = re.match(r'^([IVXLCDM]+\|[0-9]+)\. (.+)', text)`
			`if m:`
			`n = m.group(1)`
			`titolo = m.group(2).rstrip('.').strip()`
			`return f"## {n}. {_sentence_case(titolo)}"`

			`# ── Caso generico: tutto maiuscolo senza pattern riconoscibile`
			`return f"## {_sentence_case(text)}"`


			`def _is_toc_line(line: str) -> bool:`
			`"""True se la riga è l'intestazione di un blocco indice/TOC."""`
			`first_word = line.strip().split('.')[0].strip().lower()`
			`return first_word in _TOC_KEYWORDS`


			`# ─── Trasformazioni ────────────────────────────────────────────────────────────`

			`def apply_transforms(text: str) -> tuple[str, dict]:`
			`"""`
			`Applica tutte le trasformazioni strutturali al testo MD.`
			`Restituisce (testo_modificato, statistiche).`
			`"""`
			`stats = {`
			`"toc_rimosso": False,`
			`"n_header_allcaps": 0,`
			`"n_sezioni_numerate": 0,`
			`"n_paragrafi_uniti": 0,`
			`}`

			`# ── 1. Rimuovi marcatori bold nelle intestazioni esistenti`
			`# ## Titolo → ## Titolo`
			`text = re.sub(`
			`r'^(#{1,6})\s+\\(.+?)\\\s*$',`
			`r'\1 \2',`
			`text, flags=re.MULTILINE,`
			`)`

			`# ── 1b. Normalizza header esistenti con contenuto ALL-CAPS → sentence-case`
			`# ## AL DI LA' DEL BENE E DEL MALE → ## Al di la' del bene e del male`
			`def _norm_allcaps_header(m: re.Match) -> str:`
			`hashes = m.group(1)`
			`content = m.group(2).strip()`
			`letters = [c for c in content if c.isalpha()]`
			`if letters and all(c.isupper() for c in letters):`
			`return f"{hashes} {_sentence_case(content)}"`
			`return m.group(0)`

			`text = re.sub(`
			`r'^(#{1,6}) (.+)$',`
			`_norm_allcaps_header,`
			`text, flags=re.MULTILINE,`
			`)`

			`# ── 2. Rimuovi blocco TOC (riga indice + contenuto inline sulla stessa riga)`
			`# "INDICE. Capitolo 1 Capitolo 2 ..." → rimossa`
			`lines = text.split('\n')`
			`new_lines = []`
			`for line in lines:`
			`if _is_toc_line(line):`
			`stats["toc_rimosso"] = True`
			`else:`
			`new_lines.append(line)`
			`text = '\n'.join(new_lines)`

			`# ── 3. Converti righe ALL-CAPS standalone → ## header`
			`# Una riga è "standalone" se è preceduta/seguita da riga vuota`
			`# oppure si trova all'inizio/fine del documento.`
			`blocks = text.split('\n\n')`
			`new_blocks = []`
			`for block in blocks:`
			`stripped = block.strip()`
			`# Blocco standalone = un'unica riga (nessun \n interno rilevante)`
			`if '\n' not in stripped and _is_allcaps_line(stripped):`
			`new_blocks.append(_allcaps_to_header(stripped))`
			`stats["n_header_allcaps"] += 1`
			`else:`
			`# Controlla riga per riga per righe ALL-CAPS seguite da altri contenuti`
			`sub_lines = block.split('\n')`
			`converted = []`
			`for ln in sub_lines:`
			`if _is_allcaps_line(ln) and len(ln.strip()) > 3:`
			`converted.append(_allcaps_to_header(ln))`
			`stats["n_header_allcaps"] += 1`
			`else:`
			`converted.append(ln)`
			`new_blocks.append('\n'.join(converted))`
			`text = '\n\n'.join(new_blocks)`

			`# ── 4. Converti sezioni numerate "N. testo" → "### N.\n\ntesto"`
			`# Riconosce: "1. Testo", "42. Testo" (due o più spazi dopo il punto)`
			`def _num_repl(m: re.Match) -> str:`
			`num = m.group(1)`
			`testo = m.group(2).strip()`
			`stats["n_sezioni_numerate"] += 1`
			`return f"### {num}.\n\n{testo}"`

			`# Pattern standard: "1. testo" o "1. testo"`
			`text = re.sub(`
			`r'^(\d+)\.\s+(.+)$',`
			`_num_repl,`
			`text, flags=re.MULTILINE,`
			`)`

			`# Pattern con lettera-suffisso: "65 a. testo" o "65a. testo"`
			`def _num_letter_repl(m: re.Match) -> str:`
			`num = m.group(1) + m.group(2)`
			`testo = m.group(3).strip()`
			`stats["n_sezioni_numerate"] += 1`
			`return f"### {num}.\n\n{testo}"`

			`text = re.sub(`
			`r'^(\d+)\s*([a-z])\.\s+(.+)$',`
			`_num_letter_repl,`
			`text, flags=re.MULTILINE,`
			`)`

			`# ── 5. Unisci paragrafi spezzati da salti pagina PDF`
			`# Criterio: blocco A non finisce con punteggiatura di fine frase,`
			`# blocco B non inizia con maiuscola "di sezione" né è un header.`
			`# Unione sicura: mai attraverso confini ###/##.`
			`_SENTENCE_END = set('.?!»)\'"')`
			`blocks = text.split('\n\n')`
			`merged = []`
			`i = 0`
			`while i < len(blocks):`
			`b = blocks[i]`
			`stripped = b.strip()`
			`# Prova a unire con il successivo se la frase è spezzata`
			`while (`
			`i + 1 < len(blocks)`
			`and stripped`
			`and not stripped.startswith('#')`
			`and stripped[-1] not in _SENTENCE_END`
			`):`
			`nxt = blocks[i + 1].strip()`
			`# Non unire se il successivo è un header o è vuoto`
			`if not nxt or nxt.startswith('#'):`
			`break`
			`# Non unire se il successivo inizia con una cifra seguita da punto`
			`# (sarebbe l'inizio di un nuovo aforisma non ancora convertito)`
			`if re.match(r'^\d+\.', nxt):`
			`break`
			`b = stripped + ' ' + nxt`
			`stripped = b.strip()`
			`stats["n_paragrafi_uniti"] += 1`
			`i += 1`
			`merged.append(b)`
			`i += 1`
			`text = '\n\n'.join(merged)`

			`# ── 6. Normalizza whitespace multiplo interno alle righe`
			`# "parola parola" → "parola parola" (inclusi gli header)`
			`lines = text.split('\n')`
			`normalized = []`
			`for line in lines:`
			`if not line.strip():`
			`normalized.append(line)`
			`else:`
			`normalized.append(re.sub(r' +', ' ', line))`
			`text = '\n'.join(normalized)`

			`# ── 7. Riduci righe vuote multiple a doppie`
			`text = re.sub(r'\n{3,}', '\n\n', text)`

			`return text, stats`


			`# ─── Aggiornamento revision log ────────────────────────────────────────────────`

			`def update_revision_log(`
			`log_path: Path,`
			`stem: str,`
			`profile_before: dict,`
			`profile_after: dict,`
			`t_stats: dict,`
			`) -> None:`
			`header_exists = log_path.exists() and log_path.stat().st_size > 0`

			`avv = profile_after.get("avvertenze", [])`
			`avv_str = "; ".join(avv) if avv else "nessuna"`

			`entry = f"""`
			`## {stem} — {date.today().isoformat()}`

			`Trasformazioni automatiche:`
			`- Normalizzazione whitespace multiplo e righe vuote`
			`- Blocco TOC rimosso: {'sì' if t_stats['toc_rimosso'] else 'no'}`
			`- Righe ALL-CAPS → ## header: {t_stats['n_header_allcaps']}`
			`- Sezioni numerate → ### header: {t_stats['n_sezioni_numerate']}`
			`- Paragrafi uniti (salti pagina PDF): {t_stats['n_paragrafi_uniti']}`
			`- Livello struttura: {profile_before.get('livello_struttura', '?')} → {profile_after.get('livello_struttura', '?')}`

			`Avvertenze residue: {avv_str}`

			`Revisioni manuali pendenti:`
			`- [ ] Verificare conversioni ALL-CAPS errate`
			`- [ ] Controllare sezioni troppo corte o troppo lunghe`
			`"""`

			`if not header_exists:`
			`log_path.write_text("# Revision log\n" + entry, encoding="utf-8")`
			`else:`
			`existing = log_path.read_text(encoding="utf-8")`
			`log_path.write_text(existing + entry, encoding="utf-8")`


			`# ─── Per-document processing ─────────────────────────────────────────────────`

			`def process_stem(stem: str, project_root: Path, force: bool) -> bool:`
			`src_dir = project_root / "step-3" / stem`
			`out_dir = project_root / "step-4" / stem`
			`raw_src = src_dir / "raw.md"`
			`clean_src = src_dir / "clean.md"`
			`profile_src = src_dir / "structure_profile.json"`
			`clean_out = out_dir / "clean.md"`
			`profile_out = out_dir / "structure_profile.json"`

			`print(f"\nDocumento: {stem}")`

			`if not clean_src.exists():`
			`print(f" ✗ clean.md non trovato in step-3/{stem}/ — skip")`
			`return False`

			`if clean_out.exists() and not force:`
			`print(f" ⚠️ clean.md già presente — skip")`
			`print(f" (usa --force per rieseguire)")`
			`return True`

			`out_dir.mkdir(parents=True, exist_ok=True)`

			`# Copia raw.md immutabile (riferimento)`
			`if raw_src.exists():`
			`shutil.copy2(raw_src, out_dir / "raw.md")`
			`print(f" Copiato raw.md da step-3/{stem}/")`

			`# Leggi profilo step-3 (per confronto nel report)`
			`profile_before: dict = {}`
			`if profile_src.exists():`
			`profile_before = json.loads(profile_src.read_text(encoding="utf-8"))`

			`# Applica trasformazioni`
			`print(f" Applicazione trasformazioni strutturali...")`
			`text = clean_src.read_text(encoding="utf-8")`
			`text_revised, t_stats = apply_transforms(text)`

			`# Salva clean.md revisionato`
			`clean_out.write_text(text_revised, encoding="utf-8")`

			`# Ricalcola profilo sul nuovo clean.md`
			`profile_after = analyze(clean_out)`
			`profile_out.write_text(`
			`json.dumps(profile_after, ensure_ascii=False, indent=2),`
			`encoding="utf-8",`
			`)`

			`# Report`
			`lv_b = profile_before.get("livello_struttura", "?")`
			`lv_a = profile_after["livello_struttura"]`
			`_STRAT = {3: "h3_aware", 2: "h2_paragraph_split", 1: "paragraph", 0: "sliding_window"}`
			`print(f" ✅ Livello struttura: {lv_b} → {lv_a} ({_STRAT.get(lv_a, '?')})")`
			`print(f" h2: {profile_before.get('n_h2','?')} → {profile_after['n_h2']}")`
			`print(f" h3: {profile_before.get('n_h3','?')} → {profile_after['n_h3']}")`
			`print(f" TOC rimosso: {'sì' if t_stats['toc_rimosso'] else 'no'}")`
			`print(f" Righe ALL-CAPS → ##: {t_stats['n_header_allcaps']}")`
			`print(f" Sezioni numerate → ###: {t_stats['n_sezioni_numerate']}")`
			`print(f" Paragrafi uniti (salti pagina): {t_stats['n_paragrafi_uniti']}")`
			`for w in profile_after["avvertenze"]:`
			`print(f" ⚠️ {w}")`

			`# Aggiorna revision log (direttamente in step-4/, non in sottocartella)`
			`log_path = project_root / "step-4" / "revision_log.md"`
			`update_revision_log(log_path, stem, profile_before, profile_after, t_stats)`
			`print(f" ✅ step-4/revision_log.md aggiornato")`
			`print(f" ✅ structure_profile.json salvato")`
			`return True`


			`# ─── Entry point ─────────────────────────────────────────────────────────────`

			`if __name__ == "__main__":`
			`project_root = Path(__file__).parent.parent`

			`parser = argparse.ArgumentParser(description="Step 4 — Revisione automatica Markdown")`
			`parser.add_argument("--stem", help="Nome del documento (sottocartella di step-3/)")`
			`parser.add_argument("--force", action="store_true", help="Riesegui anche se già presente")`
			`args = parser.parse_args()`

			`if args.stem:`
			`stems = [args.stem]`
			`else:`
			`step3_dir = project_root / "step-3"`
			`if not step3_dir.exists():`
			`print(f"Errore: cartella step-3/ non trovata in {project_root}")`
			`sys.exit(1)`
			`stems = sorted(p.name for p in step3_dir.iterdir() if p.is_dir())`
			`if not stems:`
			`print(f"Errore: nessun documento trovato in step-3/")`
			`sys.exit(1)`

			`results = [process_stem(s, project_root, args.force) for s in stems]`

			`ok = sum(results)`
			`total = len(results)`
			`print(f"\n{'✅' if all(results) else '⚠️ '} {ok}/{total} documenti revisionati")`

			`sys.exit(0 if all(results) else 1)`