rag-from-scratch/step-4/revise.py

#!/usr/bin/env python3
"""
Step 4 — Revisione automatica del Markdown

Trasforma clean.md da step-3 rivelando la struttura latente del documento.
Le trasformazioni sono euristiche universali che funzionano su qualsiasi PDF:

  - Normalizza whitespace multiplo (artefatto PDF)
  - Riduce righe vuote multiple
  - Rimuove marcatori **bold** nelle intestazioni esistenti
  - Converte righe ALL-CAPS standalone → ## header (euristico, qualsiasi lingua)
  - Converte sezioni numerate "N.  testo" → ### N. (qualsiasi numerazione)
  - Rimuove blocchi TOC (righe che iniziano con parole-chiave indice)

Per ogni documento viene ricalcolato il profilo strutturale: il livello può
salire (es. livello 1 → 3) se le strutture latenti vengono rilevate.

Output in step-4/<stem>/:
  raw.md                  — copia da step-3 (non modificare mai)
  clean.md                — MD revisionato
  structure_profile.json  — profilo aggiornato dopo la revisione

Uso:
    python step-4/revise.py                    # tutti i documenti in step-3/
    python step-4/revise.py --stem nietzsche   # un solo documento
    python step-4/revise.py --force            # riesegui anche se già presente
"""

import argparse
import json
import re
import shutil
import sys
from datetime import date
from pathlib import Path

# Riusa la funzione analyze() già scritta nello step 3
sys.path.insert(0, str(Path(__file__).parent.parent / "step-3"))
from detect_structure import analyze  # noqa: E402


# ─── Costanti ─────────────────────────────────────────────────────────────────

# Parole-chiave che identificano blocchi TOC (da rimuovere)
_TOC_KEYWORDS = frozenset([
    "indice", "index", "contents", "table of contents",
    "sommario", "inhaltsverzeichnis", "inhalt",
])

# Preposizioni/articoli da non capitalizzare nel title-case
_STOP_IT_EN = frozenset([
    # italiano
    "di", "del", "della", "dei", "delle", "da", "in", "e", "il", "la",
    "lo", "le", "gli", "un", "una", "per", "a", "al", "alla", "ai",
    "alle", "con", "su", "sul", "sulla", "che", "o",
    # inglese
    "of", "the", "a", "an", "and", "or", "but", "in", "on", "at",
    "to", "for", "with", "by", "from", "as",
])

# Ordinali italiani → romani (per titoli come "CAPITOLO PRIMO")
_ORDINALS_IT = {
    "PRIMO": "I", "SECONDO": "II", "TERZO": "III", "QUARTO": "IV",
    "QUINTO": "V", "SESTO": "VI", "SETTIMO": "VII", "OTTAVO": "VIII",
    "NONO": "IX", "DECIMO": "X",
}

# Ordinali inglesi → arabici (per "CHAPTER ONE")
_ORDINALS_EN = {
    "ONE": "1", "TWO": "2", "THREE": "3", "FOUR": "4", "FIVE": "5",
    "SIX": "6", "SEVEN": "7", "EIGHT": "8", "NINE": "9", "TEN": "10",
}


# ─── Utilità ──────────────────────────────────────────────────────────────────

def _sentence_case(s: str) -> str:
    """
    Sentence-case: prima lettera maiuscola, resto minuscolo.
    Corretto per l'italiano e accettabile per l'inglese accademico.
    """
    if not s:
        return s
    lower = s.lower()
    return lower[0].upper() + lower[1:]


def _is_allcaps_line(line: str) -> bool:
    """
    True se la riga è una candidata per conversione a ## header.
    Criterio: tutti i caratteri alfabetici sono maiuscoli, lunghezza >= 3.
    """
    stripped = line.strip()
    letters = [c for c in stripped if c.isalpha()]
    return (
        len(letters) >= 3
        and all(c.isupper() for c in letters)
        and not stripped.startswith("#")
    )


def _allcaps_to_header(raw_line: str) -> str:
    """
    Converte una riga ALL-CAPS in un ## header title-case.
    Riconosce pattern specifici (CAPITOLO ORDINE, CHAPTER N) come bonus,
    ma funziona in modalità generica su qualsiasi testo.
    """
    text = raw_line.strip().rstrip('.').rstrip('?').strip()

    # ── Pattern italiano: "CAPITOLO PRIMO. TITOLO DEL CAPITOLO"
    _ORD_IT_PAT = "|".join(_ORDINALS_IT.keys())
    m = re.match(rf'^CAPITOLO ({_ORD_IT_PAT})\. (.+)', text)
    if m:
        roman = _ORDINALS_IT[m.group(1)]
        titolo = m.group(2).rstrip('.').rstrip('?').strip()
        return f"## Capitolo {roman} — {_sentence_case(titolo)}"

    # ── Pattern inglese: "CHAPTER ONE. TITLE" o "CHAPTER 1. TITLE"
    _ORD_EN_PAT = "|".join(_ORDINALS_EN.keys())
    m = re.match(rf'^CHAPTER ({_ORD_EN_PAT}|\d+)\.? (.+)', text)
    if m:
        n = _ORDINALS_EN.get(m.group(1), m.group(1))
        titolo = m.group(2).rstrip('.').rstrip('?').strip()
        return f"## Chapter {n} — {_sentence_case(titolo)}"

    # ── Pattern generico con numerazione romana o arabica nel prefisso
    m = re.match(r'^([IVXLCDM]+|[0-9]+)\. (.+)', text)
    if m:
        n = m.group(1)
        titolo = m.group(2).rstrip('.').strip()
        return f"## {n}. {_sentence_case(titolo)}"

    # ── Caso generico: tutto maiuscolo senza pattern riconoscibile
    return f"## {_sentence_case(text)}"


def _is_toc_line(line: str) -> bool:
    """True se la riga è l'intestazione di un blocco indice/TOC."""
    first_word = line.strip().split('.')[0].strip().lower()
    return first_word in _TOC_KEYWORDS


# ─── Trasformazioni ────────────────────────────────────────────────────────────

def apply_transforms(text: str) -> tuple[str, dict]:
    """
    Applica tutte le trasformazioni strutturali al testo MD.
    Restituisce (testo_modificato, statistiche).
    """
    stats = {
        "toc_rimosso": False,
        "n_header_allcaps": 0,
        "n_sezioni_numerate": 0,
        "n_paragrafi_uniti": 0,
    }

    # ── 1. Rimuovi marcatori **bold** nelle intestazioni esistenti
    #       ## **Titolo** → ## Titolo
    text = re.sub(
        r'^(#{1,6})\s+\*\*(.+?)\*\*\s*$',
        r'\1 \2',
        text, flags=re.MULTILINE,
    )

    # ── 1b. Normalizza header esistenti con contenuto ALL-CAPS → sentence-case
    #        ## AL DI LA' DEL BENE E DEL MALE → ## Al di la' del bene e del male
    def _norm_allcaps_header(m: re.Match) -> str:
        hashes = m.group(1)
        content = m.group(2).strip()
        letters = [c for c in content if c.isalpha()]
        if letters and all(c.isupper() for c in letters):
            return f"{hashes} {_sentence_case(content)}"
        return m.group(0)

    text = re.sub(
        r'^(#{1,6}) (.+)$',
        _norm_allcaps_header,
        text, flags=re.MULTILINE,
    )

    # ── 2. Rimuovi blocco TOC (riga indice + contenuto inline sulla stessa riga)
    #       "INDICE. Capitolo 1 Capitolo 2 ..."  → rimossa
    lines = text.split('\n')
    new_lines = []
    for line in lines:
        if _is_toc_line(line):
            stats["toc_rimosso"] = True
        else:
            new_lines.append(line)
    text = '\n'.join(new_lines)

    # ── 3. Converti righe ALL-CAPS standalone → ## header
    #       Una riga è "standalone" se è preceduta/seguita da riga vuota
    #       oppure si trova all'inizio/fine del documento.
    blocks = text.split('\n\n')
    new_blocks = []
    for block in blocks:
        stripped = block.strip()
        # Blocco standalone = un'unica riga (nessun \n interno rilevante)
        if '\n' not in stripped and _is_allcaps_line(stripped):
            new_blocks.append(_allcaps_to_header(stripped))
            stats["n_header_allcaps"] += 1
        else:
            # Controlla riga per riga per righe ALL-CAPS seguite da altri contenuti
            sub_lines = block.split('\n')
            converted = []
            for ln in sub_lines:
                if _is_allcaps_line(ln) and len(ln.strip()) > 3:
                    converted.append(_allcaps_to_header(ln))
                    stats["n_header_allcaps"] += 1
                else:
                    converted.append(ln)
            new_blocks.append('\n'.join(converted))
    text = '\n\n'.join(new_blocks)

    # ── 4. Converti sezioni numerate "N.  testo" → "### N.\n\ntesto"
    #       Riconosce: "1.  Testo", "42.  Testo" (due o più spazi dopo il punto)
    def _num_repl(m: re.Match) -> str:
        num = m.group(1)
        testo = m.group(2).strip()
        stats["n_sezioni_numerate"] += 1
        return f"### {num}.\n\n{testo}"

    # Pattern standard: "1.  testo" o "1. testo"
    text = re.sub(
        r'^(\d+)\.\s+(.+)$',
        _num_repl,
        text, flags=re.MULTILINE,
    )

    # Pattern con lettera-suffisso: "65 a. testo" o "65a. testo"
    def _num_letter_repl(m: re.Match) -> str:
        num = m.group(1) + m.group(2)
        testo = m.group(3).strip()
        stats["n_sezioni_numerate"] += 1
        return f"### {num}.\n\n{testo}"

    text = re.sub(
        r'^(\d+)\s*([a-z])\.\s+(.+)$',
        _num_letter_repl,
        text, flags=re.MULTILINE,
    )

    # ── 5. Unisci paragrafi spezzati da salti pagina PDF
    #       Criterio: blocco A non finisce con punteggiatura di fine frase,
    #       blocco B non inizia con maiuscola "di sezione" né è un header.
    #       Unione sicura: mai attraverso confini ###/##.
    _SENTENCE_END = set('.?!»)\'"')
    blocks = text.split('\n\n')
    merged = []
    i = 0
    while i < len(blocks):
        b = blocks[i]
        stripped = b.strip()
        # Prova a unire con il successivo se la frase è spezzata
        while (
            i + 1 < len(blocks)
            and stripped
            and not stripped.startswith('#')
            and stripped[-1] not in _SENTENCE_END
        ):
            nxt = blocks[i + 1].strip()
            # Non unire se il successivo è un header o è vuoto
            if not nxt or nxt.startswith('#'):
                break
            # Non unire se il successivo inizia con una cifra seguita da punto
            # (sarebbe l'inizio di un nuovo aforisma non ancora convertito)
            if re.match(r'^\d+\.', nxt):
                break
            b = stripped + ' ' + nxt
            stripped = b.strip()
            stats["n_paragrafi_uniti"] += 1
            i += 1
        merged.append(b)
        i += 1
    text = '\n\n'.join(merged)

    # ── 6. Normalizza whitespace multiplo interno alle righe
    #       "parola  parola" → "parola parola"  (inclusi gli header)
    lines = text.split('\n')
    normalized = []
    for line in lines:
        if not line.strip():
            normalized.append(line)
        else:
            normalized.append(re.sub(r'  +', ' ', line))
    text = '\n'.join(normalized)

    # ── 7. Riduci righe vuote multiple a doppie
    text = re.sub(r'\n{3,}', '\n\n', text)

    return text, stats


# ─── Aggiornamento revision log ────────────────────────────────────────────────

def update_revision_log(
    log_path: Path,
    stem: str,
    profile_before: dict,
    profile_after: dict,
    t_stats: dict,
) -> None:
    header_exists = log_path.exists() and log_path.stat().st_size > 0

    avv = profile_after.get("avvertenze", [])
    avv_str = "; ".join(avv) if avv else "nessuna"

    entry = f"""
## {stem} — {date.today().isoformat()}

**Trasformazioni automatiche:**
- Normalizzazione whitespace multiplo e righe vuote
- Blocco TOC rimosso: {'sì' if t_stats['toc_rimosso'] else 'no'}
- Righe ALL-CAPS → ## header: {t_stats['n_header_allcaps']}
- Sezioni numerate → ### header: {t_stats['n_sezioni_numerate']}
- Paragrafi uniti (salti pagina PDF): {t_stats['n_paragrafi_uniti']}
- Livello struttura: {profile_before.get('livello_struttura', '?')} → {profile_after.get('livello_struttura', '?')}

**Avvertenze residue:** {avv_str}

**Revisioni manuali pendenti:**
- [ ] Verificare conversioni ALL-CAPS errate
- [ ] Controllare sezioni troppo corte o troppo lunghe
"""

    if not header_exists:
        log_path.write_text("# Revision log\n" + entry, encoding="utf-8")
    else:
        existing = log_path.read_text(encoding="utf-8")
        log_path.write_text(existing + entry, encoding="utf-8")


# ─── Per-document processing ─────────────────────────────────────────────────

def process_stem(stem: str, project_root: Path, force: bool) -> bool:
    src_dir = project_root / "step-3" / stem
    out_dir = project_root / "step-4" / stem
    raw_src = src_dir / "raw.md"
    clean_src = src_dir / "clean.md"
    profile_src = src_dir / "structure_profile.json"
    clean_out = out_dir / "clean.md"
    profile_out = out_dir / "structure_profile.json"

    print(f"\nDocumento: {stem}")

    if not clean_src.exists():
        print(f"  ✗ clean.md non trovato in step-3/{stem}/ — skip")
        return False

    if clean_out.exists() and not force:
        print(f"  ⚠️  clean.md già presente — skip")
        print(f"       (usa --force per rieseguire)")
        return True

    out_dir.mkdir(parents=True, exist_ok=True)

    # Copia raw.md immutabile (riferimento)
    if raw_src.exists():
        shutil.copy2(raw_src, out_dir / "raw.md")
        print(f"  Copiato raw.md da step-3/{stem}/")

    # Leggi profilo step-3 (per confronto nel report)
    profile_before: dict = {}
    if profile_src.exists():
        profile_before = json.loads(profile_src.read_text(encoding="utf-8"))

    # Applica trasformazioni
    print(f"  Applicazione trasformazioni strutturali...")
    text = clean_src.read_text(encoding="utf-8")
    text_revised, t_stats = apply_transforms(text)

    # Salva clean.md revisionato
    clean_out.write_text(text_revised, encoding="utf-8")

    # Ricalcola profilo sul nuovo clean.md
    profile_after = analyze(clean_out)
    profile_out.write_text(
        json.dumps(profile_after, ensure_ascii=False, indent=2),
        encoding="utf-8",
    )

    # Report
    lv_b = profile_before.get("livello_struttura", "?")
    lv_a = profile_after["livello_struttura"]
    _STRAT = {3: "h3_aware", 2: "h2_paragraph_split", 1: "paragraph", 0: "sliding_window"}
    print(f"  ✅ Livello struttura: {lv_b} → {lv_a}  ({_STRAT.get(lv_a, '?')})")
    print(f"     h2: {profile_before.get('n_h2','?')} → {profile_after['n_h2']}")
    print(f"     h3: {profile_before.get('n_h3','?')} → {profile_after['n_h3']}")
    print(f"     TOC rimosso: {'sì' if t_stats['toc_rimosso'] else 'no'}")
    print(f"     Righe ALL-CAPS → ##: {t_stats['n_header_allcaps']}")
    print(f"     Sezioni numerate → ###: {t_stats['n_sezioni_numerate']}")
    print(f"     Paragrafi uniti (salti pagina): {t_stats['n_paragrafi_uniti']}")
    for w in profile_after["avvertenze"]:
        print(f"     ⚠️  {w}")

    # Aggiorna revision log (direttamente in step-4/, non in sottocartella)
    log_path = project_root / "step-4" / "revision_log.md"
    update_revision_log(log_path, stem, profile_before, profile_after, t_stats)
    print(f"  ✅ step-4/revision_log.md aggiornato")
    print(f"  ✅ structure_profile.json salvato")
    return True


# ─── Entry point ─────────────────────────────────────────────────────────────

if __name__ == "__main__":
    project_root = Path(__file__).parent.parent

    parser = argparse.ArgumentParser(description="Step 4 — Revisione automatica Markdown")
    parser.add_argument("--stem", help="Nome del documento (sottocartella di step-3/)")
    parser.add_argument("--force", action="store_true", help="Riesegui anche se già presente")
    args = parser.parse_args()

    if args.stem:
        stems = [args.stem]
    else:
        step3_dir = project_root / "step-3"
        if not step3_dir.exists():
            print(f"Errore: cartella step-3/ non trovata in {project_root}")
            sys.exit(1)
        stems = sorted(p.name for p in step3_dir.iterdir() if p.is_dir())
        if not stems:
            print(f"Errore: nessun documento trovato in step-3/")
            sys.exit(1)

    results = [process_stem(s, project_root, args.force) for s in stems]

    ok = sum(results)
    total = len(results)
    print(f"\n{'✅' if all(results) else '⚠️ '} {ok}/{total} documenti revisionati")

    sys.exit(0 if all(results) else 1)