From 2c0b7a462e980264617a81bdab3f6f4085e2a464 Mon Sep 17 00:00:00 2001
From: Davide Grilli <davide.grilli@outlook.com>
Date: Thu, 7 May 2026 14:58:09 +0200
Subject: [PATCH] =?UTF-8?q?feat:=20migliora=20pipeline=20PDF=E2=86=92MD=20?=
 =?UTF-8?q?per=20RAG=20=E2=80=94=20frontmatter=20e=20page=20marker?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- extract.py: aggiunge extract_metadata() — title, author, year, pages via fitz
- extract.py: aggiunge markdown_page_separator con <!-- page: N --> tra pagine
- extract.py: aggiunge replace_invalid_chars=" " per testo più pulito
- runner.py: prepend YAML frontmatter (source/title/author/year/pages) al clean.md
- runner.py: mostra title e author rilevati durante validazione

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 conversione/_pipeline/extract.py | 55 +++++++++++++++++++++++++++-----
 conversione/_pipeline/runner.py  | 24 ++++++++++++--
 2 files changed, 69 insertions(+), 10 deletions(-)

diff --git a/conversione/_pipeline/extract.py b/conversione/_pipeline/extract.py
index 4876d85..a3018ac 100644
--- a/conversione/_pipeline/extract.py
+++ b/conversione/_pipeline/extract.py
@@ -1,4 +1,5 @@
-"""Estrazione PDF: verifica dipendenze, validazione, conversione → raw Markdown."""
+"""Estrazione PDF: verifica dipendenze, validazione, metadati, conversione → raw Markdown."""
+import re
 import subprocess
 import sys
 from pathlib import Path
@@ -76,6 +77,40 @@ def validate_pdf(pdf_path: Path) -> tuple[bool, str]:
         return False, f"Impossibile aprire: {e}"
 
 
+# ─── Metadati PDF ────────────────────────────────────────────────────────────
+
+def extract_metadata(pdf_path: Path) -> dict:
+    """
+    Estrae title, author, year e page count dal PDF tramite fitz.
+    Restituisce un dict con chiavi sempre presenti (stringa vuota se assenti).
+    """
+    try:
+        import fitz
+        doc  = fitz.open(str(pdf_path))
+        meta = doc.metadata
+        pages = len(doc)
+        doc.close()
+
+        def _clean(s: str) -> str:
+            return s.strip() if s else ""
+
+        year = ""
+        creation = meta.get("creationDate", "")
+        m = re.match(r"D:(\d{4})", creation)
+        if m:
+            year = m.group(1)
+
+        return {
+            "source": pdf_path.name,
+            "title":  _clean(meta.get("title",  "")),
+            "author": _clean(meta.get("author", "")),
+            "year":   year,
+            "pages":  pages,
+        }
+    except Exception:
+        return {"source": pdf_path.name, "title": "", "author": "", "year": "", "pages": 0}
+
+
 # ─── Conversione PDF → Markdown ───────────────────────────────────────────────
 
 def _is_tagged_pdf(pdf_path: Path) -> bool:
@@ -94,13 +129,15 @@ def convert_pdf(pdf_path: Path, out_dir: Path) -> Path:
     Converte il PDF in Markdown tramite opendataloader-pdf (XY-Cut++).
 
     Parametri per output RAG-ottimale:
-      keep_line_breaks=False   → testo fluente, elimina hard-wrap del PDF
-      reading_order="xycut"    → ricostruisce ordine di lettura multi-colonna
-      sanitize=False           → preserva il testo originale senza filtri
-      image_output="off"       → nessuna immagine estratta né referenziata
-      table_method="cluster"   → rileva tabelle anche senza bordi visibili
-      content_safety_off       → non scarta footnote (tiny) né layer OCG nascosti
-      use_struct_tree          → attivo solo per PDF taggati (Word/InDesign)
+      keep_line_breaks=False        → testo fluente, elimina hard-wrap del PDF
+      reading_order="xycut"         → ricostruisce ordine di lettura multi-colonna
+      sanitize=False                → preserva il testo originale senza filtri
+      image_output="off"            → nessuna immagine estratta né referenziata
+      table_method="cluster"        → rileva tabelle anche senza bordi visibili
+      content_safety_off            → non scarta footnote (tiny) né layer OCG nascosti
+      use_struct_tree               → attivo solo per PDF taggati (Word/InDesign)
+      markdown_page_separator       → inserisce separatore + marker pagina tra pagine
+      replace_invalid_chars         → sostituisce caratteri non validi con spazio
     """
     import opendataloader_pdf
 
@@ -118,6 +155,8 @@ def convert_pdf(pdf_path: Path, out_dir: Path) -> Path:
         table_method="cluster",
         content_safety_off=["tiny", "hidden-ocg"],
         use_struct_tree=tagged,
+        markdown_page_separator="\n\n---\n<!-- page: %page-number% -->\n\n",
+        replace_invalid_chars=" ",
         quiet=True,
     )
 
diff --git a/conversione/_pipeline/runner.py b/conversione/_pipeline/runner.py
index 07b9b2b..6768f25 100644
--- a/conversione/_pipeline/runner.py
+++ b/conversione/_pipeline/runner.py
@@ -5,7 +5,7 @@ import threading
 import time
 from pathlib import Path
 
-from .extract    import validate_pdf, convert_pdf
+from .extract    import validate_pdf, convert_pdf, extract_metadata
 from ._apply     import apply_transforms
 from .structure  import analyze
 from .report     import build_report
@@ -14,6 +14,20 @@ from .validator  import _score, _grade
 
 _LIVELLO_DESC = {3: "ricca (h3)", 2: "parziale (h2)", 1: "paragrafi", 0: "testo piatto"}
 
+
+def _build_frontmatter(meta: dict) -> str:
+    lines = ["---", f"source: {meta['source']}"]
+    if meta["title"]:
+        lines.append(f'title: "{meta["title"]}"')
+    if meta["author"]:
+        lines.append(f'author: "{meta["author"]}"')
+    if meta["year"]:
+        lines.append(f"year: {meta['year']}")
+    if meta["pages"]:
+        lines.append(f"pages: {meta['pages']}")
+    lines += ["---", ""]
+    return "\n".join(lines) + "\n"
+
 _SPIN_FRAMES = "⠋⠙⠹⠸⠼⠴⠦⠧⠇⠏"
 
 
@@ -62,7 +76,7 @@ def run(stem: str, project_root: Path, force: bool) -> bool:
         print(f"      (usa --force per rieseguire)")
         return True
 
-    # [1] Validazione
+    # [1] Validazione + metadati
     print("  [1/4] Validazione PDF...")
     pdf_mb = pdf_path.stat().st_size / (1024 * 1024) if pdf_path.exists() else 0
     print(f"     File: {pdf_path.name}  ({pdf_mb:.1f} MB)")
@@ -71,6 +85,11 @@ def run(stem: str, project_root: Path, force: bool) -> bool:
         print(f"  ✗ {msg}")
         return False
     print(f"  ✅ {msg}")
+    meta = extract_metadata(pdf_path)
+    if meta["title"]:
+        print(f"     Titolo:  {meta['title']}")
+    if meta["author"]:
+        print(f"     Autore:  {meta['author']}")
 
     # [2] Conversione
     print("  [2/4] Conversione PDF → Markdown (opendataloader-pdf)...")
@@ -106,6 +125,7 @@ def run(stem: str, project_root: Path, force: bool) -> bool:
     clean_text, t = apply_transforms(raw_text, on_step=_on_step)
     sys.stdout.write("\r" + " " * 72 + "\r")
     sys.stdout.flush()
+    clean_text = _build_frontmatter(meta) + clean_text
     reduction = 100 * (1 - len(clean_text) / len(raw_text)) if raw_text else 0
     print(f"  ✅ Encoding")
     print(f"     Simboli PUA corretti:  {t['n_simboli_pua_corretti']}")