feat: migliora pipeline PDF→MD per RAG — frontmatter e page marker

- extract.py: aggiunge extract_metadata() — title, author, year, pages via fitz - extract.py: aggiunge markdown_page_separator con  tra pagine - extract.py: aggiunge replace_invalid_chars=" " per testo più pulito - runner.py: prepend YAML frontmatter (source/title/author/year/pages) al clean.md - runner.py: mostra title e author rilevati durante validazione Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-07 14:58:09 +02:00
parent 6e755c0b6c
commit 2c0b7a462e
2 changed files with 69 additions and 10 deletions
@@ -1,4 +1,5 @@
-"""Estrazione PDF: verifica dipendenze, validazione, conversione → raw Markdown."""
+"""Estrazione PDF: verifica dipendenze, validazione, metadati, conversione → raw Markdown."""
+import re
 import subprocess
 import sys
 from pathlib import Path
@@ -76,6 +77,40 @@ def validate_pdf(pdf_path: Path) -> tuple[bool, str]:
        return False, f"Impossibile aprire: {e}"


+# ─── Metadati PDF ────────────────────────────────────────────────────────────
+
+def extract_metadata(pdf_path: Path) -> dict:
+    """
+    Estrae title, author, year e page count dal PDF tramite fitz.
+    Restituisce un dict con chiavi sempre presenti (stringa vuota se assenti).
+    """
+    try:
+        import fitz
+        doc  = fitz.open(str(pdf_path))
+        meta = doc.metadata
+        pages = len(doc)
+        doc.close()
+
+        def _clean(s: str) -> str:
+            return s.strip() if s else ""
+
+        year = ""
+        creation = meta.get("creationDate", "")
+        m = re.match(r"D:(\d{4})", creation)
+        if m:
+            year = m.group(1)
+
+        return {
+            "source": pdf_path.name,
+            "title":  _clean(meta.get("title",  "")),
+            "author": _clean(meta.get("author", "")),
+            "year":   year,
+            "pages":  pages,
+        }
+    except Exception:
+        return {"source": pdf_path.name, "title": "", "author": "", "year": "", "pages": 0}
+
+
 # ─── Conversione PDF → Markdown ───────────────────────────────────────────────

 def _is_tagged_pdf(pdf_path: Path) -> bool:
@@ -94,13 +129,15 @@ def convert_pdf(pdf_path: Path, out_dir: Path) -> Path:
    Converte il PDF in Markdown tramite opendataloader-pdf (XY-Cut++).

    Parametri per output RAG-ottimale:
-      keep_line_breaks=False   → testo fluente, elimina hard-wrap del PDF
-      reading_order="xycut"    → ricostruisce ordine di lettura multi-colonna
-      sanitize=False           → preserva il testo originale senza filtri
-      image_output="off"       → nessuna immagine estratta né referenziata
-      table_method="cluster"   → rileva tabelle anche senza bordi visibili
-      content_safety_off       → non scarta footnote (tiny) né layer OCG nascosti
-      use_struct_tree          → attivo solo per PDF taggati (Word/InDesign)
+      keep_line_breaks=False        → testo fluente, elimina hard-wrap del PDF
+      reading_order="xycut"         → ricostruisce ordine di lettura multi-colonna
+      sanitize=False                → preserva il testo originale senza filtri
+      image_output="off"            → nessuna immagine estratta né referenziata
+      table_method="cluster"        → rileva tabelle anche senza bordi visibili
+      content_safety_off            → non scarta footnote (tiny) né layer OCG nascosti
+      use_struct_tree               → attivo solo per PDF taggati (Word/InDesign)
+      markdown_page_separator       → inserisce separatore + marker pagina tra pagine
+      replace_invalid_chars         → sostituisce caratteri non validi con spazio
    """
    import opendataloader_pdf

@@ -118,6 +155,8 @@ def convert_pdf(pdf_path: Path, out_dir: Path) -> Path:
        table_method="cluster",
        content_safety_off=["tiny", "hidden-ocg"],
        use_struct_tree=tagged,
+        markdown_page_separator="\n\n---\n<!-- page: %page-number% -->\n\n",
+        replace_invalid_chars=" ",
        quiet=True,
    )

@@ -5,7 +5,7 @@ import threading
 import time
 from pathlib import Path

-from .extract    import validate_pdf, convert_pdf
+from .extract    import validate_pdf, convert_pdf, extract_metadata
 from ._apply     import apply_transforms
 from .structure  import analyze
 from .report     import build_report
@@ -14,6 +14,20 @@ from .validator  import _score, _grade

 _LIVELLO_DESC = {3: "ricca (h3)", 2: "parziale (h2)", 1: "paragrafi", 0: "testo piatto"}

+
+def _build_frontmatter(meta: dict) -> str:
+    lines = ["---", f"source: {meta['source']}"]
+    if meta["title"]:
+        lines.append(f'title: "{meta["title"]}"')
+    if meta["author"]:
+        lines.append(f'author: "{meta["author"]}"')
+    if meta["year"]:
+        lines.append(f"year: {meta['year']}")
+    if meta["pages"]:
+        lines.append(f"pages: {meta['pages']}")
+    lines += ["---", ""]
+    return "\n".join(lines) + "\n"
+
 _SPIN_FRAMES = "⠋⠙⠹⠸⠼⠴⠦⠧⠇⠏"


@@ -62,7 +76,7 @@ def run(stem: str, project_root: Path, force: bool) -> bool:
        print(f"      (usa --force per rieseguire)")
        return True

-    # [1] Validazione
+    # [1] Validazione + metadati
    print("  [1/4] Validazione PDF...")
    pdf_mb = pdf_path.stat().st_size / (1024 * 1024) if pdf_path.exists() else 0
    print(f"     File: {pdf_path.name}  ({pdf_mb:.1f} MB)")
@@ -71,6 +85,11 @@ def run(stem: str, project_root: Path, force: bool) -> bool:
        print(f"  ✗ {msg}")
        return False
    print(f"  ✅ {msg}")
+    meta = extract_metadata(pdf_path)
+    if meta["title"]:
+        print(f"     Titolo:  {meta['title']}")
+    if meta["author"]:
+        print(f"     Autore:  {meta['author']}")

    # [2] Conversione
    print("  [2/4] Conversione PDF → Markdown (opendataloader-pdf)...")
@@ -106,6 +125,7 @@ def run(stem: str, project_root: Path, force: bool) -> bool:
    clean_text, t = apply_transforms(raw_text, on_step=_on_step)
    sys.stdout.write("\r" + " " * 72 + "\r")
    sys.stdout.flush()
+    clean_text = _build_frontmatter(meta) + clean_text
    reduction = 100 * (1 - len(clean_text) / len(raw_text)) if raw_text else 0
    print(f"  ✅ Encoding")
    print(f"     Simboli PUA corretti:  {t['n_simboli_pua_corretti']}")