From 2c0b7a462e980264617a81bdab3f6f4085e2a464 Mon Sep 17 00:00:00 2001 From: Davide Grilli Date: Thu, 7 May 2026 14:58:09 +0200 Subject: [PATCH] =?UTF-8?q?feat:=20migliora=20pipeline=20PDF=E2=86=92MD=20?= =?UTF-8?q?per=20RAG=20=E2=80=94=20frontmatter=20e=20page=20marker?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - extract.py: aggiunge extract_metadata() — title, author, year, pages via fitz - extract.py: aggiunge markdown_page_separator con tra pagine - extract.py: aggiunge replace_invalid_chars=" " per testo più pulito - runner.py: prepend YAML frontmatter (source/title/author/year/pages) al clean.md - runner.py: mostra title e author rilevati durante validazione Co-Authored-By: Claude Sonnet 4.6 --- conversione/_pipeline/extract.py | 55 +++++++++++++++++++++++++++----- conversione/_pipeline/runner.py | 24 ++++++++++++-- 2 files changed, 69 insertions(+), 10 deletions(-) diff --git a/conversione/_pipeline/extract.py b/conversione/_pipeline/extract.py index 4876d85..a3018ac 100644 --- a/conversione/_pipeline/extract.py +++ b/conversione/_pipeline/extract.py @@ -1,4 +1,5 @@ -"""Estrazione PDF: verifica dipendenze, validazione, conversione → raw Markdown.""" +"""Estrazione PDF: verifica dipendenze, validazione, metadati, conversione → raw Markdown.""" +import re import subprocess import sys from pathlib import Path @@ -76,6 +77,40 @@ def validate_pdf(pdf_path: Path) -> tuple[bool, str]: return False, f"Impossibile aprire: {e}" +# ─── Metadati PDF ──────────────────────────────────────────────────────────── + +def extract_metadata(pdf_path: Path) -> dict: + """ + Estrae title, author, year e page count dal PDF tramite fitz. + Restituisce un dict con chiavi sempre presenti (stringa vuota se assenti). + """ + try: + import fitz + doc = fitz.open(str(pdf_path)) + meta = doc.metadata + pages = len(doc) + doc.close() + + def _clean(s: str) -> str: + return s.strip() if s else "" + + year = "" + creation = meta.get("creationDate", "") + m = re.match(r"D:(\d{4})", creation) + if m: + year = m.group(1) + + return { + "source": pdf_path.name, + "title": _clean(meta.get("title", "")), + "author": _clean(meta.get("author", "")), + "year": year, + "pages": pages, + } + except Exception: + return {"source": pdf_path.name, "title": "", "author": "", "year": "", "pages": 0} + + # ─── Conversione PDF → Markdown ─────────────────────────────────────────────── def _is_tagged_pdf(pdf_path: Path) -> bool: @@ -94,13 +129,15 @@ def convert_pdf(pdf_path: Path, out_dir: Path) -> Path: Converte il PDF in Markdown tramite opendataloader-pdf (XY-Cut++). Parametri per output RAG-ottimale: - keep_line_breaks=False → testo fluente, elimina hard-wrap del PDF - reading_order="xycut" → ricostruisce ordine di lettura multi-colonna - sanitize=False → preserva il testo originale senza filtri - image_output="off" → nessuna immagine estratta né referenziata - table_method="cluster" → rileva tabelle anche senza bordi visibili - content_safety_off → non scarta footnote (tiny) né layer OCG nascosti - use_struct_tree → attivo solo per PDF taggati (Word/InDesign) + keep_line_breaks=False → testo fluente, elimina hard-wrap del PDF + reading_order="xycut" → ricostruisce ordine di lettura multi-colonna + sanitize=False → preserva il testo originale senza filtri + image_output="off" → nessuna immagine estratta né referenziata + table_method="cluster" → rileva tabelle anche senza bordi visibili + content_safety_off → non scarta footnote (tiny) né layer OCG nascosti + use_struct_tree → attivo solo per PDF taggati (Word/InDesign) + markdown_page_separator → inserisce separatore + marker pagina tra pagine + replace_invalid_chars → sostituisce caratteri non validi con spazio """ import opendataloader_pdf @@ -118,6 +155,8 @@ def convert_pdf(pdf_path: Path, out_dir: Path) -> Path: table_method="cluster", content_safety_off=["tiny", "hidden-ocg"], use_struct_tree=tagged, + markdown_page_separator="\n\n---\n\n\n", + replace_invalid_chars=" ", quiet=True, ) diff --git a/conversione/_pipeline/runner.py b/conversione/_pipeline/runner.py index 07b9b2b..6768f25 100644 --- a/conversione/_pipeline/runner.py +++ b/conversione/_pipeline/runner.py @@ -5,7 +5,7 @@ import threading import time from pathlib import Path -from .extract import validate_pdf, convert_pdf +from .extract import validate_pdf, convert_pdf, extract_metadata from ._apply import apply_transforms from .structure import analyze from .report import build_report @@ -14,6 +14,20 @@ from .validator import _score, _grade _LIVELLO_DESC = {3: "ricca (h3)", 2: "parziale (h2)", 1: "paragrafi", 0: "testo piatto"} + +def _build_frontmatter(meta: dict) -> str: + lines = ["---", f"source: {meta['source']}"] + if meta["title"]: + lines.append(f'title: "{meta["title"]}"') + if meta["author"]: + lines.append(f'author: "{meta["author"]}"') + if meta["year"]: + lines.append(f"year: {meta['year']}") + if meta["pages"]: + lines.append(f"pages: {meta['pages']}") + lines += ["---", ""] + return "\n".join(lines) + "\n" + _SPIN_FRAMES = "⠋⠙⠹⠸⠼⠴⠦⠧⠇⠏" @@ -62,7 +76,7 @@ def run(stem: str, project_root: Path, force: bool) -> bool: print(f" (usa --force per rieseguire)") return True - # [1] Validazione + # [1] Validazione + metadati print(" [1/4] Validazione PDF...") pdf_mb = pdf_path.stat().st_size / (1024 * 1024) if pdf_path.exists() else 0 print(f" File: {pdf_path.name} ({pdf_mb:.1f} MB)") @@ -71,6 +85,11 @@ def run(stem: str, project_root: Path, force: bool) -> bool: print(f" ✗ {msg}") return False print(f" ✅ {msg}") + meta = extract_metadata(pdf_path) + if meta["title"]: + print(f" Titolo: {meta['title']}") + if meta["author"]: + print(f" Autore: {meta['author']}") # [2] Conversione print(" [2/4] Conversione PDF → Markdown (opendataloader-pdf)...") @@ -106,6 +125,7 @@ def run(stem: str, project_root: Path, force: bool) -> bool: clean_text, t = apply_transforms(raw_text, on_step=_on_step) sys.stdout.write("\r" + " " * 72 + "\r") sys.stdout.flush() + clean_text = _build_frontmatter(meta) + clean_text reduction = 100 * (1 - len(clean_text) / len(raw_text)) if raw_text else 0 print(f" ✅ Encoding") print(f" Simboli PUA corretti: {t['n_simboli_pua_corretti']}")