ebd2a43f84
Porta da main la riscrittura completa di conversione/_pipeline/ (9 stadi PyMuPDF) e la suite tests/ senza modificare chunks/, step-8/, rag.py, ollama/, retrieve.py, config.py. requirements.txt: aggiunge PyMuPDF>=1.24.0 e pytest>=8.0, mantiene chromadb, rimuove opendataloader-pdf e pymupdf4llm. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
45 lines
1.3 KiB
Python
45 lines
1.3 KiB
Python
"""Strutture dati intermedie della pipeline: Block, Section, FontProfile."""
|
|
from __future__ import annotations
|
|
|
|
from dataclasses import dataclass, field
|
|
|
|
|
|
@dataclass
|
|
class Block:
|
|
text: str
|
|
page: int
|
|
bbox: tuple[float, float, float, float] # x0, y0, x1, y1
|
|
font_size: float
|
|
font_name: str
|
|
is_bold: bool
|
|
block_type: str = "paragraph" # paragraph|header_candidate|list_item|table|ignore
|
|
space_before: float = 0.0
|
|
level: int = 0 # assegnato da stage5 (0 = non header)
|
|
origin_spans: list[dict] = field(default_factory=list, repr=False)
|
|
|
|
@property
|
|
def x0(self) -> float: return self.bbox[0]
|
|
@property
|
|
def y0(self) -> float: return self.bbox[1]
|
|
@property
|
|
def x1(self) -> float: return self.bbox[2]
|
|
@property
|
|
def y1(self) -> float: return self.bbox[3]
|
|
|
|
|
|
@dataclass
|
|
class Section:
|
|
title: str
|
|
level: int # 1, 2, 3
|
|
content: list[Block] = field(default_factory=list)
|
|
children: list[Section] = field(default_factory=list)
|
|
page_start: int = 0
|
|
source_block: Block | None = field(default=None, repr=False)
|
|
|
|
|
|
@dataclass
|
|
class FontProfile:
|
|
body_size: float
|
|
cluster_map: dict[float, int] # font_size arrotondato → livello (1/2/3)
|
|
header_sizes: list[float] # taglie candidate header, ordinate desc
|