rag-from-scratch/conversione/_pipeline/models.py

"""Strutture dati intermedie della pipeline: Block, Section, FontProfile."""
from __future__ import annotations

from dataclasses import dataclass, field


@dataclass
class Block:
    text: str
    page: int
    bbox: tuple[float, float, float, float]  # x0, y0, x1, y1
    font_size: float
    font_name: str
    is_bold: bool
    block_type: str = "paragraph"  # paragraph|header_candidate|list_item|table|ignore
    space_before: float = 0.0
    level: int = 0                 # assegnato da stage5 (0 = non header)
    origin_spans: list[dict] = field(default_factory=list, repr=False)

    @property
    def x0(self) -> float: return self.bbox[0]
    @property
    def y0(self) -> float: return self.bbox[1]
    @property
    def x1(self) -> float: return self.bbox[2]
    @property
    def y1(self) -> float: return self.bbox[3]


@dataclass
class Section:
    title: str
    level: int           # 1, 2, 3
    content: list[Block] = field(default_factory=list)
    children: list[Section] = field(default_factory=list)
    page_start: int = 0
    source_block: Block | None = field(default=None, repr=False)


@dataclass
class FontProfile:
    body_size: float
    cluster_map: dict[float, int]   # font_size arrotondato → livello (1/2/3)
    header_sizes: list[float]       # taglie candidate header, ordinate desc