45 lines
1.3 KiB
Python
45 lines
1.3 KiB
Python
|
|
"""Strutture dati intermedie della pipeline: Block, Section, FontProfile."""
|
||
|
|
from __future__ import annotations
|
||
|
|
|
||
|
|
from dataclasses import dataclass, field
|
||
|
|
|
||
|
|
|
||
|
|
@dataclass
|
||
|
|
class Block:
|
||
|
|
text: str
|
||
|
|
page: int
|
||
|
|
bbox: tuple[float, float, float, float] # x0, y0, x1, y1
|
||
|
|
font_size: float
|
||
|
|
font_name: str
|
||
|
|
is_bold: bool
|
||
|
|
block_type: str = "paragraph" # paragraph|header_candidate|list_item|table|ignore
|
||
|
|
space_before: float = 0.0
|
||
|
|
level: int = 0 # assegnato da stage5 (0 = non header)
|
||
|
|
origin_spans: list[dict] = field(default_factory=list, repr=False)
|
||
|
|
|
||
|
|
@property
|
||
|
|
def x0(self) -> float: return self.bbox[0]
|
||
|
|
@property
|
||
|
|
def y0(self) -> float: return self.bbox[1]
|
||
|
|
@property
|
||
|
|
def x1(self) -> float: return self.bbox[2]
|
||
|
|
@property
|
||
|
|
def y1(self) -> float: return self.bbox[3]
|
||
|
|
|
||
|
|
|
||
|
|
@dataclass
|
||
|
|
class Section:
|
||
|
|
title: str
|
||
|
|
level: int # 1, 2, 3
|
||
|
|
content: list[Block] = field(default_factory=list)
|
||
|
|
children: list[Section] = field(default_factory=list)
|
||
|
|
page_start: int = 0
|
||
|
|
source_block: Block | None = field(default=None, repr=False)
|
||
|
|
|
||
|
|
|
||
|
|
@dataclass
|
||
|
|
class FontProfile:
|
||
|
|
body_size: float
|
||
|
|
cluster_map: dict[float, int] # font_size arrotondato → livello (1/2/3)
|
||
|
|
header_sizes: list[float] # taglie candidate header, ordinate desc
|