| | """ |
| | Base IO Classes for Document Intelligence |
| | |
| | Abstract interfaces for document loading and page rendering. |
| | """ |
| |
|
| | from abc import ABC, abstractmethod |
| | from dataclasses import dataclass, field |
| | from enum import Enum |
| | from pathlib import Path |
| | from typing import Any, Dict, Iterator, List, Optional, Tuple, Union |
| |
|
| | import numpy as np |
| | from PIL import Image |
| |
|
| |
|
| | class DocumentFormat(str, Enum): |
| | """Supported document formats.""" |
| |
|
| | PDF = "pdf" |
| | IMAGE = "image" |
| | TIFF_MULTIPAGE = "tiff_multipage" |
| | UNKNOWN = "unknown" |
| |
|
| | @classmethod |
| | def from_path(cls, path: Union[str, Path]) -> "DocumentFormat": |
| | """Detect format from file path.""" |
| | path = Path(path) |
| | suffix = path.suffix.lower() |
| |
|
| | if suffix == ".pdf": |
| | return cls.PDF |
| | elif suffix in {".jpg", ".jpeg", ".png", ".bmp", ".gif", ".webp"}: |
| | return cls.IMAGE |
| | elif suffix in {".tif", ".tiff"}: |
| | |
| | return cls.TIFF_MULTIPAGE |
| | else: |
| | return cls.UNKNOWN |
| |
|
| |
|
| | @dataclass |
| | class PageInfo: |
| | """Information about a document page.""" |
| |
|
| | page_number: int |
| | width_pixels: int |
| | height_pixels: int |
| | width_points: Optional[float] = None |
| | height_points: Optional[float] = None |
| | dpi: int = 72 |
| | rotation: int = 0 |
| | has_text: bool = False |
| | has_images: bool = False |
| |
|
| |
|
| | @dataclass |
| | class DocumentInfo: |
| | """Metadata about a loaded document.""" |
| |
|
| | path: Path |
| | format: DocumentFormat |
| | num_pages: int |
| | pages: List[PageInfo] = field(default_factory=list) |
| |
|
| | |
| | title: Optional[str] = None |
| | author: Optional[str] = None |
| | subject: Optional[str] = None |
| | creator: Optional[str] = None |
| | creation_date: Optional[str] = None |
| | modification_date: Optional[str] = None |
| |
|
| | |
| | file_size_bytes: int = 0 |
| | is_encrypted: bool = False |
| | is_digitally_signed: bool = False |
| |
|
| | |
| | has_text_layer: bool = False |
| | is_scanned: bool = False |
| | has_forms: bool = False |
| | has_annotations: bool = False |
| |
|
| | @property |
| | def doc_id(self) -> str: |
| | """Generate a stable document ID from path and size.""" |
| | import hashlib |
| | content = f"{self.path.name}_{self.file_size_bytes}_{self.num_pages}" |
| | return hashlib.sha256(content.encode()).hexdigest()[:16] |
| |
|
| |
|
| | @dataclass |
| | class RenderOptions: |
| | """Options for page rendering.""" |
| |
|
| | dpi: int = 200 |
| | color_mode: str = "RGB" |
| | background_color: Tuple[int, ...] = (255, 255, 255) |
| | antialias: bool = True |
| | include_annotations: bool = True |
| | include_forms: bool = True |
| |
|
| |
|
| | class DocumentLoader(ABC): |
| | """ |
| | Abstract base class for document loaders. |
| | |
| | Handles opening documents and extracting metadata. |
| | """ |
| |
|
| | @abstractmethod |
| | def load(self, path: Union[str, Path]) -> DocumentInfo: |
| | """ |
| | Load a document and extract metadata. |
| | |
| | Args: |
| | path: Path to the document file |
| | |
| | Returns: |
| | DocumentInfo with document metadata |
| | """ |
| | pass |
| |
|
| | @abstractmethod |
| | def close(self) -> None: |
| | """Release resources and close the document.""" |
| | pass |
| |
|
| | @abstractmethod |
| | def is_loaded(self) -> bool: |
| | """Check if a document is currently loaded.""" |
| | pass |
| |
|
| | @property |
| | @abstractmethod |
| | def info(self) -> Optional[DocumentInfo]: |
| | """Get information about the loaded document.""" |
| | pass |
| |
|
| | def __enter__(self): |
| | return self |
| |
|
| | def __exit__(self, exc_type, exc_val, exc_tb): |
| | self.close() |
| | return False |
| |
|
| |
|
| | class PageRenderer(ABC): |
| | """ |
| | Abstract base class for page rendering. |
| | |
| | Converts document pages to images for processing. |
| | """ |
| |
|
| | @abstractmethod |
| | def render_page( |
| | self, |
| | page_number: int, |
| | options: Optional[RenderOptions] = None |
| | ) -> np.ndarray: |
| | """ |
| | Render a single page to an image. |
| | |
| | Args: |
| | page_number: 1-indexed page number |
| | options: Rendering options |
| | |
| | Returns: |
| | Page image as numpy array (H, W, C) |
| | """ |
| | pass |
| |
|
| | def render_pages( |
| | self, |
| | page_numbers: Optional[List[int]] = None, |
| | options: Optional[RenderOptions] = None |
| | ) -> Iterator[Tuple[int, np.ndarray]]: |
| | """ |
| | Render multiple pages. |
| | |
| | Args: |
| | page_numbers: List of 1-indexed page numbers (None = all pages) |
| | options: Rendering options |
| | |
| | Yields: |
| | Tuples of (page_number, image_array) |
| | """ |
| | if page_numbers is None: |
| | |
| | raise NotImplementedError("Subclass must provide page iteration") |
| |
|
| | for page_num in page_numbers: |
| | yield page_num, self.render_page(page_num, options) |
| |
|
| | def render_region( |
| | self, |
| | page_number: int, |
| | region: Tuple[float, float, float, float], |
| | options: Optional[RenderOptions] = None, |
| | normalized: bool = True |
| | ) -> np.ndarray: |
| | """ |
| | Render a specific region of a page. |
| | |
| | Args: |
| | page_number: 1-indexed page number |
| | region: (x_min, y_min, x_max, y_max) coordinates |
| | options: Rendering options |
| | normalized: Whether coordinates are normalized (0-1) |
| | |
| | Returns: |
| | Region image as numpy array |
| | """ |
| | |
| | full_page = self.render_page(page_number, options) |
| | h, w = full_page.shape[:2] |
| |
|
| | x_min, y_min, x_max, y_max = region |
| | if normalized: |
| | x_min, x_max = int(x_min * w), int(x_max * w) |
| | y_min, y_max = int(y_min * h), int(y_max * h) |
| | else: |
| | x_min, y_min = int(x_min), int(y_min) |
| | x_max, y_max = int(x_max), int(y_max) |
| |
|
| | |
| | x_min = max(0, min(x_min, w)) |
| | x_max = max(0, min(x_max, w)) |
| | y_min = max(0, min(y_min, h)) |
| | y_max = max(0, min(y_max, h)) |
| |
|
| | return full_page[y_min:y_max, x_min:x_max] |
| |
|
| |
|
| | class DocumentProcessor(ABC): |
| | """ |
| | Combined document loader and renderer. |
| | |
| | Convenience class that combines loading and rendering. |
| | """ |
| |
|
| | def __init__(self, loader: DocumentLoader, renderer: PageRenderer): |
| | self.loader = loader |
| | self.renderer = renderer |
| |
|
| | @abstractmethod |
| | def process( |
| | self, |
| | path: Union[str, Path], |
| | options: Optional[RenderOptions] = None, |
| | page_range: Optional[Tuple[int, int]] = None |
| | ) -> Iterator[Tuple[int, np.ndarray, PageInfo]]: |
| | """ |
| | Load and render document pages. |
| | |
| | Args: |
| | path: Document path |
| | options: Rendering options |
| | page_range: Optional (start, end) page range (1-indexed, inclusive) |
| | |
| | Yields: |
| | Tuples of (page_number, image, page_info) |
| | """ |
| | pass |
| |
|