Spaces:

MHamdan
/

SPARKNET

Sleeping

App Files Files Community

SPARKNET / src /document_intelligence /io /base.py

MHamdan

Initial commit: SPARKNET framework

d520909 about 1 month ago

raw

history blame contribute delete

7.04 kB

	"""
	Base IO Classes for Document Intelligence

	Abstract interfaces for document loading and page rendering.
	"""

	from abc import ABC, abstractmethod
	from dataclasses import dataclass, field
	from enum import Enum
	from pathlib import Path
	from typing import Any, Dict, Iterator, List, Optional, Tuple, Union

	import numpy as np
	from PIL import Image


	class DocumentFormat(str, Enum):
	"""Supported document formats."""

	PDF = "pdf"
	IMAGE = "image" # JPEG, PNG, TIFF, etc.
	TIFF_MULTIPAGE = "tiff_multipage"
	UNKNOWN = "unknown"

	@classmethod
	def from_path(cls, path: Union[str, Path]) -> "DocumentFormat":
	"""Detect format from file path."""
	path = Path(path)
	suffix = path.suffix.lower()

	if suffix == ".pdf":
	return cls.PDF
	elif suffix in {".jpg", ".jpeg", ".png", ".bmp", ".gif", ".webp"}:
	return cls.IMAGE
	elif suffix in {".tif", ".tiff"}:
	# Could be single or multipage
	return cls.TIFF_MULTIPAGE
	else:
	return cls.UNKNOWN


	@dataclass
	class PageInfo:
	"""Information about a document page."""

	page_number: int # 1-indexed
	width_pixels: int
	height_pixels: int
	width_points: Optional[float] = None # PDF points (1/72 inch)
	height_points: Optional[float] = None
	dpi: int = 72
	rotation: int = 0 # Degrees (0, 90, 180, 270)
	has_text: bool = False
	has_images: bool = False


	@dataclass
	class DocumentInfo:
	"""Metadata about a loaded document."""

	path: Path
	format: DocumentFormat
	num_pages: int
	pages: List[PageInfo] = field(default_factory=list)

	# Document metadata
	title: Optional[str] = None
	author: Optional[str] = None
	subject: Optional[str] = None
	creator: Optional[str] = None
	creation_date: Optional[str] = None
	modification_date: Optional[str] = None

	# File info
	file_size_bytes: int = 0
	is_encrypted: bool = False
	is_digitally_signed: bool = False

	# Content flags
	has_text_layer: bool = False
	is_scanned: bool = False
	has_forms: bool = False
	has_annotations: bool = False

	@property
	def doc_id(self) -> str:
	"""Generate a stable document ID from path and size."""
	import hashlib
	content = f"{self.path.name}_{self.file_size_bytes}_{self.num_pages}"
	return hashlib.sha256(content.encode()).hexdigest()[:16]


	@dataclass
	class RenderOptions:
	"""Options for page rendering."""

	dpi: int = 200
	color_mode: str = "RGB" # "RGB", "L" (grayscale), "RGBA"
	background_color: Tuple[int, ...] = (255, 255, 255) # White
	antialias: bool = True
	include_annotations: bool = True
	include_forms: bool = True


	class DocumentLoader(ABC):
	"""
	Abstract base class for document loaders.

	Handles opening documents and extracting metadata.
	"""

	@abstractmethod
	def load(self, path: Union[str, Path]) -> DocumentInfo:
	"""
	Load a document and extract metadata.

	Args:
	path: Path to the document file

	Returns:
	DocumentInfo with document metadata
	"""
	pass

	@abstractmethod
	def close(self) -> None:
	"""Release resources and close the document."""
	pass

	@abstractmethod
	def is_loaded(self) -> bool:
	"""Check if a document is currently loaded."""
	pass

	@property
	@abstractmethod
	def info(self) -> Optional[DocumentInfo]:
	"""Get information about the loaded document."""
	pass

	def __enter__(self):
	return self

	def __exit__(self, exc_type, exc_val, exc_tb):
	self.close()
	return False


	class PageRenderer(ABC):
	"""
	Abstract base class for page rendering.

	Converts document pages to images for processing.
	"""

	@abstractmethod
	def render_page(
	self,
	page_number: int,
	options: Optional[RenderOptions] = None
	) -> np.ndarray:
	"""
	Render a single page to an image.

	Args:
	page_number: 1-indexed page number
	options: Rendering options

	Returns:
	Page image as numpy array (H, W, C)
	"""
	pass

	def render_pages(
	self,
	page_numbers: Optional[List[int]] = None,
	options: Optional[RenderOptions] = None
	) -> Iterator[Tuple[int, np.ndarray]]:
	"""
	Render multiple pages.

	Args:
	page_numbers: List of 1-indexed page numbers (None = all pages)
	options: Rendering options

	Yields:
	Tuples of (page_number, image_array)
	"""
	if page_numbers is None:
	# Subclasses should override to provide total pages
	raise NotImplementedError("Subclass must provide page iteration")

	for page_num in page_numbers:
	yield page_num, self.render_page(page_num, options)

	def render_region(
	self,
	page_number: int,
	region: Tuple[float, float, float, float],
	options: Optional[RenderOptions] = None,
	normalized: bool = True
	) -> np.ndarray:
	"""
	Render a specific region of a page.

	Args:
	page_number: 1-indexed page number
	region: (x_min, y_min, x_max, y_max) coordinates
	options: Rendering options
	normalized: Whether coordinates are normalized (0-1)

	Returns:
	Region image as numpy array
	"""
	# Default: render full page and crop
	full_page = self.render_page(page_number, options)
	h, w = full_page.shape[:2]

	x_min, y_min, x_max, y_max = region
	if normalized:
	x_min, x_max = int(x_min * w), int(x_max * w)
	y_min, y_max = int(y_min * h), int(y_max * h)
	else:
	x_min, y_min = int(x_min), int(y_min)
	x_max, y_max = int(x_max), int(y_max)

	# Clip to valid range
	x_min = max(0, min(x_min, w))
	x_max = max(0, min(x_max, w))
	y_min = max(0, min(y_min, h))
	y_max = max(0, min(y_max, h))

	return full_page[y_min:y_max, x_min:x_max]


	class DocumentProcessor(ABC):
	"""
	Combined document loader and renderer.

	Convenience class that combines loading and rendering.
	"""

	def __init__(self, loader: DocumentLoader, renderer: PageRenderer):
	self.loader = loader
	self.renderer = renderer

	@abstractmethod
	def process(
	self,
	path: Union[str, Path],
	options: Optional[RenderOptions] = None,
	page_range: Optional[Tuple[int, int]] = None
	) -> Iterator[Tuple[int, np.ndarray, PageInfo]]:
	"""
	Load and render document pages.

	Args:
	path: Document path
	options: Rendering options
	page_range: Optional (start, end) page range (1-indexed, inclusive)

	Yields:
	Tuples of (page_number, image, page_info)
	"""
	pass