Spaces:

MHamdan
/

SPARKNET

Sleeping

App Files Files Community

SPARKNET / src /document /ocr /tesseract_ocr.py

MHamdan

Initial commit: SPARKNET framework

d520909 about 1 month ago

raw

history blame contribute delete

9.07 kB

	"""
	Tesseract OCR Engine

	Fallback OCR engine using Tesseract.
	Provides broad language support and is widely available.
	"""

	import time
	from typing import List, Optional, Dict, Any
	import numpy as np
	from loguru import logger

	from .base import OCREngine, OCRConfig, OCRResult
	from ..schemas.core import BoundingBox, OCRRegion

	# Try to import pytesseract
	try:
	import pytesseract
	from PIL import Image
	HAS_TESSERACT = True
	except ImportError:
	HAS_TESSERACT = False
	logger.warning(
	"pytesseract not installed. Install with: pip install pytesseract "
	"Also install Tesseract: apt-get install tesseract-ocr"
	)


	class TesseractOCREngine(OCREngine):
	"""
	OCR engine using Tesseract.

	Features:
	- Broad language support (100+ languages)
	- Mature and well-tested
	- No GPU required
	- Page segmentation modes for different layouts
	"""

	# Tesseract language codes (subset of common ones)
	LANGUAGE_MAP = {
	"en": "eng",
	"ch": "chi_sim",
	"chinese_cht": "chi_tra",
	"fr": "fra",
	"german": "deu",
	"es": "spa",
	"it": "ita",
	"pt": "por",
	"ru": "rus",
	"japan": "jpn",
	"korean": "kor",
	"ar": "ara",
	"hi": "hin",
	"latin": "lat",
	}

	# Page segmentation modes
	PSM_AUTO = 3 # Fully automatic page segmentation
	PSM_SINGLE_BLOCK = 6 # Assume single uniform block of text
	PSM_SINGLE_LINE = 7 # Treat image as single line
	PSM_SPARSE = 11 # Sparse text with no particular order

	def __init__(self, config: Optional[OCRConfig] = None):
	"""Initialize Tesseract OCR engine."""
	super().__init__(config)
	self._tesseract_cmd: Optional[str] = None

	def initialize(self):
	"""Initialize Tesseract engine."""
	if not HAS_TESSERACT:
	raise RuntimeError(
	"pytesseract not installed. Install with: pip install pytesseract. "
	"Also install Tesseract: apt-get install tesseract-ocr"
	)

	if self._initialized:
	return

	logger.info("Initializing Tesseract OCR engine...")

	# Test Tesseract installation
	try:
	version = pytesseract.get_tesseract_version()
	logger.info(f"Tesseract version: {version}")
	self._initialized = True
	except Exception as e:
	logger.error(f"Tesseract not properly installed: {e}")
	raise RuntimeError(
	f"Tesseract not properly installed: {e}. "
	"Install with: apt-get install tesseract-ocr"
	)

	def recognize(
	self,
	image: np.ndarray,
	page_number: int = 0,
	) -> OCRResult:
	"""
	Perform OCR on an image using Tesseract.

	Args:
	image: Image as numpy array (RGB, HWC format)
	page_number: Page number for multi-page documents

	Returns:
	OCRResult with recognized text and regions
	"""
	if not self._initialized:
	self.initialize()

	start_time = time.time()

	try:
	# Convert numpy array to PIL Image
	pil_image = Image.fromarray(image)

	# Build language string
	lang = self._get_tesseract_lang()

	# Configure Tesseract
	custom_config = self._build_config()

	# Get detailed data with bounding boxes
	data = pytesseract.image_to_data(
	pil_image,
	lang=lang,
	config=custom_config,
	output_type=pytesseract.Output.DICT,
	)

	# Process results
	regions = []
	all_texts = []
	total_confidence = 0.0
	valid_count = 0

	height, width = image.shape[:2]

	# Group words into lines
	current_line_id = -1
	word_id = 0

	for i in range(len(data['text'])):
	text = data['text'][i].strip()
	conf = int(data['conf'][i])

	# Skip empty or low confidence
	if not text or conf < 0:
	continue

	confidence = conf / 100.0
	if confidence < self.config.min_confidence:
	continue

	# Track line changes
	block_num = data['block_num'][i]
	line_num = data['line_num'][i]
	line_id = block_num * 1000 + line_num

	if line_id != current_line_id:
	current_line_id = line_id
	word_id = 0
	else:
	word_id += 1

	# Get bounding box
	x = data['left'][i]
	y = data['top'][i]
	w = data['width'][i]
	h = data['height'][i]

	bbox = BoundingBox(
	x_min=float(x),
	y_min=float(y),
	x_max=float(x + w),
	y_max=float(y + h),
	normalized=False,
	page_width=width,
	page_height=height,
	)

	region = OCRRegion(
	text=text,
	confidence=confidence,
	bbox=bbox,
	page=page_number,
	line_id=line_id,
	word_id=word_id,
	engine="tesseract",
	)
	regions.append(region)
	all_texts.append(text)
	total_confidence += confidence
	valid_count += 1

	# Also get full text for better formatting
	full_text = pytesseract.image_to_string(
	pil_image,
	lang=lang,
	config=custom_config,
	)

	processing_time = (time.time() - start_time) * 1000

	return OCRResult(
	regions=regions,
	full_text=full_text.strip(),
	confidence_avg=total_confidence / valid_count if valid_count > 0 else 0.0,
	processing_time_ms=processing_time,
	engine="tesseract",
	success=True,
	)

	except Exception as e:
	logger.error(f"Tesseract recognition failed: {e}")
	return OCRResult(
	regions=[],
	full_text="",
	confidence_avg=0.0,
	processing_time_ms=(time.time() - start_time) * 1000,
	engine="tesseract",
	success=False,
	error=str(e),
	)

	def _get_tesseract_lang(self) -> str:
	"""Get Tesseract language string from config."""
	langs = []
	for lang in self.config.languages:
	tess_lang = self.LANGUAGE_MAP.get(lang, "eng")
	if tess_lang not in langs:
	langs.append(tess_lang)
	return "+".join(langs) if langs else "eng"

	def _build_config(self) -> str:
	"""Build Tesseract config string."""
	config_parts = [
	f"--psm {self.PSM_AUTO}", # Page segmentation mode
	"--oem 3", # Use both legacy and LSTM engines
	]

	# Add more options as needed
	if self.config.return_word_boxes:
	config_parts.append("-c preserve_interword_spaces=1")

	return " ".join(config_parts)

	def get_supported_languages(self) -> List[str]:
	"""Return list of supported language codes."""
	return list(self.LANGUAGE_MAP.keys())

	def get_installed_languages(self) -> List[str]:
	"""Get list of languages installed in Tesseract."""
	if not self._initialized:
	self.initialize()

	try:
	langs = pytesseract.get_languages()
	return langs
	except Exception as e:
	logger.warning(f"Could not get installed languages: {e}")
	return ["eng"]

	def recognize_with_hocr(
	self,
	image: np.ndarray,
	page_number: int = 0,
	) -> tuple:
	"""
	Perform OCR and return hOCR format for detailed layout.

	Args:
	image: Image as numpy array
	page_number: Page number

	Returns:
	Tuple of (OCRResult, hOCR string)
	"""
	if not self._initialized:
	self.initialize()

	pil_image = Image.fromarray(image)
	lang = self._get_tesseract_lang()
	config = self._build_config()

	# Get standard result
	ocr_result = self.recognize(image, page_number)

	# Get hOCR for layout analysis
	try:
	hocr = pytesseract.image_to_pdf_or_hocr(
	pil_image,
	lang=lang,
	config=config,
	extension='hocr',
	)
	return ocr_result, hocr.decode('utf-8')
	except Exception as e:
	logger.warning(f"Failed to generate hOCR: {e}")
	return ocr_result, None