| | """ |
| | Tesseract OCR Engine |
| | |
| | Fallback OCR engine using Tesseract. |
| | Provides broad language support and is widely available. |
| | """ |
| |
|
| | import time |
| | from typing import List, Optional, Dict, Any |
| | import numpy as np |
| | from loguru import logger |
| |
|
| | from .base import OCREngine, OCRConfig, OCRResult |
| | from ..schemas.core import BoundingBox, OCRRegion |
| |
|
| | |
| | try: |
| | import pytesseract |
| | from PIL import Image |
| | HAS_TESSERACT = True |
| | except ImportError: |
| | HAS_TESSERACT = False |
| | logger.warning( |
| | "pytesseract not installed. Install with: pip install pytesseract " |
| | "Also install Tesseract: apt-get install tesseract-ocr" |
| | ) |
| |
|
| |
|
| | class TesseractOCREngine(OCREngine): |
| | """ |
| | OCR engine using Tesseract. |
| | |
| | Features: |
| | - Broad language support (100+ languages) |
| | - Mature and well-tested |
| | - No GPU required |
| | - Page segmentation modes for different layouts |
| | """ |
| |
|
| | |
| | LANGUAGE_MAP = { |
| | "en": "eng", |
| | "ch": "chi_sim", |
| | "chinese_cht": "chi_tra", |
| | "fr": "fra", |
| | "german": "deu", |
| | "es": "spa", |
| | "it": "ita", |
| | "pt": "por", |
| | "ru": "rus", |
| | "japan": "jpn", |
| | "korean": "kor", |
| | "ar": "ara", |
| | "hi": "hin", |
| | "latin": "lat", |
| | } |
| |
|
| | |
| | PSM_AUTO = 3 |
| | PSM_SINGLE_BLOCK = 6 |
| | PSM_SINGLE_LINE = 7 |
| | PSM_SPARSE = 11 |
| |
|
| | def __init__(self, config: Optional[OCRConfig] = None): |
| | """Initialize Tesseract OCR engine.""" |
| | super().__init__(config) |
| | self._tesseract_cmd: Optional[str] = None |
| |
|
| | def initialize(self): |
| | """Initialize Tesseract engine.""" |
| | if not HAS_TESSERACT: |
| | raise RuntimeError( |
| | "pytesseract not installed. Install with: pip install pytesseract. " |
| | "Also install Tesseract: apt-get install tesseract-ocr" |
| | ) |
| |
|
| | if self._initialized: |
| | return |
| |
|
| | logger.info("Initializing Tesseract OCR engine...") |
| |
|
| | |
| | try: |
| | version = pytesseract.get_tesseract_version() |
| | logger.info(f"Tesseract version: {version}") |
| | self._initialized = True |
| | except Exception as e: |
| | logger.error(f"Tesseract not properly installed: {e}") |
| | raise RuntimeError( |
| | f"Tesseract not properly installed: {e}. " |
| | "Install with: apt-get install tesseract-ocr" |
| | ) |
| |
|
| | def recognize( |
| | self, |
| | image: np.ndarray, |
| | page_number: int = 0, |
| | ) -> OCRResult: |
| | """ |
| | Perform OCR on an image using Tesseract. |
| | |
| | Args: |
| | image: Image as numpy array (RGB, HWC format) |
| | page_number: Page number for multi-page documents |
| | |
| | Returns: |
| | OCRResult with recognized text and regions |
| | """ |
| | if not self._initialized: |
| | self.initialize() |
| |
|
| | start_time = time.time() |
| |
|
| | try: |
| | |
| | pil_image = Image.fromarray(image) |
| |
|
| | |
| | lang = self._get_tesseract_lang() |
| |
|
| | |
| | custom_config = self._build_config() |
| |
|
| | |
| | data = pytesseract.image_to_data( |
| | pil_image, |
| | lang=lang, |
| | config=custom_config, |
| | output_type=pytesseract.Output.DICT, |
| | ) |
| |
|
| | |
| | regions = [] |
| | all_texts = [] |
| | total_confidence = 0.0 |
| | valid_count = 0 |
| |
|
| | height, width = image.shape[:2] |
| |
|
| | |
| | current_line_id = -1 |
| | word_id = 0 |
| |
|
| | for i in range(len(data['text'])): |
| | text = data['text'][i].strip() |
| | conf = int(data['conf'][i]) |
| |
|
| | |
| | if not text or conf < 0: |
| | continue |
| |
|
| | confidence = conf / 100.0 |
| | if confidence < self.config.min_confidence: |
| | continue |
| |
|
| | |
| | block_num = data['block_num'][i] |
| | line_num = data['line_num'][i] |
| | line_id = block_num * 1000 + line_num |
| |
|
| | if line_id != current_line_id: |
| | current_line_id = line_id |
| | word_id = 0 |
| | else: |
| | word_id += 1 |
| |
|
| | |
| | x = data['left'][i] |
| | y = data['top'][i] |
| | w = data['width'][i] |
| | h = data['height'][i] |
| |
|
| | bbox = BoundingBox( |
| | x_min=float(x), |
| | y_min=float(y), |
| | x_max=float(x + w), |
| | y_max=float(y + h), |
| | normalized=False, |
| | page_width=width, |
| | page_height=height, |
| | ) |
| |
|
| | region = OCRRegion( |
| | text=text, |
| | confidence=confidence, |
| | bbox=bbox, |
| | page=page_number, |
| | line_id=line_id, |
| | word_id=word_id, |
| | engine="tesseract", |
| | ) |
| | regions.append(region) |
| | all_texts.append(text) |
| | total_confidence += confidence |
| | valid_count += 1 |
| |
|
| | |
| | full_text = pytesseract.image_to_string( |
| | pil_image, |
| | lang=lang, |
| | config=custom_config, |
| | ) |
| |
|
| | processing_time = (time.time() - start_time) * 1000 |
| |
|
| | return OCRResult( |
| | regions=regions, |
| | full_text=full_text.strip(), |
| | confidence_avg=total_confidence / valid_count if valid_count > 0 else 0.0, |
| | processing_time_ms=processing_time, |
| | engine="tesseract", |
| | success=True, |
| | ) |
| |
|
| | except Exception as e: |
| | logger.error(f"Tesseract recognition failed: {e}") |
| | return OCRResult( |
| | regions=[], |
| | full_text="", |
| | confidence_avg=0.0, |
| | processing_time_ms=(time.time() - start_time) * 1000, |
| | engine="tesseract", |
| | success=False, |
| | error=str(e), |
| | ) |
| |
|
| | def _get_tesseract_lang(self) -> str: |
| | """Get Tesseract language string from config.""" |
| | langs = [] |
| | for lang in self.config.languages: |
| | tess_lang = self.LANGUAGE_MAP.get(lang, "eng") |
| | if tess_lang not in langs: |
| | langs.append(tess_lang) |
| | return "+".join(langs) if langs else "eng" |
| |
|
| | def _build_config(self) -> str: |
| | """Build Tesseract config string.""" |
| | config_parts = [ |
| | f"--psm {self.PSM_AUTO}", |
| | "--oem 3", |
| | ] |
| |
|
| | |
| | if self.config.return_word_boxes: |
| | config_parts.append("-c preserve_interword_spaces=1") |
| |
|
| | return " ".join(config_parts) |
| |
|
| | def get_supported_languages(self) -> List[str]: |
| | """Return list of supported language codes.""" |
| | return list(self.LANGUAGE_MAP.keys()) |
| |
|
| | def get_installed_languages(self) -> List[str]: |
| | """Get list of languages installed in Tesseract.""" |
| | if not self._initialized: |
| | self.initialize() |
| |
|
| | try: |
| | langs = pytesseract.get_languages() |
| | return langs |
| | except Exception as e: |
| | logger.warning(f"Could not get installed languages: {e}") |
| | return ["eng"] |
| |
|
| | def recognize_with_hocr( |
| | self, |
| | image: np.ndarray, |
| | page_number: int = 0, |
| | ) -> tuple: |
| | """ |
| | Perform OCR and return hOCR format for detailed layout. |
| | |
| | Args: |
| | image: Image as numpy array |
| | page_number: Page number |
| | |
| | Returns: |
| | Tuple of (OCRResult, hOCR string) |
| | """ |
| | if not self._initialized: |
| | self.initialize() |
| |
|
| | pil_image = Image.fromarray(image) |
| | lang = self._get_tesseract_lang() |
| | config = self._build_config() |
| |
|
| | |
| | ocr_result = self.recognize(image, page_number) |
| |
|
| | |
| | try: |
| | hocr = pytesseract.image_to_pdf_or_hocr( |
| | pil_image, |
| | lang=lang, |
| | config=config, |
| | extension='hocr', |
| | ) |
| | return ocr_result, hocr.decode('utf-8') |
| | except Exception as e: |
| | logger.warning(f"Failed to generate hOCR: {e}") |
| | return ocr_result, None |
| |
|