| | """ |
| | VisionOCRAgent for SPARKNET |
| | |
| | Handles OCR and document vision tasks using Ollama's llava model. |
| | Extracts text from images, PDFs, diagrams, and complex documents. |
| | """ |
| |
|
| | import base64 |
| | from pathlib import Path |
| | from typing import Optional, Dict, Any |
| | from loguru import logger |
| | from langchain_ollama import ChatOllama |
| | from langchain_core.messages import HumanMessage |
| |
|
| | class VisionOCRAgent: |
| | """ |
| | Specialized agent for vision-based OCR tasks. |
| | Uses llava vision-language model for document analysis. |
| | """ |
| |
|
| | def __init__(self, model_name: str = "llava:7b", base_url: str = "http://localhost:11434"): |
| | """ |
| | Initialize VisionOCRAgent. |
| | |
| | Args: |
| | model_name: Ollama vision model to use (default: llava:7b) |
| | base_url: Ollama service URL |
| | """ |
| | self.model_name = model_name |
| | self.base_url = base_url |
| |
|
| | |
| | self.vision_llm = ChatOllama( |
| | model=model_name, |
| | base_url=base_url, |
| | temperature=0.1, |
| | ) |
| |
|
| | logger.info(f"Initialized VisionOCRAgent with model: {model_name}") |
| |
|
| | def _encode_image(self, image_path: str) -> str: |
| | """ |
| | Encode image to base64 for llava. |
| | |
| | Args: |
| | image_path: Path to image file |
| | |
| | Returns: |
| | Base64 encoded image string |
| | """ |
| | with open(image_path, "rb") as image_file: |
| | return base64.b64encode(image_file.read()).decode('utf-8') |
| |
|
| | async def extract_text_from_image( |
| | self, |
| | image_path: str, |
| | preserve_formatting: bool = True |
| | ) -> str: |
| | """ |
| | Extract text from an image using vision model. |
| | |
| | Args: |
| | image_path: Path to image file |
| | preserve_formatting: Whether to preserve document structure |
| | |
| | Returns: |
| | Extracted text content |
| | """ |
| | logger.info(f"📷 Extracting text from: {image_path}") |
| |
|
| | try: |
| | |
| | if preserve_formatting: |
| | prompt = """Extract all text from this image, preserving the original formatting and structure. |
| | |
| | Maintain: |
| | - Paragraph breaks and line spacing |
| | - Bullet points and numbered lists |
| | - Section headings and hierarchy |
| | - Table structures if present |
| | |
| | Return only the extracted text, formatted as closely as possible to the original.""" |
| | else: |
| | prompt = "Extract all text from this image. Return only the text content without any additional commentary." |
| |
|
| | |
| | image_data = self._encode_image(image_path) |
| |
|
| | |
| | message = HumanMessage( |
| | content=[ |
| | {"type": "text", "text": prompt}, |
| | { |
| | "type": "image_url", |
| | "image_url": f"data:image/jpeg;base64,{image_data}" |
| | } |
| | ] |
| | ) |
| |
|
| | |
| | response = await self.vision_llm.ainvoke([message]) |
| | extracted_text = response.content |
| |
|
| | logger.success(f"✅ Extracted {len(extracted_text)} characters from {Path(image_path).name}") |
| | return extracted_text |
| |
|
| | except Exception as e: |
| | logger.error(f"Failed to extract text from {image_path}: {e}") |
| | raise |
| |
|
| | async def analyze_diagram(self, image_path: str) -> Dict[str, Any]: |
| | """ |
| | Analyze technical diagrams, flowcharts, and schematics. |
| | |
| | Args: |
| | image_path: Path to diagram image |
| | |
| | Returns: |
| | Dictionary with diagram analysis |
| | """ |
| | logger.info(f"📊 Analyzing diagram: {image_path}") |
| |
|
| | try: |
| | prompt = """Analyze this technical diagram in detail. Provide: |
| | |
| | 1. Type of diagram (flowchart, circuit, organizational chart, etc.) |
| | 2. Main components and elements |
| | 3. All text labels and annotations |
| | 4. Connections and relationships between elements |
| | 5. Overall purpose and meaning |
| | |
| | Format your response as structured text.""" |
| |
|
| | image_data = self._encode_image(image_path) |
| |
|
| | message = HumanMessage( |
| | content=[ |
| | {"type": "text", "text": prompt}, |
| | { |
| | "type": "image_url", |
| | "image_url": f"data:image/jpeg;base64,{image_data}" |
| | } |
| | ] |
| | ) |
| |
|
| | response = await self.vision_llm.ainvoke([message]) |
| | analysis = response.content |
| |
|
| | logger.success(f"✅ Analyzed diagram: {Path(image_path).name}") |
| |
|
| | return { |
| | "diagram_type": "technical_diagram", |
| | "analysis": analysis, |
| | "source": image_path |
| | } |
| |
|
| | except Exception as e: |
| | logger.error(f"Failed to analyze diagram {image_path}: {e}") |
| | raise |
| |
|
| | async def extract_table_data(self, image_path: str) -> str: |
| | """ |
| | Extract data from tables in images. |
| | |
| | Args: |
| | image_path: Path to image containing table |
| | |
| | Returns: |
| | Table data in markdown format |
| | """ |
| | logger.info(f"📋 Extracting table from: {image_path}") |
| |
|
| | try: |
| | prompt = """Extract the table data from this image. |
| | |
| | Format the output as a Markdown table with proper alignment: |
| | - Use | for column separators |
| | - Use | --- | for header separator |
| | - Maintain proper column alignment |
| | - Include all rows and columns |
| | |
| | Example format: |
| | | Header 1 | Header 2 | Header 3 | |
| | | --- | --- | --- | |
| | | Data 1 | Data 2 | Data 3 | |
| | |
| | Return ONLY the table, no additional text.""" |
| |
|
| | image_data = self._encode_image(image_path) |
| |
|
| | message = HumanMessage( |
| | content=[ |
| | {"type": "text", "text": prompt}, |
| | { |
| | "type": "image_url", |
| | "image_url": f"data:image/jpeg;base64,{image_data}" |
| | } |
| | ] |
| | ) |
| |
|
| | response = await self.vision_llm.ainvoke([message]) |
| | table_markdown = response.content |
| |
|
| | logger.success(f"✅ Extracted table from {Path(image_path).name}") |
| | return table_markdown |
| |
|
| | except Exception as e: |
| | logger.error(f"Failed to extract table from {image_path}: {e}") |
| | raise |
| |
|
| | async def analyze_patent_page(self, image_path: str) -> Dict[str, Any]: |
| | """ |
| | Specialized analysis for patent document pages. |
| | |
| | Args: |
| | image_path: Path to patent page image |
| | |
| | Returns: |
| | Dictionary with extracted patent information |
| | """ |
| | logger.info(f"📄 Analyzing patent page: {image_path}") |
| |
|
| | try: |
| | prompt = """Analyze this patent document page. Extract: |
| | |
| | 1. Patent number or application number (if visible) |
| | 2. Title or heading |
| | 3. All body text (claims, descriptions, specifications) |
| | 4. Figure numbers and captions |
| | 5. Any diagrams or technical drawings descriptions |
| | 6. Inventor names and assignee information (if visible) |
| | 7. Dates (filing date, publication date, etc.) |
| | |
| | Preserve the structure and formatting. Return comprehensive extracted content.""" |
| |
|
| | image_data = self._encode_image(image_path) |
| |
|
| | message = HumanMessage( |
| | content=[ |
| | {"type": "text", "text": prompt}, |
| | { |
| | "type": "image_url", |
| | "image_url": f"data:image/jpeg;base64,{image_data}" |
| | } |
| | ] |
| | ) |
| |
|
| | response = await self.vision_llm.ainvoke([message]) |
| | analysis = response.content |
| |
|
| | logger.success(f"✅ Analyzed patent page: {Path(image_path).name}") |
| |
|
| | return { |
| | "page_content": analysis, |
| | "source": image_path, |
| | "type": "patent_page" |
| | } |
| |
|
| | except Exception as e: |
| | logger.error(f"Failed to analyze patent page {image_path}: {e}") |
| | raise |
| |
|
| | async def identify_handwriting(self, image_path: str) -> str: |
| | """ |
| | Extract handwritten text from images. |
| | |
| | Args: |
| | image_path: Path to image with handwritten content |
| | |
| | Returns: |
| | Extracted handwritten text |
| | """ |
| | logger.info(f"✍️ Extracting handwriting from: {image_path}") |
| |
|
| | try: |
| | prompt = """This image contains handwritten text. Please: |
| | |
| | 1. Carefully read all handwritten content |
| | 2. Transcribe the text exactly as written |
| | 3. Indicate [unclear] for illegible portions |
| | 4. Preserve line breaks and spacing |
| | 5. Note any annotations or margin notes |
| | |
| | Return only the transcribed text.""" |
| |
|
| | image_data = self._encode_image(image_path) |
| |
|
| | message = HumanMessage( |
| | content=[ |
| | {"type": "text", "text": prompt}, |
| | { |
| | "type": "image_url", |
| | "image_url": f"data:image/jpeg;base64,{image_data}" |
| | } |
| | ] |
| | ) |
| |
|
| | response = await self.vision_llm.ainvoke([message]) |
| | handwriting = response.content |
| |
|
| | logger.success(f"✅ Extracted handwriting from {Path(image_path).name}") |
| | return handwriting |
| |
|
| | except Exception as e: |
| | logger.error(f"Failed to extract handwriting from {image_path}: {e}") |
| | raise |
| |
|
| | def is_available(self) -> bool: |
| | """ |
| | Check if vision model is available. |
| | |
| | Returns: |
| | True if model is available, False otherwise |
| | """ |
| | try: |
| | |
| | import requests |
| | response = requests.get(f"{self.base_url}/api/tags") |
| | if response.status_code == 200: |
| | models = response.json().get("models", []) |
| | return any(self.model_name in model.get("name", "") for model in models) |
| | return False |
| | except Exception as e: |
| | logger.warning(f"Could not check model availability: {e}") |
| | return False |
| |
|