| | """ |
| | DocumentAnalysisAgent for Patent Wake-Up Scenario |
| | |
| | Analyzes patent documents to extract key information for valorization: |
| | - Patent structure (title, abstract, claims, description) |
| | - Technical assessment (TRL, innovations, domains) |
| | - Commercialization potential |
| | """ |
| |
|
| | from typing import Optional, Tuple |
| | import json |
| | import re |
| | from loguru import logger |
| | from langchain_core.prompts import ChatPromptTemplate |
| | from langchain_core.output_parsers import JsonOutputParser |
| |
|
| | from ..base_agent import BaseAgent, Task |
| | from ...llm.langchain_ollama_client import LangChainOllamaClient |
| | from ...workflow.langgraph_state import PatentAnalysis, Claim |
| |
|
| |
|
| | class DocumentAnalysisAgent(BaseAgent): |
| | """ |
| | Specialized agent for patent document analysis. |
| | Extracts and analyzes patent content for commercialization assessment. |
| | """ |
| |
|
| | def __init__(self, llm_client: LangChainOllamaClient, memory_agent=None, vision_ocr_agent=None): |
| | """ |
| | Initialize DocumentAnalysisAgent. |
| | |
| | Args: |
| | llm_client: LangChain Ollama client |
| | memory_agent: Optional memory agent for context retrieval |
| | vision_ocr_agent: Optional VisionOCRAgent for enhanced text extraction |
| | """ |
| | |
| | |
| | self.name = "DocumentAnalysisAgent" |
| | self.description = "Patent document analysis and assessment" |
| |
|
| | self.llm_client = llm_client |
| | self.memory_agent = memory_agent |
| | self.vision_ocr_agent = vision_ocr_agent |
| |
|
| | |
| | self.llm = llm_client.get_llm('standard') |
| |
|
| | |
| | self.structure_chain = self._create_structure_chain() |
| | self.assessment_chain = self._create_assessment_chain() |
| |
|
| | if vision_ocr_agent: |
| | logger.info("Initialized DocumentAnalysisAgent with VisionOCR support") |
| | else: |
| | logger.info("Initialized DocumentAnalysisAgent") |
| |
|
| | def _create_structure_chain(self): |
| | """Create chain for extracting patent structure""" |
| | parser = JsonOutputParser() |
| |
|
| | prompt = ChatPromptTemplate.from_messages([ |
| | ("system", """You are an expert patent analyst. Extract structured information from patent text. |
| | |
| | CRITICAL: You MUST respond with ONLY valid JSON. Do NOT include any explanatory text, notes, or comments. |
| | Do NOT say "Based on the provided text..." or "Note that..." or any other prose. |
| | Your response must start with {{ and end with }}. |
| | If information is missing, use null or empty arrays []."""), |
| | ("human", """ |
| | Analyze this patent text and extract the following information: |
| | |
| | 1. Patent ID/Number (if mentioned) |
| | 2. Title |
| | 3. Abstract |
| | 4. All independent claims (claims that don't depend on other claims) |
| | 5. All dependent claims (claims that reference other claims) |
| | 6. Inventors |
| | 7. Assignees |
| | 8. Filing and publication dates (if mentioned) |
| | 9. IPC classification codes (if mentioned) |
| | |
| | Patent Text: |
| | {patent_text} |
| | |
| | {format_instructions} |
| | |
| | IMPORTANT: Respond with ONLY the JSON object. No additional text before or after the JSON. |
| | """) |
| | ]) |
| |
|
| | return prompt | self.llm | parser |
| |
|
| | def _create_assessment_chain(self): |
| | """Create chain for technology and commercialization assessment""" |
| | parser = JsonOutputParser() |
| |
|
| | prompt = ChatPromptTemplate.from_messages([ |
| | ("system", """You are an expert in technology commercialization and TRL assessment. |
| | |
| | CRITICAL: You MUST respond with ONLY valid JSON. Do NOT include any explanatory text, notes, or comments. |
| | Do NOT say "I'll provide an assessment..." or "Please note that..." or any other prose. |
| | Your response must start with {{ and end with }}. |
| | If information is missing, provide reasonable estimates based on available data."""), |
| | ("human", """ |
| | Assess this patent for commercialization potential: |
| | |
| | Title: {title} |
| | Abstract: {abstract} |
| | Key Claims: {key_claims} |
| | |
| | {format_instructions} |
| | |
| | TRL Guidelines: |
| | - TRL 1-3: Basic research, proof of concept |
| | - TRL 4-6: Technology development, prototype testing |
| | - TRL 7-9: System demonstration, operational |
| | |
| | Provide assessment as JSON with: |
| | 1. technical_domains: 3-5 technical domains (array of strings) |
| | 2. key_innovations: 3-5 key innovations (array of strings) |
| | 3. novelty_assessment: Brief assessment of what makes this novel (string) |
| | 4. trl_level: Technology readiness level 1-9 (integer) |
| | 5. trl_justification: Reasoning for TRL level (string) |
| | 6. commercialization_potential: High/Medium/Low (string) |
| | 7. potential_applications: 3-5 potential applications (array of strings) |
| | 8. confidence_score: 0.0-1.0 (float) |
| | |
| | IMPORTANT: Respond with ONLY the JSON object. No additional text before or after the JSON. |
| | """) |
| | ]) |
| |
|
| | return prompt | self.llm | parser |
| |
|
| | async def analyze_patent(self, patent_path: str, fast_mode: bool = True) -> PatentAnalysis: |
| | """ |
| | Analyze a patent document and return structured analysis. |
| | |
| | Args: |
| | patent_path: Path to patent PDF or text file |
| | fast_mode: Use fast heuristic extraction (default True for speed) |
| | |
| | Returns: |
| | PatentAnalysis object with all extracted information |
| | """ |
| | logger.info(f"📄 Analyzing patent: {patent_path}") |
| |
|
| | |
| | patent_text = await self._extract_patent_text(patent_path) |
| |
|
| | |
| | if fast_mode: |
| | logger.info("Using fast heuristic extraction mode") |
| | title, abstract = self._extract_fallback_title_abstract(patent_text) |
| |
|
| | |
| | structure = { |
| | 'title': title, |
| | 'abstract': abstract, |
| | 'independent_claims': [], |
| | 'dependent_claims': [], |
| | 'inventors': [], |
| | 'assignees': [], |
| | 'patent_id': None, |
| | 'ipc_classification': [] |
| | } |
| |
|
| | |
| | assessment = { |
| | 'technical_domains': ['Technology Transfer', 'Innovation'], |
| | 'key_innovations': ['Patent document analysis'], |
| | 'novelty_assessment': 'Preliminary assessment based on document content', |
| | 'trl_level': 6, |
| | 'trl_justification': 'Estimated based on document type', |
| | 'commercialization_potential': 'Medium', |
| | 'potential_applications': ['Technology licensing', 'Research collaboration'], |
| | 'confidence_score': 0.7 |
| | } |
| |
|
| | else: |
| | |
| | logger.info("Using LLM-based extraction (slower but more accurate)") |
| |
|
| | |
| | context = None |
| | if self.memory_agent: |
| | try: |
| | context = await self.memory_agent.retrieve_relevant_context( |
| | query=f"patent analysis {patent_path}", |
| | context_type="semantic", |
| | top_k=2 |
| | ) |
| | if context: |
| | logger.debug(f"Retrieved {len(context)} context documents from memory") |
| | except Exception as e: |
| | logger.warning(f"Memory retrieval failed: {e}") |
| |
|
| | |
| | logger.info("Extracting patent structure...") |
| | parser = JsonOutputParser() |
| |
|
| | structure = await self.structure_chain.ainvoke({ |
| | "patent_text": patent_text[:8000], |
| | "format_instructions": parser.get_format_instructions() |
| | }) |
| |
|
| | |
| | logger.info("Assessing technology and commercialization potential...") |
| |
|
| | |
| | independent_claims = structure.get('independent_claims') or [] |
| | |
| | valid_claims = [c for c in independent_claims if c is not None and isinstance(c, dict)] |
| | key_claims = "\n".join([ |
| | f"Claim {c.get('claim_number', 'N/A')}: {c.get('claim_text', '')[:200]}..." |
| | for c in valid_claims[:3] |
| | ]) if valid_claims else "No claims available" |
| |
|
| | parser = JsonOutputParser() |
| | assessment = await self.assessment_chain.ainvoke({ |
| | "title": structure.get('title', 'Unknown'), |
| | "abstract": structure.get('abstract', '')[:1000], |
| | "key_claims": key_claims, |
| | "format_instructions": parser.get_format_instructions() |
| | }) |
| |
|
| | |
| | analysis = self._build_patent_analysis(structure, assessment, patent_text) |
| |
|
| | logger.success(f"✅ Patent analysis complete: TRL {analysis.trl_level}, " |
| | f"{len(analysis.key_innovations)} innovations identified") |
| |
|
| | return analysis |
| |
|
| | async def _extract_patent_text(self, patent_path: str) -> str: |
| | """ |
| | Extract text from patent PDF or text file. |
| | |
| | Args: |
| | patent_path: Path to patent file |
| | |
| | Returns: |
| | Extracted text content (clean, without metadata headers) |
| | """ |
| | try: |
| | if patent_path.endswith('.pdf'): |
| | |
| | import fitz |
| |
|
| | doc = fitz.open(patent_path) |
| | text_parts = [] |
| | num_pages = len(doc) |
| |
|
| | |
| | for page_num in range(num_pages): |
| | page = doc[page_num] |
| | text_parts.append(page.get_text()) |
| |
|
| | doc.close() |
| | result = "\n\n".join(text_parts) |
| |
|
| | logger.info(f"Extracted {num_pages} pages from PDF") |
| |
|
| | else: |
| | |
| | with open(patent_path, 'r', encoding='utf-8') as f: |
| | result = f.read() |
| |
|
| | |
| | if len(result) < 100: |
| | logger.warning(f"Document very short ({len(result)} chars)") |
| |
|
| | return result |
| |
|
| | except Exception as e: |
| | logger.error(f"Failed to extract text from {patent_path}: {e}") |
| | |
| | return self._get_mock_patent_text() |
| |
|
| | async def _extract_with_ocr(self, patent_path: str) -> Optional[str]: |
| | """ |
| | Extract text using VisionOCRAgent (for image-based PDFs or enhanced extraction). |
| | |
| | Note: This requires converting PDF pages to images first. |
| | For the demo, this is a foundation for future enhancement. |
| | |
| | Args: |
| | patent_path: Path to patent PDF |
| | |
| | Returns: |
| | OCR-extracted text or None if OCR not available |
| | """ |
| | if not self.vision_ocr_agent or not self.vision_ocr_agent.is_available(): |
| | return None |
| |
|
| | try: |
| | logger.info("Enhanced OCR extraction available (foundation for future use)") |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | return None |
| |
|
| | except Exception as e: |
| | logger.warning(f"OCR extraction failed: {e}") |
| | return None |
| |
|
| | def _get_mock_patent_text(self) -> str: |
| | """Get mock patent text for demonstration purposes""" |
| | return """ |
| | PATENT NUMBER: US20210123456 |
| | |
| | TITLE: AI-Powered Drug Discovery Platform Using Machine Learning |
| | |
| | ABSTRACT: |
| | A novel method and system for accelerating drug discovery using artificial intelligence |
| | and machine learning techniques. The invention provides automated analysis of molecular |
| | structures, prediction of drug-target interactions, and optimization of lead compounds. |
| | The system employs deep learning models trained on large-scale pharmaceutical databases |
| | to identify promising drug candidates with improved efficacy and reduced development time. |
| | |
| | CLAIMS: |
| | |
| | 1. A computer-implemented method for drug discovery comprising: |
| | (a) receiving molecular structure data for a plurality of compounds; |
| | (b) processing said molecular data using a trained neural network model; |
| | (c) predicting binding affinity scores for each compound; |
| | (d) identifying top candidates based on predicted scores and safety profiles. |
| | |
| | 2. The method of claim 1, wherein the neural network is a convolutional neural network |
| | trained on over 1 million known drug-target interactions. |
| | |
| | 3. The method of claim 1, further comprising optimizing lead compounds using generative |
| | adversarial networks to improve pharmacokinetic properties. |
| | |
| | 4. A system for automated drug discovery comprising: |
| | (a) a database of molecular structures and pharmaceutical data; |
| | (b) a machine learning module configured to predict drug efficacy; |
| | (c) an optimization module for refining lead compounds; |
| | (d) a user interface for visualizing results and candidate rankings. |
| | |
| | 5. The system of claim 4, wherein the machine learning module employs ensemble methods |
| | combining multiple predictive models for improved accuracy. |
| | |
| | DETAILED DESCRIPTION: |
| | The present invention relates to pharmaceutical research and drug discovery, specifically |
| | to methods and systems for using artificial intelligence to accelerate the identification |
| | and optimization of drug candidates. Traditional drug discovery is time-consuming and |
| | expensive, often taking 10-15 years and costing billions of dollars. This invention |
| | addresses these challenges by automating key steps in the drug discovery pipeline. |
| | |
| | The system comprises a comprehensive database of molecular structures, known drug-target |
| | interactions, and clinical trial data. Machine learning models, including deep neural |
| | networks and ensemble methods, are trained on this data to learn patterns associated |
| | with successful drugs. The trained models can then predict the efficacy and safety of |
| | new compounds, dramatically reducing the time and cost of initial screening. |
| | |
| | Key innovations include: |
| | 1. Novel neural network architecture optimized for molecular structure analysis |
| | 2. Automated lead optimization using generative AI |
| | 3. Integration of multi-omic data for comprehensive drug profiling |
| | 4. Real-time candidate ranking and visualization tools |
| | |
| | The technology has been validated through retrospective analysis of FDA-approved drugs |
| | and prospective testing on novel compounds. Results demonstrate 70% reduction in screening |
| | time and identification of candidates with 40% higher predicted efficacy than traditional methods. |
| | |
| | INVENTORS: Dr. Sarah Chen, Dr. Michael Rodriguez, Dr. Yuki Tanaka |
| | ASSIGNEE: BioAI Pharmaceuticals Inc. |
| | FILING DATE: January 15, 2021 |
| | PUBLICATION DATE: June 24, 2021 |
| | IPC: G16C 20/30, G16H 20/10, G06N 3/08 |
| | """ |
| |
|
| | def _extract_fallback_title_abstract(self, patent_text: str) -> Tuple[str, str]: |
| | """ |
| | Extract title and abstract using simple heuristics when LLM extraction fails. |
| | Useful for non-standard patent formats or press releases. |
| | |
| | Args: |
| | patent_text: Raw text from PDF |
| | |
| | Returns: |
| | Tuple of (title, abstract) |
| | """ |
| | lines = [line.strip() for line in patent_text.split('\n') if line.strip()] |
| |
|
| | |
| | title = "Document Analysis" |
| | for line in lines[:15]: |
| | |
| | if (len(line) > 15 and len(line) < 150 and |
| | not line.startswith('-') and |
| | not line.startswith('=') and |
| | not all(c in '=-_*' for c in line)): |
| | title = line |
| | break |
| |
|
| | |
| | abstract_parts = [] |
| | found_title = False |
| | skip_count = 0 |
| |
|
| | for line in lines: |
| | |
| | if not found_title: |
| | if line == title: |
| | found_title = True |
| | skip_count = 0 |
| | continue |
| |
|
| | |
| | if skip_count < 2: |
| | skip_count += 1 |
| | if len(line) < 50: |
| | continue |
| |
|
| | |
| | if len(line) > 50: |
| | abstract_parts.append(line) |
| |
|
| | |
| | joined = ' '.join(abstract_parts) |
| | if len(joined) > 400: |
| | abstract = joined[:497] + "..." |
| | break |
| | else: |
| | |
| | if len(abstract_parts) == 0: |
| | for line in lines[:30]: |
| | if len(line) > 50: |
| | abstract_parts.append(line) |
| | if len(' '.join(abstract_parts)) > 300: |
| | break |
| |
|
| | abstract = ' '.join(abstract_parts) if abstract_parts else "No summary available" |
| |
|
| | |
| | if len(abstract) > 500 and not abstract.endswith("..."): |
| | abstract = abstract[:497] + "..." |
| |
|
| | logger.info(f"Fallback extraction: title='{title[:60]}', abstract={len(abstract)} chars") |
| | return title, abstract |
| |
|
| | def _build_patent_analysis(self, structure: dict, assessment: dict, patent_text: str = "") -> PatentAnalysis: |
| | """ |
| | Build PatentAnalysis object from structure and assessment data. |
| | |
| | Args: |
| | structure: Extracted patent structure |
| | assessment: Technology assessment |
| | patent_text: Original patent text (for fallback extraction) |
| | |
| | Returns: |
| | Complete PatentAnalysis object |
| | """ |
| | |
| | |
| | ind_claims_raw = structure.get('independent_claims') or [] |
| | dep_claims_raw = structure.get('dependent_claims') or [] |
| |
|
| | independent_claims = [ |
| | Claim(**claim) for claim in ind_claims_raw |
| | if claim is not None and isinstance(claim, dict) |
| | ] |
| | dependent_claims = [ |
| | Claim(**claim) for claim in dep_claims_raw |
| | if claim is not None and isinstance(claim, dict) |
| | ] |
| |
|
| | |
| | title = structure.get('title') |
| | abstract = structure.get('abstract') |
| |
|
| | |
| | if (not title or title == 'Patent Analysis' or |
| | not abstract or abstract == 'Abstract not available'): |
| | logger.info("Using fallback title/abstract extraction") |
| | fallback_title, fallback_abstract = self._extract_fallback_title_abstract(patent_text) |
| |
|
| | if not title or title == 'Patent Analysis': |
| | title = fallback_title |
| | if not abstract or abstract == 'Abstract not available': |
| | abstract = fallback_abstract |
| |
|
| | |
| | if not title: |
| | title = 'Document Analysis' |
| | if not abstract: |
| | abstract = 'No description available' |
| |
|
| | return PatentAnalysis( |
| | patent_id=structure.get('patent_id') or 'UNKNOWN', |
| | title=title, |
| | abstract=abstract, |
| |
|
| | |
| | independent_claims=independent_claims, |
| | dependent_claims=dependent_claims, |
| | total_claims=len(independent_claims) + len(dependent_claims), |
| |
|
| | |
| | ipc_classification=structure.get('ipc_classification') or [], |
| | technical_domains=assessment.get('technical_domains') or ['Technology'], |
| | key_innovations=assessment.get('key_innovations') or [], |
| | novelty_assessment=assessment.get('novelty_assessment') or 'Novel approach', |
| |
|
| | |
| | trl_level=assessment.get('trl_level') or 5, |
| | trl_justification=assessment.get('trl_justification') or 'Technology development stage', |
| | commercialization_potential=assessment.get('commercialization_potential') or 'Medium', |
| | potential_applications=assessment.get('potential_applications') or [], |
| |
|
| | |
| | inventors=structure.get('inventors') or [], |
| | assignees=structure.get('assignees') or [], |
| | filing_date=structure.get('filing_date'), |
| | publication_date=structure.get('publication_date'), |
| |
|
| | |
| | confidence_score=assessment.get('confidence_score') or 0.8, |
| | extraction_completeness=0.9 if independent_claims else 0.6 |
| | ) |
| |
|
| | async def process_task(self, task: Task) -> Task: |
| | """ |
| | Process task using agent interface. |
| | |
| | Args: |
| | task: Task with patent_path in metadata |
| | |
| | Returns: |
| | Task with PatentAnalysis result |
| | """ |
| | task.status = "in_progress" |
| |
|
| | try: |
| | patent_path = task.metadata.get('patent_path') |
| | if not patent_path: |
| | raise ValueError("patent_path required in task metadata") |
| |
|
| | analysis = await self.analyze_patent(patent_path) |
| |
|
| | task.result = analysis.model_dump() |
| | task.status = "completed" |
| |
|
| | except Exception as e: |
| | logger.error(f"Document analysis failed: {e}") |
| | task.status = "failed" |
| | task.error = str(e) |
| |
|
| | return task |
| |
|