| | """ |
| | Text processing utilities for the efficient-context library. |
| | """ |
| |
|
| | import re |
| | from typing import List, Dict, Any |
| | import logging |
| |
|
| | |
| | logging.basicConfig(level=logging.INFO) |
| | logger = logging.getLogger(__name__) |
| |
|
| | def split_into_sentences(text: str) -> List[str]: |
| | """ |
| | Split text into sentences. |
| | |
| | Args: |
| | text: Text to split |
| | |
| | Returns: |
| | sentences: List of sentences |
| | """ |
| | |
| | |
| | text = text.replace('\n', ' ') |
| | |
| | |
| | try: |
| | import nltk |
| | try: |
| | return nltk.sent_tokenize(text) |
| | except Exception as e: |
| | logger.warning(f"NLTK sentence tokenizer error: {e}. Using fallback.") |
| | return _simple_sentence_split(text) |
| | except ImportError: |
| | logger.warning("NLTK not available, using fallback sentence splitter") |
| | return _simple_sentence_split(text) |
| |
|
| | def _simple_sentence_split(text: str) -> List[str]: |
| | """Fallback sentence splitter without dependencies.""" |
| | |
| | |
| | for abbr in ['Mr.', 'Mrs.', 'Dr.', 'vs.', 'e.g.', 'i.e.', 'etc.']: |
| | text = text.replace(abbr, abbr.replace('.', '<POINT>')) |
| | |
| | |
| | sentences = re.split(r'(?<=[.!?])\s+', text) |
| | |
| | |
| | sentences = [s.replace('<POINT>', '.') for s in sentences] |
| | |
| | |
| | return [s for s in sentences if s.strip()] |
| |
|
| | def get_sentence_importance(sentences: List[str]) -> List[float]: |
| | """ |
| | Calculate importance scores for sentences based on heuristics. |
| | |
| | Args: |
| | sentences: List of sentences to score |
| | |
| | Returns: |
| | importances: List of importance scores (0.0 to 1.0) |
| | """ |
| | |
| | importances = [] |
| | |
| | for sentence in sentences: |
| | score = 0.0 |
| | words = sentence.split() |
| | |
| | |
| | length_score = min(len(words) / 20, 1.0) |
| | |
| | |
| | keyword_score = 0.0 |
| | keywords = ['important', 'significant', 'key', 'critical', 'crucial', |
| | 'essential', 'main', 'major', 'primary', 'central', |
| | 'result', 'conclusion', 'finding', 'discovered', 'shows'] |
| | |
| | for word in words: |
| | if word.lower() in keywords: |
| | keyword_score += 0.2 |
| | |
| | keyword_score = min(keyword_score, 0.6) |
| | |
| | |
| | number_score = 0.0 |
| | if re.search(r'\d', sentence): |
| | number_score = 0.2 |
| | |
| | |
| | score = 0.5 * length_score + 0.3 * keyword_score + 0.2 * number_score |
| | |
| | |
| | importances.append(min(score, 1.0)) |
| | |
| | return importances |
| |
|
| | def calculate_text_overlap(text1: str, text2: str) -> float: |
| | """ |
| | Calculate simple text overlap between two strings. |
| | |
| | Args: |
| | text1: First text |
| | text2: Second text |
| | |
| | Returns: |
| | overlap_ratio: Ratio of shared tokens (0.0 to 1.0) |
| | """ |
| | |
| | tokens1 = set(text1.lower().split()) |
| | tokens2 = set(text2.lower().split()) |
| | |
| | |
| | if not tokens1 or not tokens2: |
| | return 0.0 |
| | |
| | overlap = tokens1.intersection(tokens2) |
| | return len(overlap) / min(len(tokens1), len(tokens2)) |
| |
|