Spaces:
Running
Running
| import re | |
| def extract_clauses(text_data): | |
| """ | |
| Extracts clauses from text chunks with location data. | |
| Args: | |
| text_data: List[Dict] with 'text' and 'page' keys. | |
| Returns: | |
| List[Dict]: [{'id', 'text', 'page', 'line'}] | |
| """ | |
| unique_clauses = [] | |
| seen = set() | |
| clause_id = 0 | |
| for chunk in text_data: | |
| raw_text = chunk.get("text", "") | |
| page_num = chunk.get("page", 1) | |
| # Split into lines first to track line numbers roughly | |
| # Or split by sentence and find position. | |
| # Simple approach: Split by sentence, then find approximate line number in chunk | |
| sentences = re.split(r'(?<=[.!?])\s+', raw_text) | |
| # Helper to find line number | |
| def get_line_number(substring, source_text): | |
| idx = source_text.find(substring) | |
| if idx == -1: return 1 | |
| return source_text[:idx].count('\n') + 1 | |
| for s in sentences: | |
| s_clean = s.strip() | |
| if len(s_clean) > 30 and s_clean not in seen: | |
| seen.add(s_clean) | |
| # Estimate line number within the page | |
| line_offset = get_line_number(s_clean, raw_text) | |
| unique_clauses.append({ | |
| "id": clause_id, | |
| "text": s_clean, | |
| "page": page_num, | |
| "line": line_offset | |
| }) | |
| clause_id += 1 | |
| return unique_clauses | |