| import logging |
| from src.nlp.spelling.araspell_rules import AraSpellPostProcessor |
|
|
| |
| original = "الولاد يلعبون بالشاروع" |
| result = "الأولاد يلعبون ب الشارع" |
|
|
| orig_standalone = set(w for w in original.split() if len(w) == 1) |
| orig_words = original.split() |
| res_words_list = result.split() |
|
|
| print(f"Original: {orig_words}") |
| print(f"Result: {res_words_list}") |
|
|
| for idx, w in enumerate(res_words_list): |
| if len(w) == 1 and w not in orig_standalone: |
| if w in 'واتيبلفك': |
| is_prefix_separation = False |
| if w in 'وفبلك' and idx + 1 < len(res_words_list): |
| next_word = res_words_list[idx + 1] |
| combined = w + next_word |
| for ow in orig_words: |
| if ow.startswith(w) and len(ow) > 2: |
| print(f" Match! ow={ow}, w={w}, combined={combined}") |
| is_prefix_separation = True |
| break |
| |
| if not is_prefix_separation: |
| print(f"[SPELLING] Blocked destructive tokenization '{w}'") |
| result = original |
| break |
|
|
| print(f"Final result: {result}") |
|
|