bayan-api / debug_pc002.py
youssefreda9's picture
update benchmark logs
aea385b
Raw
History Blame Contribute Delete
1.22 kB
import logging
from src.nlp.spelling.araspell_rules import AraSpellPostProcessor
# Mock original and result
original = "الولاد يلعبون بالشاروع"
result = "الأولاد يلعبون ب الشارع"
orig_standalone = set(w for w in original.split() if len(w) == 1)
orig_words = original.split()
res_words_list = result.split()
print(f"Original: {orig_words}")
print(f"Result: {res_words_list}")
for idx, w in enumerate(res_words_list):
if len(w) == 1 and w not in orig_standalone:
if w in 'واتيبلفك':
is_prefix_separation = False
if w in 'وفبلك' and idx + 1 < len(res_words_list):
next_word = res_words_list[idx + 1]
combined = w + next_word
for ow in orig_words:
if ow.startswith(w) and len(ow) > 2:
print(f" Match! ow={ow}, w={w}, combined={combined}")
is_prefix_separation = True
break
if not is_prefix_separation:
print(f"[SPELLING] Blocked destructive tokenization '{w}'")
result = original
break
print(f"Final result: {result}")