Spaces:

bayan10
/

bayan-api

Running

bayan-api / debug_pc002.py

update benchmark logs

aea385b 3 days ago

1.22 kB

	import logging
	from src.nlp.spelling.araspell_rules import AraSpellPostProcessor

	# Mock original and result
	original = "الولاد يلعبون بالشاروع"
	result = "الأولاد يلعبون ب الشارع"

	orig_standalone = set(w for w in original.split() if len(w) == 1)
	orig_words = original.split()
	res_words_list = result.split()

	print(f"Original: {orig_words}")
	print(f"Result: {res_words_list}")

	for idx, w in enumerate(res_words_list):
	if len(w) == 1 and w not in orig_standalone:
	if w in 'واتيبلفك':
	is_prefix_separation = False
	if w in 'وفبلك' and idx + 1 < len(res_words_list):
	next_word = res_words_list[idx + 1]
	combined = w + next_word
	for ow in orig_words:
	if ow.startswith(w) and len(ow) > 2:
	print(f" Match! ow={ow}, w={w}, combined={combined}")
	is_prefix_separation = True
	break

	if not is_prefix_separation:
	print(f"[SPELLING] Blocked destructive tokenization '{w}'")
	result = original
	break

	print(f"Final result: {result}")