| import difflib |
| import re |
|
|
| def get_word_positions(text): |
| positions = [] |
| for m in re.finditer(r'\S+', text): |
| positions.append((m.group(), m.start(), m.end())) |
| return positions |
|
|
| |
| |
| |
| |
| |
| |
|
|
| |
| |
| grammar_input = "القصة طويل ومملل" |
| grammar_output = "القصة طويلة ومملة" |
|
|
| print(f"Grammar Input: '{grammar_input}' (len={len(grammar_input)})") |
| print(f"Grammar Output: '{grammar_output}' (len={len(grammar_output)})") |
|
|
| orig_words = get_word_positions(grammar_input) |
| corr_words = get_word_positions(grammar_output) |
|
|
| print(f"\nInput words:") |
| for w, s, e in orig_words: |
| print(f" '{w}' [{s}:{e}]") |
|
|
| print(f"\nOutput words:") |
| for w, s, e in corr_words: |
| print(f" '{w}' [{s}:{e}]") |
|
|
| s = difflib.SequenceMatcher(None, [w[0] for w in orig_words], [w[0] for w in corr_words]) |
| print(f"\nOpcodes:") |
| for tag, i1, i2, j1, j2 in s.get_opcodes(): |
| print(f" {tag}: orig_words[{i1}:{i2}] vs corr_words[{j1}:{j2}]") |
| if tag != 'equal': |
| start_char = orig_words[i1][1] |
| end_char = orig_words[i2-1][2] |
| orig_slice = grammar_input[start_char:end_char] |
| corr_text = " ".join([w[0] for w in corr_words[j1:j2]]) |
| print(f" [{start_char}:{end_char}] '{orig_slice}' → '{corr_text}'") |
|
|
| |
| |
| |
| |
| |
|
|