You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
44 lines
1018 B
Python
44 lines
1018 B
Python
5 years ago
|
from .edit_distance import *
|
||
|
|
||
|
|
||
|
def align(t1, t2):
|
||
|
"""Align text."""
|
||
|
s1 = list(grapheme_clusters(unicodedata.normalize('NFC', t1)))
|
||
|
s2 = list(grapheme_clusters(unicodedata.normalize('NFC', t2)))
|
||
|
return seq_align(s1, s2)
|
||
|
|
||
|
|
||
|
def seq_align(s1, s2):
|
||
|
"""Align general sequences."""
|
||
|
s1 = list(s1)
|
||
|
s2 = list(s2)
|
||
|
ops = seq_editops(s1, s2)
|
||
|
i = 0
|
||
|
j = 0
|
||
|
|
||
|
while i < len(s1) or j < len(s2):
|
||
|
o = None
|
||
|
try:
|
||
|
ot = ops[0]
|
||
|
if ot[1] == i and ot[2] == j:
|
||
|
ops = ops[1:]
|
||
|
o = ot
|
||
|
except IndexError:
|
||
|
pass
|
||
|
|
||
|
if o:
|
||
|
if o[0] == 'insert':
|
||
|
yield (None, s2[j])
|
||
|
j += 1
|
||
|
elif o[0] == 'delete':
|
||
|
yield (s1[i], None)
|
||
|
i += 1
|
||
|
elif o[0] == 'replace':
|
||
|
yield (s1[i], s2[j])
|
||
|
i += 1
|
||
|
j += 1
|
||
|
else:
|
||
|
yield (s1[i], s2[j])
|
||
|
i += 1
|
||
|
j += 1
|