mirror of
https://github.com/qurator-spk/dinglehopper.git
synced 2025-07-03 07:29:59 +02:00
🚧 dinglehopper: Guarantee NFC + rename from_text → from_str
This commit is contained in:
parent
7843824eaf
commit
a17ee2afec
5 changed files with 29 additions and 13 deletions
|
@ -77,14 +77,16 @@ def distance(s1, s2):
|
|||
clusters. This should be the correct way to compare two Unicode strings.
|
||||
"""
|
||||
|
||||
if isinstance(s1, ExtractedText):
|
||||
s1 = s1.text
|
||||
if isinstance(s2, ExtractedText):
|
||||
s2 = s2.text
|
||||
# XXX Implicit normalization
|
||||
if isinstance(s1, str):
|
||||
s1 = ExtractedText.from_str(s1)
|
||||
if isinstance(s2, str):
|
||||
s2 = ExtractedText.from_str(s2)
|
||||
# s1 and s2 are now guaranteed (by ExtractedText) to be in NFC
|
||||
|
||||
s1 = list(grapheme_clusters(unicodedata.normalize('NFC', s1)))
|
||||
s2 = list(grapheme_clusters(unicodedata.normalize('NFC', s2)))
|
||||
return levenshtein(s1, s2)
|
||||
seq1 = list(grapheme_clusters(s1.text))
|
||||
seq2 = list(grapheme_clusters(s2.text))
|
||||
return levenshtein(seq1, seq2)
|
||||
|
||||
|
||||
def seq_editops(seq1, seq2):
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue