1
0
Fork 0
mirror of https://github.com/qurator-spk/dinglehopper.git synced 2025-07-09 10:29:56 +02:00
dinglehopper/qurator/dinglehopper/edit_distance.py

48 lines
1.6 KiB
Python
Raw Normal View History

import unicodedata
from multimethod import multimethod
2022-08-29 22:08:25 +02:00
from uniseg2.graphemecluster import grapheme_clusters
from rapidfuzz.distance import Levenshtein
2020-10-08 13:33:19 +02:00
from .extracted_text import ExtractedText
@multimethod
def distance(seq1: list[str], seq2: list[str]):
"""Compute the Levenshtein edit distance between two Unicode strings
Note that this is different from levenshtein() as this function knows about Unicode
normalization and grapheme clusters. This should be the correct way to compare two
Unicode strings.
"""
return Levenshtein.distance(seq1, seq2)
2022-08-29 01:50:19 +02:00
@multimethod
def distance(s1: str, s2: str):
"""Compute the Levenshtein edit distance between two Unicode strings
Note that this is different from levenshtein() as this function knows about Unicode
normalization and grapheme clusters. This should be the correct way to compare two
Unicode strings.
"""
seq1 = list(grapheme_clusters(unicodedata.normalize("NFC", s1)))
seq2 = list(grapheme_clusters(unicodedata.normalize("NFC", s2)))
return Levenshtein.distance(seq1, seq2)
@multimethod
def distance(s1: ExtractedText, s2: ExtractedText):
return Levenshtein.distance(s1.grapheme_clusters, s2.grapheme_clusters)
def editops(word1, word2):
2020-06-12 17:01:28 +02:00
"""
Return sequence of edit operations transforming one string to another.
Note that this returns indices to the _grapheme clusters_, not characters!
"""
word1 = list(grapheme_clusters(unicodedata.normalize("NFC", word1)))
word2 = list(grapheme_clusters(unicodedata.normalize("NFC", word2)))
return Levenshtein.editops(word1, word2).as_list()