You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
49 lines
1.6 KiB
Python
49 lines
1.6 KiB
Python
import unicodedata
|
|
from typing import List
|
|
|
|
from multimethod import multimethod
|
|
from rapidfuzz.distance import Levenshtein
|
|
from uniseg.graphemecluster import grapheme_clusters
|
|
|
|
from .extracted_text import ExtractedText
|
|
|
|
|
|
@multimethod
|
|
def distance(seq1: List[str], seq2: List[str]):
|
|
"""Compute the Levenshtein edit distance between two lists of grapheme clusters.
|
|
|
|
This assumes that the grapheme clusters are already normalized.
|
|
|
|
Use distance(str, str) instead if you need to compare two Unicode strings.
|
|
"""
|
|
return Levenshtein.distance(seq1, seq2)
|
|
|
|
|
|
@multimethod
|
|
def distance(s1: str, s2: str):
|
|
"""Compute the Levenshtein edit distance between two Unicode strings
|
|
|
|
Note that this is different from levenshtein() as this function knows about Unicode
|
|
normalization and grapheme clusters. This should be the correct way to compare two
|
|
Unicode strings.
|
|
"""
|
|
seq1 = list(grapheme_clusters(unicodedata.normalize("NFC", s1)))
|
|
seq2 = list(grapheme_clusters(unicodedata.normalize("NFC", s2)))
|
|
return Levenshtein.distance(seq1, seq2)
|
|
|
|
|
|
@multimethod
|
|
def distance(s1: ExtractedText, s2: ExtractedText):
|
|
return Levenshtein.distance(s1.grapheme_clusters, s2.grapheme_clusters)
|
|
|
|
|
|
def editops(word1, word2):
|
|
"""
|
|
Return sequence of edit operations transforming one string to another.
|
|
|
|
Note that this returns indices to the _grapheme clusters_, not characters!
|
|
"""
|
|
word1 = list(grapheme_clusters(unicodedata.normalize("NFC", word1)))
|
|
word2 = list(grapheme_clusters(unicodedata.normalize("NFC", word2)))
|
|
return Levenshtein.editops(word1, word2).as_list()
|