2019-12-09 12:44:05 +01:00
|
|
|
import unicodedata
|
2024-01-03 20:52:07 +01:00
|
|
|
from typing import List
|
2019-12-09 12:44:05 +01:00
|
|
|
|
2020-10-08 12:15:58 +02:00
|
|
|
from multimethod import multimethod
|
2022-08-17 11:42:19 +02:00
|
|
|
from rapidfuzz.distance import Levenshtein
|
2024-01-03 22:40:43 +03:30
|
|
|
from typing import List
|
2023-08-03 19:21:21 +02:00
|
|
|
from uniseg.graphemecluster import grapheme_clusters
|
2019-12-09 12:44:05 +01:00
|
|
|
|
2020-10-08 13:33:19 +02:00
|
|
|
from .extracted_text import ExtractedText
|
2020-10-08 13:25:20 +02:00
|
|
|
|
2019-12-09 12:44:05 +01:00
|
|
|
|
2022-08-29 01:49:04 +02:00
|
|
|
@multimethod
|
2024-01-03 20:52:07 +01:00
|
|
|
def distance(seq1: List[str], seq2: List[str]):
|
2023-10-31 19:08:25 +01:00
|
|
|
"""Compute the Levenshtein edit distance between two lists of grapheme clusters.
|
2022-08-29 01:49:04 +02:00
|
|
|
|
2023-10-31 19:08:25 +01:00
|
|
|
This assumes that the grapheme clusters are already normalized.
|
|
|
|
|
|
|
|
Use distance(str, str) instead if you need to compare two Unicode strings.
|
2022-08-29 01:49:04 +02:00
|
|
|
"""
|
|
|
|
return Levenshtein.distance(seq1, seq2)
|
|
|
|
|
2022-08-29 01:50:19 +02:00
|
|
|
|
2020-10-08 12:15:58 +02:00
|
|
|
@multimethod
|
|
|
|
def distance(s1: str, s2: str):
|
2019-12-09 12:44:05 +01:00
|
|
|
"""Compute the Levenshtein edit distance between two Unicode strings
|
|
|
|
|
2021-10-22 15:38:59 +02:00
|
|
|
Note that this is different from levenshtein() as this function knows about Unicode
|
|
|
|
normalization and grapheme clusters. This should be the correct way to compare two
|
|
|
|
Unicode strings.
|
2019-12-09 12:44:05 +01:00
|
|
|
"""
|
2020-11-10 12:29:55 +01:00
|
|
|
seq1 = list(grapheme_clusters(unicodedata.normalize("NFC", s1)))
|
|
|
|
seq2 = list(grapheme_clusters(unicodedata.normalize("NFC", s2)))
|
2022-08-17 11:42:19 +02:00
|
|
|
return Levenshtein.distance(seq1, seq2)
|
2020-10-08 10:47:20 +02:00
|
|
|
|
2020-10-08 11:25:01 +02:00
|
|
|
|
2020-10-08 12:15:58 +02:00
|
|
|
@multimethod
|
|
|
|
def distance(s1: ExtractedText, s2: ExtractedText):
|
2022-08-29 01:49:04 +02:00
|
|
|
return Levenshtein.distance(s1.grapheme_clusters, s2.grapheme_clusters)
|
2019-12-09 12:44:05 +01:00
|
|
|
|
|
|
|
|
|
|
|
def editops(word1, word2):
|
2020-06-12 17:01:28 +02:00
|
|
|
"""
|
|
|
|
Return sequence of edit operations transforming one string to another.
|
|
|
|
|
|
|
|
Note that this returns indices to the _grapheme clusters_, not characters!
|
|
|
|
"""
|
2020-11-10 12:29:55 +01:00
|
|
|
word1 = list(grapheme_clusters(unicodedata.normalize("NFC", word1)))
|
|
|
|
word2 = list(grapheme_clusters(unicodedata.normalize("NFC", word2)))
|
2022-08-17 17:55:44 +02:00
|
|
|
return Levenshtein.editops(word1, word2).as_list()
|