You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
44 lines
1.4 KiB
Python
44 lines
1.4 KiB
Python
5 years ago
|
from __future__ import division, print_function
|
||
|
|
||
|
import unicodedata
|
||
|
from functools import partial, lru_cache
|
||
|
from typing import Sequence, Tuple
|
||
|
|
||
|
import numpy as np
|
||
4 years ago
|
from multimethod import multimethod
|
||
5 years ago
|
from uniseg.graphemecluster import grapheme_clusters
|
||
4 years ago
|
from tqdm import tqdm
|
||
2 years ago
|
from rapidfuzz.distance import Levenshtein
|
||
5 years ago
|
|
||
4 years ago
|
from .extracted_text import ExtractedText
|
||
4 years ago
|
from .config import Config
|
||
4 years ago
|
|
||
5 years ago
|
|
||
4 years ago
|
@multimethod
|
||
|
def distance(s1: str, s2: str):
|
||
5 years ago
|
"""Compute the Levenshtein edit distance between two Unicode strings
|
||
|
|
||
3 years ago
|
Note that this is different from levenshtein() as this function knows about Unicode
|
||
|
normalization and grapheme clusters. This should be the correct way to compare two
|
||
|
Unicode strings.
|
||
5 years ago
|
"""
|
||
4 years ago
|
seq1 = list(grapheme_clusters(unicodedata.normalize("NFC", s1)))
|
||
|
seq2 = list(grapheme_clusters(unicodedata.normalize("NFC", s2)))
|
||
2 years ago
|
return Levenshtein.distance(seq1, seq2)
|
||
4 years ago
|
|
||
4 years ago
|
|
||
4 years ago
|
@multimethod
|
||
|
def distance(s1: ExtractedText, s2: ExtractedText):
|
||
|
return distance(s1.text, s2.text)
|
||
5 years ago
|
|
||
|
|
||
|
def editops(word1, word2):
|
||
4 years ago
|
"""
|
||
|
Return sequence of edit operations transforming one string to another.
|
||
|
|
||
|
Note that this returns indices to the _grapheme clusters_, not characters!
|
||
|
"""
|
||
4 years ago
|
word1 = list(grapheme_clusters(unicodedata.normalize("NFC", word1)))
|
||
|
word2 = list(grapheme_clusters(unicodedata.normalize("NFC", word2)))
|
||
2 years ago
|
return Levenshtein.editops(word1, word2).as_list()
|