Switch from custom Levenshtein to python-Levenshtein
As the distance and editops calculation is a performance bottleneck in this application we substituted the custom Levenshtein implementation to the C implementation in the python-Levenshtein package. We now also have separate entrypoints for texts with unicode normalization and without because this also can be done more efficiently once upon preprocessing.pull/48/head
parent
0e263cfac2
commit
e371da899e
@ -1,183 +1,136 @@
|
|||||||
from __future__ import division, print_function
|
from __future__ import division, print_function
|
||||||
|
|
||||||
import unicodedata
|
import unicodedata
|
||||||
from functools import partial, lru_cache
|
|
||||||
from itertools import chain
|
from itertools import chain
|
||||||
from typing import Sequence, Tuple, List
|
from typing import List, Union, Tuple
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
from Levenshtein import editops as c_editops, distance as c_distance
|
from Levenshtein import editops as c_editops, distance as c_distance
|
||||||
from multimethod import multimethod
|
from multimethod import multimethod
|
||||||
from uniseg.graphemecluster import grapheme_clusters
|
from uniseg.graphemecluster import grapheme_clusters
|
||||||
from tqdm import tqdm
|
|
||||||
|
|
||||||
from .extracted_text import ExtractedText
|
from .extracted_text import ExtractedText
|
||||||
from .config import Config
|
|
||||||
|
|
||||||
|
|
||||||
def levenshtein_matrix(seq1: Sequence, seq2: Sequence):
|
@multimethod
|
||||||
"""Compute the matrix commonly computed to produce the Levenshtein distance.
|
def distance_unicode(s1: str, s2: str):
|
||||||
This is also known as the Wagner-Fischer algorithm. The matrix element at the bottom right contains the desired
|
"""Compute the Levenshtein edit distance between two Unicode strings
|
||||||
edit distance.
|
|
||||||
|
|
||||||
This algorithm is implemented here because we need an implementation that can work with sequences other than
|
|
||||||
strings, e.g. lists of grapheme clusters or lists of word strings.
|
|
||||||
"""
|
|
||||||
|
|
||||||
# Internally, we use a cached version. As the cache only works on hashable parameters, we convert the input
|
|
||||||
# sequences to tuples to make them hashable.
|
|
||||||
return _levenshtein_matrix(tuple(seq1), tuple(seq2))
|
|
||||||
|
|
||||||
|
|
||||||
@lru_cache(maxsize=10)
|
Note that this is different from distance() as this function knows about Unicode
|
||||||
def _levenshtein_matrix(seq1: Tuple, seq2: Tuple):
|
normalization and grapheme clusters.
|
||||||
"""Compute the matrix commonly computed to produce the Levenshtein distance.
|
|
||||||
|
|
||||||
This is a LRU cached function not meant to be used directly. Use levenshtein_matrix() instead.
|
This should be the correct way to compare two Unicode strings.
|
||||||
"""
|
"""
|
||||||
m = len(seq1)
|
s1, s2 = transform_unicode(s1, s2)
|
||||||
n = len(seq2)
|
return distance(s1, s2)
|
||||||
|
|
||||||
def from_to(start, stop):
|
|
||||||
return range(start, stop + 1, 1)
|
|
||||||
|
|
||||||
D = np.zeros((m + 1, n + 1), np.int)
|
@multimethod
|
||||||
D[0, 0] = 0
|
def distance_unicode(s1: ExtractedText, s2: ExtractedText):
|
||||||
for i in from_to(1, m):
|
"""Compute the Levenshtein edit distance between two Unicode strings
|
||||||
D[i, 0] = i
|
|
||||||
for j in from_to(1, n):
|
|
||||||
D[0, j] = j
|
|
||||||
for i in tqdm(from_to(1, m), disable=not Config.progress):
|
|
||||||
for j in from_to(1, n):
|
|
||||||
D[i, j] = min(
|
|
||||||
D[i - 1, j - 1]
|
|
||||||
+ 1 * (seq1[i - 1] != seq2[j - 1]), # Same or Substitution
|
|
||||||
D[i, j - 1] + 1, # Insertion
|
|
||||||
D[i - 1, j] + 1, # Deletion
|
|
||||||
)
|
|
||||||
|
|
||||||
return D
|
|
||||||
|
|
||||||
|
Note that this is different from distance() as this function knows about Unicode
|
||||||
|
normalization and grapheme clusters.
|
||||||
|
|
||||||
def levenshtein(seq1, seq2):
|
This should be the correct way to compare two Unicode strings.
|
||||||
"""Compute the Levenshtein edit distance between two sequences"""
|
"""
|
||||||
m = len(seq1)
|
return distance_unicode(s1.text, s2.text)
|
||||||
n = len(seq2)
|
|
||||||
|
|
||||||
D = levenshtein_matrix(seq1, seq2)
|
|
||||||
return D[m, n]
|
|
||||||
|
|
||||||
|
@multimethod
|
||||||
|
def distance(l1: List, l2: List):
|
||||||
|
"""Compute the Levenshtein edit distance between two lists.
|
||||||
|
|
||||||
def levenshtein_matrix_cache_clear():
|
Also see `distance_unicode()`.
|
||||||
"""Clear internal Levenshtein matrix cache.
|
|
||||||
|
|
||||||
You want to do this between different input file pairs to decrease memory
|
The difference is that this implementation does not care about grapheme clusters or
|
||||||
usage by not caching results from prior input files.
|
unicode normalization, assuming that this already has been done in preprocessing.
|
||||||
"""
|
"""
|
||||||
_levenshtein_matrix.cache_clear()
|
s1, s2 = transform_lists(l1, l2)
|
||||||
|
return c_distance(s1, s2)
|
||||||
|
|
||||||
|
|
||||||
@multimethod
|
@multimethod
|
||||||
def distance(s1: str, s2: str):
|
def distance(s1: str, s2: str):
|
||||||
"""Compute the Levenshtein edit distance between two Unicode strings
|
"""Compute the Levenshtein edit distance between two strings.
|
||||||
|
|
||||||
Note that this is different from levenshtein() as this function knows about Unicode
|
Also see `distance_unicode()`.
|
||||||
normalization and grapheme clusters.
|
|
||||||
|
|
||||||
This should be the correct way to compare two Unicode strings.
|
The difference is that this implementation does not care about grapheme clusters or
|
||||||
|
unicode normalization, assuming that this already has been done in preprocessing.
|
||||||
"""
|
"""
|
||||||
seq1 = list(grapheme_clusters(unicodedata.normalize("NFC", s1)))
|
return c_distance(s1, s2)
|
||||||
seq2 = list(grapheme_clusters(unicodedata.normalize("NFC", s2)))
|
|
||||||
if any(len(s) > 1 for s in chain(seq1, seq2)):
|
|
||||||
return distance(seq1, seq2)
|
|
||||||
else:
|
|
||||||
return distance_fast("".join(seq1), "".join(seq2))
|
|
||||||
|
|
||||||
|
|
||||||
@multimethod
|
@multimethod
|
||||||
def distance(s1: ExtractedText, s2: ExtractedText):
|
def distance(s1: ExtractedText, s2: ExtractedText):
|
||||||
return distance(s1.text, s2.text)
|
"""Compute the Levenshtein edit distance between two strings.
|
||||||
|
|
||||||
|
|
||||||
@multimethod
|
|
||||||
def distance(s1: List, s2: List):
|
|
||||||
return levenshtein(s1, s2)
|
|
||||||
|
|
||||||
|
|
||||||
def distance_fast(s1: str, s2: str):
|
Also see `distance_unicode()`.
|
||||||
"""Compute the Levenshtein edit distance between two Unicode strings
|
|
||||||
|
|
||||||
Also see `distance()`.
|
|
||||||
|
|
||||||
The difference is that this implementation does not care about grapheme clusters or
|
The difference is that this implementation does not care about grapheme clusters or
|
||||||
unicode normalization, assuming that this already has been done in preprocessing.
|
unicode normalization, assuming that this already has been done in preprocessing.
|
||||||
"""
|
"""
|
||||||
return c_distance(s1, s2)
|
return distance(s1.text, s2.text)
|
||||||
|
|
||||||
|
|
||||||
@multimethod
|
@multimethod
|
||||||
def editops(seq1: List, seq2: List):
|
def editops_unicode(s1: str, s2: str):
|
||||||
"""
|
"""Return sequence of edit operations transforming one string to another.
|
||||||
Return sequence of edit operations transforming one sequence to another.
|
|
||||||
|
|
||||||
This aims to return the same/similar results as python-Levenshtein's editops(),
|
Note that this returns indices to the _grapheme clusters_, not characters!
|
||||||
just generalized to arbitrary sequences.
|
|
||||||
"""
|
"""
|
||||||
seq1 = list(seq1)
|
s1, s2 = transform_unicode(s1, s2)
|
||||||
seq2 = list(seq2)
|
return editops(s1, s2)
|
||||||
m = len(seq1)
|
|
||||||
n = len(seq2)
|
|
||||||
D = levenshtein_matrix(seq1, seq2)
|
|
||||||
|
|
||||||
def _tail_backtrace(i, j, accumulator):
|
|
||||||
if i > 0 and D[i - 1, j] + 1 == D[i, j]:
|
|
||||||
return partial(
|
|
||||||
_tail_backtrace, i - 1, j, [("delete", i - 1, j)] + accumulator
|
|
||||||
)
|
|
||||||
if j > 0 and D[i, j - 1] + 1 == D[i, j]:
|
|
||||||
return partial(
|
|
||||||
_tail_backtrace, i, j - 1, [("insert", i, j - 1)] + accumulator
|
|
||||||
)
|
|
||||||
if i > 0 and j > 0 and D[i - 1, j - 1] + 1 == D[i, j]:
|
|
||||||
return partial(
|
|
||||||
_tail_backtrace, i - 1, j - 1, [("replace", i - 1, j - 1)] + accumulator
|
|
||||||
)
|
|
||||||
if i > 0 and j > 0 and D[i - 1, j - 1] == D[i, j]:
|
|
||||||
return partial(_tail_backtrace, i - 1, j - 1, accumulator) # NOP
|
|
||||||
return accumulator
|
|
||||||
|
|
||||||
def backtrace(i, j):
|
|
||||||
result = partial(_tail_backtrace, i, j, [])
|
|
||||||
while isinstance(result, partial):
|
|
||||||
result = result()
|
|
||||||
|
|
||||||
return result
|
|
||||||
|
|
||||||
b = backtrace(m, n)
|
|
||||||
return b
|
|
||||||
|
|
||||||
|
|
||||||
@multimethod
|
@multimethod
|
||||||
def editops(s1: str, s2: str):
|
def editops(l1: List, l2: List):
|
||||||
"""
|
"""Return sequence of edit operations transforming one list to another.
|
||||||
Return sequence of edit operations transforming one string to another.
|
|
||||||
|
|
||||||
Note that this returns indices to the _grapheme clusters_, not characters!
|
Also see `editops_unicode()`.
|
||||||
|
|
||||||
|
The difference is that this implementation does not care about grapheme clusters or
|
||||||
|
unicode normalization, assuming that this already has been done in preprocessing.
|
||||||
"""
|
"""
|
||||||
s1 = list(grapheme_clusters(unicodedata.normalize("NFC", s1)))
|
s1, s2 = transform_lists(l1, l2)
|
||||||
s2 = list(grapheme_clusters(unicodedata.normalize("NFC", s2)))
|
return c_editops(s1, s2)
|
||||||
if any(len(s) > 1 for s in chain(s1, s2)):
|
|
||||||
return editops(s1, s2)
|
|
||||||
else:
|
|
||||||
return editops_fast("".join(s1), "".join(s2))
|
|
||||||
|
|
||||||
|
|
||||||
def editops_fast(s1: str, s2: str):
|
@multimethod
|
||||||
|
def editops(s1: str, s2: str):
|
||||||
"""Return sequence of edit operations transforming one string to another.
|
"""Return sequence of edit operations transforming one string to another.
|
||||||
|
|
||||||
Also see `editops()`.
|
Also see `editops_unicode()`.
|
||||||
|
|
||||||
The difference is that this implementation does not care about grapheme clusters or
|
The difference is that this implementation does not care about grapheme clusters or
|
||||||
unicode normalization, assuming that this already has been done in preprocessing.
|
unicode normalization, assuming that this already has been done in preprocessing.
|
||||||
"""
|
"""
|
||||||
return c_editops(s1, s2)
|
return c_editops(s1, s2)
|
||||||
|
|
||||||
|
|
||||||
|
def transform_lists(l1: List, l2: List) -> Tuple[str, str]:
|
||||||
|
"""Transform two lists into string representation.
|
||||||
|
|
||||||
|
We need this transformation to be able to calculate a Levenshtein distance
|
||||||
|
between two sequences.
|
||||||
|
|
||||||
|
Note that we can only process 1,114,111 unique elements with this implementation.
|
||||||
|
See https://docs.python.org/3/library/functions.html#chr
|
||||||
|
"""
|
||||||
|
mapping = {el: chr(i) for i, el in enumerate(frozenset(chain(l1, l2)))}
|
||||||
|
s1 = "".join([mapping[el] for el in l1])
|
||||||
|
s2 = "".join([mapping[el] for el in l2])
|
||||||
|
return s1, s2
|
||||||
|
|
||||||
|
|
||||||
|
def transform_unicode(s1: str, s2: str) -> Union[Tuple[str, str], Tuple[List[str]]]:
|
||||||
|
"""Transform two text sequences to unicode representation.
|
||||||
|
|
||||||
|
Normalize to unicode and decides whether we have wide chars
|
||||||
|
that needs to be represented by lists.
|
||||||
|
"""
|
||||||
|
s1 = list(grapheme_clusters(unicodedata.normalize("NFC", s1)))
|
||||||
|
s2 = list(grapheme_clusters(unicodedata.normalize("NFC", s2)))
|
||||||
|
if all(len(s) < 2 for s in chain(s1, s2)):
|
||||||
|
s1, s2 = "".join(s1), "".join(s2)
|
||||||
|
return s1, s2
|
||||||
|
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue