Switch from custom Levenshtein to python-Levenshtein

As the distance and editops calculation is a performance bottleneck in
this application we substituted the custom Levenshtein implementation to
the C implementation in the python-Levenshtein package.

We now also have separate entrypoints for texts with unicode normalization
and without because this also can be done more efficiently once upon
preprocessing.
pull/48/head
Benjamin Rosemann 4 years ago
parent 0e263cfac2
commit e371da899e

@ -6,7 +6,7 @@ from typing import Tuple
from multimethod import multimethod from multimethod import multimethod
from uniseg.graphemecluster import grapheme_clusters from uniseg.graphemecluster import grapheme_clusters
from .edit_distance import distance from .edit_distance import distance_unicode
from .extracted_text import ExtractedText from .extracted_text import ExtractedText
@ -18,7 +18,7 @@ def character_error_rate_n(reference: str, compared: str) -> Tuple[float, int]:
:return: character error rate and length of the reference :return: character error rate and length of the reference
""" """
d = distance(reference, compared) d = distance_unicode(reference, compared)
n = len(list(grapheme_clusters(unicodedata.normalize("NFC", reference)))) n = len(list(grapheme_clusters(unicodedata.normalize("NFC", reference))))
if d == 0: if d == 0:

@ -1,183 +1,136 @@
from __future__ import division, print_function from __future__ import division, print_function
import unicodedata import unicodedata
from functools import partial, lru_cache
from itertools import chain from itertools import chain
from typing import Sequence, Tuple, List from typing import List, Union, Tuple
import numpy as np
from Levenshtein import editops as c_editops, distance as c_distance from Levenshtein import editops as c_editops, distance as c_distance
from multimethod import multimethod from multimethod import multimethod
from uniseg.graphemecluster import grapheme_clusters from uniseg.graphemecluster import grapheme_clusters
from tqdm import tqdm
from .extracted_text import ExtractedText from .extracted_text import ExtractedText
from .config import Config
def levenshtein_matrix(seq1: Sequence, seq2: Sequence): @multimethod
"""Compute the matrix commonly computed to produce the Levenshtein distance. def distance_unicode(s1: str, s2: str):
This is also known as the Wagner-Fischer algorithm. The matrix element at the bottom right contains the desired """Compute the Levenshtein edit distance between two Unicode strings
edit distance.
This algorithm is implemented here because we need an implementation that can work with sequences other than
strings, e.g. lists of grapheme clusters or lists of word strings.
"""
# Internally, we use a cached version. As the cache only works on hashable parameters, we convert the input
# sequences to tuples to make them hashable.
return _levenshtein_matrix(tuple(seq1), tuple(seq2))
@lru_cache(maxsize=10) Note that this is different from distance() as this function knows about Unicode
def _levenshtein_matrix(seq1: Tuple, seq2: Tuple): normalization and grapheme clusters.
"""Compute the matrix commonly computed to produce the Levenshtein distance.
This is a LRU cached function not meant to be used directly. Use levenshtein_matrix() instead. This should be the correct way to compare two Unicode strings.
""" """
m = len(seq1) s1, s2 = transform_unicode(s1, s2)
n = len(seq2) return distance(s1, s2)
def from_to(start, stop):
return range(start, stop + 1, 1)
D = np.zeros((m + 1, n + 1), np.int) @multimethod
D[0, 0] = 0 def distance_unicode(s1: ExtractedText, s2: ExtractedText):
for i in from_to(1, m): """Compute the Levenshtein edit distance between two Unicode strings
D[i, 0] = i
for j in from_to(1, n):
D[0, j] = j
for i in tqdm(from_to(1, m), disable=not Config.progress):
for j in from_to(1, n):
D[i, j] = min(
D[i - 1, j - 1]
+ 1 * (seq1[i - 1] != seq2[j - 1]), # Same or Substitution
D[i, j - 1] + 1, # Insertion
D[i - 1, j] + 1, # Deletion
)
return D
Note that this is different from distance() as this function knows about Unicode
normalization and grapheme clusters.
def levenshtein(seq1, seq2): This should be the correct way to compare two Unicode strings.
"""Compute the Levenshtein edit distance between two sequences""" """
m = len(seq1) return distance_unicode(s1.text, s2.text)
n = len(seq2)
D = levenshtein_matrix(seq1, seq2)
return D[m, n]
@multimethod
def distance(l1: List, l2: List):
"""Compute the Levenshtein edit distance between two lists.
def levenshtein_matrix_cache_clear(): Also see `distance_unicode()`.
"""Clear internal Levenshtein matrix cache.
You want to do this between different input file pairs to decrease memory The difference is that this implementation does not care about grapheme clusters or
usage by not caching results from prior input files. unicode normalization, assuming that this already has been done in preprocessing.
""" """
_levenshtein_matrix.cache_clear() s1, s2 = transform_lists(l1, l2)
return c_distance(s1, s2)
@multimethod @multimethod
def distance(s1: str, s2: str): def distance(s1: str, s2: str):
"""Compute the Levenshtein edit distance between two Unicode strings """Compute the Levenshtein edit distance between two strings.
Note that this is different from levenshtein() as this function knows about Unicode Also see `distance_unicode()`.
normalization and grapheme clusters.
This should be the correct way to compare two Unicode strings. The difference is that this implementation does not care about grapheme clusters or
unicode normalization, assuming that this already has been done in preprocessing.
""" """
seq1 = list(grapheme_clusters(unicodedata.normalize("NFC", s1))) return c_distance(s1, s2)
seq2 = list(grapheme_clusters(unicodedata.normalize("NFC", s2)))
if any(len(s) > 1 for s in chain(seq1, seq2)):
return distance(seq1, seq2)
else:
return distance_fast("".join(seq1), "".join(seq2))
@multimethod @multimethod
def distance(s1: ExtractedText, s2: ExtractedText): def distance(s1: ExtractedText, s2: ExtractedText):
return distance(s1.text, s2.text) """Compute the Levenshtein edit distance between two strings.
@multimethod
def distance(s1: List, s2: List):
return levenshtein(s1, s2)
def distance_fast(s1: str, s2: str): Also see `distance_unicode()`.
"""Compute the Levenshtein edit distance between two Unicode strings
Also see `distance()`.
The difference is that this implementation does not care about grapheme clusters or The difference is that this implementation does not care about grapheme clusters or
unicode normalization, assuming that this already has been done in preprocessing. unicode normalization, assuming that this already has been done in preprocessing.
""" """
return c_distance(s1, s2) return distance(s1.text, s2.text)
@multimethod @multimethod
def editops(seq1: List, seq2: List): def editops_unicode(s1: str, s2: str):
""" """Return sequence of edit operations transforming one string to another.
Return sequence of edit operations transforming one sequence to another.
This aims to return the same/similar results as python-Levenshtein's editops(), Note that this returns indices to the _grapheme clusters_, not characters!
just generalized to arbitrary sequences.
""" """
seq1 = list(seq1) s1, s2 = transform_unicode(s1, s2)
seq2 = list(seq2) return editops(s1, s2)
m = len(seq1)
n = len(seq2)
D = levenshtein_matrix(seq1, seq2)
def _tail_backtrace(i, j, accumulator):
if i > 0 and D[i - 1, j] + 1 == D[i, j]:
return partial(
_tail_backtrace, i - 1, j, [("delete", i - 1, j)] + accumulator
)
if j > 0 and D[i, j - 1] + 1 == D[i, j]:
return partial(
_tail_backtrace, i, j - 1, [("insert", i, j - 1)] + accumulator
)
if i > 0 and j > 0 and D[i - 1, j - 1] + 1 == D[i, j]:
return partial(
_tail_backtrace, i - 1, j - 1, [("replace", i - 1, j - 1)] + accumulator
)
if i > 0 and j > 0 and D[i - 1, j - 1] == D[i, j]:
return partial(_tail_backtrace, i - 1, j - 1, accumulator) # NOP
return accumulator
def backtrace(i, j):
result = partial(_tail_backtrace, i, j, [])
while isinstance(result, partial):
result = result()
return result
b = backtrace(m, n)
return b
@multimethod @multimethod
def editops(s1: str, s2: str): def editops(l1: List, l2: List):
""" """Return sequence of edit operations transforming one list to another.
Return sequence of edit operations transforming one string to another.
Note that this returns indices to the _grapheme clusters_, not characters! Also see `editops_unicode()`.
The difference is that this implementation does not care about grapheme clusters or
unicode normalization, assuming that this already has been done in preprocessing.
""" """
s1 = list(grapheme_clusters(unicodedata.normalize("NFC", s1))) s1, s2 = transform_lists(l1, l2)
s2 = list(grapheme_clusters(unicodedata.normalize("NFC", s2))) return c_editops(s1, s2)
if any(len(s) > 1 for s in chain(s1, s2)):
return editops(s1, s2)
else:
return editops_fast("".join(s1), "".join(s2))
def editops_fast(s1: str, s2: str): @multimethod
def editops(s1: str, s2: str):
"""Return sequence of edit operations transforming one string to another. """Return sequence of edit operations transforming one string to another.
Also see `editops()`. Also see `editops_unicode()`.
The difference is that this implementation does not care about grapheme clusters or The difference is that this implementation does not care about grapheme clusters or
unicode normalization, assuming that this already has been done in preprocessing. unicode normalization, assuming that this already has been done in preprocessing.
""" """
return c_editops(s1, s2) return c_editops(s1, s2)
def transform_lists(l1: List, l2: List) -> Tuple[str, str]:
"""Transform two lists into string representation.
We need this transformation to be able to calculate a Levenshtein distance
between two sequences.
Note that we can only process 1,114,111 unique elements with this implementation.
See https://docs.python.org/3/library/functions.html#chr
"""
mapping = {el: chr(i) for i, el in enumerate(frozenset(chain(l1, l2)))}
s1 = "".join([mapping[el] for el in l1])
s2 = "".join([mapping[el] for el in l2])
return s1, s2
def transform_unicode(s1: str, s2: str) -> Union[Tuple[str, str], Tuple[List[str]]]:
"""Transform two text sequences to unicode representation.
Normalize to unicode and decides whether we have wide chars
that needs to be represented by lists.
"""
s1 = list(grapheme_clusters(unicodedata.normalize("NFC", s1)))
s2 = list(grapheme_clusters(unicodedata.normalize("NFC", s2)))
if all(len(s) < 2 for s in chain(s1, s2)):
s1, s2 = "".join(s1), "".join(s2)
return s1, s2

File diff suppressed because it is too large Load Diff

@ -8,7 +8,6 @@ from ocrd_utils import getLogger, make_file_id, assert_file_grp_cardinality
from pkg_resources import resource_string from pkg_resources import resource_string
from .cli import process as cli_process from .cli import process as cli_process
from .edit_distance import levenshtein_matrix_cache_clear
OCRD_TOOL = json.loads(resource_string(__name__, "ocrd-tool.json").decode("utf8")) OCRD_TOOL = json.loads(resource_string(__name__, "ocrd-tool.json").decode("utf8"))
@ -74,8 +73,6 @@ class OcrdDinglehopperEvaluate(Processor):
local_filename=report_prefix + report_suffix, local_filename=report_prefix + report_suffix,
) )
# Clear cache between files
levenshtein_matrix_cache_clear()
if __name__ == "__main__": if __name__ == "__main__":
ocrd_dinglehopper() ocrd_dinglehopper()

@ -2,7 +2,7 @@ import unicodedata
import pytest import pytest
from .. import distance, distance_fast from .. import distance, distance_unicode
TEST_PARAMS = "s1,s2,expected_dist" TEST_PARAMS = "s1,s2,expected_dist"
@ -42,25 +42,13 @@ def test_distance_sequences(s1, s2, expected_dist):
assert dist == expected_dist assert dist == expected_dist
@pytest.mark.parametrize(TEST_PARAMS, TEST_STRINGS)
def test_distance_strings(s1, s2, expected_dist):
dist = distance(s1, s2)
assert dist == expected_dist
@pytest.mark.parametrize(TEST_PARAMS, TEST_STRINGS)
def test_distance_fast(s1, s2, expected_dist):
dist = distance_fast(s1, s2)
assert dist == expected_dist
@pytest.mark.parametrize(TEST_PARAMS, TEST_UNICODE) @pytest.mark.parametrize(TEST_PARAMS, TEST_UNICODE)
def test_editops_fast_unicode(s1, s2, expected_dist): def test_distance_with_unicode(s1, s2, expected_dist):
dist = distance_fast(s1, s2) dist = distance(s1, s2)
assert dist != expected_dist assert dist != expected_dist
@pytest.mark.parametrize(TEST_PARAMS, TEST_UNICODE) @pytest.mark.parametrize(TEST_PARAMS, TEST_UNICODE)
def test_distance_unicode(s1, s2, expected_dist): def test_distance_unicode(s1, s2, expected_dist):
dist = distance(s1, s2) dist = distance_unicode(s1, s2)
assert dist == expected_dist assert dist == expected_dist

@ -2,7 +2,7 @@ import unicodedata
import pytest import pytest
from .. import editops, editops_fast from .. import editops, editops_unicode
TEST_PARAMS = "s1,s2,expected_ops" TEST_PARAMS = "s1,s2,expected_ops"
@ -51,36 +51,22 @@ TEST_UNICODE = [
] ]
@pytest.mark.parametrize(TEST_PARAMS, TEST_STRINGS)
def test_editops_strings(s1, s2, expected_ops):
ops = editops(s1, s2)
assert ops == expected_ops
@pytest.mark.parametrize(TEST_PARAMS, [*TEST_STRINGS, *TEST_SEQUENCES]) @pytest.mark.parametrize(TEST_PARAMS, [*TEST_STRINGS, *TEST_SEQUENCES])
def test_editops_sequences(s1, s2, expected_ops): def test_editops(s1, s2, expected_ops):
ops = editops(s1, s2) ops = editops(s1, s2)
assert ops == expected_ops assert ops == expected_ops
@pytest.mark.parametrize(TEST_PARAMS, TEST_STRINGS)
def test_editops_fast(s1, s2, expected_ops):
ops = editops_fast(s1, s2)
assert ops == expected_ops
@pytest.mark.parametrize(TEST_PARAMS, TEST_UNICODE) @pytest.mark.parametrize(TEST_PARAMS, TEST_UNICODE)
def test_editops_fast_unicode(s1, s2, expected_ops): def test_editops_with_unicode(s1, s2, expected_ops):
ops = editops_fast(s1, s2) ops = editops(s1, s2)
assert ops != expected_ops assert ops != expected_ops
@pytest.mark.parametrize(TEST_PARAMS, TEST_UNICODE) @pytest.mark.parametrize(TEST_PARAMS, TEST_UNICODE)
def test_editops_unicode(s1, s2, expected_ops): def test_editops_unicode(s1, s2, expected_ops):
"""Test editops() in cases where dealing with grapheme clusters matters"""
if not expected_ops: if not expected_ops:
assert s1 != s2 assert s1 != s2
assert unicodedata.normalize("NFC", s1) == unicodedata.normalize("NFC", s2) assert unicodedata.normalize("NFC", s1) == unicodedata.normalize("NFC", s2)
ops = editops(s1, s2) ops = editops_unicode(s1, s2)
assert ops == expected_ops assert ops == expected_ops

@ -9,3 +9,4 @@ ocrd >= 2.20.1
attrs attrs
multimethod == 1.3 # latest version to officially support Python 3.5 multimethod == 1.3 # latest version to officially support Python 3.5
tqdm tqdm
python-levenshtein

Loading…
Cancel
Save