Switch from custom Levenshtein to python-Levenshtein

As the distance and editops calculation is a performance bottleneck in
this application we substituted the custom Levenshtein implementation to
the C implementation in the python-Levenshtein package.

We now also have separate entrypoints for texts with unicode normalization
and without because this also can be done more efficiently once upon
preprocessing.
pull/48/head
Benjamin Rosemann 4 years ago
parent 0e263cfac2
commit e371da899e

@ -6,7 +6,7 @@ from typing import Tuple
from multimethod import multimethod
from uniseg.graphemecluster import grapheme_clusters
from .edit_distance import distance
from .edit_distance import distance_unicode
from .extracted_text import ExtractedText
@ -18,7 +18,7 @@ def character_error_rate_n(reference: str, compared: str) -> Tuple[float, int]:
:return: character error rate and length of the reference
"""
d = distance(reference, compared)
d = distance_unicode(reference, compared)
n = len(list(grapheme_clusters(unicodedata.normalize("NFC", reference))))
if d == 0:

@ -1,183 +1,136 @@
from __future__ import division, print_function
import unicodedata
from functools import partial, lru_cache
from itertools import chain
from typing import Sequence, Tuple, List
from typing import List, Union, Tuple
import numpy as np
from Levenshtein import editops as c_editops, distance as c_distance
from multimethod import multimethod
from uniseg.graphemecluster import grapheme_clusters
from tqdm import tqdm
from .extracted_text import ExtractedText
from .config import Config
def levenshtein_matrix(seq1: Sequence, seq2: Sequence):
"""Compute the matrix commonly computed to produce the Levenshtein distance.
This is also known as the Wagner-Fischer algorithm. The matrix element at the bottom right contains the desired
edit distance.
This algorithm is implemented here because we need an implementation that can work with sequences other than
strings, e.g. lists of grapheme clusters or lists of word strings.
"""
# Internally, we use a cached version. As the cache only works on hashable parameters, we convert the input
# sequences to tuples to make them hashable.
return _levenshtein_matrix(tuple(seq1), tuple(seq2))
@multimethod
def distance_unicode(s1: str, s2: str):
"""Compute the Levenshtein edit distance between two Unicode strings
@lru_cache(maxsize=10)
def _levenshtein_matrix(seq1: Tuple, seq2: Tuple):
"""Compute the matrix commonly computed to produce the Levenshtein distance.
Note that this is different from distance() as this function knows about Unicode
normalization and grapheme clusters.
This is a LRU cached function not meant to be used directly. Use levenshtein_matrix() instead.
This should be the correct way to compare two Unicode strings.
"""
m = len(seq1)
n = len(seq2)
s1, s2 = transform_unicode(s1, s2)
return distance(s1, s2)
def from_to(start, stop):
return range(start, stop + 1, 1)
D = np.zeros((m + 1, n + 1), np.int)
D[0, 0] = 0
for i in from_to(1, m):
D[i, 0] = i
for j in from_to(1, n):
D[0, j] = j
for i in tqdm(from_to(1, m), disable=not Config.progress):
for j in from_to(1, n):
D[i, j] = min(
D[i - 1, j - 1]
+ 1 * (seq1[i - 1] != seq2[j - 1]), # Same or Substitution
D[i, j - 1] + 1, # Insertion
D[i - 1, j] + 1, # Deletion
)
return D
@multimethod
def distance_unicode(s1: ExtractedText, s2: ExtractedText):
"""Compute the Levenshtein edit distance between two Unicode strings
Note that this is different from distance() as this function knows about Unicode
normalization and grapheme clusters.
def levenshtein(seq1, seq2):
"""Compute the Levenshtein edit distance between two sequences"""
m = len(seq1)
n = len(seq2)
This should be the correct way to compare two Unicode strings.
"""
return distance_unicode(s1.text, s2.text)
D = levenshtein_matrix(seq1, seq2)
return D[m, n]
@multimethod
def distance(l1: List, l2: List):
"""Compute the Levenshtein edit distance between two lists.
def levenshtein_matrix_cache_clear():
"""Clear internal Levenshtein matrix cache.
Also see `distance_unicode()`.
You want to do this between different input file pairs to decrease memory
usage by not caching results from prior input files.
The difference is that this implementation does not care about grapheme clusters or
unicode normalization, assuming that this already has been done in preprocessing.
"""
_levenshtein_matrix.cache_clear()
s1, s2 = transform_lists(l1, l2)
return c_distance(s1, s2)
@multimethod
def distance(s1: str, s2: str):
"""Compute the Levenshtein edit distance between two Unicode strings
"""Compute the Levenshtein edit distance between two strings.
Note that this is different from levenshtein() as this function knows about Unicode
normalization and grapheme clusters.
Also see `distance_unicode()`.
This should be the correct way to compare two Unicode strings.
The difference is that this implementation does not care about grapheme clusters or
unicode normalization, assuming that this already has been done in preprocessing.
"""
seq1 = list(grapheme_clusters(unicodedata.normalize("NFC", s1)))
seq2 = list(grapheme_clusters(unicodedata.normalize("NFC", s2)))
if any(len(s) > 1 for s in chain(seq1, seq2)):
return distance(seq1, seq2)
else:
return distance_fast("".join(seq1), "".join(seq2))
return c_distance(s1, s2)
@multimethod
def distance(s1: ExtractedText, s2: ExtractedText):
return distance(s1.text, s2.text)
@multimethod
def distance(s1: List, s2: List):
return levenshtein(s1, s2)
def distance_fast(s1: str, s2: str):
"""Compute the Levenshtein edit distance between two Unicode strings
"""Compute the Levenshtein edit distance between two strings.
Also see `distance()`.
Also see `distance_unicode()`.
The difference is that this implementation does not care about grapheme clusters or
unicode normalization, assuming that this already has been done in preprocessing.
"""
return c_distance(s1, s2)
return distance(s1.text, s2.text)
@multimethod
def editops(seq1: List, seq2: List):
"""
Return sequence of edit operations transforming one sequence to another.
def editops_unicode(s1: str, s2: str):
"""Return sequence of edit operations transforming one string to another.
This aims to return the same/similar results as python-Levenshtein's editops(),
just generalized to arbitrary sequences.
Note that this returns indices to the _grapheme clusters_, not characters!
"""
seq1 = list(seq1)
seq2 = list(seq2)
m = len(seq1)
n = len(seq2)
D = levenshtein_matrix(seq1, seq2)
def _tail_backtrace(i, j, accumulator):
if i > 0 and D[i - 1, j] + 1 == D[i, j]:
return partial(
_tail_backtrace, i - 1, j, [("delete", i - 1, j)] + accumulator
)
if j > 0 and D[i, j - 1] + 1 == D[i, j]:
return partial(
_tail_backtrace, i, j - 1, [("insert", i, j - 1)] + accumulator
)
if i > 0 and j > 0 and D[i - 1, j - 1] + 1 == D[i, j]:
return partial(
_tail_backtrace, i - 1, j - 1, [("replace", i - 1, j - 1)] + accumulator
)
if i > 0 and j > 0 and D[i - 1, j - 1] == D[i, j]:
return partial(_tail_backtrace, i - 1, j - 1, accumulator) # NOP
return accumulator
def backtrace(i, j):
result = partial(_tail_backtrace, i, j, [])
while isinstance(result, partial):
result = result()
return result
b = backtrace(m, n)
return b
s1, s2 = transform_unicode(s1, s2)
return editops(s1, s2)
@multimethod
def editops(s1: str, s2: str):
"""
Return sequence of edit operations transforming one string to another.
def editops(l1: List, l2: List):
"""Return sequence of edit operations transforming one list to another.
Note that this returns indices to the _grapheme clusters_, not characters!
Also see `editops_unicode()`.
The difference is that this implementation does not care about grapheme clusters or
unicode normalization, assuming that this already has been done in preprocessing.
"""
s1 = list(grapheme_clusters(unicodedata.normalize("NFC", s1)))
s2 = list(grapheme_clusters(unicodedata.normalize("NFC", s2)))
if any(len(s) > 1 for s in chain(s1, s2)):
return editops(s1, s2)
else:
return editops_fast("".join(s1), "".join(s2))
s1, s2 = transform_lists(l1, l2)
return c_editops(s1, s2)
def editops_fast(s1: str, s2: str):
@multimethod
def editops(s1: str, s2: str):
"""Return sequence of edit operations transforming one string to another.
Also see `editops()`.
Also see `editops_unicode()`.
The difference is that this implementation does not care about grapheme clusters or
unicode normalization, assuming that this already has been done in preprocessing.
"""
return c_editops(s1, s2)
def transform_lists(l1: List, l2: List) -> Tuple[str, str]:
"""Transform two lists into string representation.
We need this transformation to be able to calculate a Levenshtein distance
between two sequences.
Note that we can only process 1,114,111 unique elements with this implementation.
See https://docs.python.org/3/library/functions.html#chr
"""
mapping = {el: chr(i) for i, el in enumerate(frozenset(chain(l1, l2)))}
s1 = "".join([mapping[el] for el in l1])
s2 = "".join([mapping[el] for el in l2])
return s1, s2
def transform_unicode(s1: str, s2: str) -> Union[Tuple[str, str], Tuple[List[str]]]:
"""Transform two text sequences to unicode representation.
Normalize to unicode and decides whether we have wide chars
that needs to be represented by lists.
"""
s1 = list(grapheme_clusters(unicodedata.normalize("NFC", s1)))
s2 = list(grapheme_clusters(unicodedata.normalize("NFC", s2)))
if all(len(s) < 2 for s in chain(s1, s2)):
s1, s2 = "".join(s1), "".join(s2)
return s1, s2

File diff suppressed because it is too large Load Diff

@ -8,7 +8,6 @@ from ocrd_utils import getLogger, make_file_id, assert_file_grp_cardinality
from pkg_resources import resource_string
from .cli import process as cli_process
from .edit_distance import levenshtein_matrix_cache_clear
OCRD_TOOL = json.loads(resource_string(__name__, "ocrd-tool.json").decode("utf8"))
@ -74,8 +73,6 @@ class OcrdDinglehopperEvaluate(Processor):
local_filename=report_prefix + report_suffix,
)
# Clear cache between files
levenshtein_matrix_cache_clear()
if __name__ == "__main__":
ocrd_dinglehopper()

@ -2,7 +2,7 @@ import unicodedata
import pytest
from .. import distance, distance_fast
from .. import distance, distance_unicode
TEST_PARAMS = "s1,s2,expected_dist"
@ -42,25 +42,13 @@ def test_distance_sequences(s1, s2, expected_dist):
assert dist == expected_dist
@pytest.mark.parametrize(TEST_PARAMS, TEST_STRINGS)
def test_distance_strings(s1, s2, expected_dist):
dist = distance(s1, s2)
assert dist == expected_dist
@pytest.mark.parametrize(TEST_PARAMS, TEST_STRINGS)
def test_distance_fast(s1, s2, expected_dist):
dist = distance_fast(s1, s2)
assert dist == expected_dist
@pytest.mark.parametrize(TEST_PARAMS, TEST_UNICODE)
def test_editops_fast_unicode(s1, s2, expected_dist):
dist = distance_fast(s1, s2)
def test_distance_with_unicode(s1, s2, expected_dist):
dist = distance(s1, s2)
assert dist != expected_dist
@pytest.mark.parametrize(TEST_PARAMS, TEST_UNICODE)
def test_distance_unicode(s1, s2, expected_dist):
dist = distance(s1, s2)
dist = distance_unicode(s1, s2)
assert dist == expected_dist

@ -2,7 +2,7 @@ import unicodedata
import pytest
from .. import editops, editops_fast
from .. import editops, editops_unicode
TEST_PARAMS = "s1,s2,expected_ops"
@ -51,36 +51,22 @@ TEST_UNICODE = [
]
@pytest.mark.parametrize(TEST_PARAMS, TEST_STRINGS)
def test_editops_strings(s1, s2, expected_ops):
ops = editops(s1, s2)
assert ops == expected_ops
@pytest.mark.parametrize(TEST_PARAMS, [*TEST_STRINGS, *TEST_SEQUENCES])
def test_editops_sequences(s1, s2, expected_ops):
def test_editops(s1, s2, expected_ops):
ops = editops(s1, s2)
assert ops == expected_ops
@pytest.mark.parametrize(TEST_PARAMS, TEST_STRINGS)
def test_editops_fast(s1, s2, expected_ops):
ops = editops_fast(s1, s2)
assert ops == expected_ops
@pytest.mark.parametrize(TEST_PARAMS, TEST_UNICODE)
def test_editops_fast_unicode(s1, s2, expected_ops):
ops = editops_fast(s1, s2)
def test_editops_with_unicode(s1, s2, expected_ops):
ops = editops(s1, s2)
assert ops != expected_ops
@pytest.mark.parametrize(TEST_PARAMS, TEST_UNICODE)
def test_editops_unicode(s1, s2, expected_ops):
"""Test editops() in cases where dealing with grapheme clusters matters"""
if not expected_ops:
assert s1 != s2
assert unicodedata.normalize("NFC", s1) == unicodedata.normalize("NFC", s2)
ops = editops(s1, s2)
ops = editops_unicode(s1, s2)
assert ops == expected_ops

@ -9,3 +9,4 @@ ocrd >= 2.20.1
attrs
multimethod == 1.3 # latest version to officially support Python 3.5
tqdm
python-levenshtein

Loading…
Cancel
Save