mirror of
https://github.com/qurator-spk/dinglehopper.git
synced 2025-06-19 08:39:59 +02:00
Switch from custom Levenshtein to python-Levenshtein
As the distance and editops calculation is a performance bottleneck in this application we substituted the custom Levenshtein implementation to the C implementation in the python-Levenshtein package. We now also have separate entrypoints for texts with unicode normalization and without because this also can be done more efficiently once upon preprocessing.
This commit is contained in:
parent
0e263cfac2
commit
e371da899e
7 changed files with 98 additions and 1210 deletions
|
@ -6,7 +6,7 @@ from typing import Tuple
|
||||||
from multimethod import multimethod
|
from multimethod import multimethod
|
||||||
from uniseg.graphemecluster import grapheme_clusters
|
from uniseg.graphemecluster import grapheme_clusters
|
||||||
|
|
||||||
from .edit_distance import distance
|
from .edit_distance import distance_unicode
|
||||||
from .extracted_text import ExtractedText
|
from .extracted_text import ExtractedText
|
||||||
|
|
||||||
|
|
||||||
|
@ -18,7 +18,7 @@ def character_error_rate_n(reference: str, compared: str) -> Tuple[float, int]:
|
||||||
:return: character error rate and length of the reference
|
:return: character error rate and length of the reference
|
||||||
"""
|
"""
|
||||||
|
|
||||||
d = distance(reference, compared)
|
d = distance_unicode(reference, compared)
|
||||||
n = len(list(grapheme_clusters(unicodedata.normalize("NFC", reference))))
|
n = len(list(grapheme_clusters(unicodedata.normalize("NFC", reference))))
|
||||||
|
|
||||||
if d == 0:
|
if d == 0:
|
||||||
|
|
|
@ -1,113 +1,59 @@
|
||||||
from __future__ import division, print_function
|
from __future__ import division, print_function
|
||||||
|
|
||||||
import unicodedata
|
import unicodedata
|
||||||
from functools import partial, lru_cache
|
|
||||||
from itertools import chain
|
from itertools import chain
|
||||||
from typing import Sequence, Tuple, List
|
from typing import List, Union, Tuple
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
from Levenshtein import editops as c_editops, distance as c_distance
|
from Levenshtein import editops as c_editops, distance as c_distance
|
||||||
from multimethod import multimethod
|
from multimethod import multimethod
|
||||||
from uniseg.graphemecluster import grapheme_clusters
|
from uniseg.graphemecluster import grapheme_clusters
|
||||||
from tqdm import tqdm
|
|
||||||
|
|
||||||
from .extracted_text import ExtractedText
|
from .extracted_text import ExtractedText
|
||||||
from .config import Config
|
|
||||||
|
|
||||||
|
|
||||||
def levenshtein_matrix(seq1: Sequence, seq2: Sequence):
|
|
||||||
"""Compute the matrix commonly computed to produce the Levenshtein distance.
|
|
||||||
This is also known as the Wagner-Fischer algorithm. The matrix element at the bottom right contains the desired
|
|
||||||
edit distance.
|
|
||||||
|
|
||||||
This algorithm is implemented here because we need an implementation that can work with sequences other than
|
|
||||||
strings, e.g. lists of grapheme clusters or lists of word strings.
|
|
||||||
"""
|
|
||||||
|
|
||||||
# Internally, we use a cached version. As the cache only works on hashable parameters, we convert the input
|
|
||||||
# sequences to tuples to make them hashable.
|
|
||||||
return _levenshtein_matrix(tuple(seq1), tuple(seq2))
|
|
||||||
|
|
||||||
|
|
||||||
@lru_cache(maxsize=10)
|
|
||||||
def _levenshtein_matrix(seq1: Tuple, seq2: Tuple):
|
|
||||||
"""Compute the matrix commonly computed to produce the Levenshtein distance.
|
|
||||||
|
|
||||||
This is a LRU cached function not meant to be used directly. Use levenshtein_matrix() instead.
|
|
||||||
"""
|
|
||||||
m = len(seq1)
|
|
||||||
n = len(seq2)
|
|
||||||
|
|
||||||
def from_to(start, stop):
|
|
||||||
return range(start, stop + 1, 1)
|
|
||||||
|
|
||||||
D = np.zeros((m + 1, n + 1), np.int)
|
|
||||||
D[0, 0] = 0
|
|
||||||
for i in from_to(1, m):
|
|
||||||
D[i, 0] = i
|
|
||||||
for j in from_to(1, n):
|
|
||||||
D[0, j] = j
|
|
||||||
for i in tqdm(from_to(1, m), disable=not Config.progress):
|
|
||||||
for j in from_to(1, n):
|
|
||||||
D[i, j] = min(
|
|
||||||
D[i - 1, j - 1]
|
|
||||||
+ 1 * (seq1[i - 1] != seq2[j - 1]), # Same or Substitution
|
|
||||||
D[i, j - 1] + 1, # Insertion
|
|
||||||
D[i - 1, j] + 1, # Deletion
|
|
||||||
)
|
|
||||||
|
|
||||||
return D
|
|
||||||
|
|
||||||
|
|
||||||
def levenshtein(seq1, seq2):
|
|
||||||
"""Compute the Levenshtein edit distance between two sequences"""
|
|
||||||
m = len(seq1)
|
|
||||||
n = len(seq2)
|
|
||||||
|
|
||||||
D = levenshtein_matrix(seq1, seq2)
|
|
||||||
return D[m, n]
|
|
||||||
|
|
||||||
|
|
||||||
def levenshtein_matrix_cache_clear():
|
|
||||||
"""Clear internal Levenshtein matrix cache.
|
|
||||||
|
|
||||||
You want to do this between different input file pairs to decrease memory
|
|
||||||
usage by not caching results from prior input files.
|
|
||||||
"""
|
|
||||||
_levenshtein_matrix.cache_clear()
|
|
||||||
|
|
||||||
|
|
||||||
@multimethod
|
@multimethod
|
||||||
def distance(s1: str, s2: str):
|
def distance_unicode(s1: str, s2: str):
|
||||||
"""Compute the Levenshtein edit distance between two Unicode strings
|
"""Compute the Levenshtein edit distance between two Unicode strings
|
||||||
|
|
||||||
Note that this is different from levenshtein() as this function knows about Unicode
|
Note that this is different from distance() as this function knows about Unicode
|
||||||
normalization and grapheme clusters.
|
normalization and grapheme clusters.
|
||||||
|
|
||||||
This should be the correct way to compare two Unicode strings.
|
This should be the correct way to compare two Unicode strings.
|
||||||
"""
|
"""
|
||||||
seq1 = list(grapheme_clusters(unicodedata.normalize("NFC", s1)))
|
s1, s2 = transform_unicode(s1, s2)
|
||||||
seq2 = list(grapheme_clusters(unicodedata.normalize("NFC", s2)))
|
return distance(s1, s2)
|
||||||
if any(len(s) > 1 for s in chain(seq1, seq2)):
|
|
||||||
return distance(seq1, seq2)
|
|
||||||
else:
|
|
||||||
return distance_fast("".join(seq1), "".join(seq2))
|
|
||||||
|
|
||||||
|
|
||||||
@multimethod
|
@multimethod
|
||||||
def distance(s1: ExtractedText, s2: ExtractedText):
|
def distance_unicode(s1: ExtractedText, s2: ExtractedText):
|
||||||
return distance(s1.text, s2.text)
|
|
||||||
|
|
||||||
|
|
||||||
@multimethod
|
|
||||||
def distance(s1: List, s2: List):
|
|
||||||
return levenshtein(s1, s2)
|
|
||||||
|
|
||||||
|
|
||||||
def distance_fast(s1: str, s2: str):
|
|
||||||
"""Compute the Levenshtein edit distance between two Unicode strings
|
"""Compute the Levenshtein edit distance between two Unicode strings
|
||||||
|
|
||||||
Also see `distance()`.
|
Note that this is different from distance() as this function knows about Unicode
|
||||||
|
normalization and grapheme clusters.
|
||||||
|
|
||||||
|
This should be the correct way to compare two Unicode strings.
|
||||||
|
"""
|
||||||
|
return distance_unicode(s1.text, s2.text)
|
||||||
|
|
||||||
|
|
||||||
|
@multimethod
|
||||||
|
def distance(l1: List, l2: List):
|
||||||
|
"""Compute the Levenshtein edit distance between two lists.
|
||||||
|
|
||||||
|
Also see `distance_unicode()`.
|
||||||
|
|
||||||
|
The difference is that this implementation does not care about grapheme clusters or
|
||||||
|
unicode normalization, assuming that this already has been done in preprocessing.
|
||||||
|
"""
|
||||||
|
s1, s2 = transform_lists(l1, l2)
|
||||||
|
return c_distance(s1, s2)
|
||||||
|
|
||||||
|
|
||||||
|
@multimethod
|
||||||
|
def distance(s1: str, s2: str):
|
||||||
|
"""Compute the Levenshtein edit distance between two strings.
|
||||||
|
|
||||||
|
Also see `distance_unicode()`.
|
||||||
|
|
||||||
The difference is that this implementation does not care about grapheme clusters or
|
The difference is that this implementation does not care about grapheme clusters or
|
||||||
unicode normalization, assuming that this already has been done in preprocessing.
|
unicode normalization, assuming that this already has been done in preprocessing.
|
||||||
|
@ -116,68 +62,75 @@ def distance_fast(s1: str, s2: str):
|
||||||
|
|
||||||
|
|
||||||
@multimethod
|
@multimethod
|
||||||
def editops(seq1: List, seq2: List):
|
def distance(s1: ExtractedText, s2: ExtractedText):
|
||||||
|
"""Compute the Levenshtein edit distance between two strings.
|
||||||
|
|
||||||
|
Also see `distance_unicode()`.
|
||||||
|
|
||||||
|
The difference is that this implementation does not care about grapheme clusters or
|
||||||
|
unicode normalization, assuming that this already has been done in preprocessing.
|
||||||
"""
|
"""
|
||||||
Return sequence of edit operations transforming one sequence to another.
|
return distance(s1.text, s2.text)
|
||||||
|
|
||||||
This aims to return the same/similar results as python-Levenshtein's editops(),
|
|
||||||
just generalized to arbitrary sequences.
|
@multimethod
|
||||||
|
def editops_unicode(s1: str, s2: str):
|
||||||
|
"""Return sequence of edit operations transforming one string to another.
|
||||||
|
|
||||||
|
Note that this returns indices to the _grapheme clusters_, not characters!
|
||||||
"""
|
"""
|
||||||
seq1 = list(seq1)
|
s1, s2 = transform_unicode(s1, s2)
|
||||||
seq2 = list(seq2)
|
return editops(s1, s2)
|
||||||
m = len(seq1)
|
|
||||||
n = len(seq2)
|
|
||||||
D = levenshtein_matrix(seq1, seq2)
|
|
||||||
|
|
||||||
def _tail_backtrace(i, j, accumulator):
|
|
||||||
if i > 0 and D[i - 1, j] + 1 == D[i, j]:
|
|
||||||
return partial(
|
|
||||||
_tail_backtrace, i - 1, j, [("delete", i - 1, j)] + accumulator
|
|
||||||
)
|
|
||||||
if j > 0 and D[i, j - 1] + 1 == D[i, j]:
|
|
||||||
return partial(
|
|
||||||
_tail_backtrace, i, j - 1, [("insert", i, j - 1)] + accumulator
|
|
||||||
)
|
|
||||||
if i > 0 and j > 0 and D[i - 1, j - 1] + 1 == D[i, j]:
|
|
||||||
return partial(
|
|
||||||
_tail_backtrace, i - 1, j - 1, [("replace", i - 1, j - 1)] + accumulator
|
|
||||||
)
|
|
||||||
if i > 0 and j > 0 and D[i - 1, j - 1] == D[i, j]:
|
|
||||||
return partial(_tail_backtrace, i - 1, j - 1, accumulator) # NOP
|
|
||||||
return accumulator
|
|
||||||
|
|
||||||
def backtrace(i, j):
|
@multimethod
|
||||||
result = partial(_tail_backtrace, i, j, [])
|
def editops(l1: List, l2: List):
|
||||||
while isinstance(result, partial):
|
"""Return sequence of edit operations transforming one list to another.
|
||||||
result = result()
|
|
||||||
|
|
||||||
return result
|
Also see `editops_unicode()`.
|
||||||
|
|
||||||
b = backtrace(m, n)
|
The difference is that this implementation does not care about grapheme clusters or
|
||||||
return b
|
unicode normalization, assuming that this already has been done in preprocessing.
|
||||||
|
"""
|
||||||
|
s1, s2 = transform_lists(l1, l2)
|
||||||
|
return c_editops(s1, s2)
|
||||||
|
|
||||||
|
|
||||||
@multimethod
|
@multimethod
|
||||||
def editops(s1: str, s2: str):
|
def editops(s1: str, s2: str):
|
||||||
"""
|
|
||||||
Return sequence of edit operations transforming one string to another.
|
|
||||||
|
|
||||||
Note that this returns indices to the _grapheme clusters_, not characters!
|
|
||||||
"""
|
|
||||||
s1 = list(grapheme_clusters(unicodedata.normalize("NFC", s1)))
|
|
||||||
s2 = list(grapheme_clusters(unicodedata.normalize("NFC", s2)))
|
|
||||||
if any(len(s) > 1 for s in chain(s1, s2)):
|
|
||||||
return editops(s1, s2)
|
|
||||||
else:
|
|
||||||
return editops_fast("".join(s1), "".join(s2))
|
|
||||||
|
|
||||||
|
|
||||||
def editops_fast(s1: str, s2: str):
|
|
||||||
"""Return sequence of edit operations transforming one string to another.
|
"""Return sequence of edit operations transforming one string to another.
|
||||||
|
|
||||||
Also see `editops()`.
|
Also see `editops_unicode()`.
|
||||||
|
|
||||||
The difference is that this implementation does not care about grapheme clusters or
|
The difference is that this implementation does not care about grapheme clusters or
|
||||||
unicode normalization, assuming that this already has been done in preprocessing.
|
unicode normalization, assuming that this already has been done in preprocessing.
|
||||||
"""
|
"""
|
||||||
return c_editops(s1, s2)
|
return c_editops(s1, s2)
|
||||||
|
|
||||||
|
|
||||||
|
def transform_lists(l1: List, l2: List) -> Tuple[str, str]:
|
||||||
|
"""Transform two lists into string representation.
|
||||||
|
|
||||||
|
We need this transformation to be able to calculate a Levenshtein distance
|
||||||
|
between two sequences.
|
||||||
|
|
||||||
|
Note that we can only process 1,114,111 unique elements with this implementation.
|
||||||
|
See https://docs.python.org/3/library/functions.html#chr
|
||||||
|
"""
|
||||||
|
mapping = {el: chr(i) for i, el in enumerate(frozenset(chain(l1, l2)))}
|
||||||
|
s1 = "".join([mapping[el] for el in l1])
|
||||||
|
s2 = "".join([mapping[el] for el in l2])
|
||||||
|
return s1, s2
|
||||||
|
|
||||||
|
|
||||||
|
def transform_unicode(s1: str, s2: str) -> Union[Tuple[str, str], Tuple[List[str]]]:
|
||||||
|
"""Transform two text sequences to unicode representation.
|
||||||
|
|
||||||
|
Normalize to unicode and decides whether we have wide chars
|
||||||
|
that needs to be represented by lists.
|
||||||
|
"""
|
||||||
|
s1 = list(grapheme_clusters(unicodedata.normalize("NFC", s1)))
|
||||||
|
s2 = list(grapheme_clusters(unicodedata.normalize("NFC", s2)))
|
||||||
|
if all(len(s) < 2 for s in chain(s1, s2)):
|
||||||
|
s1, s2 = "".join(s1), "".join(s2)
|
||||||
|
return s1, s2
|
||||||
|
|
File diff suppressed because it is too large
Load diff
|
@ -8,7 +8,6 @@ from ocrd_utils import getLogger, make_file_id, assert_file_grp_cardinality
|
||||||
from pkg_resources import resource_string
|
from pkg_resources import resource_string
|
||||||
|
|
||||||
from .cli import process as cli_process
|
from .cli import process as cli_process
|
||||||
from .edit_distance import levenshtein_matrix_cache_clear
|
|
||||||
|
|
||||||
OCRD_TOOL = json.loads(resource_string(__name__, "ocrd-tool.json").decode("utf8"))
|
OCRD_TOOL = json.loads(resource_string(__name__, "ocrd-tool.json").decode("utf8"))
|
||||||
|
|
||||||
|
@ -74,8 +73,6 @@ class OcrdDinglehopperEvaluate(Processor):
|
||||||
local_filename=report_prefix + report_suffix,
|
local_filename=report_prefix + report_suffix,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Clear cache between files
|
|
||||||
levenshtein_matrix_cache_clear()
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
ocrd_dinglehopper()
|
ocrd_dinglehopper()
|
||||||
|
|
|
@ -2,7 +2,7 @@ import unicodedata
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from .. import distance, distance_fast
|
from .. import distance, distance_unicode
|
||||||
|
|
||||||
|
|
||||||
TEST_PARAMS = "s1,s2,expected_dist"
|
TEST_PARAMS = "s1,s2,expected_dist"
|
||||||
|
@ -42,25 +42,13 @@ def test_distance_sequences(s1, s2, expected_dist):
|
||||||
assert dist == expected_dist
|
assert dist == expected_dist
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(TEST_PARAMS, TEST_STRINGS)
|
|
||||||
def test_distance_strings(s1, s2, expected_dist):
|
|
||||||
dist = distance(s1, s2)
|
|
||||||
assert dist == expected_dist
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(TEST_PARAMS, TEST_STRINGS)
|
|
||||||
def test_distance_fast(s1, s2, expected_dist):
|
|
||||||
dist = distance_fast(s1, s2)
|
|
||||||
assert dist == expected_dist
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(TEST_PARAMS, TEST_UNICODE)
|
@pytest.mark.parametrize(TEST_PARAMS, TEST_UNICODE)
|
||||||
def test_editops_fast_unicode(s1, s2, expected_dist):
|
def test_distance_with_unicode(s1, s2, expected_dist):
|
||||||
dist = distance_fast(s1, s2)
|
dist = distance(s1, s2)
|
||||||
assert dist != expected_dist
|
assert dist != expected_dist
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(TEST_PARAMS, TEST_UNICODE)
|
@pytest.mark.parametrize(TEST_PARAMS, TEST_UNICODE)
|
||||||
def test_distance_unicode(s1, s2, expected_dist):
|
def test_distance_unicode(s1, s2, expected_dist):
|
||||||
dist = distance(s1, s2)
|
dist = distance_unicode(s1, s2)
|
||||||
assert dist == expected_dist
|
assert dist == expected_dist
|
||||||
|
|
|
@ -2,7 +2,7 @@ import unicodedata
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from .. import editops, editops_fast
|
from .. import editops, editops_unicode
|
||||||
|
|
||||||
TEST_PARAMS = "s1,s2,expected_ops"
|
TEST_PARAMS = "s1,s2,expected_ops"
|
||||||
|
|
||||||
|
@ -51,36 +51,22 @@ TEST_UNICODE = [
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(TEST_PARAMS, TEST_STRINGS)
|
|
||||||
def test_editops_strings(s1, s2, expected_ops):
|
|
||||||
ops = editops(s1, s2)
|
|
||||||
assert ops == expected_ops
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(TEST_PARAMS, [*TEST_STRINGS, *TEST_SEQUENCES])
|
@pytest.mark.parametrize(TEST_PARAMS, [*TEST_STRINGS, *TEST_SEQUENCES])
|
||||||
def test_editops_sequences(s1, s2, expected_ops):
|
def test_editops(s1, s2, expected_ops):
|
||||||
ops = editops(s1, s2)
|
ops = editops(s1, s2)
|
||||||
assert ops == expected_ops
|
assert ops == expected_ops
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(TEST_PARAMS, TEST_STRINGS)
|
|
||||||
def test_editops_fast(s1, s2, expected_ops):
|
|
||||||
ops = editops_fast(s1, s2)
|
|
||||||
assert ops == expected_ops
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(TEST_PARAMS, TEST_UNICODE)
|
@pytest.mark.parametrize(TEST_PARAMS, TEST_UNICODE)
|
||||||
def test_editops_fast_unicode(s1, s2, expected_ops):
|
def test_editops_with_unicode(s1, s2, expected_ops):
|
||||||
ops = editops_fast(s1, s2)
|
ops = editops(s1, s2)
|
||||||
assert ops != expected_ops
|
assert ops != expected_ops
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(TEST_PARAMS, TEST_UNICODE)
|
@pytest.mark.parametrize(TEST_PARAMS, TEST_UNICODE)
|
||||||
def test_editops_unicode(s1, s2, expected_ops):
|
def test_editops_unicode(s1, s2, expected_ops):
|
||||||
"""Test editops() in cases where dealing with grapheme clusters matters"""
|
|
||||||
|
|
||||||
if not expected_ops:
|
if not expected_ops:
|
||||||
assert s1 != s2
|
assert s1 != s2
|
||||||
assert unicodedata.normalize("NFC", s1) == unicodedata.normalize("NFC", s2)
|
assert unicodedata.normalize("NFC", s1) == unicodedata.normalize("NFC", s2)
|
||||||
ops = editops(s1, s2)
|
ops = editops_unicode(s1, s2)
|
||||||
assert ops == expected_ops
|
assert ops == expected_ops
|
||||||
|
|
|
@ -9,3 +9,4 @@ ocrd >= 2.20.1
|
||||||
attrs
|
attrs
|
||||||
multimethod == 1.3 # latest version to officially support Python 3.5
|
multimethod == 1.3 # latest version to officially support Python 3.5
|
||||||
tqdm
|
tqdm
|
||||||
|
python-levenshtein
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue