mirror of
https://github.com/qurator-spk/dinglehopper.git
synced 2025-06-09 11:50:00 +02:00
⚡ dinglehopper: Use rapidfuzz for editops
This commit is contained in:
parent
249787686f
commit
af8da1d716
5 changed files with 28 additions and 127 deletions
|
@ -1,4 +1,5 @@
|
|||
from .edit_distance import *
|
||||
from rapidfuzz.string_metric import levenshtein_editops
|
||||
|
||||
|
||||
def align(t1, t2):
|
||||
|
@ -12,7 +13,7 @@ def seq_align(s1, s2):
|
|||
"""Align general sequences."""
|
||||
s1 = list(s1)
|
||||
s2 = list(s2)
|
||||
ops = seq_editops(s1, s2)
|
||||
ops = levenshtein_editops(s1, s2)
|
||||
i = 0
|
||||
j = 0
|
||||
|
||||
|
|
|
@ -8,79 +8,19 @@ import numpy as np
|
|||
from multimethod import multimethod
|
||||
from uniseg.graphemecluster import grapheme_clusters
|
||||
from tqdm import tqdm
|
||||
from rapidfuzz.string_metric import levenshtein, levenshtein_editops
|
||||
|
||||
from .extracted_text import ExtractedText
|
||||
from .config import Config
|
||||
|
||||
|
||||
def levenshtein_matrix(seq1: Sequence, seq2: Sequence):
|
||||
"""Compute the matrix commonly computed to produce the Levenshtein distance.
|
||||
This is also known as the Wagner-Fischer algorithm. The matrix element at the bottom right contains the desired
|
||||
edit distance.
|
||||
|
||||
This algorithm is implemented here because we need an implementation that can work with sequences other than
|
||||
strings, e.g. lists of grapheme clusters or lists of word strings.
|
||||
"""
|
||||
|
||||
# Internally, we use a cached version. As the cache only works on hashable parameters, we convert the input
|
||||
# sequences to tuples to make them hashable.
|
||||
return _levenshtein_matrix(tuple(seq1), tuple(seq2))
|
||||
|
||||
|
||||
@lru_cache(maxsize=10)
|
||||
def _levenshtein_matrix(seq1: Tuple, seq2: Tuple):
|
||||
"""Compute the matrix commonly computed to produce the Levenshtein distance.
|
||||
|
||||
This is a LRU cached function not meant to be used directly. Use levenshtein_matrix() instead.
|
||||
"""
|
||||
m = len(seq1)
|
||||
n = len(seq2)
|
||||
|
||||
def from_to(start, stop):
|
||||
return range(start, stop + 1, 1)
|
||||
|
||||
D = np.zeros((m + 1, n + 1), np.int)
|
||||
D[0, 0] = 0
|
||||
for i in from_to(1, m):
|
||||
D[i, 0] = i
|
||||
for j in from_to(1, n):
|
||||
D[0, j] = j
|
||||
for i in tqdm(from_to(1, m), disable=not Config.progress):
|
||||
for j in from_to(1, n):
|
||||
D[i, j] = min(
|
||||
D[i - 1, j - 1]
|
||||
+ 1 * (seq1[i - 1] != seq2[j - 1]), # Same or Substitution
|
||||
D[i, j - 1] + 1, # Insertion
|
||||
D[i - 1, j] + 1, # Deletion
|
||||
)
|
||||
|
||||
return D
|
||||
|
||||
|
||||
def levenshtein(seq1, seq2):
|
||||
"""Compute the Levenshtein edit distance between two sequences"""
|
||||
m = len(seq1)
|
||||
n = len(seq2)
|
||||
|
||||
D = levenshtein_matrix(seq1, seq2)
|
||||
return D[m, n]
|
||||
|
||||
|
||||
def levenshtein_matrix_cache_clear():
|
||||
"""Clear internal Levenshtein matrix cache.
|
||||
|
||||
You want to do this between different input file pairs to decrease memory
|
||||
usage by not caching results from prior input files.
|
||||
"""
|
||||
_levenshtein_matrix.cache_clear()
|
||||
|
||||
|
||||
@multimethod
|
||||
def distance(s1: str, s2: str):
|
||||
"""Compute the Levenshtein edit distance between two Unicode strings
|
||||
|
||||
Note that this is different from levenshtein() as this function knows about Unicode normalization and grapheme
|
||||
clusters. This should be the correct way to compare two Unicode strings.
|
||||
Note that this is different from levenshtein() as this function knows about Unicode
|
||||
normalization and grapheme clusters. This should be the correct way to compare two
|
||||
Unicode strings.
|
||||
"""
|
||||
seq1 = list(grapheme_clusters(unicodedata.normalize("NFC", s1)))
|
||||
seq2 = list(grapheme_clusters(unicodedata.normalize("NFC", s2)))
|
||||
|
@ -92,47 +32,6 @@ def distance(s1: ExtractedText, s2: ExtractedText):
|
|||
return distance(s1.text, s2.text)
|
||||
|
||||
|
||||
def seq_editops(seq1, seq2):
|
||||
"""
|
||||
Return sequence of edit operations transforming one sequence to another.
|
||||
|
||||
This aims to return the same/similar results as python-Levenshtein's editops(), just generalized to arbitrary
|
||||
sequences.
|
||||
"""
|
||||
seq1 = list(seq1)
|
||||
seq2 = list(seq2)
|
||||
m = len(seq1)
|
||||
n = len(seq2)
|
||||
D = levenshtein_matrix(seq1, seq2)
|
||||
|
||||
def _tail_backtrace(i, j, accumulator):
|
||||
if i > 0 and D[i - 1, j] + 1 == D[i, j]:
|
||||
return partial(
|
||||
_tail_backtrace, i - 1, j, [("delete", i - 1, j)] + accumulator
|
||||
)
|
||||
if j > 0 and D[i, j - 1] + 1 == D[i, j]:
|
||||
return partial(
|
||||
_tail_backtrace, i, j - 1, [("insert", i, j - 1)] + accumulator
|
||||
)
|
||||
if i > 0 and j > 0 and D[i - 1, j - 1] + 1 == D[i, j]:
|
||||
return partial(
|
||||
_tail_backtrace, i - 1, j - 1, [("replace", i - 1, j - 1)] + accumulator
|
||||
)
|
||||
if i > 0 and j > 0 and D[i - 1, j - 1] == D[i, j]:
|
||||
return partial(_tail_backtrace, i - 1, j - 1, accumulator) # NOP
|
||||
return accumulator
|
||||
|
||||
def backtrace(i, j):
|
||||
result = partial(_tail_backtrace, i, j, [])
|
||||
while isinstance(result, partial):
|
||||
result = result()
|
||||
|
||||
return result
|
||||
|
||||
b = backtrace(m, n)
|
||||
return b
|
||||
|
||||
|
||||
def editops(word1, word2):
|
||||
"""
|
||||
Return sequence of edit operations transforming one string to another.
|
||||
|
@ -141,4 +40,4 @@ def editops(word1, word2):
|
|||
"""
|
||||
word1 = list(grapheme_clusters(unicodedata.normalize("NFC", word1)))
|
||||
word2 = list(grapheme_clusters(unicodedata.normalize("NFC", word2)))
|
||||
return seq_editops(word1, word2)
|
||||
return levenshtein_editops(word1, word2)
|
||||
|
|
|
@ -8,7 +8,6 @@ from ocrd_utils import getLogger, make_file_id, assert_file_grp_cardinality
|
|||
from pkg_resources import resource_string
|
||||
|
||||
from .cli import process as cli_process
|
||||
from .edit_distance import levenshtein_matrix_cache_clear
|
||||
|
||||
OCRD_TOOL = json.loads(resource_string(__name__, "ocrd-tool.json").decode("utf8"))
|
||||
|
||||
|
@ -74,8 +73,6 @@ class OcrdDinglehopperEvaluate(Processor):
|
|||
local_filename=report_prefix + report_suffix,
|
||||
)
|
||||
|
||||
# Clear cache between files
|
||||
levenshtein_matrix_cache_clear()
|
||||
|
||||
if __name__ == "__main__":
|
||||
ocrd_dinglehopper()
|
||||
|
|
|
@ -1,35 +1,38 @@
|
|||
import unicodedata
|
||||
|
||||
from .. import seq_editops, editops
|
||||
from rapidfuzz.string_metric import levenshtein_editops
|
||||
from .. import editops
|
||||
|
||||
|
||||
# TODO: Remove rapidfuzz.string_metric.levenshtein_editops tests eventually
|
||||
|
||||
def test_trivial():
|
||||
assert seq_editops("abc", "abc") == []
|
||||
assert seq_editops("", "") == []
|
||||
assert levenshtein_editops("abc", "abc") == []
|
||||
assert levenshtein_editops("", "") == []
|
||||
|
||||
|
||||
def test_insert():
|
||||
assert seq_editops("bc", "abc") == [("insert", 0, 0)]
|
||||
assert seq_editops("ac", "abc") == [("insert", 1, 1)]
|
||||
assert seq_editops("ab", "abc") == [("insert", 2, 2)]
|
||||
assert seq_editops("", "a") == [("insert", 0, 0)]
|
||||
assert levenshtein_editops("bc", "abc") == [("insert", 0, 0)]
|
||||
assert levenshtein_editops("ac", "abc") == [("insert", 1, 1)]
|
||||
assert levenshtein_editops("ab", "abc") == [("insert", 2, 2)]
|
||||
assert levenshtein_editops("", "a") == [("insert", 0, 0)]
|
||||
|
||||
|
||||
def test_multiple():
|
||||
assert seq_editops("bcd", "abce") == [("insert", 0, 0), ("replace", 2, 3)]
|
||||
assert levenshtein_editops("bcd", "abce") == [("insert", 0, 0), ("replace", 2, 3)]
|
||||
|
||||
|
||||
def test_delete():
|
||||
assert seq_editops("abcdef", "cdef") == [("delete", 0, 0), ("delete", 1, 0)]
|
||||
assert seq_editops("Xabcdef", "Xcdef") == [("delete", 1, 1), ("delete", 2, 1)]
|
||||
assert seq_editops("abcdefg", "acdefX") == [("delete", 1, 1), ("replace", 6, 5)]
|
||||
assert seq_editops("abcde", "aabcd") == [("insert", 1, 1), ("delete", 4, 5)]
|
||||
assert seq_editops("Foo", "") == [
|
||||
assert levenshtein_editops("abcdef", "cdef") == [("delete", 0, 0), ("delete", 1, 0)]
|
||||
assert levenshtein_editops("Xabcdef", "Xcdef") == [("delete", 1, 1), ("delete", 2, 1)]
|
||||
assert levenshtein_editops("abcdefg", "acdefX") == [("delete", 1, 1), ("replace", 6, 5)]
|
||||
assert levenshtein_editops("abcde", "aabcd") == [("insert", 1, 1), ("delete", 4, 5)]
|
||||
assert levenshtein_editops("Foo", "") == [
|
||||
("delete", 0, 0),
|
||||
("delete", 1, 0),
|
||||
("delete", 2, 0),
|
||||
]
|
||||
assert seq_editops("Foolish", "Foo") == [
|
||||
assert levenshtein_editops("Foolish", "Foo") == [
|
||||
("delete", 3, 3),
|
||||
("delete", 4, 3),
|
||||
("delete", 5, 3),
|
||||
|
@ -38,10 +41,10 @@ def test_delete():
|
|||
|
||||
|
||||
def test_ambiguous():
|
||||
assert seq_editops("bcd", "abcef") == [
|
||||
assert levenshtein_editops("bcd", "abcef") == [
|
||||
("insert", 0, 0),
|
||||
("replace", 2, 3),
|
||||
("insert", 3, 4),
|
||||
("insert", 2, 3),
|
||||
("replace", 2, 4),
|
||||
]
|
||||
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue