diff --git a/qurator/dinglehopper/align.py b/qurator/dinglehopper/align.py index c7e7733..45c4835 100644 --- a/qurator/dinglehopper/align.py +++ b/qurator/dinglehopper/align.py @@ -1,4 +1,5 @@ from .edit_distance import * +from rapidfuzz.string_metric import levenshtein_editops def align(t1, t2): @@ -12,7 +13,7 @@ def seq_align(s1, s2): """Align general sequences.""" s1 = list(s1) s2 = list(s2) - ops = seq_editops(s1, s2) + ops = levenshtein_editops(s1, s2) i = 0 j = 0 diff --git a/qurator/dinglehopper/edit_distance.py b/qurator/dinglehopper/edit_distance.py index 0b9c8f4..7fa4ae1 100644 --- a/qurator/dinglehopper/edit_distance.py +++ b/qurator/dinglehopper/edit_distance.py @@ -8,79 +8,19 @@ import numpy as np from multimethod import multimethod from uniseg.graphemecluster import grapheme_clusters from tqdm import tqdm +from rapidfuzz.string_metric import levenshtein, levenshtein_editops from .extracted_text import ExtractedText from .config import Config -def levenshtein_matrix(seq1: Sequence, seq2: Sequence): - """Compute the matrix commonly computed to produce the Levenshtein distance. - This is also known as the Wagner-Fischer algorithm. The matrix element at the bottom right contains the desired - edit distance. - - This algorithm is implemented here because we need an implementation that can work with sequences other than - strings, e.g. lists of grapheme clusters or lists of word strings. - """ - - # Internally, we use a cached version. As the cache only works on hashable parameters, we convert the input - # sequences to tuples to make them hashable. - return _levenshtein_matrix(tuple(seq1), tuple(seq2)) - - -@lru_cache(maxsize=10) -def _levenshtein_matrix(seq1: Tuple, seq2: Tuple): - """Compute the matrix commonly computed to produce the Levenshtein distance. - - This is a LRU cached function not meant to be used directly. Use levenshtein_matrix() instead. - """ - m = len(seq1) - n = len(seq2) - - def from_to(start, stop): - return range(start, stop + 1, 1) - - D = np.zeros((m + 1, n + 1), np.int) - D[0, 0] = 0 - for i in from_to(1, m): - D[i, 0] = i - for j in from_to(1, n): - D[0, j] = j - for i in tqdm(from_to(1, m), disable=not Config.progress): - for j in from_to(1, n): - D[i, j] = min( - D[i - 1, j - 1] - + 1 * (seq1[i - 1] != seq2[j - 1]), # Same or Substitution - D[i, j - 1] + 1, # Insertion - D[i - 1, j] + 1, # Deletion - ) - - return D - - -def levenshtein(seq1, seq2): - """Compute the Levenshtein edit distance between two sequences""" - m = len(seq1) - n = len(seq2) - - D = levenshtein_matrix(seq1, seq2) - return D[m, n] - - -def levenshtein_matrix_cache_clear(): - """Clear internal Levenshtein matrix cache. - - You want to do this between different input file pairs to decrease memory - usage by not caching results from prior input files. - """ - _levenshtein_matrix.cache_clear() - - @multimethod def distance(s1: str, s2: str): """Compute the Levenshtein edit distance between two Unicode strings - Note that this is different from levenshtein() as this function knows about Unicode normalization and grapheme - clusters. This should be the correct way to compare two Unicode strings. + Note that this is different from levenshtein() as this function knows about Unicode + normalization and grapheme clusters. This should be the correct way to compare two + Unicode strings. """ seq1 = list(grapheme_clusters(unicodedata.normalize("NFC", s1))) seq2 = list(grapheme_clusters(unicodedata.normalize("NFC", s2))) @@ -92,47 +32,6 @@ def distance(s1: ExtractedText, s2: ExtractedText): return distance(s1.text, s2.text) -def seq_editops(seq1, seq2): - """ - Return sequence of edit operations transforming one sequence to another. - - This aims to return the same/similar results as python-Levenshtein's editops(), just generalized to arbitrary - sequences. - """ - seq1 = list(seq1) - seq2 = list(seq2) - m = len(seq1) - n = len(seq2) - D = levenshtein_matrix(seq1, seq2) - - def _tail_backtrace(i, j, accumulator): - if i > 0 and D[i - 1, j] + 1 == D[i, j]: - return partial( - _tail_backtrace, i - 1, j, [("delete", i - 1, j)] + accumulator - ) - if j > 0 and D[i, j - 1] + 1 == D[i, j]: - return partial( - _tail_backtrace, i, j - 1, [("insert", i, j - 1)] + accumulator - ) - if i > 0 and j > 0 and D[i - 1, j - 1] + 1 == D[i, j]: - return partial( - _tail_backtrace, i - 1, j - 1, [("replace", i - 1, j - 1)] + accumulator - ) - if i > 0 and j > 0 and D[i - 1, j - 1] == D[i, j]: - return partial(_tail_backtrace, i - 1, j - 1, accumulator) # NOP - return accumulator - - def backtrace(i, j): - result = partial(_tail_backtrace, i, j, []) - while isinstance(result, partial): - result = result() - - return result - - b = backtrace(m, n) - return b - - def editops(word1, word2): """ Return sequence of edit operations transforming one string to another. @@ -141,4 +40,4 @@ def editops(word1, word2): """ word1 = list(grapheme_clusters(unicodedata.normalize("NFC", word1))) word2 = list(grapheme_clusters(unicodedata.normalize("NFC", word2))) - return seq_editops(word1, word2) + return levenshtein_editops(word1, word2) diff --git a/qurator/dinglehopper/notebooks/Levenshtein.ipynb b/qurator/dinglehopper/notebooks/Levenshtein.ipynb index f56d0d7..8761994 100644 --- a/qurator/dinglehopper/notebooks/Levenshtein.ipynb +++ b/qurator/dinglehopper/notebooks/Levenshtein.ipynb @@ -18,62 +18,20 @@ "# Levenshtein edit distance" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "dinglehopper uses to have its own (very inefficient) Levenshtein edit distance implementation, but now uses RapidFuzz." + ] + }, { "cell_type": "code", "execution_count": 2, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "def levenshtein_matrix(seq1, seq2):\n", - " \"\"\"Compute the matrix commonly computed to produce the Levenshtein distance.\n", - "\n", - " This is also known as the Wagner-Fischer algorithm. The matrix element at the bottom right contains the desired\n", - " edit distance.\n", - "\n", - " This algorithm is implemented here because we need an implementation that can work with sequences other than\n", - " strings, e.g. lists of grapheme clusters or lists of word strings.\n", - " \"\"\"\n", - " m = len(seq1)\n", - " n = len(seq2)\n", - "\n", - " def from_to(start, stop):\n", - " return range(start, stop + 1, 1)\n", - "\n", - " D = np.zeros((m + 1, n + 1), np.int)\n", - " D[0, 0] = 0\n", - " for i in from_to(1, m):\n", - " D[i, 0] = i\n", - " for j in from_to(1, n):\n", - " D[0, j] = j\n", - " for i in from_to(1, m):\n", - " for j in from_to(1, n):\n", - " D[i, j] = min(\n", - " D[i - 1, j - 1] + 1 * (seq1[i - 1] != seq2[j - 1]), # Same or Substitution\n", - " D[i, j - 1] + 1, # Insertion\n", - " D[i - 1, j] + 1 # Deletion\n", - " )\n", - "\n", - " return D\n", - "\n", - "def levenshtein(seq1, seq2):\n", - " \"\"\"Compute the Levenshtein edit distance between two sequences\"\"\"\n", - " m = len(seq1)\n", - " n = len(seq2)\n", - "\n", - " D = levenshtein_matrix(seq1, seq2)\n", - " return D[m, n]\n", - "\n" - ] - } - ], + "outputs": [], "source": [ - "from edit_distance import levenshtein_matrix, levenshtein\n", - "\n", - "print(inspect.getsource(levenshtein_matrix))\n", - "print(inspect.getsource(levenshtein))" + "from rapidfuzz.string_metric import levenshtein" ] }, { @@ -170,21 +128,23 @@ "name": "stdout", "output_type": "stream", "text": [ - "def distance(s1, s2):\n", + "@multimethod\n", + "def distance(s1: str, s2: str):\n", " \"\"\"Compute the Levenshtein edit distance between two Unicode strings\n", "\n", - " Note that this is different from levenshtein() as this function knows about Unicode normalization and grapheme\n", - " clusters. This should be the correct way to compare two Unicode strings.\n", + " Note that this is different from levenshtein() as this function knows about Unicode\n", + " normalization and grapheme clusters. This should be the correct way to compare two\n", + " Unicode strings.\n", " \"\"\"\n", - " s1 = list(grapheme_clusters(unicodedata.normalize('NFC', s1)))\n", - " s2 = list(grapheme_clusters(unicodedata.normalize('NFC', s2)))\n", - " return levenshtein(s1, s2)\n", + " seq1 = list(grapheme_clusters(unicodedata.normalize(\"NFC\", s1)))\n", + " seq2 = list(grapheme_clusters(unicodedata.normalize(\"NFC\", s2)))\n", + " return levenshtein(seq1, seq2)\n", "\n" ] } ], "source": [ - "from edit_distance import distance\n", + "from qurator.dinglehopper.edit_distance import distance\n", "print(inspect.getsource(distance))" ] }, @@ -247,8 +207,7 @@ "source": [ "# Edit operations\n", "\n", - "python-Levenshtein supports backtracing, i.e. giving a sequence of edit options that transforms a word to another word:\n", - "\n" + "python-Levenshtein + RapidFuzz supports backtracing, i.e. giving a sequence of edit options that transforms a word to another word:" ] }, { @@ -257,32 +216,20 @@ "metadata": {}, "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "[('insert', 5, 5), ('replace', 5, 6)]\n" - ] + "data": { + "text/plain": [ + "[('replace', 2, 2)]" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ - "import Levenshtein\n", - "word1 = 'Schlyñ' # with LATIN SMALL LETTER N WITH TILDE\n", - "word2 = 'Schlym̃' # with LATIN SMALL LETTER M + COMBINING TILDE\n", - "print(Levenshtein.editops(word1, word2))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Note that it does not work with grapheme clusters, but \"characters\", so it gives 2 operations." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Defining our own `editops()`. (This looks a bit wild due to our own tail recursion handling.)" + "from rapidfuzz.string_metric import levenshtein_editops as editops\n", + "\n", + "editops('Foo', 'Fon')" ] }, { @@ -294,47 +241,12 @@ "name": "stdout", "output_type": "stream", "text": [ - "def seq_editops(seq1, seq2):\n", - " seq1 = list(seq1)\n", - " seq2 = list(seq2)\n", - " m = len(seq1)\n", - " n = len(seq2)\n", - " D = levenshtein_matrix(seq1, seq2)\n", - "\n", - " def _tail_backtrace(i, j, accumulator):\n", - " if i > 0 and D[i - 1, j] + 1 == D[i, j]:\n", - " return partial(_tail_backtrace, i - 1, j, [('delete', i-1, j)] + accumulator)\n", - " if j > 0 and D[i, j - 1] + 1 == D[i, j]:\n", - " return partial(_tail_backtrace, i, j - 1, [('insert', i, j-1)] + accumulator)\n", - " if i > 0 and j > 0 and D[i - 1, j - 1] + 1 == D[i, j]:\n", - " return partial(_tail_backtrace, i - 1, j - 1, [('replace', i-1, j-1)] + accumulator)\n", - " if i > 0 and j > 0 and D[i - 1, j - 1] == D[i, j]:\n", - " return partial(_tail_backtrace, i - 1, j - 1, accumulator) # NOP\n", - " return accumulator\n", - "\n", - " def backtrace(i, j):\n", - " result = partial(_tail_backtrace, i, j, [])\n", - " while isinstance(result, partial):\n", - " result = result()\n", - "\n", - " return result\n", - "\n", - " b = backtrace(m, n)\n", - " return b\n", - "\n", - "def editops(word1, word2):\n", - " # XXX Note that this returns indices to the _grapheme clusters_, not characters!\n", - " word1 = list(grapheme_clusters(unicodedata.normalize('NFC', word1)))\n", - " word2 = list(grapheme_clusters(unicodedata.normalize('NFC', word2)))\n", - " return seq_editops(word1, word2)\n", - "\n" + "[('insert', 4, 4)]\n" ] } ], "source": [ - "from edit_distance import seq_editops, editops\n", - "print(inspect.getsource(seq_editops))\n", - "print(inspect.getsource(editops))" + "print(editops('Käptn', 'Käpt\\'n'))" ] }, { @@ -343,18 +255,15 @@ "metadata": {}, "outputs": [ { - "data": { - "text/plain": [ - "[('replace', 2, 2)]" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" + "name": "stdout", + "output_type": "stream", + "text": [ + "[('delete', 6, 6)]\n" + ] } ], "source": [ - "editops('Foo', 'Fon')" + "print(editops('Delete something', 'Deletesomething'))" ] }, { @@ -366,66 +275,76 @@ "name": "stdout", "output_type": "stream", "text": [ - "[('insert', 4, 4)]\n", - "[('insert', 4, 4)]\n" + "[('delete', 1, 1), ('replace', 13, 12), ('insert', 16, 15), ('delete', 23, 23)]\n" ] } ], "source": [ - "print(editops('Käptn', 'Käpt\\'n'))\n", - "print(Levenshtein.editops('Käptn', 'Käpt\\'n'))" + "print(editops('A more difficult example', 'Amore difficült exampl'))" ] }, { - "cell_type": "code", - "execution_count": 13, + "cell_type": "markdown", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[('delete', 6, 6)]\n", - "[('delete', 6, 6)]\n" - ] - } - ], "source": [ - "print(editops('Delete something', 'Deletesomething'))\n", - "print(Levenshtein.editops('Delete something', 'Deletesomething'))" + "Let's try it with a difficult example that needs grapheme cluster handling:" ] }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 13, "metadata": {}, "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "[('delete', 1, 1), ('replace', 13, 12), ('insert', 17, 16), ('delete', 23, 23)]\n", - "[('delete', 1, 1), ('replace', 13, 12), ('insert', 16, 15), ('delete', 23, 23)]\n" - ] + "data": { + "text/plain": [ + "[('insert', 5, 5), ('replace', 5, 6)]" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ - "print(editops('A more difficult example', 'Amore difficült exampl'))\n", - "print(Levenshtein.editops('A more difficult example', 'Amore difficült exampl'))" + "word1 = 'Schlyñ' # with LATIN SMALL LETTER N WITH TILDE\n", + "word2 = 'Schlym̃' # with LATIN SMALL LETTER M + COMBINING TILDE\n", + "\n", + "editops(word1, word2)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "XXX Note that our implementation returns different positions here for the 'insert'. " + "That doesn't look right, let's redefine it with grapheme cluster support:" ] }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": 14, "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "def editops(word1, word2):\n", + " \"\"\"\n", + " Return sequence of edit operations transforming one string to another.\n", + "\n", + " Note that this returns indices to the _grapheme clusters_, not characters!\n", + " \"\"\"\n", + " word1 = list(grapheme_clusters(unicodedata.normalize(\"NFC\", word1)))\n", + " word2 = list(grapheme_clusters(unicodedata.normalize(\"NFC\", word2)))\n", + " return levenshtein_editops(word1, word2)\n", + "\n" + ] + } + ], "source": [ - "Let's try it with a difficult example that needs grapheme cluster handling:" + "from qurator.dinglehopper.edit_distance import editops\n", + "print(inspect.getsource(editops))" ] }, { @@ -455,7 +374,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "🎉" + "🎉\n", + "\n", + "Here, a problem is that the positions are grapheme cluster positions, not Python character indexes!" ] }, { @@ -489,22 +410,20 @@ "name": "stdout", "output_type": "stream", "text": [ - "def character_error_rate(reference, compared):\n", - " d = distance(reference, compared)\n", - " if d == 0:\n", - " return 0\n", - "\n", - " n = len(list(grapheme_clusters(unicodedata.normalize('NFC', reference))))\n", - " if n == 0:\n", - " return float('inf')\n", + "def character_error_rate(reference, compared) -> float:\n", + " \"\"\"\n", + " Compute character error rate.\n", "\n", - " return d/n\n", + " :return: character error rate\n", + " \"\"\"\n", + " cer, _ = character_error_rate_n(reference, compared)\n", + " return cer\n", "\n" ] } ], "source": [ - "from character_error_rate import character_error_rate\n", + "from qurator.dinglehopper.character_error_rate import character_error_rate\n", "print(inspect.getsource(character_error_rate))" ] }, @@ -732,16 +651,20 @@ "name": "stdout", "output_type": "stream", "text": [ - "def words(s):\n", + "@multimethod\n", + "def words(s: str):\n", + " \"\"\"Extract words from a string\"\"\"\n", + "\n", " # Patch uniseg.wordbreak.word_break to deal with our private use characters. See also\n", " # https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/WordBreakProperty.txt\n", " old_word_break = uniseg.wordbreak.word_break\n", "\n", " def new_word_break(c, index=0):\n", " if 0xE000 <= ord(c) <= 0xF8FF: # Private Use Area\n", - " return 'ALetter'\n", + " return \"ALetter\"\n", " else:\n", " return old_word_break(c, index)\n", + "\n", " uniseg.wordbreak.word_break = new_word_break\n", "\n", " # Check if c is an unwanted character, i.e. whitespace, punctuation, or similar\n", @@ -749,8 +672,8 @@ "\n", " # See https://www.fileformat.info/info/unicode/category/index.htm\n", " # and https://unicodebook.readthedocs.io/unicode.html#categories\n", - " unwanted_categories = 'O', 'M', 'P', 'Z', 'S'\n", - " unwanted_subcategories = 'Cc', 'Cf'\n", + " unwanted_categories = \"O\", \"M\", \"P\", \"Z\", \"S\"\n", + " unwanted_subcategories = \"Cc\", \"Cf\"\n", "\n", " subcat = unicodedata.category(c)\n", " cat = subcat[0]\n", @@ -778,7 +701,7 @@ } ], "source": [ - "from word_error_rate import words\n", + "from qurator.dinglehopper.word_error_rate import words\n", "print(inspect.getsource(words))\n", "\n", "list(words(example_text))" @@ -905,29 +828,15 @@ "name": "stdout", "output_type": "stream", "text": [ - "def word_error_rate(reference, compared):\n", - " if isinstance(reference, str):\n", - " reference_seq = list(words_normalized(reference))\n", - " compared_seq = list(words_normalized(compared))\n", - " else:\n", - " reference_seq = list(reference)\n", - " compared_seq = list(compared)\n", - "\n", - " d = levenshtein(reference_seq, compared_seq)\n", - " if d == 0:\n", - " return 0\n", - "\n", - " n = len(reference_seq)\n", - " if n == 0:\n", - " return float('inf')\n", - "\n", - " return d / n\n", + "def word_error_rate(reference, compared) -> float:\n", + " wer, _ = word_error_rate_n(reference, compared)\n", + " return wer\n", "\n" ] } ], "source": [ - "from word_error_rate import word_error_rate\n", + "from qurator.dinglehopper.word_error_rate import word_error_rate\n", "print(inspect.getsource(word_error_rate))" ] }, @@ -1002,9 +911,9 @@ "metadata": { "hide_input": false, "kernelspec": { - "display_name": "Python 3", + "display_name": "dinglehopper-github", "language": "python", - "name": "python3" + "name": "dinglehopper-github" }, "language_info": { "codemirror_mode": { @@ -1016,7 +925,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.3" + "version": "3.7.12" }, "toc": { "base_numbering": 1, diff --git a/qurator/dinglehopper/ocrd_cli.py b/qurator/dinglehopper/ocrd_cli.py index adfbbab..7c513e6 100644 --- a/qurator/dinglehopper/ocrd_cli.py +++ b/qurator/dinglehopper/ocrd_cli.py @@ -8,7 +8,6 @@ from ocrd_utils import getLogger, make_file_id, assert_file_grp_cardinality from pkg_resources import resource_string from .cli import process as cli_process -from .edit_distance import levenshtein_matrix_cache_clear OCRD_TOOL = json.loads(resource_string(__name__, "ocrd-tool.json").decode("utf8")) @@ -74,8 +73,6 @@ class OcrdDinglehopperEvaluate(Processor): local_filename=report_prefix + report_suffix, ) - # Clear cache between files - levenshtein_matrix_cache_clear() if __name__ == "__main__": ocrd_dinglehopper() diff --git a/qurator/dinglehopper/tests/test_edit_distance.py b/qurator/dinglehopper/tests/test_edit_distance.py index dc1f202..be427a8 100644 --- a/qurator/dinglehopper/tests/test_edit_distance.py +++ b/qurator/dinglehopper/tests/test_edit_distance.py @@ -2,27 +2,7 @@ from __future__ import division, print_function import unicodedata -from .. import levenshtein, distance - - -def test_levenshtein(): - assert levenshtein("a", "a") == 0 - assert levenshtein("a", "b") == 1 - assert levenshtein("Foo", "Bar") == 3 - - assert levenshtein("", "") == 0 - assert levenshtein("Foo", "") == 3 - assert levenshtein("", "Foo") == 3 - - assert levenshtein("Foo", "Food") == 1 - assert levenshtein("Fnord", "Food") == 2 - assert levenshtein("Müll", "Mull") == 1 - assert levenshtein("Abstand", "Sand") == 4 - - -def test_levenshtein_other_sequences(): - assert levenshtein(["a", "ab"], ["a", "ab", "c"]) == 1 - assert levenshtein(["a", "ab"], ["a", "c"]) == 1 +from .. import distance def test_distance(): diff --git a/qurator/dinglehopper/tests/test_editops.py b/qurator/dinglehopper/tests/test_editops.py index 06afbfc..7233cf4 100644 --- a/qurator/dinglehopper/tests/test_editops.py +++ b/qurator/dinglehopper/tests/test_editops.py @@ -1,48 +1,6 @@ import unicodedata -from .. import seq_editops, editops - - -def test_trivial(): - assert seq_editops("abc", "abc") == [] - assert seq_editops("", "") == [] - - -def test_insert(): - assert seq_editops("bc", "abc") == [("insert", 0, 0)] - assert seq_editops("ac", "abc") == [("insert", 1, 1)] - assert seq_editops("ab", "abc") == [("insert", 2, 2)] - assert seq_editops("", "a") == [("insert", 0, 0)] - - -def test_multiple(): - assert seq_editops("bcd", "abce") == [("insert", 0, 0), ("replace", 2, 3)] - - -def test_delete(): - assert seq_editops("abcdef", "cdef") == [("delete", 0, 0), ("delete", 1, 0)] - assert seq_editops("Xabcdef", "Xcdef") == [("delete", 1, 1), ("delete", 2, 1)] - assert seq_editops("abcdefg", "acdefX") == [("delete", 1, 1), ("replace", 6, 5)] - assert seq_editops("abcde", "aabcd") == [("insert", 1, 1), ("delete", 4, 5)] - assert seq_editops("Foo", "") == [ - ("delete", 0, 0), - ("delete", 1, 0), - ("delete", 2, 0), - ] - assert seq_editops("Foolish", "Foo") == [ - ("delete", 3, 3), - ("delete", 4, 3), - ("delete", 5, 3), - ("delete", 6, 3), - ] - - -def test_ambiguous(): - assert seq_editops("bcd", "abcef") == [ - ("insert", 0, 0), - ("replace", 2, 3), - ("insert", 3, 4), - ] +from .. import editops def test_editops(): diff --git a/qurator/dinglehopper/word_error_rate.py b/qurator/dinglehopper/word_error_rate.py index dde57b9..64b40d2 100644 --- a/qurator/dinglehopper/word_error_rate.py +++ b/qurator/dinglehopper/word_error_rate.py @@ -6,7 +6,7 @@ from multimethod import multimethod import uniseg.wordbreak -from .edit_distance import levenshtein +from rapidfuzz.string_metric import levenshtein from . import ExtractedText diff --git a/requirements.txt b/requirements.txt index 7bb53ac..02bc99f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,3 +9,4 @@ ocrd >= 2.20.1 attrs multimethod == 1.3 # latest version to officially support Python 3.5 tqdm +rapidfuzz >= 1.8.1