Merge branch 'rapidfuzz'

2026-07-29 15:02:33 +02:00 · 2021-10-22 18:19:58 +02:00 · 2021-10-22 18:19:58 +02:00 · dea0c53f88
commit dea0c53f88
parent 249787686f 06ea38449c
8 changed files with 116 additions and 371 deletions
--- a/qurator/dinglehopper/align.py
+++ b/qurator/dinglehopper/align.py
@ -1,4 +1,5 @@
 from .edit_distance import *
+from rapidfuzz.string_metric import levenshtein_editops


 def align(t1, t2):
@ -12,7 +13,7 @@ def seq_align(s1, s2):
    """Align general sequences."""
    s1 = list(s1)
    s2 = list(s2)
-    ops = seq_editops(s1, s2)
+    ops = levenshtein_editops(s1, s2)
    i = 0
    j = 0

--- a/qurator/dinglehopper/edit_distance.py
+++ b/qurator/dinglehopper/edit_distance.py
@ -8,79 +8,19 @@ import numpy as np
 from multimethod import multimethod
 from uniseg.graphemecluster import grapheme_clusters
 from tqdm import tqdm
+from rapidfuzz.string_metric import levenshtein, levenshtein_editops

 from .extracted_text import ExtractedText
 from .config import Config


-def levenshtein_matrix(seq1: Sequence, seq2: Sequence):
-    """Compute the matrix commonly computed to produce the Levenshtein distance.
-    This is also known as the Wagner-Fischer algorithm. The matrix element at the bottom right contains the desired
-    edit distance.
-
-    This algorithm is implemented here because we need an implementation that can work with sequences other than
-    strings, e.g. lists of grapheme clusters or lists of word strings.
-    """
-
-    # Internally, we use a cached version. As the cache only works on hashable parameters, we convert the input
-    # sequences to tuples to make them hashable.
-    return _levenshtein_matrix(tuple(seq1), tuple(seq2))
-
-
-@lru_cache(maxsize=10)
-def _levenshtein_matrix(seq1: Tuple, seq2: Tuple):
-    """Compute the matrix commonly computed to produce the Levenshtein distance.
-
-    This is a LRU cached function not meant to be used directly. Use levenshtein_matrix() instead.
-    """
-    m = len(seq1)
-    n = len(seq2)
-
-    def from_to(start, stop):
-        return range(start, stop + 1, 1)
-
-    D = np.zeros((m + 1, n + 1), np.int)
-    D[0, 0] = 0
-    for i in from_to(1, m):
-        D[i, 0] = i
-    for j in from_to(1, n):
-        D[0, j] = j
-    for i in tqdm(from_to(1, m), disable=not Config.progress):
-        for j in from_to(1, n):
-            D[i, j] = min(
-                D[i - 1, j - 1]
-                + 1 * (seq1[i - 1] != seq2[j - 1]),  # Same or Substitution
-                D[i, j - 1] + 1,  # Insertion
-                D[i - 1, j] + 1,  # Deletion
-            )
-
-    return D
-
-
-def levenshtein(seq1, seq2):
-    """Compute the Levenshtein edit distance between two sequences"""
-    m = len(seq1)
-    n = len(seq2)
-
-    D = levenshtein_matrix(seq1, seq2)
-    return D[m, n]
-
-
-def levenshtein_matrix_cache_clear():
-    """Clear internal Levenshtein matrix cache.
-
-    You want to do this between different input file pairs to decrease memory
-    usage by not caching results from prior input files.
-    """
-    _levenshtein_matrix.cache_clear()
-
-
@multimethod
 def distance(s1: str, s2: str):
    """Compute the Levenshtein edit distance between two Unicode strings

-    Note that this is different from levenshtein() as this function knows about Unicode normalization and grapheme
-    clusters. This should be the correct way to compare two Unicode strings.
+    Note that this is different from levenshtein() as this function knows about Unicode
+    normalization and grapheme clusters. This should be the correct way to compare two
+    Unicode strings.
    """
    seq1 = list(grapheme_clusters(unicodedata.normalize("NFC", s1)))
    seq2 = list(grapheme_clusters(unicodedata.normalize("NFC", s2)))
@ -92,47 +32,6 @@ def distance(s1: ExtractedText, s2: ExtractedText):
    return distance(s1.text, s2.text)


-def seq_editops(seq1, seq2):
-    """
-    Return sequence of edit operations transforming one sequence to another.
-
-    This aims to return the same/similar results as python-Levenshtein's editops(), just generalized to arbitrary
-    sequences.
-    """
-    seq1 = list(seq1)
-    seq2 = list(seq2)
-    m = len(seq1)
-    n = len(seq2)
-    D = levenshtein_matrix(seq1, seq2)
-
-    def _tail_backtrace(i, j, accumulator):
-        if i > 0 and D[i - 1, j] + 1 == D[i, j]:
-            return partial(
-                _tail_backtrace, i - 1, j, [("delete", i - 1, j)] + accumulator
-            )
-        if j > 0 and D[i, j - 1] + 1 == D[i, j]:
-            return partial(
-                _tail_backtrace, i, j - 1, [("insert", i, j - 1)] + accumulator
-            )
-        if i > 0 and j > 0 and D[i - 1, j - 1] + 1 == D[i, j]:
-            return partial(
-                _tail_backtrace, i - 1, j - 1, [("replace", i - 1, j - 1)] + accumulator
-            )
-        if i > 0 and j > 0 and D[i - 1, j - 1] == D[i, j]:
-            return partial(_tail_backtrace, i - 1, j - 1, accumulator)  # NOP
-        return accumulator
-
-    def backtrace(i, j):
-        result = partial(_tail_backtrace, i, j, [])
-        while isinstance(result, partial):
-            result = result()
-
-        return result
-
-    b = backtrace(m, n)
-    return b
-
-
 def editops(word1, word2):
    """
    Return sequence of edit operations transforming one string to another.
@ -141,4 +40,4 @@ def editops(word1, word2):
    """
    word1 = list(grapheme_clusters(unicodedata.normalize("NFC", word1)))
    word2 = list(grapheme_clusters(unicodedata.normalize("NFC", word2)))
-    return seq_editops(word1, word2)
+    return levenshtein_editops(word1, word2)
--- a/qurator/dinglehopper/notebooks/Levenshtein.ipynb
+++ b/qurator/dinglehopper/notebooks/Levenshtein.ipynb
@ -18,62 +18,20 @@
    "# Levenshtein edit distance"
   ]
  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "dinglehopper uses to have its own (very inefficient) Levenshtein edit distance implementation, but now uses RapidFuzz."
+   ]
+  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "def levenshtein_matrix(seq1, seq2):\n",
-      "    \"\"\"Compute the matrix commonly computed to produce the Levenshtein distance.\n",
-      "\n",
-      "    This is also known as the Wagner-Fischer algorithm. The matrix element at the bottom right contains the desired\n",
-      "    edit distance.\n",
-      "\n",
-      "    This algorithm is implemented here because we need an implementation that can work with sequences other than\n",
-      "    strings, e.g. lists of grapheme clusters or lists of word strings.\n",
-      "    \"\"\"\n",
-      "    m = len(seq1)\n",
-      "    n = len(seq2)\n",
-      "\n",
-      "    def from_to(start, stop):\n",
-      "        return range(start, stop + 1, 1)\n",
-      "\n",
-      "    D = np.zeros((m + 1, n + 1), np.int)\n",
-      "    D[0, 0] = 0\n",
-      "    for i in from_to(1, m):\n",
-      "        D[i, 0] = i\n",
-      "    for j in from_to(1, n):\n",
-      "        D[0, j] = j\n",
-      "    for i in from_to(1, m):\n",
-      "        for j in from_to(1, n):\n",
-      "            D[i, j] = min(\n",
-      "                D[i - 1, j - 1] + 1 * (seq1[i - 1] != seq2[j - 1]),  # Same or Substitution\n",
-      "                D[i, j - 1] + 1,  # Insertion\n",
-      "                D[i - 1, j] + 1   # Deletion\n",
-      "            )\n",
-      "\n",
-      "    return D\n",
-      "\n",
-      "def levenshtein(seq1, seq2):\n",
-      "    \"\"\"Compute the Levenshtein edit distance between two sequences\"\"\"\n",
-      "    m = len(seq1)\n",
-      "    n = len(seq2)\n",
-      "\n",
-      "    D = levenshtein_matrix(seq1, seq2)\n",
-      "    return D[m, n]\n",
-      "\n"
-     ]
-    }
-   ],
+   "outputs": [],
   "source": [
-    "from edit_distance import levenshtein_matrix, levenshtein\n",
-    "\n",
-    "print(inspect.getsource(levenshtein_matrix))\n",
-    "print(inspect.getsource(levenshtein))"
+    "from rapidfuzz.string_metric import levenshtein"
   ]
  },
  {
@ -170,21 +128,23 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "def distance(s1, s2):\n",
+      "@multimethod\n",
+      "def distance(s1: str, s2: str):\n",
      "    \"\"\"Compute the Levenshtein edit distance between two Unicode strings\n",
      "\n",
-      "    Note that this is different from levenshtein() as this function knows about Unicode normalization and grapheme\n",
-      "    clusters. This should be the correct way to compare two Unicode strings.\n",
+      "    Note that this is different from levenshtein() as this function knows about Unicode\n",
+      "    normalization and grapheme clusters. This should be the correct way to compare two\n",
+      "    Unicode strings.\n",
      "    \"\"\"\n",
-      "    s1 = list(grapheme_clusters(unicodedata.normalize('NFC', s1)))\n",
-      "    s2 = list(grapheme_clusters(unicodedata.normalize('NFC', s2)))\n",
-      "    return levenshtein(s1, s2)\n",
+      "    seq1 = list(grapheme_clusters(unicodedata.normalize(\"NFC\", s1)))\n",
+      "    seq2 = list(grapheme_clusters(unicodedata.normalize(\"NFC\", s2)))\n",
+      "    return levenshtein(seq1, seq2)\n",
      "\n"
     ]
    }
   ],
   "source": [
-    "from edit_distance import distance\n",
+    "from qurator.dinglehopper.edit_distance import distance\n",
    "print(inspect.getsource(distance))"
   ]
  },
@ -247,8 +207,7 @@
   "source": [
    "# Edit operations\n",
    "\n",
-    "python-Levenshtein supports backtracing, i.e. giving a sequence of edit options that transforms a word to another word:\n",
-    "\n"
+    "python-Levenshtein + RapidFuzz supports backtracing, i.e. giving a sequence of edit options that transforms a word to another word:"
   ]
  },
  {
@ -257,32 +216,20 @@
   "metadata": {},
   "outputs": [
    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[('insert', 5, 5), ('replace', 5, 6)]\n"
-     ]
+     "data": {
+      "text/plain": [
+       "[('replace', 2, 2)]"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
    }
   ],
   "source": [
-    "import Levenshtein\n",
-    "word1 = 'Schlyñ'  # with LATIN SMALL LETTER N WITH TILDE\n",
-    "word2 = 'Schlym̃'  # with LATIN SMALL LETTER M + COMBINING TILDE\n",
-    "print(Levenshtein.editops(word1, word2))"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Note that it does not work with grapheme clusters, but \"characters\", so it gives 2 operations."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Defining our own `editops()`. (This looks a bit wild due to our own tail recursion handling.)"
+    "from rapidfuzz.string_metric import levenshtein_editops as editops\n",
+    "\n",
+    "editops('Foo', 'Fon')"
   ]
  },
  {
@ -294,47 +241,12 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "def seq_editops(seq1, seq2):\n",
-      "    seq1 = list(seq1)\n",
-      "    seq2 = list(seq2)\n",
-      "    m = len(seq1)\n",
-      "    n = len(seq2)\n",
-      "    D = levenshtein_matrix(seq1, seq2)\n",
-      "\n",
-      "    def _tail_backtrace(i, j, accumulator):\n",
-      "        if i > 0 and D[i - 1, j] + 1 == D[i, j]:\n",
-      "            return partial(_tail_backtrace, i - 1, j, [('delete', i-1, j)] + accumulator)\n",
-      "        if j > 0 and D[i, j - 1] + 1 == D[i, j]:\n",
-      "            return partial(_tail_backtrace, i, j - 1, [('insert', i, j-1)] + accumulator)\n",
-      "        if i > 0 and j > 0 and D[i - 1, j - 1] + 1 == D[i, j]:\n",
-      "            return partial(_tail_backtrace, i - 1, j - 1, [('replace', i-1, j-1)] + accumulator)\n",
-      "        if i > 0 and j > 0 and D[i - 1, j - 1] == D[i, j]:\n",
-      "            return partial(_tail_backtrace, i - 1, j - 1, accumulator)  # NOP\n",
-      "        return accumulator\n",
-      "\n",
-      "    def backtrace(i, j):\n",
-      "        result = partial(_tail_backtrace, i, j, [])\n",
-      "        while isinstance(result, partial):\n",
-      "            result = result()\n",
-      "\n",
-      "        return result\n",
-      "\n",
-      "    b = backtrace(m, n)\n",
-      "    return b\n",
-      "\n",
-      "def editops(word1, word2):\n",
-      "    # XXX Note that this returns indices to the _grapheme clusters_, not characters!\n",
-      "    word1 = list(grapheme_clusters(unicodedata.normalize('NFC', word1)))\n",
-      "    word2 = list(grapheme_clusters(unicodedata.normalize('NFC', word2)))\n",
-      "    return seq_editops(word1, word2)\n",
-      "\n"
+      "[('insert', 4, 4)]\n"
     ]
    }
   ],
   "source": [
-    "from edit_distance import seq_editops, editops\n",
-    "print(inspect.getsource(seq_editops))\n",
-    "print(inspect.getsource(editops))"
+    "print(editops('Käptn', 'Käpt\\'n'))"
   ]
  },
  {
@ -343,18 +255,15 @@
   "metadata": {},
   "outputs": [
    {
-     "data": {
-      "text/plain": [
-       "[('replace', 2, 2)]"
-      ]
-     },
-     "execution_count": 11,
-     "metadata": {},
-     "output_type": "execute_result"
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[('delete', 6, 6)]\n"
+     ]
    }
   ],
   "source": [
-    "editops('Foo', 'Fon')"
+    "print(editops('Delete something', 'Deletesomething'))"
   ]
  },
  {
@ -366,14 +275,19 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "[('insert', 4, 4)]\n",
-      "[('insert', 4, 4)]\n"
+      "[('delete', 1, 1), ('replace', 13, 12), ('insert', 16, 15), ('delete', 23, 23)]\n"
     ]
    }
   ],
   "source": [
-    "print(editops('Käptn', 'Käpt\\'n'))\n",
-    "print(Levenshtein.editops('Käptn', 'Käpt\\'n'))"
+    "print(editops('A more difficult example', 'Amore difficült  exampl'))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Let's try it with a difficult example that needs grapheme cluster handling:"
   ]
  },
  {
@ -382,17 +296,28 @@
   "metadata": {},
   "outputs": [
    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[('delete', 6, 6)]\n",
-      "[('delete', 6, 6)]\n"
-     ]
+     "data": {
+      "text/plain": [
+       "[('insert', 5, 5), ('replace', 5, 6)]"
+      ]
+     },
+     "execution_count": 13,
+     "metadata": {},
+     "output_type": "execute_result"
    }
   ],
   "source": [
-    "print(editops('Delete something', 'Deletesomething'))\n",
-    "print(Levenshtein.editops('Delete something', 'Deletesomething'))"
+    "word1 = 'Schlyñ'  # with LATIN SMALL LETTER N WITH TILDE\n",
+    "word2 = 'Schlym̃'  # with LATIN SMALL LETTER M + COMBINING TILDE\n",
+    "\n",
+    "editops(word1, word2)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "That doesn't look right, let's redefine it with grapheme cluster support:"
   ]
  },
  {
@ -404,28 +329,22 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "[('delete', 1, 1), ('replace', 13, 12), ('insert', 17, 16), ('delete', 23, 23)]\n",
-      "[('delete', 1, 1), ('replace', 13, 12), ('insert', 16, 15), ('delete', 23, 23)]\n"
+      "def editops(word1, word2):\n",
+      "    \"\"\"\n",
+      "    Return sequence of edit operations transforming one string to another.\n",
+      "\n",
+      "    Note that this returns indices to the _grapheme clusters_, not characters!\n",
+      "    \"\"\"\n",
+      "    word1 = list(grapheme_clusters(unicodedata.normalize(\"NFC\", word1)))\n",
+      "    word2 = list(grapheme_clusters(unicodedata.normalize(\"NFC\", word2)))\n",
+      "    return levenshtein_editops(word1, word2)\n",
+      "\n"
     ]
    }
   ],
   "source": [
-    "print(editops('A more difficult example', 'Amore difficült  exampl'))\n",
-    "print(Levenshtein.editops('A more difficult example', 'Amore difficült  exampl'))"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "XXX Note that our implementation returns different positions here for the 'insert'. "
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Let's try it with a difficult example that needs grapheme cluster handling:"
+    "from qurator.dinglehopper.edit_distance import editops\n",
+    "print(inspect.getsource(editops))"
   ]
  },
  {
@ -455,7 +374,9 @@
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "🎉"
+    "🎉\n",
+    "\n",
+    "Here, a problem is that the positions are grapheme cluster positions, not Python character indexes!"
   ]
  },
  {
@ -489,22 +410,20 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "def character_error_rate(reference, compared):\n",
-      "    d = distance(reference, compared)\n",
-      "    if d == 0:\n",
-      "        return 0\n",
+      "def character_error_rate(reference, compared) -> float:\n",
+      "    \"\"\"\n",
+      "    Compute character error rate.\n",
      "\n",
-      "    n = len(list(grapheme_clusters(unicodedata.normalize('NFC', reference))))\n",
-      "    if n == 0:\n",
-      "        return float('inf')\n",
-      "\n",
-      "    return d/n\n",
+      "    :return: character error rate\n",
+      "    \"\"\"\n",
+      "    cer, _ = character_error_rate_n(reference, compared)\n",
+      "    return cer\n",
      "\n"
     ]
    }
   ],
   "source": [
-    "from character_error_rate import character_error_rate\n",
+    "from qurator.dinglehopper.character_error_rate import character_error_rate\n",
    "print(inspect.getsource(character_error_rate))"
   ]
  },
@ -732,16 +651,20 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "def words(s):\n",
+      "@multimethod\n",
+      "def words(s: str):\n",
+      "    \"\"\"Extract words from a string\"\"\"\n",
+      "\n",
      "    # Patch uniseg.wordbreak.word_break to deal with our private use characters. See also\n",
      "    # https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/WordBreakProperty.txt\n",
      "    old_word_break = uniseg.wordbreak.word_break\n",
      "\n",
      "    def new_word_break(c, index=0):\n",
      "        if 0xE000 <= ord(c) <= 0xF8FF:  # Private Use Area\n",
-      "            return 'ALetter'\n",
+      "            return \"ALetter\"\n",
      "        else:\n",
      "            return old_word_break(c, index)\n",
+      "\n",
      "    uniseg.wordbreak.word_break = new_word_break\n",
      "\n",
      "    # Check if c is an unwanted character, i.e. whitespace, punctuation, or similar\n",
@ -749,8 +672,8 @@
      "\n",
      "        # See https://www.fileformat.info/info/unicode/category/index.htm\n",
      "        # and https://unicodebook.readthedocs.io/unicode.html#categories\n",
-      "        unwanted_categories = 'O', 'M', 'P', 'Z', 'S'\n",
-      "        unwanted_subcategories = 'Cc', 'Cf'\n",
+      "        unwanted_categories = \"O\", \"M\", \"P\", \"Z\", \"S\"\n",
+      "        unwanted_subcategories = \"Cc\", \"Cf\"\n",
      "\n",
      "        subcat = unicodedata.category(c)\n",
      "        cat = subcat[0]\n",
@ -778,7 +701,7 @@
    }
   ],
   "source": [
-    "from word_error_rate import words\n",
+    "from qurator.dinglehopper.word_error_rate import words\n",
    "print(inspect.getsource(words))\n",
    "\n",
    "list(words(example_text))"
@ -905,29 +828,15 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "def word_error_rate(reference, compared):\n",
-      "    if isinstance(reference, str):\n",
-      "        reference_seq = list(words_normalized(reference))\n",
-      "        compared_seq = list(words_normalized(compared))\n",
-      "    else:\n",
-      "        reference_seq = list(reference)\n",
-      "        compared_seq = list(compared)\n",
-      "\n",
-      "    d = levenshtein(reference_seq, compared_seq)\n",
-      "    if d == 0:\n",
-      "        return 0\n",
-      "\n",
-      "    n = len(reference_seq)\n",
-      "    if n == 0:\n",
-      "        return float('inf')\n",
-      "\n",
-      "    return d / n\n",
+      "def word_error_rate(reference, compared) -> float:\n",
+      "    wer, _ = word_error_rate_n(reference, compared)\n",
+      "    return wer\n",
      "\n"
     ]
    }
   ],
   "source": [
-    "from word_error_rate import word_error_rate\n",
+    "from qurator.dinglehopper.word_error_rate import word_error_rate\n",
    "print(inspect.getsource(word_error_rate))"
   ]
  },
@ -1002,9 +911,9 @@
 "metadata": {
  "hide_input": false,
  "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "dinglehopper-github",
   "language": "python",
-   "name": "python3"
+   "name": "dinglehopper-github"
  },
  "language_info": {
   "codemirror_mode": {
@ -1016,7 +925,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.7.3"
+   "version": "3.7.12"
  },
  "toc": {
   "base_numbering": 1,
--- a/qurator/dinglehopper/ocrd_cli.py
+++ b/qurator/dinglehopper/ocrd_cli.py
@ -8,7 +8,6 @@ from ocrd_utils import getLogger, make_file_id, assert_file_grp_cardinality
 from pkg_resources import resource_string

 from .cli import process as cli_process
-from .edit_distance import levenshtein_matrix_cache_clear

 OCRD_TOOL = json.loads(resource_string(__name__, "ocrd-tool.json").decode("utf8"))

@ -74,8 +73,6 @@ class OcrdDinglehopperEvaluate(Processor):
                    local_filename=report_prefix + report_suffix,
                )

-            # Clear cache between files
-            levenshtein_matrix_cache_clear()

 if __name__ == "__main__":
    ocrd_dinglehopper()
--- a/qurator/dinglehopper/tests/test_edit_distance.py
+++ b/qurator/dinglehopper/tests/test_edit_distance.py
@ -2,27 +2,7 @@ from __future__ import division, print_function

 import unicodedata

-from .. import levenshtein, distance
-
-
-def test_levenshtein():
-    assert levenshtein("a", "a") == 0
-    assert levenshtein("a", "b") == 1
-    assert levenshtein("Foo", "Bar") == 3
-
-    assert levenshtein("", "") == 0
-    assert levenshtein("Foo", "") == 3
-    assert levenshtein("", "Foo") == 3
-
-    assert levenshtein("Foo", "Food") == 1
-    assert levenshtein("Fnord", "Food") == 2
-    assert levenshtein("Müll", "Mull") == 1
-    assert levenshtein("Abstand", "Sand") == 4
-
-
-def test_levenshtein_other_sequences():
-    assert levenshtein(["a", "ab"], ["a", "ab", "c"]) == 1
-    assert levenshtein(["a", "ab"], ["a", "c"]) == 1
+from .. import distance


 def test_distance():
--- a/qurator/dinglehopper/tests/test_editops.py
+++ b/qurator/dinglehopper/tests/test_editops.py
@ -1,48 +1,6 @@
 import unicodedata

-from .. import seq_editops, editops
-
-
-def test_trivial():
-    assert seq_editops("abc", "abc") == []
-    assert seq_editops("", "") == []
-
-
-def test_insert():
-    assert seq_editops("bc", "abc") == [("insert", 0, 0)]
-    assert seq_editops("ac", "abc") == [("insert", 1, 1)]
-    assert seq_editops("ab", "abc") == [("insert", 2, 2)]
-    assert seq_editops("", "a") == [("insert", 0, 0)]
-
-
-def test_multiple():
-    assert seq_editops("bcd", "abce") == [("insert", 0, 0), ("replace", 2, 3)]
-
-
-def test_delete():
-    assert seq_editops("abcdef", "cdef") == [("delete", 0, 0), ("delete", 1, 0)]
-    assert seq_editops("Xabcdef", "Xcdef") == [("delete", 1, 1), ("delete", 2, 1)]
-    assert seq_editops("abcdefg", "acdefX") == [("delete", 1, 1), ("replace", 6, 5)]
-    assert seq_editops("abcde", "aabcd") == [("insert", 1, 1), ("delete", 4, 5)]
-    assert seq_editops("Foo", "") == [
-        ("delete", 0, 0),
-        ("delete", 1, 0),
-        ("delete", 2, 0),
-    ]
-    assert seq_editops("Foolish", "Foo") == [
-        ("delete", 3, 3),
-        ("delete", 4, 3),
-        ("delete", 5, 3),
-        ("delete", 6, 3),
-    ]
-
-
-def test_ambiguous():
-    assert seq_editops("bcd", "abcef") == [
-        ("insert", 0, 0),
-        ("replace", 2, 3),
-        ("insert", 3, 4),
-    ]
+from .. import editops


 def test_editops():
--- a/qurator/dinglehopper/word_error_rate.py
+++ b/qurator/dinglehopper/word_error_rate.py
@ -6,7 +6,7 @@ from multimethod import multimethod

 import uniseg.wordbreak

-from .edit_distance import levenshtein
+from rapidfuzz.string_metric import levenshtein
 from . import ExtractedText


--- a/requirements.txt
+++ b/requirements.txt
@ -9,3 +9,4 @@ ocrd >= 2.20.1
 attrs
 multimethod == 1.3  # latest version to officially support Python 3.5
 tqdm
+rapidfuzz >= 1.8.1