📝 dinglehopper: Update Levenshtein notebook

2025-08-17 05:30:04 +02:00 · 2021-10-22 16:58:40 +02:00 · 2021-10-22 16:58:40 +02:00 · 06ea38449c
commit 06ea38449c
parent 3ee688001a
1 changed files with 105 additions and 196 deletions
--- a/qurator/dinglehopper/notebooks/Levenshtein.ipynb
+++ b/qurator/dinglehopper/notebooks/Levenshtein.ipynb
@ -18,62 +18,20 @@
    "# Levenshtein edit distance"
   ]
  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "dinglehopper uses to have its own (very inefficient) Levenshtein edit distance implementation, but now uses RapidFuzz."
+   ]
+  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "def levenshtein_matrix(seq1, seq2):\n",
-      "    \"\"\"Compute the matrix commonly computed to produce the Levenshtein distance.\n",
-      "\n",
-      "    This is also known as the Wagner-Fischer algorithm. The matrix element at the bottom right contains the desired\n",
-      "    edit distance.\n",
-      "\n",
-      "    This algorithm is implemented here because we need an implementation that can work with sequences other than\n",
-      "    strings, e.g. lists of grapheme clusters or lists of word strings.\n",
-      "    \"\"\"\n",
-      "    m = len(seq1)\n",
-      "    n = len(seq2)\n",
-      "\n",
-      "    def from_to(start, stop):\n",
-      "        return range(start, stop + 1, 1)\n",
-      "\n",
-      "    D = np.zeros((m + 1, n + 1), np.int)\n",
-      "    D[0, 0] = 0\n",
-      "    for i in from_to(1, m):\n",
-      "        D[i, 0] = i\n",
-      "    for j in from_to(1, n):\n",
-      "        D[0, j] = j\n",
-      "    for i in from_to(1, m):\n",
-      "        for j in from_to(1, n):\n",
-      "            D[i, j] = min(\n",
-      "                D[i - 1, j - 1] + 1 * (seq1[i - 1] != seq2[j - 1]),  # Same or Substitution\n",
-      "                D[i, j - 1] + 1,  # Insertion\n",
-      "                D[i - 1, j] + 1   # Deletion\n",
-      "            )\n",
-      "\n",
-      "    return D\n",
-      "\n",
-      "def levenshtein(seq1, seq2):\n",
-      "    \"\"\"Compute the Levenshtein edit distance between two sequences\"\"\"\n",
-      "    m = len(seq1)\n",
-      "    n = len(seq2)\n",
-      "\n",
-      "    D = levenshtein_matrix(seq1, seq2)\n",
-      "    return D[m, n]\n",
-      "\n"
-     ]
-    }
-   ],
+   "outputs": [],
   "source": [
-    "from edit_distance import levenshtein_matrix, levenshtein\n",
-    "\n",
-    "print(inspect.getsource(levenshtein_matrix))\n",
-    "print(inspect.getsource(levenshtein))"
+    "from rapidfuzz.string_metric import levenshtein"
   ]
  },
  {
@ -170,21 +128,23 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "def distance(s1, s2):\n",
+      "@multimethod\n",
+      "def distance(s1: str, s2: str):\n",
      "    \"\"\"Compute the Levenshtein edit distance between two Unicode strings\n",
      "\n",
-      "    Note that this is different from levenshtein() as this function knows about Unicode normalization and grapheme\n",
-      "    clusters. This should be the correct way to compare two Unicode strings.\n",
+      "    Note that this is different from levenshtein() as this function knows about Unicode\n",
+      "    normalization and grapheme clusters. This should be the correct way to compare two\n",
+      "    Unicode strings.\n",
      "    \"\"\"\n",
-      "    s1 = list(grapheme_clusters(unicodedata.normalize('NFC', s1)))\n",
-      "    s2 = list(grapheme_clusters(unicodedata.normalize('NFC', s2)))\n",
-      "    return levenshtein(s1, s2)\n",
+      "    seq1 = list(grapheme_clusters(unicodedata.normalize(\"NFC\", s1)))\n",
+      "    seq2 = list(grapheme_clusters(unicodedata.normalize(\"NFC\", s2)))\n",
+      "    return levenshtein(seq1, seq2)\n",
      "\n"
     ]
    }
   ],
   "source": [
-    "from edit_distance import distance\n",
+    "from qurator.dinglehopper.edit_distance import distance\n",
    "print(inspect.getsource(distance))"
   ]
  },
@ -247,8 +207,7 @@
   "source": [
    "# Edit operations\n",
    "\n",
-    "python-Levenshtein supports backtracing, i.e. giving a sequence of edit options that transforms a word to another word:\n",
-    "\n"
+    "python-Levenshtein + RapidFuzz supports backtracing, i.e. giving a sequence of edit options that transforms a word to another word:"
   ]
  },
  {
@ -257,32 +216,20 @@
   "metadata": {},
   "outputs": [
    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[('insert', 5, 5), ('replace', 5, 6)]\n"
-     ]
+     "data": {
+      "text/plain": [
+       "[('replace', 2, 2)]"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
    }
   ],
   "source": [
-    "import Levenshtein\n",
-    "word1 = 'Schlyñ'  # with LATIN SMALL LETTER N WITH TILDE\n",
-    "word2 = 'Schlym̃'  # with LATIN SMALL LETTER M + COMBINING TILDE\n",
-    "print(Levenshtein.editops(word1, word2))"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Note that it does not work with grapheme clusters, but \"characters\", so it gives 2 operations."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Defining our own `editops()`. (This looks a bit wild due to our own tail recursion handling.)"
+    "from rapidfuzz.string_metric import levenshtein_editops as editops\n",
+    "\n",
+    "editops('Foo', 'Fon')"
   ]
  },
  {
@ -294,47 +241,12 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "def seq_editops(seq1, seq2):\n",
-      "    seq1 = list(seq1)\n",
-      "    seq2 = list(seq2)\n",
-      "    m = len(seq1)\n",
-      "    n = len(seq2)\n",
-      "    D = levenshtein_matrix(seq1, seq2)\n",
-      "\n",
-      "    def _tail_backtrace(i, j, accumulator):\n",
-      "        if i > 0 and D[i - 1, j] + 1 == D[i, j]:\n",
-      "            return partial(_tail_backtrace, i - 1, j, [('delete', i-1, j)] + accumulator)\n",
-      "        if j > 0 and D[i, j - 1] + 1 == D[i, j]:\n",
-      "            return partial(_tail_backtrace, i, j - 1, [('insert', i, j-1)] + accumulator)\n",
-      "        if i > 0 and j > 0 and D[i - 1, j - 1] + 1 == D[i, j]:\n",
-      "            return partial(_tail_backtrace, i - 1, j - 1, [('replace', i-1, j-1)] + accumulator)\n",
-      "        if i > 0 and j > 0 and D[i - 1, j - 1] == D[i, j]:\n",
-      "            return partial(_tail_backtrace, i - 1, j - 1, accumulator)  # NOP\n",
-      "        return accumulator\n",
-      "\n",
-      "    def backtrace(i, j):\n",
-      "        result = partial(_tail_backtrace, i, j, [])\n",
-      "        while isinstance(result, partial):\n",
-      "            result = result()\n",
-      "\n",
-      "        return result\n",
-      "\n",
-      "    b = backtrace(m, n)\n",
-      "    return b\n",
-      "\n",
-      "def editops(word1, word2):\n",
-      "    # XXX Note that this returns indices to the _grapheme clusters_, not characters!\n",
-      "    word1 = list(grapheme_clusters(unicodedata.normalize('NFC', word1)))\n",
-      "    word2 = list(grapheme_clusters(unicodedata.normalize('NFC', word2)))\n",
-      "    return seq_editops(word1, word2)\n",
-      "\n"
+      "[('insert', 4, 4)]\n"
     ]
    }
   ],
   "source": [
-    "from edit_distance import seq_editops, editops\n",
-    "print(inspect.getsource(seq_editops))\n",
-    "print(inspect.getsource(editops))"
+    "print(editops('Käptn', 'Käpt\\'n'))"
   ]
  },
  {
@ -343,18 +255,15 @@
   "metadata": {},
   "outputs": [
    {
-     "data": {
-      "text/plain": [
-       "[('replace', 2, 2)]"
-      ]
-     },
-     "execution_count": 11,
-     "metadata": {},
-     "output_type": "execute_result"
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[('delete', 6, 6)]\n"
+     ]
    }
   ],
   "source": [
-    "editops('Foo', 'Fon')"
+    "print(editops('Delete something', 'Deletesomething'))"
   ]
  },
  {
@ -366,14 +275,19 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "[('insert', 4, 4)]\n",
-      "[('insert', 4, 4)]\n"
+      "[('delete', 1, 1), ('replace', 13, 12), ('insert', 16, 15), ('delete', 23, 23)]\n"
     ]
    }
   ],
   "source": [
-    "print(editops('Käptn', 'Käpt\\'n'))\n",
-    "print(Levenshtein.editops('Käptn', 'Käpt\\'n'))"
+    "print(editops('A more difficult example', 'Amore difficült  exampl'))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Let's try it with a difficult example that needs grapheme cluster handling:"
   ]
  },
  {
@ -382,17 +296,28 @@
   "metadata": {},
   "outputs": [
    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[('delete', 6, 6)]\n",
-      "[('delete', 6, 6)]\n"
-     ]
+     "data": {
+      "text/plain": [
+       "[('insert', 5, 5), ('replace', 5, 6)]"
+      ]
+     },
+     "execution_count": 13,
+     "metadata": {},
+     "output_type": "execute_result"
    }
   ],
   "source": [
-    "print(editops('Delete something', 'Deletesomething'))\n",
-    "print(Levenshtein.editops('Delete something', 'Deletesomething'))"
+    "word1 = 'Schlyñ'  # with LATIN SMALL LETTER N WITH TILDE\n",
+    "word2 = 'Schlym̃'  # with LATIN SMALL LETTER M + COMBINING TILDE\n",
+    "\n",
+    "editops(word1, word2)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "That doesn't look right, let's redefine it with grapheme cluster support:"
   ]
  },
  {
@ -404,28 +329,22 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "[('delete', 1, 1), ('replace', 13, 12), ('insert', 17, 16), ('delete', 23, 23)]\n",
-      "[('delete', 1, 1), ('replace', 13, 12), ('insert', 16, 15), ('delete', 23, 23)]\n"
+      "def editops(word1, word2):\n",
+      "    \"\"\"\n",
+      "    Return sequence of edit operations transforming one string to another.\n",
+      "\n",
+      "    Note that this returns indices to the _grapheme clusters_, not characters!\n",
+      "    \"\"\"\n",
+      "    word1 = list(grapheme_clusters(unicodedata.normalize(\"NFC\", word1)))\n",
+      "    word2 = list(grapheme_clusters(unicodedata.normalize(\"NFC\", word2)))\n",
+      "    return levenshtein_editops(word1, word2)\n",
+      "\n"
     ]
    }
   ],
   "source": [
-    "print(editops('A more difficult example', 'Amore difficült  exampl'))\n",
-    "print(Levenshtein.editops('A more difficult example', 'Amore difficült  exampl'))"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "XXX Note that our implementation returns different positions here for the 'insert'. "
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Let's try it with a difficult example that needs grapheme cluster handling:"
+    "from qurator.dinglehopper.edit_distance import editops\n",
+    "print(inspect.getsource(editops))"
   ]
  },
  {
@ -455,7 +374,9 @@
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "🎉"
+    "🎉\n",
+    "\n",
+    "Here, a problem is that the positions are grapheme cluster positions, not Python character indexes!"
   ]
  },
  {
@ -489,22 +410,20 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "def character_error_rate(reference, compared):\n",
-      "    d = distance(reference, compared)\n",
-      "    if d == 0:\n",
-      "        return 0\n",
+      "def character_error_rate(reference, compared) -> float:\n",
+      "    \"\"\"\n",
+      "    Compute character error rate.\n",
      "\n",
-      "    n = len(list(grapheme_clusters(unicodedata.normalize('NFC', reference))))\n",
-      "    if n == 0:\n",
-      "        return float('inf')\n",
-      "\n",
-      "    return d/n\n",
+      "    :return: character error rate\n",
+      "    \"\"\"\n",
+      "    cer, _ = character_error_rate_n(reference, compared)\n",
+      "    return cer\n",
      "\n"
     ]
    }
   ],
   "source": [
-    "from character_error_rate import character_error_rate\n",
+    "from qurator.dinglehopper.character_error_rate import character_error_rate\n",
    "print(inspect.getsource(character_error_rate))"
   ]
  },
@ -732,16 +651,20 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "def words(s):\n",
+      "@multimethod\n",
+      "def words(s: str):\n",
+      "    \"\"\"Extract words from a string\"\"\"\n",
+      "\n",
      "    # Patch uniseg.wordbreak.word_break to deal with our private use characters. See also\n",
      "    # https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/WordBreakProperty.txt\n",
      "    old_word_break = uniseg.wordbreak.word_break\n",
      "\n",
      "    def new_word_break(c, index=0):\n",
      "        if 0xE000 <= ord(c) <= 0xF8FF:  # Private Use Area\n",
-      "            return 'ALetter'\n",
+      "            return \"ALetter\"\n",
      "        else:\n",
      "            return old_word_break(c, index)\n",
+      "\n",
      "    uniseg.wordbreak.word_break = new_word_break\n",
      "\n",
      "    # Check if c is an unwanted character, i.e. whitespace, punctuation, or similar\n",
@ -749,8 +672,8 @@
      "\n",
      "        # See https://www.fileformat.info/info/unicode/category/index.htm\n",
      "        # and https://unicodebook.readthedocs.io/unicode.html#categories\n",
-      "        unwanted_categories = 'O', 'M', 'P', 'Z', 'S'\n",
-      "        unwanted_subcategories = 'Cc', 'Cf'\n",
+      "        unwanted_categories = \"O\", \"M\", \"P\", \"Z\", \"S\"\n",
+      "        unwanted_subcategories = \"Cc\", \"Cf\"\n",
      "\n",
      "        subcat = unicodedata.category(c)\n",
      "        cat = subcat[0]\n",
@ -778,7 +701,7 @@
    }
   ],
   "source": [
-    "from word_error_rate import words\n",
+    "from qurator.dinglehopper.word_error_rate import words\n",
    "print(inspect.getsource(words))\n",
    "\n",
    "list(words(example_text))"
@ -905,29 +828,15 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "def word_error_rate(reference, compared):\n",
-      "    if isinstance(reference, str):\n",
-      "        reference_seq = list(words_normalized(reference))\n",
-      "        compared_seq = list(words_normalized(compared))\n",
-      "    else:\n",
-      "        reference_seq = list(reference)\n",
-      "        compared_seq = list(compared)\n",
-      "\n",
-      "    d = levenshtein(reference_seq, compared_seq)\n",
-      "    if d == 0:\n",
-      "        return 0\n",
-      "\n",
-      "    n = len(reference_seq)\n",
-      "    if n == 0:\n",
-      "        return float('inf')\n",
-      "\n",
-      "    return d / n\n",
+      "def word_error_rate(reference, compared) -> float:\n",
+      "    wer, _ = word_error_rate_n(reference, compared)\n",
+      "    return wer\n",
      "\n"
     ]
    }
   ],
   "source": [
-    "from word_error_rate import word_error_rate\n",
+    "from qurator.dinglehopper.word_error_rate import word_error_rate\n",
    "print(inspect.getsource(word_error_rate))"
   ]
  },
@ -1002,9 +911,9 @@
 "metadata": {
  "hide_input": false,
  "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "dinglehopper-github",
   "language": "python",
-   "name": "python3"
+   "name": "dinglehopper-github"
  },
  "language_info": {
   "codemirror_mode": {
@ -1016,7 +925,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.7.3"
+   "version": "3.7.12"
  },
  "toc": {
   "base_numbering": 1,