dinglehopper/qurator/dinglehopper/notebooks/Levenshtein.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import unicodedata\n",
    "import inspect"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Levenshtein edit distance"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "def levenshtein_matrix(seq1, seq2):\n",
      "    \"\"\"Compute the matrix commonly computed to produce the Levenshtein distance.\n",
      "\n",
      "    This is also known as the Wagner-Fischer algorithm. The matrix element at the bottom right contains the desired\n",
      "    edit distance.\n",
      "\n",
      "    This algorithm is implemented here because we need an implementation that can work with sequences other than\n",
      "    strings, e.g. lists of grapheme clusters or lists of word strings.\n",
      "    \"\"\"\n",
      "    m = len(seq1)\n",
      "    n = len(seq2)\n",
      "\n",
      "    def from_to(start, stop):\n",
      "        return range(start, stop + 1, 1)\n",
      "\n",
      "    D = np.zeros((m + 1, n + 1), np.int)\n",
      "    D[0, 0] = 0\n",
      "    for i in from_to(1, m):\n",
      "        D[i, 0] = i\n",
      "    for j in from_to(1, n):\n",
      "        D[0, j] = j\n",
      "    for i in from_to(1, m):\n",
      "        for j in from_to(1, n):\n",
      "            D[i, j] = min(\n",
      "                D[i - 1, j - 1] + 1 * (seq1[i - 1] != seq2[j - 1]),  # Same or Substitution\n",
      "                D[i, j - 1] + 1,  # Insertion\n",
      "                D[i - 1, j] + 1   # Deletion\n",
      "            )\n",
      "\n",
      "    return D\n",
      "\n",
      "def levenshtein(seq1, seq2):\n",
      "    \"\"\"Compute the Levenshtein edit distance between two sequences\"\"\"\n",
      "    m = len(seq1)\n",
      "    n = len(seq2)\n",
      "\n",
      "    D = levenshtein_matrix(seq1, seq2)\n",
      "    return D[m, n]\n",
      "\n"
     ]
    }
   ],
   "source": [
    "from edit_distance import levenshtein_matrix, levenshtein\n",
    "\n",
    "print(inspect.getsource(levenshtein_matrix))\n",
    "print(inspect.getsource(levenshtein))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "assert levenshtein('a', 'a') == 0\n",
    "assert levenshtein('a', 'b') == 1\n",
    "assert levenshtein('Foo', 'Bar') == 3\n",
    "assert levenshtein('', '') == 0\n",
    "assert levenshtein('Foo', '') == 3\n",
    "assert levenshtein('', 'Foo') == 3\n",
    "assert levenshtein('Fnord', 'Food') == 2\n",
    "assert levenshtein('Müll', 'Mull') == 1\n",
    "assert levenshtein('Abstand', 'Sand') == 4"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "This fails for different representations of the \"same\" canonically equivalent string:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "2"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "word1 = unicodedata.normalize('NFC', 'Schlyñ')\n",
    "word2 = unicodedata.normalize('NFD', 'Schlyñ')  # Different, decomposed!\n",
    "levenshtein(word1, word2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Same, but for grapheme clusters\n",
    "from uniseg.graphemecluster import grapheme_clusters\n",
    "\n",
    "word1 = list(grapheme_clusters(unicodedata.normalize('NFC', 'Schlyñ')))\n",
    "word2 = list(grapheme_clusters(unicodedata.normalize('NFD', 'Schlyñ')))\n",
    "levenshtein(word1, word2)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Better."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Let's define a edit distance function that uses the basic Levenshtein algorithm, but knows about Unicode normalization and grapheme clusters!"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "def distance(s1, s2):\n",
      "    \"\"\"Compute the Levenshtein edit distance between two Unicode strings\n",
      "\n",
      "    Note that this is different from levenshtein() as this function knows about Unicode normalization and grapheme\n",
      "    clusters. This should be the correct way to compare two Unicode strings.\n",
      "    \"\"\"\n",
      "    s1 = list(grapheme_clusters(unicodedata.normalize('NFC', s1)))\n",
      "    s2 = list(grapheme_clusters(unicodedata.normalize('NFC', s2)))\n",
      "    return levenshtein(s1, s2)\n",
      "\n"
     ]
    }
   ],
   "source": [
    "from edit_distance import distance\n",
    "print(inspect.getsource(distance))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "word1 = unicodedata.normalize('NFC', 'Schlyñ')\n",
    "word2 = unicodedata.normalize('NFD', 'Schlyñ')  # Different, decomposed!\n",
    "\n",
    "distance(word1, word2)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "This should give us the correct answer of 1 for 'Schlyñ' (with LATIN SMALL LETTER N WITH TILDE) vs 'Schlym̃' (with LATIN SMALL LETTER M + COMBINING TILDE):"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "word1 = 'Schlyñ'\n",
    "word2 = 'Schlym̃'\n",
    "#print('Lengths, as far as Python is concerned:', len(word1), len(word2))  # → gives 6 and 7!\n",
    "distance(word1, word2)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Edit operations\n",
    "\n",
    "python-Levenshtein supports backtracing, i.e. giving a sequence of edit options that transforms a word to another word:\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[('insert', 5, 5), ('replace', 5, 6)]\n"
     ]
    }
   ],
   "source": [
    "import Levenshtein\n",
    "word1 = 'Schlyñ'  # with LATIN SMALL LETTER N WITH TILDE\n",
    "word2 = 'Schlym̃'  # with LATIN SMALL LETTER M + COMBINING TILDE\n",
    "print(Levenshtein.editops(word1, word2))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Note that it does not work with grapheme clusters, but \"characters\", so it gives 2 operations."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Defining our own `editops()`. (This looks a bit wild due to our own tail recursion handling.)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "def seq_editops(seq1, seq2):\n",
      "    seq1 = list(seq1)\n",
      "    seq2 = list(seq2)\n",
      "    m = len(seq1)\n",
      "    n = len(seq2)\n",
      "    D = levenshtein_matrix(seq1, seq2)\n",
      "\n",
      "    def _tail_backtrace(i, j, accumulator):\n",
      "        if i > 0 and D[i - 1, j] + 1 == D[i, j]:\n",
      "            return partial(_tail_backtrace, i - 1, j, [('delete', i-1, j)] + accumulator)\n",
      "        if j > 0 and D[i, j - 1] + 1 == D[i, j]:\n",
      "            return partial(_tail_backtrace, i, j - 1, [('insert', i, j-1)] + accumulator)\n",
      "        if i > 0 and j > 0 and D[i - 1, j - 1] + 1 == D[i, j]:\n",
      "            return partial(_tail_backtrace, i - 1, j - 1, [('replace', i-1, j-1)] + accumulator)\n",
      "        if i > 0 and j > 0 and D[i - 1, j - 1] == D[i, j]:\n",
      "            return partial(_tail_backtrace, i - 1, j - 1, accumulator)  # NOP\n",
      "        return accumulator\n",
      "\n",
      "    def backtrace(i, j):\n",
      "        result = partial(_tail_backtrace, i, j, [])\n",
      "        while isinstance(result, partial):\n",
      "            result = result()\n",
      "\n",
      "        return result\n",
      "\n",
      "    b = backtrace(m, n)\n",
      "    return b\n",
      "\n",
      "def editops(word1, word2):\n",
      "    # XXX Note that this returns indices to the _grapheme clusters_, not characters!\n",
      "    word1 = list(grapheme_clusters(unicodedata.normalize('NFC', word1)))\n",
      "    word2 = list(grapheme_clusters(unicodedata.normalize('NFC', word2)))\n",
      "    return seq_editops(word1, word2)\n",
      "\n"
     ]
    }
   ],
   "source": [
    "from edit_distance import seq_editops, editops\n",
    "print(inspect.getsource(seq_editops))\n",
    "print(inspect.getsource(editops))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('replace', 2, 2)]"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "editops('Foo', 'Fon')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[('insert', 4, 4)]\n",
      "[('insert', 4, 4)]\n"
     ]
    }
   ],
   "source": [
    "print(editops('Käptn', 'Käpt\\'n'))\n",
    "print(Levenshtein.editops('Käptn', 'Käpt\\'n'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[('delete', 6, 6)]\n",
      "[('delete', 6, 6)]\n"
     ]
    }
   ],
   "source": [
    "print(editops('Delete something', 'Deletesomething'))\n",
    "print(Levenshtein.editops('Delete something', 'Deletesomething'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[('delete', 1, 1), ('replace', 13, 12), ('insert', 17, 16), ('delete', 23, 23)]\n",
      "[('delete', 1, 1), ('replace', 13, 12), ('insert', 16, 15), ('delete', 23, 23)]\n"
     ]
    }
   ],
   "source": [
    "print(editops('A more difficult example', 'Amore difficült  exampl'))\n",
    "print(Levenshtein.editops('A more difficult example', 'Amore difficült  exampl'))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "XXX Note that our implementation returns different positions here for the 'insert'. "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Let's try it with a difficult example that needs grapheme cluster handling:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('replace', 5, 5)]"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "word1 = 'Schlyñ'  # with LATIN SMALL LETTER N WITH TILDE\n",
    "word2 = 'Schlym̃'  # with LATIN SMALL LETTER M + COMBINING TILDE\n",
    "\n",
    "editops(word1, word2)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "🎉"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Character error rate\n",
    "\n",
    "[digitisation.eu](https://sites.google.com/site/textdigitisation/qualitymeasures/computingerrorrates) defines the character error rate (CER) as:\n",
    "\n",
    "$$\n",
    "\\text{CER} = \\frac{i + s + d}{n}\n",
    "$$\n",
    "\n",
    "where $i$ is the number of inserts, $s$ the number of substitutions, $d$ the number of deletions and $n$ is the number of characters in the reference text. (The text is not super clear about $n$ being the number of characters in the reference text, but it seems appropiate as they *are* clear about this when computing the word error rate.)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Because our edit distance is equal to $i + s + d$, we can thus define:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "def character_error_rate(reference, compared):\n",
      "    d = distance(reference, compared)\n",
      "    if d == 0:\n",
      "        return 0\n",
      "\n",
      "    n = len(list(grapheme_clusters(unicodedata.normalize('NFC', reference))))\n",
      "    if n == 0:\n",
      "        return float('inf')\n",
      "\n",
      "    return d/n\n",
      "\n"
     ]
    }
   ],
   "source": [
    "from character_error_rate import character_error_rate\n",
    "print(inspect.getsource(character_error_rate))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "assert character_error_rate('Foo', 'Bär') == 3/3\n",
    "assert character_error_rate('Fnord', 'Food') == 2/5\n",
    "assert character_error_rate('Food', 'Fnord') == 2/4\n",
    "assert character_error_rate('Schlyñ', 'Schlym̃') == 1/6"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "# From experiments/2019-07-ocrevalUAtion: These are already preprocessed by the equivalences in equivalences-tess-frk.csv.\n",
    "gt = \"\"\"115 über die vielen Sorgen wegen deſſelben vergaß Hartkopf, der Frau Amtmännin das ver⸗ ſprochene zu überliefern. — Ein Erpreſſer wurde an ihn abgeſchickt, um ihn ums Him⸗ melswillen zu ſagen, daß er das Verſprochene gleich den Augenblick überbringen möchte, die Frau Amtmännin hätte ſich auf ihn verlaſſen, und nun wüßte ſie nicht, was ſie anfangen ſollte. Den Augenblick ſollte er kommen, ſonſt vergieng ſie in ihrer Angſt. — Die Gäſte wären ſchon angekommen, und es fehlte ihr doch noch an allem. — Hartkopf mußte ſich erſt beſinnen, und endlich nach langem Nachdenken fiel es ihm erſt wieder ein. — Er langte den Zettel aus dem Accisbuche heraus, und ſagte ſeiner Frau, daß ſie das, was da wäre, herbeyſchaffen möchte. Jndeß mangelten doch einige Generalia, die alſo wegfielen. — Hartkopf gieng ſelbſt mit und überbrachte es. — „Herr Jemine! er böſer Mann!“ — ſchrie ihm die Frau Amtmännin entgegen, und ſchlug ihn auf die Schulter und blickte den Korb, der voll gedrückt, gerüttelt und überﬂüſſig in ihren Schoos gegeben werden ſollte, mit Augen voller Freu⸗ H 2\"\"\"\n",
    "tess = \"\"\"emm unmit; Lis Übey die vielen Sorgen wegen\" deſſelben vergaß Hartkopf, der Frau! Amimännin das- ver ſprochene zu überliefeen. ==\" Ein Epypreſſer- wurde an ihn abgeſchieet', um' ihn ums Hime melswillen zu ſagen, \"daß er das Verſyrochene leich den Augenblick \"überbringen möchte, die Frau Amtmännin hätte ſich auf ihn veriaſſen, und nun wüßte ſie- nicht, was ſie anfangen ſollte, =! 'Den Augenblick ſollte \"er kommen, ſonſt vergieng ſie in ihrer Angſt. == Die Säuaſie- wären. ſchon angekommen, und es fehlte ihr do < noch an alien, === Hartfopyf mußte ſich erſt TIM und endlich mach langem Rachdenken fiel es ihm erſt wieder ein, ==. Ex langte den Zettel aus dem- Accisbuche heraus, und ſagte ſeiner Frau, daß ſie das , was da wäre, herbeyſchaffen mschte. ZIudeß „mangelten doch einige Generalia, die alſo wegfielen. == ' Havrkopf gieng ſelbſt mit und überbrachte es == | „Herr Jemine! er böſer Mann 1-2 ſchrie ihm die Frau Amtmännin entgegen, und ſchlug ihn auf die Schulter und blickte den Korb, der - voll gedrückt, gerüttelt und überfirfſig in ihren Ss HEILE werden ſolite, mit Augen voller EE) Fron?\"\"\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.1190\n"
     ]
    }
   ],
   "source": [
    "print('{:.4f}'.format(character_error_rate(gt, tess)))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "XXX This gives a smaller CER than ocrevalUAtion (which gives 0.1228). Why?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.1190253045923149"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "levenshtein(gt, tess)/len(gt)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "That's ~ the same, so I think it's not about the character segmentation. Check that we're only dealing with single-codepoint grapheme clusters:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "for w in gt, tess:\n",
    "    for g in grapheme_clusters(w):\n",
    "        assert len(g) == 1"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Maybe ocrevalUAtion doesn't count whitespace?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'115überdievielenSorgenwegendeſſelbenvergaßHartkopf,derFrauAmtmännindasver⸗ſprochenezuüberliefern.—EinErpreſſerwurdeanihnabgeſchickt,umihnumsHim⸗melswillenzuſagen,daßerdasVerſprochenegleichdenAugenblicküberbringenmöchte,dieFrauAmtmänninhätteſichaufihnverlaſſen,undnunwüßteſienicht,wasſieanfangenſollte.DenAugenblickſollteerkommen,ſonſtvergiengſieinihrerAngſt.—DieGäſtewärenſchonangekommen,undesfehlteihrdochnochanallem.—Hartkopfmußteſicherſtbeſinnen,undendlichnachlangemNachdenkenfielesihmerſtwiederein.—ErlangtedenZettelausdemAccisbucheheraus,undſagteſeinerFrau,daßſiedas,wasdawäre,herbeyſchaffenmöchte.JndeßmangeltendocheinigeGeneralia,diealſowegfielen.—Hartkopfgiengſelbſtmitundüberbrachtees.—„HerrJemine!erböſerMann!“—ſchrieihmdieFrauAmtmänninentgegen,undſchlugihnaufdieSchulterundblicktedenKorb,dervollgedrückt,gerütteltundüberﬂüſſiginihrenSchoosgegebenwerdenſollte,mitAugenvollerFreu⸗H2'"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "def remove_whitespace(s):\n",
    "    return s.replace(' ', '')\n",
    "remove_whitespace(gt)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.1324\n"
     ]
    }
   ],
   "source": [
    "print('{:.4f}'.format(character_error_rate(remove_whitespace(gt), remove_whitespace(tess))))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Now it's larger than ocrevalUAtion 🤷‍♂️"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Word error rate"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Word segmentation"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Naively split on spaces.\n",
    "\n",
    "(Note: ocrevalUAtion does confusing things here, like the Token splitting in a hash function, with an empty pattern?!)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "def naive_word_split(s):\n",
    "    return s.split(' ')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "example_text = \"The quick (“brown”) fox can't jump 32.3 feet, right?\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['The',\n",
       " 'quick',\n",
       " '(“brown”)',\n",
       " 'fox',\n",
       " \"can't\",\n",
       " 'jump',\n",
       " '32.3',\n",
       " 'feet,',\n",
       " 'right?']"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "naive_word_split(example_text)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Let's do it the Unicode way (Appendix UAX #29 on Unicode Text Segmentation): Split on word boundaries using the uniseg libraries and ignore words that contain only whitespace, punctuation \"and similar characters\":"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "def words(s):\n",
      "    # Patch uniseg.wordbreak.word_break to deal with our private use characters. See also\n",
      "    # https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/WordBreakProperty.txt\n",
      "    old_word_break = uniseg.wordbreak.word_break\n",
      "\n",
      "    def new_word_break(c, index=0):\n",
      "        if 0xE000 <= ord(c) <= 0xF8FF:  # Private Use Area\n",
      "            return 'ALetter'\n",
      "        else:\n",
      "            return old_word_break(c, index)\n",
      "    uniseg.wordbreak.word_break = new_word_break\n",
      "\n",
      "    # Check if c is an unwanted character, i.e. whitespace, punctuation, or similar\n",
      "    def unwanted(c):\n",
      "\n",
      "        # See https://www.fileformat.info/info/unicode/category/index.htm\n",
      "        # and https://unicodebook.readthedocs.io/unicode.html#categories\n",
      "        unwanted_categories = 'O', 'M', 'P', 'Z', 'S'\n",
      "        unwanted_subcategories = 'Cc', 'Cf'\n",
      "\n",
      "        subcat = unicodedata.category(c)\n",
      "        cat = subcat[0]\n",
      "        return cat in unwanted_categories or subcat in unwanted_subcategories\n",
      "\n",
      "    # We follow Unicode Standard Annex #29 on Unicode Text Segmentation here: Split on word boundaries using\n",
      "    # uniseg.wordbreak.words() and ignore all \"words\" that contain only whitespace, punctation \"or similar characters.\"\n",
      "    for word in uniseg.wordbreak.words(s):\n",
      "        if all(unwanted(c) for c in word):\n",
      "            pass\n",
      "        else:\n",
      "            yield word\n",
      "\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "['The', 'quick', 'brown', 'fox', \"can't\", 'jump', '32.3', 'feet', 'right']"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from word_error_rate import words\n",
    "print(inspect.getsource(words))\n",
    "\n",
    "list(words(example_text))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['Der',\n",
       " 'schnelle',\n",
       " 'braune',\n",
       " 'Fuchs',\n",
       " 'kann',\n",
       " 'keine',\n",
       " '3,14',\n",
       " 'Meter',\n",
       " 'springen',\n",
       " 'oder']"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "list(words('Der schnelle [„braune“] Fuchs kann keine 3,14 Meter springen, oder?'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['Dies', 'ist', 'ein', 'Beispielsatz', 'Oh', 'ja']"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "list(words('Dies ist ein Beispielsatz. Oh, ja.'))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "It's probably not correct for Chinese and Japanese, but at least it doesn't rely on spaces."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['我', '很', '高', '興', '跟', '你', '見', '面']"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "list(words('我很高興跟你見面'))  # \"Pleased to meet you\" in Mandarin, Traditional writing"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['医', '者', 'を', '呼', 'ん', 'で', 'く', 'だ', 'さ', 'い']"
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "list(words('医者を呼んでください。'))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Word error rate"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "For the word error rate, normalize again and compare sequences of words."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "def word_error_rate(reference, compared):\n",
      "    if isinstance(reference, str):\n",
      "        reference_seq = list(words_normalized(reference))\n",
      "        compared_seq = list(words_normalized(compared))\n",
      "    else:\n",
      "        reference_seq = list(reference)\n",
      "        compared_seq = list(compared)\n",
      "\n",
      "    d = levenshtein(reference_seq, compared_seq)\n",
      "    if d == 0:\n",
      "        return 0\n",
      "\n",
      "    n = len(reference_seq)\n",
      "    if n == 0:\n",
      "        return float('inf')\n",
      "\n",
      "    return d / n\n",
      "\n"
     ]
    }
   ],
   "source": [
    "from word_error_rate import word_error_rate\n",
    "print(inspect.getsource(word_error_rate))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.25"
      ]
     },
     "execution_count": 33,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "word_error_rate('Dies ist ein Beispielsatz.', 'Dies isi ein Beispielsatz,')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.75"
      ]
     },
     "execution_count": 34,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "word_error_rate('Fnord ist verdampfter Kräutertee!', 'Fnòrd ist verdmpfter Krautertee.')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.18823529411764706"
      ]
     },
     "execution_count": 35,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "word_error_rate(gt, tess)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "This is a little larger than the ocrevalUAtion result!"
   ]
  }
 ],
 "metadata": {
  "hide_input": false,
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": true
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}