From e371da899e65e53e7fa0ca0923000d270356fae9 Mon Sep 17 00:00:00 2001 From: Benjamin Rosemann Date: Mon, 16 Nov 2020 12:06:44 +0100 Subject: [PATCH] Switch from custom Levenshtein to python-Levenshtein As the distance and editops calculation is a performance bottleneck in this application we substituted the custom Levenshtein implementation to the C implementation in the python-Levenshtein package. We now also have separate entrypoints for texts with unicode normalization and without because this also can be done more efficiently once upon preprocessing. --- qurator/dinglehopper/character_error_rate.py | 4 +- qurator/dinglehopper/edit_distance.py | 201 ++-- .../dinglehopper/notebooks/Levenshtein.ipynb | 1037 ----------------- qurator/dinglehopper/ocrd_cli.py | 3 - .../dinglehopper/tests/test_edit_distance.py | 20 +- qurator/dinglehopper/tests/test_editops.py | 24 +- requirements.txt | 1 + 7 files changed, 89 insertions(+), 1201 deletions(-) delete mode 100644 qurator/dinglehopper/notebooks/Levenshtein.ipynb diff --git a/qurator/dinglehopper/character_error_rate.py b/qurator/dinglehopper/character_error_rate.py index 0c3ef7d..2d663b1 100644 --- a/qurator/dinglehopper/character_error_rate.py +++ b/qurator/dinglehopper/character_error_rate.py @@ -6,7 +6,7 @@ from typing import Tuple from multimethod import multimethod from uniseg.graphemecluster import grapheme_clusters -from .edit_distance import distance +from .edit_distance import distance_unicode from .extracted_text import ExtractedText @@ -18,7 +18,7 @@ def character_error_rate_n(reference: str, compared: str) -> Tuple[float, int]: :return: character error rate and length of the reference """ - d = distance(reference, compared) + d = distance_unicode(reference, compared) n = len(list(grapheme_clusters(unicodedata.normalize("NFC", reference)))) if d == 0: diff --git a/qurator/dinglehopper/edit_distance.py b/qurator/dinglehopper/edit_distance.py index b906fa9..5b7f624 100644 --- a/qurator/dinglehopper/edit_distance.py +++ b/qurator/dinglehopper/edit_distance.py @@ -1,183 +1,136 @@ from __future__ import division, print_function import unicodedata -from functools import partial, lru_cache from itertools import chain -from typing import Sequence, Tuple, List +from typing import List, Union, Tuple -import numpy as np from Levenshtein import editops as c_editops, distance as c_distance from multimethod import multimethod from uniseg.graphemecluster import grapheme_clusters -from tqdm import tqdm from .extracted_text import ExtractedText -from .config import Config -def levenshtein_matrix(seq1: Sequence, seq2: Sequence): - """Compute the matrix commonly computed to produce the Levenshtein distance. - This is also known as the Wagner-Fischer algorithm. The matrix element at the bottom right contains the desired - edit distance. - - This algorithm is implemented here because we need an implementation that can work with sequences other than - strings, e.g. lists of grapheme clusters or lists of word strings. - """ - - # Internally, we use a cached version. As the cache only works on hashable parameters, we convert the input - # sequences to tuples to make them hashable. - return _levenshtein_matrix(tuple(seq1), tuple(seq2)) - +@multimethod +def distance_unicode(s1: str, s2: str): + """Compute the Levenshtein edit distance between two Unicode strings -@lru_cache(maxsize=10) -def _levenshtein_matrix(seq1: Tuple, seq2: Tuple): - """Compute the matrix commonly computed to produce the Levenshtein distance. + Note that this is different from distance() as this function knows about Unicode + normalization and grapheme clusters. - This is a LRU cached function not meant to be used directly. Use levenshtein_matrix() instead. + This should be the correct way to compare two Unicode strings. """ - m = len(seq1) - n = len(seq2) + s1, s2 = transform_unicode(s1, s2) + return distance(s1, s2) - def from_to(start, stop): - return range(start, stop + 1, 1) - D = np.zeros((m + 1, n + 1), np.int) - D[0, 0] = 0 - for i in from_to(1, m): - D[i, 0] = i - for j in from_to(1, n): - D[0, j] = j - for i in tqdm(from_to(1, m), disable=not Config.progress): - for j in from_to(1, n): - D[i, j] = min( - D[i - 1, j - 1] - + 1 * (seq1[i - 1] != seq2[j - 1]), # Same or Substitution - D[i, j - 1] + 1, # Insertion - D[i - 1, j] + 1, # Deletion - ) - - return D +@multimethod +def distance_unicode(s1: ExtractedText, s2: ExtractedText): + """Compute the Levenshtein edit distance between two Unicode strings + Note that this is different from distance() as this function knows about Unicode + normalization and grapheme clusters. -def levenshtein(seq1, seq2): - """Compute the Levenshtein edit distance between two sequences""" - m = len(seq1) - n = len(seq2) + This should be the correct way to compare two Unicode strings. + """ + return distance_unicode(s1.text, s2.text) - D = levenshtein_matrix(seq1, seq2) - return D[m, n] +@multimethod +def distance(l1: List, l2: List): + """Compute the Levenshtein edit distance between two lists. -def levenshtein_matrix_cache_clear(): - """Clear internal Levenshtein matrix cache. + Also see `distance_unicode()`. - You want to do this between different input file pairs to decrease memory - usage by not caching results from prior input files. + The difference is that this implementation does not care about grapheme clusters or + unicode normalization, assuming that this already has been done in preprocessing. """ - _levenshtein_matrix.cache_clear() + s1, s2 = transform_lists(l1, l2) + return c_distance(s1, s2) @multimethod def distance(s1: str, s2: str): - """Compute the Levenshtein edit distance between two Unicode strings + """Compute the Levenshtein edit distance between two strings. - Note that this is different from levenshtein() as this function knows about Unicode - normalization and grapheme clusters. + Also see `distance_unicode()`. - This should be the correct way to compare two Unicode strings. + The difference is that this implementation does not care about grapheme clusters or + unicode normalization, assuming that this already has been done in preprocessing. """ - seq1 = list(grapheme_clusters(unicodedata.normalize("NFC", s1))) - seq2 = list(grapheme_clusters(unicodedata.normalize("NFC", s2))) - if any(len(s) > 1 for s in chain(seq1, seq2)): - return distance(seq1, seq2) - else: - return distance_fast("".join(seq1), "".join(seq2)) + return c_distance(s1, s2) @multimethod def distance(s1: ExtractedText, s2: ExtractedText): - return distance(s1.text, s2.text) - - -@multimethod -def distance(s1: List, s2: List): - return levenshtein(s1, s2) - + """Compute the Levenshtein edit distance between two strings. -def distance_fast(s1: str, s2: str): - """Compute the Levenshtein edit distance between two Unicode strings - - Also see `distance()`. + Also see `distance_unicode()`. The difference is that this implementation does not care about grapheme clusters or unicode normalization, assuming that this already has been done in preprocessing. """ - return c_distance(s1, s2) + return distance(s1.text, s2.text) @multimethod -def editops(seq1: List, seq2: List): - """ - Return sequence of edit operations transforming one sequence to another. +def editops_unicode(s1: str, s2: str): + """Return sequence of edit operations transforming one string to another. - This aims to return the same/similar results as python-Levenshtein's editops(), - just generalized to arbitrary sequences. + Note that this returns indices to the _grapheme clusters_, not characters! """ - seq1 = list(seq1) - seq2 = list(seq2) - m = len(seq1) - n = len(seq2) - D = levenshtein_matrix(seq1, seq2) - - def _tail_backtrace(i, j, accumulator): - if i > 0 and D[i - 1, j] + 1 == D[i, j]: - return partial( - _tail_backtrace, i - 1, j, [("delete", i - 1, j)] + accumulator - ) - if j > 0 and D[i, j - 1] + 1 == D[i, j]: - return partial( - _tail_backtrace, i, j - 1, [("insert", i, j - 1)] + accumulator - ) - if i > 0 and j > 0 and D[i - 1, j - 1] + 1 == D[i, j]: - return partial( - _tail_backtrace, i - 1, j - 1, [("replace", i - 1, j - 1)] + accumulator - ) - if i > 0 and j > 0 and D[i - 1, j - 1] == D[i, j]: - return partial(_tail_backtrace, i - 1, j - 1, accumulator) # NOP - return accumulator - - def backtrace(i, j): - result = partial(_tail_backtrace, i, j, []) - while isinstance(result, partial): - result = result() - - return result - - b = backtrace(m, n) - return b + s1, s2 = transform_unicode(s1, s2) + return editops(s1, s2) @multimethod -def editops(s1: str, s2: str): - """ - Return sequence of edit operations transforming one string to another. +def editops(l1: List, l2: List): + """Return sequence of edit operations transforming one list to another. - Note that this returns indices to the _grapheme clusters_, not characters! + Also see `editops_unicode()`. + + The difference is that this implementation does not care about grapheme clusters or + unicode normalization, assuming that this already has been done in preprocessing. """ - s1 = list(grapheme_clusters(unicodedata.normalize("NFC", s1))) - s2 = list(grapheme_clusters(unicodedata.normalize("NFC", s2))) - if any(len(s) > 1 for s in chain(s1, s2)): - return editops(s1, s2) - else: - return editops_fast("".join(s1), "".join(s2)) + s1, s2 = transform_lists(l1, l2) + return c_editops(s1, s2) -def editops_fast(s1: str, s2: str): +@multimethod +def editops(s1: str, s2: str): """Return sequence of edit operations transforming one string to another. - Also see `editops()`. + Also see `editops_unicode()`. The difference is that this implementation does not care about grapheme clusters or unicode normalization, assuming that this already has been done in preprocessing. """ return c_editops(s1, s2) + + +def transform_lists(l1: List, l2: List) -> Tuple[str, str]: + """Transform two lists into string representation. + + We need this transformation to be able to calculate a Levenshtein distance + between two sequences. + + Note that we can only process 1,114,111 unique elements with this implementation. + See https://docs.python.org/3/library/functions.html#chr + """ + mapping = {el: chr(i) for i, el in enumerate(frozenset(chain(l1, l2)))} + s1 = "".join([mapping[el] for el in l1]) + s2 = "".join([mapping[el] for el in l2]) + return s1, s2 + + +def transform_unicode(s1: str, s2: str) -> Union[Tuple[str, str], Tuple[List[str]]]: + """Transform two text sequences to unicode representation. + + Normalize to unicode and decides whether we have wide chars + that needs to be represented by lists. + """ + s1 = list(grapheme_clusters(unicodedata.normalize("NFC", s1))) + s2 = list(grapheme_clusters(unicodedata.normalize("NFC", s2))) + if all(len(s) < 2 for s in chain(s1, s2)): + s1, s2 = "".join(s1), "".join(s2) + return s1, s2 diff --git a/qurator/dinglehopper/notebooks/Levenshtein.ipynb b/qurator/dinglehopper/notebooks/Levenshtein.ipynb deleted file mode 100644 index f56d0d7..0000000 --- a/qurator/dinglehopper/notebooks/Levenshtein.ipynb +++ /dev/null @@ -1,1037 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "import numpy as np\n", - "import unicodedata\n", - "import inspect" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Levenshtein edit distance" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "def levenshtein_matrix(seq1, seq2):\n", - " \"\"\"Compute the matrix commonly computed to produce the Levenshtein distance.\n", - "\n", - " This is also known as the Wagner-Fischer algorithm. The matrix element at the bottom right contains the desired\n", - " edit distance.\n", - "\n", - " This algorithm is implemented here because we need an implementation that can work with sequences other than\n", - " strings, e.g. lists of grapheme clusters or lists of word strings.\n", - " \"\"\"\n", - " m = len(seq1)\n", - " n = len(seq2)\n", - "\n", - " def from_to(start, stop):\n", - " return range(start, stop + 1, 1)\n", - "\n", - " D = np.zeros((m + 1, n + 1), np.int)\n", - " D[0, 0] = 0\n", - " for i in from_to(1, m):\n", - " D[i, 0] = i\n", - " for j in from_to(1, n):\n", - " D[0, j] = j\n", - " for i in from_to(1, m):\n", - " for j in from_to(1, n):\n", - " D[i, j] = min(\n", - " D[i - 1, j - 1] + 1 * (seq1[i - 1] != seq2[j - 1]), # Same or Substitution\n", - " D[i, j - 1] + 1, # Insertion\n", - " D[i - 1, j] + 1 # Deletion\n", - " )\n", - "\n", - " return D\n", - "\n", - "def levenshtein(seq1, seq2):\n", - " \"\"\"Compute the Levenshtein edit distance between two sequences\"\"\"\n", - " m = len(seq1)\n", - " n = len(seq2)\n", - "\n", - " D = levenshtein_matrix(seq1, seq2)\n", - " return D[m, n]\n", - "\n" - ] - } - ], - "source": [ - "from edit_distance import levenshtein_matrix, levenshtein\n", - "\n", - "print(inspect.getsource(levenshtein_matrix))\n", - "print(inspect.getsource(levenshtein))" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "assert levenshtein('a', 'a') == 0\n", - "assert levenshtein('a', 'b') == 1\n", - "assert levenshtein('Foo', 'Bar') == 3\n", - "assert levenshtein('', '') == 0\n", - "assert levenshtein('Foo', '') == 3\n", - "assert levenshtein('', 'Foo') == 3\n", - "assert levenshtein('Fnord', 'Food') == 2\n", - "assert levenshtein('Müll', 'Mull') == 1\n", - "assert levenshtein('Abstand', 'Sand') == 4" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This fails for different representations of the \"same\" canonically equivalent string:" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "2" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "word1 = unicodedata.normalize('NFC', 'Schlyñ')\n", - "word2 = unicodedata.normalize('NFD', 'Schlyñ') # Different, decomposed!\n", - "levenshtein(word1, word2)" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "1" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Same, but for grapheme clusters\n", - "from uniseg.graphemecluster import grapheme_clusters\n", - "\n", - "word1 = list(grapheme_clusters(unicodedata.normalize('NFC', 'Schlyñ')))\n", - "word2 = list(grapheme_clusters(unicodedata.normalize('NFD', 'Schlyñ')))\n", - "levenshtein(word1, word2)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Better." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Let's define a edit distance function that uses the basic Levenshtein algorithm, but knows about Unicode normalization and grapheme clusters!" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "def distance(s1, s2):\n", - " \"\"\"Compute the Levenshtein edit distance between two Unicode strings\n", - "\n", - " Note that this is different from levenshtein() as this function knows about Unicode normalization and grapheme\n", - " clusters. This should be the correct way to compare two Unicode strings.\n", - " \"\"\"\n", - " s1 = list(grapheme_clusters(unicodedata.normalize('NFC', s1)))\n", - " s2 = list(grapheme_clusters(unicodedata.normalize('NFC', s2)))\n", - " return levenshtein(s1, s2)\n", - "\n" - ] - } - ], - "source": [ - "from edit_distance import distance\n", - "print(inspect.getsource(distance))" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "word1 = unicodedata.normalize('NFC', 'Schlyñ')\n", - "word2 = unicodedata.normalize('NFD', 'Schlyñ') # Different, decomposed!\n", - "\n", - "distance(word1, word2)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This should give us the correct answer of 1 for 'Schlyñ' (with LATIN SMALL LETTER N WITH TILDE) vs 'Schlym̃' (with LATIN SMALL LETTER M + COMBINING TILDE):" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "1" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "word1 = 'Schlyñ'\n", - "word2 = 'Schlym̃'\n", - "#print('Lengths, as far as Python is concerned:', len(word1), len(word2)) # → gives 6 and 7!\n", - "distance(word1, word2)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Edit operations\n", - "\n", - "python-Levenshtein supports backtracing, i.e. giving a sequence of edit options that transforms a word to another word:\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[('insert', 5, 5), ('replace', 5, 6)]\n" - ] - } - ], - "source": [ - "import Levenshtein\n", - "word1 = 'Schlyñ' # with LATIN SMALL LETTER N WITH TILDE\n", - "word2 = 'Schlym̃' # with LATIN SMALL LETTER M + COMBINING TILDE\n", - "print(Levenshtein.editops(word1, word2))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Note that it does not work with grapheme clusters, but \"characters\", so it gives 2 operations." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Defining our own `editops()`. (This looks a bit wild due to our own tail recursion handling.)" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "def seq_editops(seq1, seq2):\n", - " seq1 = list(seq1)\n", - " seq2 = list(seq2)\n", - " m = len(seq1)\n", - " n = len(seq2)\n", - " D = levenshtein_matrix(seq1, seq2)\n", - "\n", - " def _tail_backtrace(i, j, accumulator):\n", - " if i > 0 and D[i - 1, j] + 1 == D[i, j]:\n", - " return partial(_tail_backtrace, i - 1, j, [('delete', i-1, j)] + accumulator)\n", - " if j > 0 and D[i, j - 1] + 1 == D[i, j]:\n", - " return partial(_tail_backtrace, i, j - 1, [('insert', i, j-1)] + accumulator)\n", - " if i > 0 and j > 0 and D[i - 1, j - 1] + 1 == D[i, j]:\n", - " return partial(_tail_backtrace, i - 1, j - 1, [('replace', i-1, j-1)] + accumulator)\n", - " if i > 0 and j > 0 and D[i - 1, j - 1] == D[i, j]:\n", - " return partial(_tail_backtrace, i - 1, j - 1, accumulator) # NOP\n", - " return accumulator\n", - "\n", - " def backtrace(i, j):\n", - " result = partial(_tail_backtrace, i, j, [])\n", - " while isinstance(result, partial):\n", - " result = result()\n", - "\n", - " return result\n", - "\n", - " b = backtrace(m, n)\n", - " return b\n", - "\n", - "def editops(word1, word2):\n", - " # XXX Note that this returns indices to the _grapheme clusters_, not characters!\n", - " word1 = list(grapheme_clusters(unicodedata.normalize('NFC', word1)))\n", - " word2 = list(grapheme_clusters(unicodedata.normalize('NFC', word2)))\n", - " return seq_editops(word1, word2)\n", - "\n" - ] - } - ], - "source": [ - "from edit_distance import seq_editops, editops\n", - "print(inspect.getsource(seq_editops))\n", - "print(inspect.getsource(editops))" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[('replace', 2, 2)]" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "editops('Foo', 'Fon')" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[('insert', 4, 4)]\n", - "[('insert', 4, 4)]\n" - ] - } - ], - "source": [ - "print(editops('Käptn', 'Käpt\\'n'))\n", - "print(Levenshtein.editops('Käptn', 'Käpt\\'n'))" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[('delete', 6, 6)]\n", - "[('delete', 6, 6)]\n" - ] - } - ], - "source": [ - "print(editops('Delete something', 'Deletesomething'))\n", - "print(Levenshtein.editops('Delete something', 'Deletesomething'))" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[('delete', 1, 1), ('replace', 13, 12), ('insert', 17, 16), ('delete', 23, 23)]\n", - "[('delete', 1, 1), ('replace', 13, 12), ('insert', 16, 15), ('delete', 23, 23)]\n" - ] - } - ], - "source": [ - "print(editops('A more difficult example', 'Amore difficült exampl'))\n", - "print(Levenshtein.editops('A more difficult example', 'Amore difficült exampl'))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "XXX Note that our implementation returns different positions here for the 'insert'. " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Let's try it with a difficult example that needs grapheme cluster handling:" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[('replace', 5, 5)]" - ] - }, - "execution_count": 15, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "word1 = 'Schlyñ' # with LATIN SMALL LETTER N WITH TILDE\n", - "word2 = 'Schlym̃' # with LATIN SMALL LETTER M + COMBINING TILDE\n", - "\n", - "editops(word1, word2)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "🎉" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Character error rate\n", - "\n", - "[digitisation.eu](https://sites.google.com/site/textdigitisation/qualitymeasures/computingerrorrates) defines the character error rate (CER) as:\n", - "\n", - "$$\n", - "\\text{CER} = \\frac{i + s + d}{n}\n", - "$$\n", - "\n", - "where $i$ is the number of inserts, $s$ the number of substitutions, $d$ the number of deletions and $n$ is the number of characters in the reference text. (The text is not super clear about $n$ being the number of characters in the reference text, but it seems appropiate as they *are* clear about this when computing the word error rate.)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Because our edit distance is equal to $i + s + d$, we can thus define:" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "def character_error_rate(reference, compared):\n", - " d = distance(reference, compared)\n", - " if d == 0:\n", - " return 0\n", - "\n", - " n = len(list(grapheme_clusters(unicodedata.normalize('NFC', reference))))\n", - " if n == 0:\n", - " return float('inf')\n", - "\n", - " return d/n\n", - "\n" - ] - } - ], - "source": [ - "from character_error_rate import character_error_rate\n", - "print(inspect.getsource(character_error_rate))" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [], - "source": [ - "assert character_error_rate('Foo', 'Bär') == 3/3\n", - "assert character_error_rate('Fnord', 'Food') == 2/5\n", - "assert character_error_rate('Food', 'Fnord') == 2/4\n", - "assert character_error_rate('Schlyñ', 'Schlym̃') == 1/6" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [], - "source": [ - "# From experiments/2019-07-ocrevalUAtion: These are already preprocessed by the equivalences in equivalences-tess-frk.csv.\n", - "gt = \"\"\"115 über die vielen Sorgen wegen deſſelben vergaß Hartkopf, der Frau Amtmännin das ver⸗ ſprochene zu überliefern. — Ein Erpreſſer wurde an ihn abgeſchickt, um ihn ums Him⸗ melswillen zu ſagen, daß er das Verſprochene gleich den Augenblick überbringen möchte, die Frau Amtmännin hätte ſich auf ihn verlaſſen, und nun wüßte ſie nicht, was ſie anfangen ſollte. Den Augenblick ſollte er kommen, ſonſt vergieng ſie in ihrer Angſt. — Die Gäſte wären ſchon angekommen, und es fehlte ihr doch noch an allem. — Hartkopf mußte ſich erſt beſinnen, und endlich nach langem Nachdenken fiel es ihm erſt wieder ein. — Er langte den Zettel aus dem Accisbuche heraus, und ſagte ſeiner Frau, daß ſie das, was da wäre, herbeyſchaffen möchte. Jndeß mangelten doch einige Generalia, die alſo wegfielen. — Hartkopf gieng ſelbſt mit und überbrachte es. — „Herr Jemine! er böſer Mann!“ — ſchrie ihm die Frau Amtmännin entgegen, und ſchlug ihn auf die Schulter und blickte den Korb, der voll gedrückt, gerüttelt und überflüſſig in ihren Schoos gegeben werden ſollte, mit Augen voller Freu⸗ H 2\"\"\"\n", - "tess = \"\"\"emm unmit; Lis Übey die vielen Sorgen wegen\" deſſelben vergaß Hartkopf, der Frau! Amimännin das- ver ſprochene zu überliefeen. ==\" Ein Epypreſſer- wurde an ihn abgeſchieet', um' ihn ums Hime melswillen zu ſagen, \"daß er das Verſyrochene leich den Augenblick \"überbringen möchte, die Frau Amtmännin hätte ſich auf ihn veriaſſen, und nun wüßte ſie- nicht, was ſie anfangen ſollte, =! 'Den Augenblick ſollte \"er kommen, ſonſt vergieng ſie in ihrer Angſt. == Die Säuaſie- wären. ſchon angekommen, und es fehlte ihr do < noch an alien, === Hartfopyf mußte ſich erſt TIM und endlich mach langem Rachdenken fiel es ihm erſt wieder ein, ==. Ex langte den Zettel aus dem- Accisbuche heraus, und ſagte ſeiner Frau, daß ſie das , was da wäre, herbeyſchaffen mschte. ZIudeß „mangelten doch einige Generalia, die alſo wegfielen. == ' Havrkopf gieng ſelbſt mit und überbrachte es == | „Herr Jemine! er böſer Mann 1-2 ſchrie ihm die Frau Amtmännin entgegen, und ſchlug ihn auf die Schulter und blickte den Korb, der - voll gedrückt, gerüttelt und überfirfſig in ihren Ss HEILE werden ſolite, mit Augen voller EE) Fron?\"\"\"" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "0.1190\n" - ] - } - ], - "source": [ - "print('{:.4f}'.format(character_error_rate(gt, tess)))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "XXX This gives a smaller CER than ocrevalUAtion (which gives 0.1228). Why?" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0.1190253045923149" - ] - }, - "execution_count": 20, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "levenshtein(gt, tess)/len(gt)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "That's ~ the same, so I think it's not about the character segmentation. Check that we're only dealing with single-codepoint grapheme clusters:" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "metadata": {}, - "outputs": [], - "source": [ - "for w in gt, tess:\n", - " for g in grapheme_clusters(w):\n", - " assert len(g) == 1" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Maybe ocrevalUAtion doesn't count whitespace?" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'115überdievielenSorgenwegendeſſelbenvergaßHartkopf,derFrauAmtmännindasver⸗ſprochenezuüberliefern.—EinErpreſſerwurdeanihnabgeſchickt,umihnumsHim⸗melswillenzuſagen,daßerdasVerſprochenegleichdenAugenblicküberbringenmöchte,dieFrauAmtmänninhätteſichaufihnverlaſſen,undnunwüßteſienicht,wasſieanfangenſollte.DenAugenblickſollteerkommen,ſonſtvergiengſieinihrerAngſt.—DieGäſtewärenſchonangekommen,undesfehlteihrdochnochanallem.—Hartkopfmußteſicherſtbeſinnen,undendlichnachlangemNachdenkenfielesihmerſtwiederein.—ErlangtedenZettelausdemAccisbucheheraus,undſagteſeinerFrau,daßſiedas,wasdawäre,herbeyſchaffenmöchte.JndeßmangeltendocheinigeGeneralia,diealſowegfielen.—Hartkopfgiengſelbſtmitundüberbrachtees.—„HerrJemine!erböſerMann!“—ſchrieihmdieFrauAmtmänninentgegen,undſchlugihnaufdieSchulterundblicktedenKorb,dervollgedrückt,gerütteltundüberflüſſiginihrenSchoosgegebenwerdenſollte,mitAugenvollerFreu⸗H2'" - ] - }, - "execution_count": 22, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "def remove_whitespace(s):\n", - " return s.replace(' ', '')\n", - "remove_whitespace(gt)" - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "0.1324\n" - ] - } - ], - "source": [ - "print('{:.4f}'.format(character_error_rate(remove_whitespace(gt), remove_whitespace(tess))))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now it's larger than ocrevalUAtion 🤷‍♂️" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Word error rate" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Word segmentation" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Naively split on spaces.\n", - "\n", - "(Note: ocrevalUAtion does confusing things here, like the Token splitting in a hash function, with an empty pattern?!)" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "metadata": {}, - "outputs": [], - "source": [ - "def naive_word_split(s):\n", - " return s.split(' ')" - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "metadata": {}, - "outputs": [], - "source": [ - "example_text = \"The quick (“brown”) fox can't jump 32.3 feet, right?\"" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['The',\n", - " 'quick',\n", - " '(“brown”)',\n", - " 'fox',\n", - " \"can't\",\n", - " 'jump',\n", - " '32.3',\n", - " 'feet,',\n", - " 'right?']" - ] - }, - "execution_count": 26, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "naive_word_split(example_text)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Let's do it the Unicode way (Appendix UAX #29 on Unicode Text Segmentation): Split on word boundaries using the uniseg libraries and ignore words that contain only whitespace, punctuation \"and similar characters\":" - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "def words(s):\n", - " # Patch uniseg.wordbreak.word_break to deal with our private use characters. See also\n", - " # https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/WordBreakProperty.txt\n", - " old_word_break = uniseg.wordbreak.word_break\n", - "\n", - " def new_word_break(c, index=0):\n", - " if 0xE000 <= ord(c) <= 0xF8FF: # Private Use Area\n", - " return 'ALetter'\n", - " else:\n", - " return old_word_break(c, index)\n", - " uniseg.wordbreak.word_break = new_word_break\n", - "\n", - " # Check if c is an unwanted character, i.e. whitespace, punctuation, or similar\n", - " def unwanted(c):\n", - "\n", - " # See https://www.fileformat.info/info/unicode/category/index.htm\n", - " # and https://unicodebook.readthedocs.io/unicode.html#categories\n", - " unwanted_categories = 'O', 'M', 'P', 'Z', 'S'\n", - " unwanted_subcategories = 'Cc', 'Cf'\n", - "\n", - " subcat = unicodedata.category(c)\n", - " cat = subcat[0]\n", - " return cat in unwanted_categories or subcat in unwanted_subcategories\n", - "\n", - " # We follow Unicode Standard Annex #29 on Unicode Text Segmentation here: Split on word boundaries using\n", - " # uniseg.wordbreak.words() and ignore all \"words\" that contain only whitespace, punctation \"or similar characters.\"\n", - " for word in uniseg.wordbreak.words(s):\n", - " if all(unwanted(c) for c in word):\n", - " pass\n", - " else:\n", - " yield word\n", - "\n" - ] - }, - { - "data": { - "text/plain": [ - "['The', 'quick', 'brown', 'fox', \"can't\", 'jump', '32.3', 'feet', 'right']" - ] - }, - "execution_count": 27, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from word_error_rate import words\n", - "print(inspect.getsource(words))\n", - "\n", - "list(words(example_text))" - ] - }, - { - "cell_type": "code", - "execution_count": 28, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['Der',\n", - " 'schnelle',\n", - " 'braune',\n", - " 'Fuchs',\n", - " 'kann',\n", - " 'keine',\n", - " '3,14',\n", - " 'Meter',\n", - " 'springen',\n", - " 'oder']" - ] - }, - "execution_count": 28, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "list(words('Der schnelle [„braune“] Fuchs kann keine 3,14 Meter springen, oder?'))" - ] - }, - { - "cell_type": "code", - "execution_count": 29, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['Dies', 'ist', 'ein', 'Beispielsatz', 'Oh', 'ja']" - ] - }, - "execution_count": 29, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "list(words('Dies ist ein Beispielsatz. Oh, ja.'))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "It's probably not correct for Chinese and Japanese, but at least it doesn't rely on spaces." - ] - }, - { - "cell_type": "code", - "execution_count": 30, - "metadata": { - "scrolled": true - }, - "outputs": [ - { - "data": { - "text/plain": [ - "['我', '很', '高', '興', '跟', '你', '見', '面']" - ] - }, - "execution_count": 30, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "list(words('我很高興跟你見面')) # \"Pleased to meet you\" in Mandarin, Traditional writing" - ] - }, - { - "cell_type": "code", - "execution_count": 31, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['医', '者', 'を', '呼', 'ん', 'で', 'く', 'だ', 'さ', 'い']" - ] - }, - "execution_count": 31, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "list(words('医者を呼んでください。'))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Word error rate" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "For the word error rate, normalize again and compare sequences of words." - ] - }, - { - "cell_type": "code", - "execution_count": 32, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "def word_error_rate(reference, compared):\n", - " if isinstance(reference, str):\n", - " reference_seq = list(words_normalized(reference))\n", - " compared_seq = list(words_normalized(compared))\n", - " else:\n", - " reference_seq = list(reference)\n", - " compared_seq = list(compared)\n", - "\n", - " d = levenshtein(reference_seq, compared_seq)\n", - " if d == 0:\n", - " return 0\n", - "\n", - " n = len(reference_seq)\n", - " if n == 0:\n", - " return float('inf')\n", - "\n", - " return d / n\n", - "\n" - ] - } - ], - "source": [ - "from word_error_rate import word_error_rate\n", - "print(inspect.getsource(word_error_rate))" - ] - }, - { - "cell_type": "code", - "execution_count": 33, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0.25" - ] - }, - "execution_count": 33, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "word_error_rate('Dies ist ein Beispielsatz.', 'Dies isi ein Beispielsatz,')" - ] - }, - { - "cell_type": "code", - "execution_count": 34, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0.75" - ] - }, - "execution_count": 34, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "word_error_rate('Fnord ist verdampfter Kräutertee!', 'Fnòrd ist verdmpfter Krautertee.')" - ] - }, - { - "cell_type": "code", - "execution_count": 35, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0.18823529411764706" - ] - }, - "execution_count": 35, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "word_error_rate(gt, tess)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This is a little larger than the ocrevalUAtion result!" - ] - } - ], - "metadata": { - "hide_input": false, - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.3" - }, - "toc": { - "base_numbering": 1, - "nav_menu": {}, - "number_sections": true, - "sideBar": true, - "skip_h1_title": false, - "title_cell": "Table of Contents", - "title_sidebar": "Contents", - "toc_cell": false, - "toc_position": {}, - "toc_section_display": true, - "toc_window_display": true - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/qurator/dinglehopper/ocrd_cli.py b/qurator/dinglehopper/ocrd_cli.py index adfbbab..7c513e6 100644 --- a/qurator/dinglehopper/ocrd_cli.py +++ b/qurator/dinglehopper/ocrd_cli.py @@ -8,7 +8,6 @@ from ocrd_utils import getLogger, make_file_id, assert_file_grp_cardinality from pkg_resources import resource_string from .cli import process as cli_process -from .edit_distance import levenshtein_matrix_cache_clear OCRD_TOOL = json.loads(resource_string(__name__, "ocrd-tool.json").decode("utf8")) @@ -74,8 +73,6 @@ class OcrdDinglehopperEvaluate(Processor): local_filename=report_prefix + report_suffix, ) - # Clear cache between files - levenshtein_matrix_cache_clear() if __name__ == "__main__": ocrd_dinglehopper() diff --git a/qurator/dinglehopper/tests/test_edit_distance.py b/qurator/dinglehopper/tests/test_edit_distance.py index ed31379..81e03c4 100644 --- a/qurator/dinglehopper/tests/test_edit_distance.py +++ b/qurator/dinglehopper/tests/test_edit_distance.py @@ -2,7 +2,7 @@ import unicodedata import pytest -from .. import distance, distance_fast +from .. import distance, distance_unicode TEST_PARAMS = "s1,s2,expected_dist" @@ -42,25 +42,13 @@ def test_distance_sequences(s1, s2, expected_dist): assert dist == expected_dist -@pytest.mark.parametrize(TEST_PARAMS, TEST_STRINGS) -def test_distance_strings(s1, s2, expected_dist): - dist = distance(s1, s2) - assert dist == expected_dist - - -@pytest.mark.parametrize(TEST_PARAMS, TEST_STRINGS) -def test_distance_fast(s1, s2, expected_dist): - dist = distance_fast(s1, s2) - assert dist == expected_dist - - @pytest.mark.parametrize(TEST_PARAMS, TEST_UNICODE) -def test_editops_fast_unicode(s1, s2, expected_dist): - dist = distance_fast(s1, s2) +def test_distance_with_unicode(s1, s2, expected_dist): + dist = distance(s1, s2) assert dist != expected_dist @pytest.mark.parametrize(TEST_PARAMS, TEST_UNICODE) def test_distance_unicode(s1, s2, expected_dist): - dist = distance(s1, s2) + dist = distance_unicode(s1, s2) assert dist == expected_dist diff --git a/qurator/dinglehopper/tests/test_editops.py b/qurator/dinglehopper/tests/test_editops.py index 5a10db2..c33027d 100644 --- a/qurator/dinglehopper/tests/test_editops.py +++ b/qurator/dinglehopper/tests/test_editops.py @@ -2,7 +2,7 @@ import unicodedata import pytest -from .. import editops, editops_fast +from .. import editops, editops_unicode TEST_PARAMS = "s1,s2,expected_ops" @@ -51,36 +51,22 @@ TEST_UNICODE = [ ] -@pytest.mark.parametrize(TEST_PARAMS, TEST_STRINGS) -def test_editops_strings(s1, s2, expected_ops): - ops = editops(s1, s2) - assert ops == expected_ops - - @pytest.mark.parametrize(TEST_PARAMS, [*TEST_STRINGS, *TEST_SEQUENCES]) -def test_editops_sequences(s1, s2, expected_ops): +def test_editops(s1, s2, expected_ops): ops = editops(s1, s2) assert ops == expected_ops -@pytest.mark.parametrize(TEST_PARAMS, TEST_STRINGS) -def test_editops_fast(s1, s2, expected_ops): - ops = editops_fast(s1, s2) - assert ops == expected_ops - - @pytest.mark.parametrize(TEST_PARAMS, TEST_UNICODE) -def test_editops_fast_unicode(s1, s2, expected_ops): - ops = editops_fast(s1, s2) +def test_editops_with_unicode(s1, s2, expected_ops): + ops = editops(s1, s2) assert ops != expected_ops @pytest.mark.parametrize(TEST_PARAMS, TEST_UNICODE) def test_editops_unicode(s1, s2, expected_ops): - """Test editops() in cases where dealing with grapheme clusters matters""" - if not expected_ops: assert s1 != s2 assert unicodedata.normalize("NFC", s1) == unicodedata.normalize("NFC", s2) - ops = editops(s1, s2) + ops = editops_unicode(s1, s2) assert ops == expected_ops diff --git a/requirements.txt b/requirements.txt index 7bb53ac..99172c1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,3 +9,4 @@ ocrd >= 2.20.1 attrs multimethod == 1.3 # latest version to officially support Python 3.5 tqdm +python-levenshtein