Merge branch 'rapidfuzz'

pull/66/head
Gerber, Mike 3 years ago
commit dea0c53f88

@ -1,4 +1,5 @@
from .edit_distance import *
from rapidfuzz.string_metric import levenshtein_editops
def align(t1, t2):
@ -12,7 +13,7 @@ def seq_align(s1, s2):
"""Align general sequences."""
s1 = list(s1)
s2 = list(s2)
ops = seq_editops(s1, s2)
ops = levenshtein_editops(s1, s2)
i = 0
j = 0

@ -8,79 +8,19 @@ import numpy as np
from multimethod import multimethod
from uniseg.graphemecluster import grapheme_clusters
from tqdm import tqdm
from rapidfuzz.string_metric import levenshtein, levenshtein_editops
from .extracted_text import ExtractedText
from .config import Config
def levenshtein_matrix(seq1: Sequence, seq2: Sequence):
"""Compute the matrix commonly computed to produce the Levenshtein distance.
This is also known as the Wagner-Fischer algorithm. The matrix element at the bottom right contains the desired
edit distance.
This algorithm is implemented here because we need an implementation that can work with sequences other than
strings, e.g. lists of grapheme clusters or lists of word strings.
"""
# Internally, we use a cached version. As the cache only works on hashable parameters, we convert the input
# sequences to tuples to make them hashable.
return _levenshtein_matrix(tuple(seq1), tuple(seq2))
@lru_cache(maxsize=10)
def _levenshtein_matrix(seq1: Tuple, seq2: Tuple):
"""Compute the matrix commonly computed to produce the Levenshtein distance.
This is a LRU cached function not meant to be used directly. Use levenshtein_matrix() instead.
"""
m = len(seq1)
n = len(seq2)
def from_to(start, stop):
return range(start, stop + 1, 1)
D = np.zeros((m + 1, n + 1), np.int)
D[0, 0] = 0
for i in from_to(1, m):
D[i, 0] = i
for j in from_to(1, n):
D[0, j] = j
for i in tqdm(from_to(1, m), disable=not Config.progress):
for j in from_to(1, n):
D[i, j] = min(
D[i - 1, j - 1]
+ 1 * (seq1[i - 1] != seq2[j - 1]), # Same or Substitution
D[i, j - 1] + 1, # Insertion
D[i - 1, j] + 1, # Deletion
)
return D
def levenshtein(seq1, seq2):
"""Compute the Levenshtein edit distance between two sequences"""
m = len(seq1)
n = len(seq2)
D = levenshtein_matrix(seq1, seq2)
return D[m, n]
def levenshtein_matrix_cache_clear():
"""Clear internal Levenshtein matrix cache.
You want to do this between different input file pairs to decrease memory
usage by not caching results from prior input files.
"""
_levenshtein_matrix.cache_clear()
@multimethod
def distance(s1: str, s2: str):
"""Compute the Levenshtein edit distance between two Unicode strings
Note that this is different from levenshtein() as this function knows about Unicode normalization and grapheme
clusters. This should be the correct way to compare two Unicode strings.
Note that this is different from levenshtein() as this function knows about Unicode
normalization and grapheme clusters. This should be the correct way to compare two
Unicode strings.
"""
seq1 = list(grapheme_clusters(unicodedata.normalize("NFC", s1)))
seq2 = list(grapheme_clusters(unicodedata.normalize("NFC", s2)))
@ -92,47 +32,6 @@ def distance(s1: ExtractedText, s2: ExtractedText):
return distance(s1.text, s2.text)
def seq_editops(seq1, seq2):
"""
Return sequence of edit operations transforming one sequence to another.
This aims to return the same/similar results as python-Levenshtein's editops(), just generalized to arbitrary
sequences.
"""
seq1 = list(seq1)
seq2 = list(seq2)
m = len(seq1)
n = len(seq2)
D = levenshtein_matrix(seq1, seq2)
def _tail_backtrace(i, j, accumulator):
if i > 0 and D[i - 1, j] + 1 == D[i, j]:
return partial(
_tail_backtrace, i - 1, j, [("delete", i - 1, j)] + accumulator
)
if j > 0 and D[i, j - 1] + 1 == D[i, j]:
return partial(
_tail_backtrace, i, j - 1, [("insert", i, j - 1)] + accumulator
)
if i > 0 and j > 0 and D[i - 1, j - 1] + 1 == D[i, j]:
return partial(
_tail_backtrace, i - 1, j - 1, [("replace", i - 1, j - 1)] + accumulator
)
if i > 0 and j > 0 and D[i - 1, j - 1] == D[i, j]:
return partial(_tail_backtrace, i - 1, j - 1, accumulator) # NOP
return accumulator
def backtrace(i, j):
result = partial(_tail_backtrace, i, j, [])
while isinstance(result, partial):
result = result()
return result
b = backtrace(m, n)
return b
def editops(word1, word2):
"""
Return sequence of edit operations transforming one string to another.
@ -141,4 +40,4 @@ def editops(word1, word2):
"""
word1 = list(grapheme_clusters(unicodedata.normalize("NFC", word1)))
word2 = list(grapheme_clusters(unicodedata.normalize("NFC", word2)))
return seq_editops(word1, word2)
return levenshtein_editops(word1, word2)

@ -18,62 +18,20 @@
"# Levenshtein edit distance"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"dinglehopper uses to have its own (very inefficient) Levenshtein edit distance implementation, but now uses RapidFuzz."
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"def levenshtein_matrix(seq1, seq2):\n",
" \"\"\"Compute the matrix commonly computed to produce the Levenshtein distance.\n",
"\n",
" This is also known as the Wagner-Fischer algorithm. The matrix element at the bottom right contains the desired\n",
" edit distance.\n",
"\n",
" This algorithm is implemented here because we need an implementation that can work with sequences other than\n",
" strings, e.g. lists of grapheme clusters or lists of word strings.\n",
" \"\"\"\n",
" m = len(seq1)\n",
" n = len(seq2)\n",
"\n",
" def from_to(start, stop):\n",
" return range(start, stop + 1, 1)\n",
"\n",
" D = np.zeros((m + 1, n + 1), np.int)\n",
" D[0, 0] = 0\n",
" for i in from_to(1, m):\n",
" D[i, 0] = i\n",
" for j in from_to(1, n):\n",
" D[0, j] = j\n",
" for i in from_to(1, m):\n",
" for j in from_to(1, n):\n",
" D[i, j] = min(\n",
" D[i - 1, j - 1] + 1 * (seq1[i - 1] != seq2[j - 1]), # Same or Substitution\n",
" D[i, j - 1] + 1, # Insertion\n",
" D[i - 1, j] + 1 # Deletion\n",
" )\n",
"\n",
" return D\n",
"\n",
"def levenshtein(seq1, seq2):\n",
" \"\"\"Compute the Levenshtein edit distance between two sequences\"\"\"\n",
" m = len(seq1)\n",
" n = len(seq2)\n",
"\n",
" D = levenshtein_matrix(seq1, seq2)\n",
" return D[m, n]\n",
"\n"
]
}
],
"outputs": [],
"source": [
"from edit_distance import levenshtein_matrix, levenshtein\n",
"\n",
"print(inspect.getsource(levenshtein_matrix))\n",
"print(inspect.getsource(levenshtein))"
"from rapidfuzz.string_metric import levenshtein"
]
},
{
@ -170,21 +128,23 @@
"name": "stdout",
"output_type": "stream",
"text": [
"def distance(s1, s2):\n",
"@multimethod\n",
"def distance(s1: str, s2: str):\n",
" \"\"\"Compute the Levenshtein edit distance between two Unicode strings\n",
"\n",
" Note that this is different from levenshtein() as this function knows about Unicode normalization and grapheme\n",
" clusters. This should be the correct way to compare two Unicode strings.\n",
" Note that this is different from levenshtein() as this function knows about Unicode\n",
" normalization and grapheme clusters. This should be the correct way to compare two\n",
" Unicode strings.\n",
" \"\"\"\n",
" s1 = list(grapheme_clusters(unicodedata.normalize('NFC', s1)))\n",
" s2 = list(grapheme_clusters(unicodedata.normalize('NFC', s2)))\n",
" return levenshtein(s1, s2)\n",
" seq1 = list(grapheme_clusters(unicodedata.normalize(\"NFC\", s1)))\n",
" seq2 = list(grapheme_clusters(unicodedata.normalize(\"NFC\", s2)))\n",
" return levenshtein(seq1, seq2)\n",
"\n"
]
}
],
"source": [
"from edit_distance import distance\n",
"from qurator.dinglehopper.edit_distance import distance\n",
"print(inspect.getsource(distance))"
]
},
@ -247,8 +207,7 @@
"source": [
"# Edit operations\n",
"\n",
"python-Levenshtein supports backtracing, i.e. giving a sequence of edit options that transforms a word to another word:\n",
"\n"
"python-Levenshtein + RapidFuzz supports backtracing, i.e. giving a sequence of edit options that transforms a word to another word:"
]
},
{
@ -257,32 +216,20 @@
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[('insert', 5, 5), ('replace', 5, 6)]\n"
]
}
],
"source": [
"import Levenshtein\n",
"word1 = 'Schlyñ' # with LATIN SMALL LETTER N WITH TILDE\n",
"word2 = 'Schlym̃' # with LATIN SMALL LETTER M + COMBINING TILDE\n",
"print(Levenshtein.editops(word1, word2))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Note that it does not work with grapheme clusters, but \"characters\", so it gives 2 operations."
"data": {
"text/plain": [
"[('replace', 2, 2)]"
]
},
{
"cell_type": "markdown",
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"Defining our own `editops()`. (This looks a bit wild due to our own tail recursion handling.)"
"from rapidfuzz.string_metric import levenshtein_editops as editops\n",
"\n",
"editops('Foo', 'Fon')"
]
},
{
@ -294,47 +241,12 @@
"name": "stdout",
"output_type": "stream",
"text": [
"def seq_editops(seq1, seq2):\n",
" seq1 = list(seq1)\n",
" seq2 = list(seq2)\n",
" m = len(seq1)\n",
" n = len(seq2)\n",
" D = levenshtein_matrix(seq1, seq2)\n",
"\n",
" def _tail_backtrace(i, j, accumulator):\n",
" if i > 0 and D[i - 1, j] + 1 == D[i, j]:\n",
" return partial(_tail_backtrace, i - 1, j, [('delete', i-1, j)] + accumulator)\n",
" if j > 0 and D[i, j - 1] + 1 == D[i, j]:\n",
" return partial(_tail_backtrace, i, j - 1, [('insert', i, j-1)] + accumulator)\n",
" if i > 0 and j > 0 and D[i - 1, j - 1] + 1 == D[i, j]:\n",
" return partial(_tail_backtrace, i - 1, j - 1, [('replace', i-1, j-1)] + accumulator)\n",
" if i > 0 and j > 0 and D[i - 1, j - 1] == D[i, j]:\n",
" return partial(_tail_backtrace, i - 1, j - 1, accumulator) # NOP\n",
" return accumulator\n",
"\n",
" def backtrace(i, j):\n",
" result = partial(_tail_backtrace, i, j, [])\n",
" while isinstance(result, partial):\n",
" result = result()\n",
"\n",
" return result\n",
"\n",
" b = backtrace(m, n)\n",
" return b\n",
"\n",
"def editops(word1, word2):\n",
" # XXX Note that this returns indices to the _grapheme clusters_, not characters!\n",
" word1 = list(grapheme_clusters(unicodedata.normalize('NFC', word1)))\n",
" word2 = list(grapheme_clusters(unicodedata.normalize('NFC', word2)))\n",
" return seq_editops(word1, word2)\n",
"\n"
"[('insert', 4, 4)]\n"
]
}
],
"source": [
"from edit_distance import seq_editops, editops\n",
"print(inspect.getsource(seq_editops))\n",
"print(inspect.getsource(editops))"
"print(editops('Käptn', 'Käpt\\'n'))"
]
},
{
@ -343,18 +255,15 @@
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[('replace', 2, 2)]"
"name": "stdout",
"output_type": "stream",
"text": [
"[('delete', 6, 6)]\n"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"editops('Foo', 'Fon')"
"print(editops('Delete something', 'Deletesomething'))"
]
},
{
@ -366,66 +275,76 @@
"name": "stdout",
"output_type": "stream",
"text": [
"[('insert', 4, 4)]\n",
"[('insert', 4, 4)]\n"
"[('delete', 1, 1), ('replace', 13, 12), ('insert', 16, 15), ('delete', 23, 23)]\n"
]
}
],
"source": [
"print(editops('Käptn', 'Käpt\\'n'))\n",
"print(Levenshtein.editops('Käptn', 'Käpt\\'n'))"
"print(editops('A more difficult example', 'Amore difficült exampl'))"
]
},
{
"cell_type": "code",
"execution_count": 13,
"cell_type": "markdown",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[('delete', 6, 6)]\n",
"[('delete', 6, 6)]\n"
]
}
],
"source": [
"print(editops('Delete something', 'Deletesomething'))\n",
"print(Levenshtein.editops('Delete something', 'Deletesomething'))"
"Let's try it with a difficult example that needs grapheme cluster handling:"
]
},
{
"cell_type": "code",
"execution_count": 14,
"execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[('delete', 1, 1), ('replace', 13, 12), ('insert', 17, 16), ('delete', 23, 23)]\n",
"[('delete', 1, 1), ('replace', 13, 12), ('insert', 16, 15), ('delete', 23, 23)]\n"
"data": {
"text/plain": [
"[('insert', 5, 5), ('replace', 5, 6)]"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"print(editops('A more difficult example', 'Amore difficült exampl'))\n",
"print(Levenshtein.editops('A more difficult example', 'Amore difficült exampl'))"
"word1 = 'Schlyñ' # with LATIN SMALL LETTER N WITH TILDE\n",
"word2 = 'Schlym̃' # with LATIN SMALL LETTER M + COMBINING TILDE\n",
"\n",
"editops(word1, word2)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"XXX Note that our implementation returns different positions here for the 'insert'. "
"That doesn't look right, let's redefine it with grapheme cluster support:"
]
},
{
"cell_type": "markdown",
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"def editops(word1, word2):\n",
" \"\"\"\n",
" Return sequence of edit operations transforming one string to another.\n",
"\n",
" Note that this returns indices to the _grapheme clusters_, not characters!\n",
" \"\"\"\n",
" word1 = list(grapheme_clusters(unicodedata.normalize(\"NFC\", word1)))\n",
" word2 = list(grapheme_clusters(unicodedata.normalize(\"NFC\", word2)))\n",
" return levenshtein_editops(word1, word2)\n",
"\n"
]
}
],
"source": [
"Let's try it with a difficult example that needs grapheme cluster handling:"
"from qurator.dinglehopper.edit_distance import editops\n",
"print(inspect.getsource(editops))"
]
},
{
@ -455,7 +374,9 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"🎉"
"🎉\n",
"\n",
"Here, a problem is that the positions are grapheme cluster positions, not Python character indexes!"
]
},
{
@ -489,22 +410,20 @@
"name": "stdout",
"output_type": "stream",
"text": [
"def character_error_rate(reference, compared):\n",
" d = distance(reference, compared)\n",
" if d == 0:\n",
" return 0\n",
"\n",
" n = len(list(grapheme_clusters(unicodedata.normalize('NFC', reference))))\n",
" if n == 0:\n",
" return float('inf')\n",
"def character_error_rate(reference, compared) -> float:\n",
" \"\"\"\n",
" Compute character error rate.\n",
"\n",
" return d/n\n",
" :return: character error rate\n",
" \"\"\"\n",
" cer, _ = character_error_rate_n(reference, compared)\n",
" return cer\n",
"\n"
]
}
],
"source": [
"from character_error_rate import character_error_rate\n",
"from qurator.dinglehopper.character_error_rate import character_error_rate\n",
"print(inspect.getsource(character_error_rate))"
]
},
@ -732,16 +651,20 @@
"name": "stdout",
"output_type": "stream",
"text": [
"def words(s):\n",
"@multimethod\n",
"def words(s: str):\n",
" \"\"\"Extract words from a string\"\"\"\n",
"\n",
" # Patch uniseg.wordbreak.word_break to deal with our private use characters. See also\n",
" # https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/WordBreakProperty.txt\n",
" old_word_break = uniseg.wordbreak.word_break\n",
"\n",
" def new_word_break(c, index=0):\n",
" if 0xE000 <= ord(c) <= 0xF8FF: # Private Use Area\n",
" return 'ALetter'\n",
" return \"ALetter\"\n",
" else:\n",
" return old_word_break(c, index)\n",
"\n",
" uniseg.wordbreak.word_break = new_word_break\n",
"\n",
" # Check if c is an unwanted character, i.e. whitespace, punctuation, or similar\n",
@ -749,8 +672,8 @@
"\n",
" # See https://www.fileformat.info/info/unicode/category/index.htm\n",
" # and https://unicodebook.readthedocs.io/unicode.html#categories\n",
" unwanted_categories = 'O', 'M', 'P', 'Z', 'S'\n",
" unwanted_subcategories = 'Cc', 'Cf'\n",
" unwanted_categories = \"O\", \"M\", \"P\", \"Z\", \"S\"\n",
" unwanted_subcategories = \"Cc\", \"Cf\"\n",
"\n",
" subcat = unicodedata.category(c)\n",
" cat = subcat[0]\n",
@ -778,7 +701,7 @@
}
],
"source": [
"from word_error_rate import words\n",
"from qurator.dinglehopper.word_error_rate import words\n",
"print(inspect.getsource(words))\n",
"\n",
"list(words(example_text))"
@ -905,29 +828,15 @@
"name": "stdout",
"output_type": "stream",
"text": [
"def word_error_rate(reference, compared):\n",
" if isinstance(reference, str):\n",
" reference_seq = list(words_normalized(reference))\n",
" compared_seq = list(words_normalized(compared))\n",
" else:\n",
" reference_seq = list(reference)\n",
" compared_seq = list(compared)\n",
"\n",
" d = levenshtein(reference_seq, compared_seq)\n",
" if d == 0:\n",
" return 0\n",
"\n",
" n = len(reference_seq)\n",
" if n == 0:\n",
" return float('inf')\n",
"\n",
" return d / n\n",
"def word_error_rate(reference, compared) -> float:\n",
" wer, _ = word_error_rate_n(reference, compared)\n",
" return wer\n",
"\n"
]
}
],
"source": [
"from word_error_rate import word_error_rate\n",
"from qurator.dinglehopper.word_error_rate import word_error_rate\n",
"print(inspect.getsource(word_error_rate))"
]
},
@ -1002,9 +911,9 @@
"metadata": {
"hide_input": false,
"kernelspec": {
"display_name": "Python 3",
"display_name": "dinglehopper-github",
"language": "python",
"name": "python3"
"name": "dinglehopper-github"
},
"language_info": {
"codemirror_mode": {
@ -1016,7 +925,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.3"
"version": "3.7.12"
},
"toc": {
"base_numbering": 1,

@ -8,7 +8,6 @@ from ocrd_utils import getLogger, make_file_id, assert_file_grp_cardinality
from pkg_resources import resource_string
from .cli import process as cli_process
from .edit_distance import levenshtein_matrix_cache_clear
OCRD_TOOL = json.loads(resource_string(__name__, "ocrd-tool.json").decode("utf8"))
@ -74,8 +73,6 @@ class OcrdDinglehopperEvaluate(Processor):
local_filename=report_prefix + report_suffix,
)
# Clear cache between files
levenshtein_matrix_cache_clear()
if __name__ == "__main__":
ocrd_dinglehopper()

@ -2,27 +2,7 @@ from __future__ import division, print_function
import unicodedata
from .. import levenshtein, distance
def test_levenshtein():
assert levenshtein("a", "a") == 0
assert levenshtein("a", "b") == 1
assert levenshtein("Foo", "Bar") == 3
assert levenshtein("", "") == 0
assert levenshtein("Foo", "") == 3
assert levenshtein("", "Foo") == 3
assert levenshtein("Foo", "Food") == 1
assert levenshtein("Fnord", "Food") == 2
assert levenshtein("Müll", "Mull") == 1
assert levenshtein("Abstand", "Sand") == 4
def test_levenshtein_other_sequences():
assert levenshtein(["a", "ab"], ["a", "ab", "c"]) == 1
assert levenshtein(["a", "ab"], ["a", "c"]) == 1
from .. import distance
def test_distance():

@ -1,48 +1,6 @@
import unicodedata
from .. import seq_editops, editops
def test_trivial():
assert seq_editops("abc", "abc") == []
assert seq_editops("", "") == []
def test_insert():
assert seq_editops("bc", "abc") == [("insert", 0, 0)]
assert seq_editops("ac", "abc") == [("insert", 1, 1)]
assert seq_editops("ab", "abc") == [("insert", 2, 2)]
assert seq_editops("", "a") == [("insert", 0, 0)]
def test_multiple():
assert seq_editops("bcd", "abce") == [("insert", 0, 0), ("replace", 2, 3)]
def test_delete():
assert seq_editops("abcdef", "cdef") == [("delete", 0, 0), ("delete", 1, 0)]
assert seq_editops("Xabcdef", "Xcdef") == [("delete", 1, 1), ("delete", 2, 1)]
assert seq_editops("abcdefg", "acdefX") == [("delete", 1, 1), ("replace", 6, 5)]
assert seq_editops("abcde", "aabcd") == [("insert", 1, 1), ("delete", 4, 5)]
assert seq_editops("Foo", "") == [
("delete", 0, 0),
("delete", 1, 0),
("delete", 2, 0),
]
assert seq_editops("Foolish", "Foo") == [
("delete", 3, 3),
("delete", 4, 3),
("delete", 5, 3),
("delete", 6, 3),
]
def test_ambiguous():
assert seq_editops("bcd", "abcef") == [
("insert", 0, 0),
("replace", 2, 3),
("insert", 3, 4),
]
from .. import editops
def test_editops():

@ -6,7 +6,7 @@ from multimethod import multimethod
import uniseg.wordbreak
from .edit_distance import levenshtein
from rapidfuzz.string_metric import levenshtein
from . import ExtractedText

@ -9,3 +9,4 @@ ocrd >= 2.20.1
attrs
multimethod == 1.3 # latest version to officially support Python 3.5
tqdm
rapidfuzz >= 1.8.1

Loading…
Cancel
Save