mirror of
https://github.com/qurator-spk/dinglehopper.git
synced 2025-07-05 16:39:59 +02:00
Merge branch 'rapidfuzz'
This commit is contained in:
commit
dea0c53f88
8 changed files with 116 additions and 371 deletions
|
@ -1,4 +1,5 @@
|
||||||
from .edit_distance import *
|
from .edit_distance import *
|
||||||
|
from rapidfuzz.string_metric import levenshtein_editops
|
||||||
|
|
||||||
|
|
||||||
def align(t1, t2):
|
def align(t1, t2):
|
||||||
|
@ -12,7 +13,7 @@ def seq_align(s1, s2):
|
||||||
"""Align general sequences."""
|
"""Align general sequences."""
|
||||||
s1 = list(s1)
|
s1 = list(s1)
|
||||||
s2 = list(s2)
|
s2 = list(s2)
|
||||||
ops = seq_editops(s1, s2)
|
ops = levenshtein_editops(s1, s2)
|
||||||
i = 0
|
i = 0
|
||||||
j = 0
|
j = 0
|
||||||
|
|
||||||
|
|
|
@ -8,79 +8,19 @@ import numpy as np
|
||||||
from multimethod import multimethod
|
from multimethod import multimethod
|
||||||
from uniseg.graphemecluster import grapheme_clusters
|
from uniseg.graphemecluster import grapheme_clusters
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
from rapidfuzz.string_metric import levenshtein, levenshtein_editops
|
||||||
|
|
||||||
from .extracted_text import ExtractedText
|
from .extracted_text import ExtractedText
|
||||||
from .config import Config
|
from .config import Config
|
||||||
|
|
||||||
|
|
||||||
def levenshtein_matrix(seq1: Sequence, seq2: Sequence):
|
|
||||||
"""Compute the matrix commonly computed to produce the Levenshtein distance.
|
|
||||||
This is also known as the Wagner-Fischer algorithm. The matrix element at the bottom right contains the desired
|
|
||||||
edit distance.
|
|
||||||
|
|
||||||
This algorithm is implemented here because we need an implementation that can work with sequences other than
|
|
||||||
strings, e.g. lists of grapheme clusters or lists of word strings.
|
|
||||||
"""
|
|
||||||
|
|
||||||
# Internally, we use a cached version. As the cache only works on hashable parameters, we convert the input
|
|
||||||
# sequences to tuples to make them hashable.
|
|
||||||
return _levenshtein_matrix(tuple(seq1), tuple(seq2))
|
|
||||||
|
|
||||||
|
|
||||||
@lru_cache(maxsize=10)
|
|
||||||
def _levenshtein_matrix(seq1: Tuple, seq2: Tuple):
|
|
||||||
"""Compute the matrix commonly computed to produce the Levenshtein distance.
|
|
||||||
|
|
||||||
This is a LRU cached function not meant to be used directly. Use levenshtein_matrix() instead.
|
|
||||||
"""
|
|
||||||
m = len(seq1)
|
|
||||||
n = len(seq2)
|
|
||||||
|
|
||||||
def from_to(start, stop):
|
|
||||||
return range(start, stop + 1, 1)
|
|
||||||
|
|
||||||
D = np.zeros((m + 1, n + 1), np.int)
|
|
||||||
D[0, 0] = 0
|
|
||||||
for i in from_to(1, m):
|
|
||||||
D[i, 0] = i
|
|
||||||
for j in from_to(1, n):
|
|
||||||
D[0, j] = j
|
|
||||||
for i in tqdm(from_to(1, m), disable=not Config.progress):
|
|
||||||
for j in from_to(1, n):
|
|
||||||
D[i, j] = min(
|
|
||||||
D[i - 1, j - 1]
|
|
||||||
+ 1 * (seq1[i - 1] != seq2[j - 1]), # Same or Substitution
|
|
||||||
D[i, j - 1] + 1, # Insertion
|
|
||||||
D[i - 1, j] + 1, # Deletion
|
|
||||||
)
|
|
||||||
|
|
||||||
return D
|
|
||||||
|
|
||||||
|
|
||||||
def levenshtein(seq1, seq2):
|
|
||||||
"""Compute the Levenshtein edit distance between two sequences"""
|
|
||||||
m = len(seq1)
|
|
||||||
n = len(seq2)
|
|
||||||
|
|
||||||
D = levenshtein_matrix(seq1, seq2)
|
|
||||||
return D[m, n]
|
|
||||||
|
|
||||||
|
|
||||||
def levenshtein_matrix_cache_clear():
|
|
||||||
"""Clear internal Levenshtein matrix cache.
|
|
||||||
|
|
||||||
You want to do this between different input file pairs to decrease memory
|
|
||||||
usage by not caching results from prior input files.
|
|
||||||
"""
|
|
||||||
_levenshtein_matrix.cache_clear()
|
|
||||||
|
|
||||||
|
|
||||||
@multimethod
|
@multimethod
|
||||||
def distance(s1: str, s2: str):
|
def distance(s1: str, s2: str):
|
||||||
"""Compute the Levenshtein edit distance between two Unicode strings
|
"""Compute the Levenshtein edit distance between two Unicode strings
|
||||||
|
|
||||||
Note that this is different from levenshtein() as this function knows about Unicode normalization and grapheme
|
Note that this is different from levenshtein() as this function knows about Unicode
|
||||||
clusters. This should be the correct way to compare two Unicode strings.
|
normalization and grapheme clusters. This should be the correct way to compare two
|
||||||
|
Unicode strings.
|
||||||
"""
|
"""
|
||||||
seq1 = list(grapheme_clusters(unicodedata.normalize("NFC", s1)))
|
seq1 = list(grapheme_clusters(unicodedata.normalize("NFC", s1)))
|
||||||
seq2 = list(grapheme_clusters(unicodedata.normalize("NFC", s2)))
|
seq2 = list(grapheme_clusters(unicodedata.normalize("NFC", s2)))
|
||||||
|
@ -92,47 +32,6 @@ def distance(s1: ExtractedText, s2: ExtractedText):
|
||||||
return distance(s1.text, s2.text)
|
return distance(s1.text, s2.text)
|
||||||
|
|
||||||
|
|
||||||
def seq_editops(seq1, seq2):
|
|
||||||
"""
|
|
||||||
Return sequence of edit operations transforming one sequence to another.
|
|
||||||
|
|
||||||
This aims to return the same/similar results as python-Levenshtein's editops(), just generalized to arbitrary
|
|
||||||
sequences.
|
|
||||||
"""
|
|
||||||
seq1 = list(seq1)
|
|
||||||
seq2 = list(seq2)
|
|
||||||
m = len(seq1)
|
|
||||||
n = len(seq2)
|
|
||||||
D = levenshtein_matrix(seq1, seq2)
|
|
||||||
|
|
||||||
def _tail_backtrace(i, j, accumulator):
|
|
||||||
if i > 0 and D[i - 1, j] + 1 == D[i, j]:
|
|
||||||
return partial(
|
|
||||||
_tail_backtrace, i - 1, j, [("delete", i - 1, j)] + accumulator
|
|
||||||
)
|
|
||||||
if j > 0 and D[i, j - 1] + 1 == D[i, j]:
|
|
||||||
return partial(
|
|
||||||
_tail_backtrace, i, j - 1, [("insert", i, j - 1)] + accumulator
|
|
||||||
)
|
|
||||||
if i > 0 and j > 0 and D[i - 1, j - 1] + 1 == D[i, j]:
|
|
||||||
return partial(
|
|
||||||
_tail_backtrace, i - 1, j - 1, [("replace", i - 1, j - 1)] + accumulator
|
|
||||||
)
|
|
||||||
if i > 0 and j > 0 and D[i - 1, j - 1] == D[i, j]:
|
|
||||||
return partial(_tail_backtrace, i - 1, j - 1, accumulator) # NOP
|
|
||||||
return accumulator
|
|
||||||
|
|
||||||
def backtrace(i, j):
|
|
||||||
result = partial(_tail_backtrace, i, j, [])
|
|
||||||
while isinstance(result, partial):
|
|
||||||
result = result()
|
|
||||||
|
|
||||||
return result
|
|
||||||
|
|
||||||
b = backtrace(m, n)
|
|
||||||
return b
|
|
||||||
|
|
||||||
|
|
||||||
def editops(word1, word2):
|
def editops(word1, word2):
|
||||||
"""
|
"""
|
||||||
Return sequence of edit operations transforming one string to another.
|
Return sequence of edit operations transforming one string to another.
|
||||||
|
@ -141,4 +40,4 @@ def editops(word1, word2):
|
||||||
"""
|
"""
|
||||||
word1 = list(grapheme_clusters(unicodedata.normalize("NFC", word1)))
|
word1 = list(grapheme_clusters(unicodedata.normalize("NFC", word1)))
|
||||||
word2 = list(grapheme_clusters(unicodedata.normalize("NFC", word2)))
|
word2 = list(grapheme_clusters(unicodedata.normalize("NFC", word2)))
|
||||||
return seq_editops(word1, word2)
|
return levenshtein_editops(word1, word2)
|
||||||
|
|
|
@ -18,62 +18,20 @@
|
||||||
"# Levenshtein edit distance"
|
"# Levenshtein edit distance"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"dinglehopper uses to have its own (very inefficient) Levenshtein edit distance implementation, but now uses RapidFuzz."
|
||||||
|
]
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 2,
|
"execution_count": 2,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [],
|
||||||
{
|
|
||||||
"name": "stdout",
|
|
||||||
"output_type": "stream",
|
|
||||||
"text": [
|
|
||||||
"def levenshtein_matrix(seq1, seq2):\n",
|
|
||||||
" \"\"\"Compute the matrix commonly computed to produce the Levenshtein distance.\n",
|
|
||||||
"\n",
|
|
||||||
" This is also known as the Wagner-Fischer algorithm. The matrix element at the bottom right contains the desired\n",
|
|
||||||
" edit distance.\n",
|
|
||||||
"\n",
|
|
||||||
" This algorithm is implemented here because we need an implementation that can work with sequences other than\n",
|
|
||||||
" strings, e.g. lists of grapheme clusters or lists of word strings.\n",
|
|
||||||
" \"\"\"\n",
|
|
||||||
" m = len(seq1)\n",
|
|
||||||
" n = len(seq2)\n",
|
|
||||||
"\n",
|
|
||||||
" def from_to(start, stop):\n",
|
|
||||||
" return range(start, stop + 1, 1)\n",
|
|
||||||
"\n",
|
|
||||||
" D = np.zeros((m + 1, n + 1), np.int)\n",
|
|
||||||
" D[0, 0] = 0\n",
|
|
||||||
" for i in from_to(1, m):\n",
|
|
||||||
" D[i, 0] = i\n",
|
|
||||||
" for j in from_to(1, n):\n",
|
|
||||||
" D[0, j] = j\n",
|
|
||||||
" for i in from_to(1, m):\n",
|
|
||||||
" for j in from_to(1, n):\n",
|
|
||||||
" D[i, j] = min(\n",
|
|
||||||
" D[i - 1, j - 1] + 1 * (seq1[i - 1] != seq2[j - 1]), # Same or Substitution\n",
|
|
||||||
" D[i, j - 1] + 1, # Insertion\n",
|
|
||||||
" D[i - 1, j] + 1 # Deletion\n",
|
|
||||||
" )\n",
|
|
||||||
"\n",
|
|
||||||
" return D\n",
|
|
||||||
"\n",
|
|
||||||
"def levenshtein(seq1, seq2):\n",
|
|
||||||
" \"\"\"Compute the Levenshtein edit distance between two sequences\"\"\"\n",
|
|
||||||
" m = len(seq1)\n",
|
|
||||||
" n = len(seq2)\n",
|
|
||||||
"\n",
|
|
||||||
" D = levenshtein_matrix(seq1, seq2)\n",
|
|
||||||
" return D[m, n]\n",
|
|
||||||
"\n"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
"source": [
|
||||||
"from edit_distance import levenshtein_matrix, levenshtein\n",
|
"from rapidfuzz.string_metric import levenshtein"
|
||||||
"\n",
|
|
||||||
"print(inspect.getsource(levenshtein_matrix))\n",
|
|
||||||
"print(inspect.getsource(levenshtein))"
|
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
@ -170,21 +128,23 @@
|
||||||
"name": "stdout",
|
"name": "stdout",
|
||||||
"output_type": "stream",
|
"output_type": "stream",
|
||||||
"text": [
|
"text": [
|
||||||
"def distance(s1, s2):\n",
|
"@multimethod\n",
|
||||||
|
"def distance(s1: str, s2: str):\n",
|
||||||
" \"\"\"Compute the Levenshtein edit distance between two Unicode strings\n",
|
" \"\"\"Compute the Levenshtein edit distance between two Unicode strings\n",
|
||||||
"\n",
|
"\n",
|
||||||
" Note that this is different from levenshtein() as this function knows about Unicode normalization and grapheme\n",
|
" Note that this is different from levenshtein() as this function knows about Unicode\n",
|
||||||
" clusters. This should be the correct way to compare two Unicode strings.\n",
|
" normalization and grapheme clusters. This should be the correct way to compare two\n",
|
||||||
|
" Unicode strings.\n",
|
||||||
" \"\"\"\n",
|
" \"\"\"\n",
|
||||||
" s1 = list(grapheme_clusters(unicodedata.normalize('NFC', s1)))\n",
|
" seq1 = list(grapheme_clusters(unicodedata.normalize(\"NFC\", s1)))\n",
|
||||||
" s2 = list(grapheme_clusters(unicodedata.normalize('NFC', s2)))\n",
|
" seq2 = list(grapheme_clusters(unicodedata.normalize(\"NFC\", s2)))\n",
|
||||||
" return levenshtein(s1, s2)\n",
|
" return levenshtein(seq1, seq2)\n",
|
||||||
"\n"
|
"\n"
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"from edit_distance import distance\n",
|
"from qurator.dinglehopper.edit_distance import distance\n",
|
||||||
"print(inspect.getsource(distance))"
|
"print(inspect.getsource(distance))"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
|
@ -247,8 +207,7 @@
|
||||||
"source": [
|
"source": [
|
||||||
"# Edit operations\n",
|
"# Edit operations\n",
|
||||||
"\n",
|
"\n",
|
||||||
"python-Levenshtein supports backtracing, i.e. giving a sequence of edit options that transforms a word to another word:\n",
|
"python-Levenshtein + RapidFuzz supports backtracing, i.e. giving a sequence of edit options that transforms a word to another word:"
|
||||||
"\n"
|
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
@ -257,32 +216,20 @@
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
{
|
{
|
||||||
"name": "stdout",
|
"data": {
|
||||||
"output_type": "stream",
|
"text/plain": [
|
||||||
"text": [
|
"[('replace', 2, 2)]"
|
||||||
"[('insert', 5, 5), ('replace', 5, 6)]\n"
|
]
|
||||||
]
|
},
|
||||||
|
"execution_count": 9,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"import Levenshtein\n",
|
"from rapidfuzz.string_metric import levenshtein_editops as editops\n",
|
||||||
"word1 = 'Schlyñ' # with LATIN SMALL LETTER N WITH TILDE\n",
|
"\n",
|
||||||
"word2 = 'Schlym̃' # with LATIN SMALL LETTER M + COMBINING TILDE\n",
|
"editops('Foo', 'Fon')"
|
||||||
"print(Levenshtein.editops(word1, word2))"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"metadata": {},
|
|
||||||
"source": [
|
|
||||||
"Note that it does not work with grapheme clusters, but \"characters\", so it gives 2 operations."
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"metadata": {},
|
|
||||||
"source": [
|
|
||||||
"Defining our own `editops()`. (This looks a bit wild due to our own tail recursion handling.)"
|
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
@ -294,47 +241,12 @@
|
||||||
"name": "stdout",
|
"name": "stdout",
|
||||||
"output_type": "stream",
|
"output_type": "stream",
|
||||||
"text": [
|
"text": [
|
||||||
"def seq_editops(seq1, seq2):\n",
|
"[('insert', 4, 4)]\n"
|
||||||
" seq1 = list(seq1)\n",
|
|
||||||
" seq2 = list(seq2)\n",
|
|
||||||
" m = len(seq1)\n",
|
|
||||||
" n = len(seq2)\n",
|
|
||||||
" D = levenshtein_matrix(seq1, seq2)\n",
|
|
||||||
"\n",
|
|
||||||
" def _tail_backtrace(i, j, accumulator):\n",
|
|
||||||
" if i > 0 and D[i - 1, j] + 1 == D[i, j]:\n",
|
|
||||||
" return partial(_tail_backtrace, i - 1, j, [('delete', i-1, j)] + accumulator)\n",
|
|
||||||
" if j > 0 and D[i, j - 1] + 1 == D[i, j]:\n",
|
|
||||||
" return partial(_tail_backtrace, i, j - 1, [('insert', i, j-1)] + accumulator)\n",
|
|
||||||
" if i > 0 and j > 0 and D[i - 1, j - 1] + 1 == D[i, j]:\n",
|
|
||||||
" return partial(_tail_backtrace, i - 1, j - 1, [('replace', i-1, j-1)] + accumulator)\n",
|
|
||||||
" if i > 0 and j > 0 and D[i - 1, j - 1] == D[i, j]:\n",
|
|
||||||
" return partial(_tail_backtrace, i - 1, j - 1, accumulator) # NOP\n",
|
|
||||||
" return accumulator\n",
|
|
||||||
"\n",
|
|
||||||
" def backtrace(i, j):\n",
|
|
||||||
" result = partial(_tail_backtrace, i, j, [])\n",
|
|
||||||
" while isinstance(result, partial):\n",
|
|
||||||
" result = result()\n",
|
|
||||||
"\n",
|
|
||||||
" return result\n",
|
|
||||||
"\n",
|
|
||||||
" b = backtrace(m, n)\n",
|
|
||||||
" return b\n",
|
|
||||||
"\n",
|
|
||||||
"def editops(word1, word2):\n",
|
|
||||||
" # XXX Note that this returns indices to the _grapheme clusters_, not characters!\n",
|
|
||||||
" word1 = list(grapheme_clusters(unicodedata.normalize('NFC', word1)))\n",
|
|
||||||
" word2 = list(grapheme_clusters(unicodedata.normalize('NFC', word2)))\n",
|
|
||||||
" return seq_editops(word1, word2)\n",
|
|
||||||
"\n"
|
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"from edit_distance import seq_editops, editops\n",
|
"print(editops('Käptn', 'Käpt\\'n'))"
|
||||||
"print(inspect.getsource(seq_editops))\n",
|
|
||||||
"print(inspect.getsource(editops))"
|
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
@ -343,18 +255,15 @@
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
{
|
{
|
||||||
"data": {
|
"name": "stdout",
|
||||||
"text/plain": [
|
"output_type": "stream",
|
||||||
"[('replace', 2, 2)]"
|
"text": [
|
||||||
]
|
"[('delete', 6, 6)]\n"
|
||||||
},
|
]
|
||||||
"execution_count": 11,
|
|
||||||
"metadata": {},
|
|
||||||
"output_type": "execute_result"
|
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"editops('Foo', 'Fon')"
|
"print(editops('Delete something', 'Deletesomething'))"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
@ -366,14 +275,19 @@
|
||||||
"name": "stdout",
|
"name": "stdout",
|
||||||
"output_type": "stream",
|
"output_type": "stream",
|
||||||
"text": [
|
"text": [
|
||||||
"[('insert', 4, 4)]\n",
|
"[('delete', 1, 1), ('replace', 13, 12), ('insert', 16, 15), ('delete', 23, 23)]\n"
|
||||||
"[('insert', 4, 4)]\n"
|
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"print(editops('Käptn', 'Käpt\\'n'))\n",
|
"print(editops('A more difficult example', 'Amore difficült exampl'))"
|
||||||
"print(Levenshtein.editops('Käptn', 'Käpt\\'n'))"
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"Let's try it with a difficult example that needs grapheme cluster handling:"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
@ -382,17 +296,28 @@
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
{
|
{
|
||||||
"name": "stdout",
|
"data": {
|
||||||
"output_type": "stream",
|
"text/plain": [
|
||||||
"text": [
|
"[('insert', 5, 5), ('replace', 5, 6)]"
|
||||||
"[('delete', 6, 6)]\n",
|
]
|
||||||
"[('delete', 6, 6)]\n"
|
},
|
||||||
]
|
"execution_count": 13,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"print(editops('Delete something', 'Deletesomething'))\n",
|
"word1 = 'Schlyñ' # with LATIN SMALL LETTER N WITH TILDE\n",
|
||||||
"print(Levenshtein.editops('Delete something', 'Deletesomething'))"
|
"word2 = 'Schlym̃' # with LATIN SMALL LETTER M + COMBINING TILDE\n",
|
||||||
|
"\n",
|
||||||
|
"editops(word1, word2)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"That doesn't look right, let's redefine it with grapheme cluster support:"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
@ -404,28 +329,22 @@
|
||||||
"name": "stdout",
|
"name": "stdout",
|
||||||
"output_type": "stream",
|
"output_type": "stream",
|
||||||
"text": [
|
"text": [
|
||||||
"[('delete', 1, 1), ('replace', 13, 12), ('insert', 17, 16), ('delete', 23, 23)]\n",
|
"def editops(word1, word2):\n",
|
||||||
"[('delete', 1, 1), ('replace', 13, 12), ('insert', 16, 15), ('delete', 23, 23)]\n"
|
" \"\"\"\n",
|
||||||
|
" Return sequence of edit operations transforming one string to another.\n",
|
||||||
|
"\n",
|
||||||
|
" Note that this returns indices to the _grapheme clusters_, not characters!\n",
|
||||||
|
" \"\"\"\n",
|
||||||
|
" word1 = list(grapheme_clusters(unicodedata.normalize(\"NFC\", word1)))\n",
|
||||||
|
" word2 = list(grapheme_clusters(unicodedata.normalize(\"NFC\", word2)))\n",
|
||||||
|
" return levenshtein_editops(word1, word2)\n",
|
||||||
|
"\n"
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"print(editops('A more difficult example', 'Amore difficült exampl'))\n",
|
"from qurator.dinglehopper.edit_distance import editops\n",
|
||||||
"print(Levenshtein.editops('A more difficult example', 'Amore difficült exampl'))"
|
"print(inspect.getsource(editops))"
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"metadata": {},
|
|
||||||
"source": [
|
|
||||||
"XXX Note that our implementation returns different positions here for the 'insert'. "
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"metadata": {},
|
|
||||||
"source": [
|
|
||||||
"Let's try it with a difficult example that needs grapheme cluster handling:"
|
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
@ -455,7 +374,9 @@
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
"🎉"
|
"🎉\n",
|
||||||
|
"\n",
|
||||||
|
"Here, a problem is that the positions are grapheme cluster positions, not Python character indexes!"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
@ -489,22 +410,20 @@
|
||||||
"name": "stdout",
|
"name": "stdout",
|
||||||
"output_type": "stream",
|
"output_type": "stream",
|
||||||
"text": [
|
"text": [
|
||||||
"def character_error_rate(reference, compared):\n",
|
"def character_error_rate(reference, compared) -> float:\n",
|
||||||
" d = distance(reference, compared)\n",
|
" \"\"\"\n",
|
||||||
" if d == 0:\n",
|
" Compute character error rate.\n",
|
||||||
" return 0\n",
|
|
||||||
"\n",
|
"\n",
|
||||||
" n = len(list(grapheme_clusters(unicodedata.normalize('NFC', reference))))\n",
|
" :return: character error rate\n",
|
||||||
" if n == 0:\n",
|
" \"\"\"\n",
|
||||||
" return float('inf')\n",
|
" cer, _ = character_error_rate_n(reference, compared)\n",
|
||||||
"\n",
|
" return cer\n",
|
||||||
" return d/n\n",
|
|
||||||
"\n"
|
"\n"
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"from character_error_rate import character_error_rate\n",
|
"from qurator.dinglehopper.character_error_rate import character_error_rate\n",
|
||||||
"print(inspect.getsource(character_error_rate))"
|
"print(inspect.getsource(character_error_rate))"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
|
@ -732,16 +651,20 @@
|
||||||
"name": "stdout",
|
"name": "stdout",
|
||||||
"output_type": "stream",
|
"output_type": "stream",
|
||||||
"text": [
|
"text": [
|
||||||
"def words(s):\n",
|
"@multimethod\n",
|
||||||
|
"def words(s: str):\n",
|
||||||
|
" \"\"\"Extract words from a string\"\"\"\n",
|
||||||
|
"\n",
|
||||||
" # Patch uniseg.wordbreak.word_break to deal with our private use characters. See also\n",
|
" # Patch uniseg.wordbreak.word_break to deal with our private use characters. See also\n",
|
||||||
" # https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/WordBreakProperty.txt\n",
|
" # https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/WordBreakProperty.txt\n",
|
||||||
" old_word_break = uniseg.wordbreak.word_break\n",
|
" old_word_break = uniseg.wordbreak.word_break\n",
|
||||||
"\n",
|
"\n",
|
||||||
" def new_word_break(c, index=0):\n",
|
" def new_word_break(c, index=0):\n",
|
||||||
" if 0xE000 <= ord(c) <= 0xF8FF: # Private Use Area\n",
|
" if 0xE000 <= ord(c) <= 0xF8FF: # Private Use Area\n",
|
||||||
" return 'ALetter'\n",
|
" return \"ALetter\"\n",
|
||||||
" else:\n",
|
" else:\n",
|
||||||
" return old_word_break(c, index)\n",
|
" return old_word_break(c, index)\n",
|
||||||
|
"\n",
|
||||||
" uniseg.wordbreak.word_break = new_word_break\n",
|
" uniseg.wordbreak.word_break = new_word_break\n",
|
||||||
"\n",
|
"\n",
|
||||||
" # Check if c is an unwanted character, i.e. whitespace, punctuation, or similar\n",
|
" # Check if c is an unwanted character, i.e. whitespace, punctuation, or similar\n",
|
||||||
|
@ -749,8 +672,8 @@
|
||||||
"\n",
|
"\n",
|
||||||
" # See https://www.fileformat.info/info/unicode/category/index.htm\n",
|
" # See https://www.fileformat.info/info/unicode/category/index.htm\n",
|
||||||
" # and https://unicodebook.readthedocs.io/unicode.html#categories\n",
|
" # and https://unicodebook.readthedocs.io/unicode.html#categories\n",
|
||||||
" unwanted_categories = 'O', 'M', 'P', 'Z', 'S'\n",
|
" unwanted_categories = \"O\", \"M\", \"P\", \"Z\", \"S\"\n",
|
||||||
" unwanted_subcategories = 'Cc', 'Cf'\n",
|
" unwanted_subcategories = \"Cc\", \"Cf\"\n",
|
||||||
"\n",
|
"\n",
|
||||||
" subcat = unicodedata.category(c)\n",
|
" subcat = unicodedata.category(c)\n",
|
||||||
" cat = subcat[0]\n",
|
" cat = subcat[0]\n",
|
||||||
|
@ -778,7 +701,7 @@
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"from word_error_rate import words\n",
|
"from qurator.dinglehopper.word_error_rate import words\n",
|
||||||
"print(inspect.getsource(words))\n",
|
"print(inspect.getsource(words))\n",
|
||||||
"\n",
|
"\n",
|
||||||
"list(words(example_text))"
|
"list(words(example_text))"
|
||||||
|
@ -905,29 +828,15 @@
|
||||||
"name": "stdout",
|
"name": "stdout",
|
||||||
"output_type": "stream",
|
"output_type": "stream",
|
||||||
"text": [
|
"text": [
|
||||||
"def word_error_rate(reference, compared):\n",
|
"def word_error_rate(reference, compared) -> float:\n",
|
||||||
" if isinstance(reference, str):\n",
|
" wer, _ = word_error_rate_n(reference, compared)\n",
|
||||||
" reference_seq = list(words_normalized(reference))\n",
|
" return wer\n",
|
||||||
" compared_seq = list(words_normalized(compared))\n",
|
|
||||||
" else:\n",
|
|
||||||
" reference_seq = list(reference)\n",
|
|
||||||
" compared_seq = list(compared)\n",
|
|
||||||
"\n",
|
|
||||||
" d = levenshtein(reference_seq, compared_seq)\n",
|
|
||||||
" if d == 0:\n",
|
|
||||||
" return 0\n",
|
|
||||||
"\n",
|
|
||||||
" n = len(reference_seq)\n",
|
|
||||||
" if n == 0:\n",
|
|
||||||
" return float('inf')\n",
|
|
||||||
"\n",
|
|
||||||
" return d / n\n",
|
|
||||||
"\n"
|
"\n"
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"from word_error_rate import word_error_rate\n",
|
"from qurator.dinglehopper.word_error_rate import word_error_rate\n",
|
||||||
"print(inspect.getsource(word_error_rate))"
|
"print(inspect.getsource(word_error_rate))"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
|
@ -1002,9 +911,9 @@
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"hide_input": false,
|
"hide_input": false,
|
||||||
"kernelspec": {
|
"kernelspec": {
|
||||||
"display_name": "Python 3",
|
"display_name": "dinglehopper-github",
|
||||||
"language": "python",
|
"language": "python",
|
||||||
"name": "python3"
|
"name": "dinglehopper-github"
|
||||||
},
|
},
|
||||||
"language_info": {
|
"language_info": {
|
||||||
"codemirror_mode": {
|
"codemirror_mode": {
|
||||||
|
@ -1016,7 +925,7 @@
|
||||||
"name": "python",
|
"name": "python",
|
||||||
"nbconvert_exporter": "python",
|
"nbconvert_exporter": "python",
|
||||||
"pygments_lexer": "ipython3",
|
"pygments_lexer": "ipython3",
|
||||||
"version": "3.7.3"
|
"version": "3.7.12"
|
||||||
},
|
},
|
||||||
"toc": {
|
"toc": {
|
||||||
"base_numbering": 1,
|
"base_numbering": 1,
|
||||||
|
|
|
@ -8,7 +8,6 @@ from ocrd_utils import getLogger, make_file_id, assert_file_grp_cardinality
|
||||||
from pkg_resources import resource_string
|
from pkg_resources import resource_string
|
||||||
|
|
||||||
from .cli import process as cli_process
|
from .cli import process as cli_process
|
||||||
from .edit_distance import levenshtein_matrix_cache_clear
|
|
||||||
|
|
||||||
OCRD_TOOL = json.loads(resource_string(__name__, "ocrd-tool.json").decode("utf8"))
|
OCRD_TOOL = json.loads(resource_string(__name__, "ocrd-tool.json").decode("utf8"))
|
||||||
|
|
||||||
|
@ -74,8 +73,6 @@ class OcrdDinglehopperEvaluate(Processor):
|
||||||
local_filename=report_prefix + report_suffix,
|
local_filename=report_prefix + report_suffix,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Clear cache between files
|
|
||||||
levenshtein_matrix_cache_clear()
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
ocrd_dinglehopper()
|
ocrd_dinglehopper()
|
||||||
|
|
|
@ -2,27 +2,7 @@ from __future__ import division, print_function
|
||||||
|
|
||||||
import unicodedata
|
import unicodedata
|
||||||
|
|
||||||
from .. import levenshtein, distance
|
from .. import distance
|
||||||
|
|
||||||
|
|
||||||
def test_levenshtein():
|
|
||||||
assert levenshtein("a", "a") == 0
|
|
||||||
assert levenshtein("a", "b") == 1
|
|
||||||
assert levenshtein("Foo", "Bar") == 3
|
|
||||||
|
|
||||||
assert levenshtein("", "") == 0
|
|
||||||
assert levenshtein("Foo", "") == 3
|
|
||||||
assert levenshtein("", "Foo") == 3
|
|
||||||
|
|
||||||
assert levenshtein("Foo", "Food") == 1
|
|
||||||
assert levenshtein("Fnord", "Food") == 2
|
|
||||||
assert levenshtein("Müll", "Mull") == 1
|
|
||||||
assert levenshtein("Abstand", "Sand") == 4
|
|
||||||
|
|
||||||
|
|
||||||
def test_levenshtein_other_sequences():
|
|
||||||
assert levenshtein(["a", "ab"], ["a", "ab", "c"]) == 1
|
|
||||||
assert levenshtein(["a", "ab"], ["a", "c"]) == 1
|
|
||||||
|
|
||||||
|
|
||||||
def test_distance():
|
def test_distance():
|
||||||
|
|
|
@ -1,48 +1,6 @@
|
||||||
import unicodedata
|
import unicodedata
|
||||||
|
|
||||||
from .. import seq_editops, editops
|
from .. import editops
|
||||||
|
|
||||||
|
|
||||||
def test_trivial():
|
|
||||||
assert seq_editops("abc", "abc") == []
|
|
||||||
assert seq_editops("", "") == []
|
|
||||||
|
|
||||||
|
|
||||||
def test_insert():
|
|
||||||
assert seq_editops("bc", "abc") == [("insert", 0, 0)]
|
|
||||||
assert seq_editops("ac", "abc") == [("insert", 1, 1)]
|
|
||||||
assert seq_editops("ab", "abc") == [("insert", 2, 2)]
|
|
||||||
assert seq_editops("", "a") == [("insert", 0, 0)]
|
|
||||||
|
|
||||||
|
|
||||||
def test_multiple():
|
|
||||||
assert seq_editops("bcd", "abce") == [("insert", 0, 0), ("replace", 2, 3)]
|
|
||||||
|
|
||||||
|
|
||||||
def test_delete():
|
|
||||||
assert seq_editops("abcdef", "cdef") == [("delete", 0, 0), ("delete", 1, 0)]
|
|
||||||
assert seq_editops("Xabcdef", "Xcdef") == [("delete", 1, 1), ("delete", 2, 1)]
|
|
||||||
assert seq_editops("abcdefg", "acdefX") == [("delete", 1, 1), ("replace", 6, 5)]
|
|
||||||
assert seq_editops("abcde", "aabcd") == [("insert", 1, 1), ("delete", 4, 5)]
|
|
||||||
assert seq_editops("Foo", "") == [
|
|
||||||
("delete", 0, 0),
|
|
||||||
("delete", 1, 0),
|
|
||||||
("delete", 2, 0),
|
|
||||||
]
|
|
||||||
assert seq_editops("Foolish", "Foo") == [
|
|
||||||
("delete", 3, 3),
|
|
||||||
("delete", 4, 3),
|
|
||||||
("delete", 5, 3),
|
|
||||||
("delete", 6, 3),
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
def test_ambiguous():
|
|
||||||
assert seq_editops("bcd", "abcef") == [
|
|
||||||
("insert", 0, 0),
|
|
||||||
("replace", 2, 3),
|
|
||||||
("insert", 3, 4),
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
def test_editops():
|
def test_editops():
|
||||||
|
|
|
@ -6,7 +6,7 @@ from multimethod import multimethod
|
||||||
|
|
||||||
import uniseg.wordbreak
|
import uniseg.wordbreak
|
||||||
|
|
||||||
from .edit_distance import levenshtein
|
from rapidfuzz.string_metric import levenshtein
|
||||||
from . import ExtractedText
|
from . import ExtractedText
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -9,3 +9,4 @@ ocrd >= 2.20.1
|
||||||
attrs
|
attrs
|
||||||
multimethod == 1.3 # latest version to officially support Python 3.5
|
multimethod == 1.3 # latest version to officially support Python 3.5
|
||||||
tqdm
|
tqdm
|
||||||
|
rapidfuzz >= 1.8.1
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue