ADD tempcache for levensthein matrix and reformat code.

2025-08-03 14:49:57 +02:00 · 2019-10-31 12:14:05 +01:00 · 2019-10-31 12:14:05 +01:00 · 077396bb56
commit 077396bb56
parent fb89c8f571
2 changed files with 37 additions and 12 deletions
--- a/qurator/dinglehopper/cli.py
+++ b/qurator/dinglehopper/cli.py
@ -1,4 +1,4 @@
-import os
+import shutil
 import click
 from jinja2 import Environment, FileSystemLoader
@ -40,6 +40,12 @@ def gen_diff_report(gt_things, ocr_things, css_prefix, joiner, none, align):
        </div>
        '''.format(gtx, ocrx)
 def delete_temp():
    # XXX Delete all np-tempfiles?
    tempath = os.path.normpath(tempfile.gettempdir() + "/dinglehopper/")
    if os.path.exists(tempath):
        shutil.rmtree(os.path.normpath(tempath))
 def process(gt, ocr, report_prefix):
    """Check OCR result against GT.
@ -77,6 +83,7 @@ def process(gt, ocr, report_prefix):
            word_diff_report=word_diff_report
        ).dump(out_fn)
    delete_temp()
@click.command()
@click.argument('gt', type=click.Path(exists=True))
--- a/qurator/dinglehopper/edit_distance.py
+++ b/qurator/dinglehopper/edit_distance.py
@ -4,18 +4,32 @@ import unicodedata
 from functools import partial
 import numpy as np
 import hashlib
 import os
 import tempfile
 from uniseg.graphemecluster import grapheme_clusters
-def levenshtein_matrix(seq1, seq2):
+def levenshtein_matrix(seq1, seq2, tempcache=True):
    """Compute the matrix commonly computed to produce the Levenshtein distance.
-    This is also known as the Wagner-Fischer algorithm. The matrix element at the bottom right contains the desired
+    The first algorithm is based on the hypothesis that the set of individual graphemes is smaller than
-    edit distance.
+    the length of the grapheme cluster array.
    The second algorithm is also known as the Wagner-Fischer algorithm.
    The matrix element at the bottom right contains the desired edit distance.
    This algorithm is implemented here because we need an implementation that can work with sequences other than
    strings, e.g. lists of grapheme clusters or lists of word strings.
    """
    if tempcache:
        hashname = hashlib.sha1(("".join(seq1) + "".join(seq2)).encode("utf-8")).hexdigest()
        tempdir = os.path.normpath(tempfile.gettempdir() + "/dinglehopper/")
        if not os.path.exists(tempdir):
            os.makedirs(os.path.normpath(tempfile.gettempdir() + "/dinglehopper/"))
        tempath = os.path.normpath(tempdir +"/"+hashname+".npy")
        if os.path.exists(tempath):
            return np.load(tempath)
    m = len(seq1)
    n = len(seq2)
@ -40,20 +54,24 @@ def levenshtein_matrix(seq1, seq2):
            if seq1[row] in interset:
                mask = masks[grapheme]
                for col in range(0,n):
-                    D[row + 1, col + 1] = min(D[row, col] + mask[col], D[row + 1, col], D[row, col + 1])+1
+                    D[row + 1, col + 1] = min(D[row, col] + mask[col], # same or substitution
                                              D[row + 1, col], # insertion
                                              D[row, col + 1])+1 # deletion
            else:
                for col in range(0,n):
-                    D[row+1,col+1] = min(D[row,col],D[row+1,col],D[row,col+1])+1
+                    D[row+1,col+1] = min(D[row,col], # same or substitution
-
+                                         D[row+1,col], # insertion
                                         D[row,col+1])+1 # deletion
    else:
        for i in range(1, m+1):
            for j in range(1, n+1):
-                E[i, j] = min(
+                D[i, j] = min(
-                    E[i - 1, j - 1] + 1 * (seq1[i - 1] != seq2[j - 1]),  # Same or Substitution
+                    D[i - 1, j - 1] + 1 * (seq1[i - 1] != seq2[j - 1]),  # Same or Substitution
-                    E[i, j - 1] + 1,  # Insertion
+                    D[i, j - 1] + 1,  # Insertion
-                    E[i - 1, j] + 1   # Deletion
+                    D[i - 1, j] + 1   # Deletion
                )
-
+    if tempcache:
        np.save(tempath,D)
    return D
 def levenshtein(seq1, seq2):