From 077396bb56c85eed243d583a4f2a32a7d11d699d Mon Sep 17 00:00:00 2001 From: JKamlah <> Date: Thu, 31 Oct 2019 12:14:05 +0100 Subject: [PATCH] ADD tempcache for levensthein matrix and reformat code. --- qurator/dinglehopper/cli.py | 9 +++++- qurator/dinglehopper/edit_distance.py | 40 +++++++++++++++++++-------- 2 files changed, 37 insertions(+), 12 deletions(-) diff --git a/qurator/dinglehopper/cli.py b/qurator/dinglehopper/cli.py index 5d7f43b..3c56c76 100644 --- a/qurator/dinglehopper/cli.py +++ b/qurator/dinglehopper/cli.py @@ -1,4 +1,4 @@ -import os +import shutil import click from jinja2 import Environment, FileSystemLoader @@ -40,6 +40,12 @@ def gen_diff_report(gt_things, ocr_things, css_prefix, joiner, none, align): '''.format(gtx, ocrx) +def delete_temp(): + # XXX Delete all np-tempfiles? + tempath = os.path.normpath(tempfile.gettempdir() + "/dinglehopper/") + if os.path.exists(tempath): + shutil.rmtree(os.path.normpath(tempath)) + def process(gt, ocr, report_prefix): """Check OCR result against GT. @@ -77,6 +83,7 @@ def process(gt, ocr, report_prefix): word_diff_report=word_diff_report ).dump(out_fn) + delete_temp() @click.command() @click.argument('gt', type=click.Path(exists=True)) diff --git a/qurator/dinglehopper/edit_distance.py b/qurator/dinglehopper/edit_distance.py index 9025c9d..1b8a80d 100644 --- a/qurator/dinglehopper/edit_distance.py +++ b/qurator/dinglehopper/edit_distance.py @@ -4,18 +4,32 @@ import unicodedata from functools import partial import numpy as np +import hashlib +import os +import tempfile from uniseg.graphemecluster import grapheme_clusters -def levenshtein_matrix(seq1, seq2): +def levenshtein_matrix(seq1, seq2, tempcache=True): """Compute the matrix commonly computed to produce the Levenshtein distance. - This is also known as the Wagner-Fischer algorithm. The matrix element at the bottom right contains the desired - edit distance. + The first algorithm is based on the hypothesis that the set of individual graphemes is smaller than + the length of the grapheme cluster array. + + The second algorithm is also known as the Wagner-Fischer algorithm. + The matrix element at the bottom right contains the desired edit distance. This algorithm is implemented here because we need an implementation that can work with sequences other than strings, e.g. lists of grapheme clusters or lists of word strings. """ + if tempcache: + hashname = hashlib.sha1(("".join(seq1) + "".join(seq2)).encode("utf-8")).hexdigest() + tempdir = os.path.normpath(tempfile.gettempdir() + "/dinglehopper/") + if not os.path.exists(tempdir): + os.makedirs(os.path.normpath(tempfile.gettempdir() + "/dinglehopper/")) + tempath = os.path.normpath(tempdir +"/"+hashname+".npy") + if os.path.exists(tempath): + return np.load(tempath) m = len(seq1) n = len(seq2) @@ -40,20 +54,24 @@ def levenshtein_matrix(seq1, seq2): if seq1[row] in interset: mask = masks[grapheme] for col in range(0,n): - D[row + 1, col + 1] = min(D[row, col] + mask[col], D[row + 1, col], D[row, col + 1])+1 + D[row + 1, col + 1] = min(D[row, col] + mask[col], # same or substitution + D[row + 1, col], # insertion + D[row, col + 1])+1 # deletion else: for col in range(0,n): - D[row+1,col+1] = min(D[row,col],D[row+1,col],D[row,col+1])+1 - + D[row+1,col+1] = min(D[row,col], # same or substitution + D[row+1,col], # insertion + D[row,col+1])+1 # deletion else: for i in range(1, m+1): for j in range(1, n+1): - E[i, j] = min( - E[i - 1, j - 1] + 1 * (seq1[i - 1] != seq2[j - 1]), # Same or Substitution - E[i, j - 1] + 1, # Insertion - E[i - 1, j] + 1 # Deletion + D[i, j] = min( + D[i - 1, j - 1] + 1 * (seq1[i - 1] != seq2[j - 1]), # Same or Substitution + D[i, j - 1] + 1, # Insertion + D[i - 1, j] + 1 # Deletion ) - + if tempcache: + np.save(tempath,D) return D def levenshtein(seq1, seq2):