From d458568ebd6df5f88397cb29be29dc3dd98a186f Mon Sep 17 00:00:00 2001 From: JKamlah <> Date: Mon, 4 Nov 2019 11:22:06 +0100 Subject: [PATCH] FIX naming, spacing and deletion of tempcachfiles. --- qurator/dinglehopper/cli.py | 18 ++++++++---- qurator/dinglehopper/edit_distance.py | 42 +++++++++++++-------------- 2 files changed, 33 insertions(+), 27 deletions(-) diff --git a/qurator/dinglehopper/cli.py b/qurator/dinglehopper/cli.py index 3c56c76..31e582b 100644 --- a/qurator/dinglehopper/cli.py +++ b/qurator/dinglehopper/cli.py @@ -1,6 +1,8 @@ import shutil import click +import os +import glob from jinja2 import Environment, FileSystemLoader @@ -40,11 +42,15 @@ def gen_diff_report(gt_things, ocr_things, css_prefix, joiner, none, align): '''.format(gtx, ocrx) -def delete_temp(): - # XXX Delete all np-tempfiles? - tempath = os.path.normpath(tempfile.gettempdir() + "/dinglehopper/") - if os.path.exists(tempath): - shutil.rmtree(os.path.normpath(tempath)) +def delete_tempcache(): + # Delete all tempfiles and the directory (if empty) + tempdir = tempfile.gettempdir() + "/dinglehopper/" + if os.path.exists(tempdir): + tempfiles = glob.glob(tempdir+"*.np*") + for tempfilename in tempfiles: + os.remove(tempfilename) + if not os.listdir(tempdir): + shutil.rmtree(os.path.normpath(tempdir)) def process(gt, ocr, report_prefix): @@ -83,7 +89,7 @@ def process(gt, ocr, report_prefix): word_diff_report=word_diff_report ).dump(out_fn) - delete_temp() + delete_tempcache() @click.command() @click.argument('gt', type=click.Path(exists=True)) diff --git a/qurator/dinglehopper/edit_distance.py b/qurator/dinglehopper/edit_distance.py index 1b8a80d..6f7c286 100644 --- a/qurator/dinglehopper/edit_distance.py +++ b/qurator/dinglehopper/edit_distance.py @@ -26,19 +26,19 @@ def levenshtein_matrix(seq1, seq2, tempcache=True): hashname = hashlib.sha1(("".join(seq1) + "".join(seq2)).encode("utf-8")).hexdigest() tempdir = os.path.normpath(tempfile.gettempdir() + "/dinglehopper/") if not os.path.exists(tempdir): - os.makedirs(os.path.normpath(tempfile.gettempdir() + "/dinglehopper/")) - tempath = os.path.normpath(tempdir +"/"+hashname+".npy") - if os.path.exists(tempath): - return np.load(tempath) + os.makedirs(tempdir + "/dinglehopper/") + tempfilename = os.path.normpath(tempdir + "/" + hashname + ".npy") + if os.path.exists(tempfilename): + return np.load(tempfilename) m = len(seq1) n = len(seq2) - D = np.ones((m + 1, n + 1), np.int) - D[:,0] = np.arange(m+1) - D[0,:] = np.arange(n+1) + D = np.zeros((m + 1, n + 1), np.int) + D[:, 0] = np.arange(m+1) + D[0, :] = np.arange(n+1) - if m > 10 and n > 10: + if m > 26 and n > 26: # All grapheme which occur in both sets interset = set(seq1).intersection(set(seq2)) @@ -51,27 +51,27 @@ def levenshtein_matrix(seq1, seq2, tempcache=True): # Calculate the levensthein matrix for row, grapheme in enumerate(seq1): - if seq1[row] in interset: + if grapheme in interset: mask = masks[grapheme] for col in range(0,n): - D[row + 1, col + 1] = min(D[row, col] + mask[col], # same or substitution + D[row + 1, col + 1] = 1 + min(D[row, col] + mask[col], # same or subsitution D[row + 1, col], # insertion - D[row, col + 1])+1 # deletion + D[row, col + 1]) # deletion else: for col in range(0,n): - D[row+1,col+1] = min(D[row,col], # same or substitution - D[row+1,col], # insertion - D[row,col+1])+1 # deletion + D[row+1, col+1] = 1 + min(D[row, col], # substitution + D[row+1, col], # insertion + D[row, col+1]) # deletion else: - for i in range(1, m+1): - for j in range(1, n+1): - D[i, j] = min( - D[i - 1, j - 1] + 1 * (seq1[i - 1] != seq2[j - 1]), # Same or Substitution - D[i, j - 1] + 1, # Insertion - D[i - 1, j] + 1 # Deletion + for row in range(1, m+1): + for col in range(1, n+1): + D[row, col] = min( + D[row - 1, col - 1] + 1 * (seq1[row - 1] != seq2[col - 1]), # Same or Substitution + D[row, col - 1] + 1, # Insertion + D[row - 1, col] + 1 # Deletion ) if tempcache: - np.save(tempath,D) + np.save(tempfilename,D) return D def levenshtein(seq1, seq2):