mirror of
https://github.com/qurator-spk/dinglehopper.git
synced 2025-06-07 19:05:13 +02:00
FIX naming, spacing and deletion of tempcachfiles.
This commit is contained in:
parent
c9cfdc59ae
commit
d458568ebd
2 changed files with 33 additions and 27 deletions
|
@ -1,6 +1,8 @@
|
||||||
import shutil
|
import shutil
|
||||||
|
|
||||||
import click
|
import click
|
||||||
|
import os
|
||||||
|
import glob
|
||||||
from jinja2 import Environment, FileSystemLoader
|
from jinja2 import Environment, FileSystemLoader
|
||||||
|
|
||||||
|
|
||||||
|
@ -40,11 +42,15 @@ def gen_diff_report(gt_things, ocr_things, css_prefix, joiner, none, align):
|
||||||
</div>
|
</div>
|
||||||
'''.format(gtx, ocrx)
|
'''.format(gtx, ocrx)
|
||||||
|
|
||||||
def delete_temp():
|
def delete_tempcache():
|
||||||
# XXX Delete all np-tempfiles?
|
# Delete all tempfiles and the directory (if empty)
|
||||||
tempath = os.path.normpath(tempfile.gettempdir() + "/dinglehopper/")
|
tempdir = tempfile.gettempdir() + "/dinglehopper/"
|
||||||
if os.path.exists(tempath):
|
if os.path.exists(tempdir):
|
||||||
shutil.rmtree(os.path.normpath(tempath))
|
tempfiles = glob.glob(tempdir+"*.np*")
|
||||||
|
for tempfilename in tempfiles:
|
||||||
|
os.remove(tempfilename)
|
||||||
|
if not os.listdir(tempdir):
|
||||||
|
shutil.rmtree(os.path.normpath(tempdir))
|
||||||
|
|
||||||
|
|
||||||
def process(gt, ocr, report_prefix):
|
def process(gt, ocr, report_prefix):
|
||||||
|
@ -83,7 +89,7 @@ def process(gt, ocr, report_prefix):
|
||||||
word_diff_report=word_diff_report
|
word_diff_report=word_diff_report
|
||||||
).dump(out_fn)
|
).dump(out_fn)
|
||||||
|
|
||||||
delete_temp()
|
delete_tempcache()
|
||||||
|
|
||||||
@click.command()
|
@click.command()
|
||||||
@click.argument('gt', type=click.Path(exists=True))
|
@click.argument('gt', type=click.Path(exists=True))
|
||||||
|
|
|
@ -26,19 +26,19 @@ def levenshtein_matrix(seq1, seq2, tempcache=True):
|
||||||
hashname = hashlib.sha1(("".join(seq1) + "".join(seq2)).encode("utf-8")).hexdigest()
|
hashname = hashlib.sha1(("".join(seq1) + "".join(seq2)).encode("utf-8")).hexdigest()
|
||||||
tempdir = os.path.normpath(tempfile.gettempdir() + "/dinglehopper/")
|
tempdir = os.path.normpath(tempfile.gettempdir() + "/dinglehopper/")
|
||||||
if not os.path.exists(tempdir):
|
if not os.path.exists(tempdir):
|
||||||
os.makedirs(os.path.normpath(tempfile.gettempdir() + "/dinglehopper/"))
|
os.makedirs(tempdir + "/dinglehopper/")
|
||||||
tempath = os.path.normpath(tempdir +"/"+hashname+".npy")
|
tempfilename = os.path.normpath(tempdir + "/" + hashname + ".npy")
|
||||||
if os.path.exists(tempath):
|
if os.path.exists(tempfilename):
|
||||||
return np.load(tempath)
|
return np.load(tempfilename)
|
||||||
|
|
||||||
m = len(seq1)
|
m = len(seq1)
|
||||||
n = len(seq2)
|
n = len(seq2)
|
||||||
|
|
||||||
D = np.ones((m + 1, n + 1), np.int)
|
D = np.zeros((m + 1, n + 1), np.int)
|
||||||
D[:,0] = np.arange(m+1)
|
D[:, 0] = np.arange(m+1)
|
||||||
D[0,:] = np.arange(n+1)
|
D[0, :] = np.arange(n+1)
|
||||||
|
|
||||||
if m > 10 and n > 10:
|
if m > 26 and n > 26:
|
||||||
# All grapheme which occur in both sets
|
# All grapheme which occur in both sets
|
||||||
interset = set(seq1).intersection(set(seq2))
|
interset = set(seq1).intersection(set(seq2))
|
||||||
|
|
||||||
|
@ -51,27 +51,27 @@ def levenshtein_matrix(seq1, seq2, tempcache=True):
|
||||||
|
|
||||||
# Calculate the levensthein matrix
|
# Calculate the levensthein matrix
|
||||||
for row, grapheme in enumerate(seq1):
|
for row, grapheme in enumerate(seq1):
|
||||||
if seq1[row] in interset:
|
if grapheme in interset:
|
||||||
mask = masks[grapheme]
|
mask = masks[grapheme]
|
||||||
for col in range(0,n):
|
for col in range(0,n):
|
||||||
D[row + 1, col + 1] = min(D[row, col] + mask[col], # same or substitution
|
D[row + 1, col + 1] = 1 + min(D[row, col] + mask[col], # same or subsitution
|
||||||
D[row + 1, col], # insertion
|
D[row + 1, col], # insertion
|
||||||
D[row, col + 1])+1 # deletion
|
D[row, col + 1]) # deletion
|
||||||
else:
|
else:
|
||||||
for col in range(0,n):
|
for col in range(0,n):
|
||||||
D[row+1,col+1] = min(D[row,col], # same or substitution
|
D[row+1, col+1] = 1 + min(D[row, col], # substitution
|
||||||
D[row+1,col], # insertion
|
D[row+1, col], # insertion
|
||||||
D[row,col+1])+1 # deletion
|
D[row, col+1]) # deletion
|
||||||
else:
|
else:
|
||||||
for i in range(1, m+1):
|
for row in range(1, m+1):
|
||||||
for j in range(1, n+1):
|
for col in range(1, n+1):
|
||||||
D[i, j] = min(
|
D[row, col] = min(
|
||||||
D[i - 1, j - 1] + 1 * (seq1[i - 1] != seq2[j - 1]), # Same or Substitution
|
D[row - 1, col - 1] + 1 * (seq1[row - 1] != seq2[col - 1]), # Same or Substitution
|
||||||
D[i, j - 1] + 1, # Insertion
|
D[row, col - 1] + 1, # Insertion
|
||||||
D[i - 1, j] + 1 # Deletion
|
D[row - 1, col] + 1 # Deletion
|
||||||
)
|
)
|
||||||
if tempcache:
|
if tempcache:
|
||||||
np.save(tempath,D)
|
np.save(tempfilename,D)
|
||||||
return D
|
return D
|
||||||
|
|
||||||
def levenshtein(seq1, seq2):
|
def levenshtein(seq1, seq2):
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue