diff --git a/qurator/dinglehopper/edit_distance.py b/qurator/dinglehopper/edit_distance.py index fec2bca..1fd5bb5 100644 --- a/qurator/dinglehopper/edit_distance.py +++ b/qurator/dinglehopper/edit_distance.py @@ -22,11 +22,11 @@ def levenshtein_matrix(seq1: Sequence, seq2: Sequence): return _levenshtein_matrix(tuple(seq1), tuple(seq2)) -@lru_cache() +@lru_cache(maxsize=10) def _levenshtein_matrix(seq1: Tuple, seq2: Tuple): """Compute the matrix commonly computed to produce the Levenshtein distance. - This is a LRU cached function not meant to be used directly. Use levensthein_matrix() instead. + This is a LRU cached function not meant to be used directly. Use levenshtein_matrix() instead. """ m = len(seq1) n = len(seq2) @@ -60,6 +60,15 @@ def levenshtein(seq1, seq2): return D[m, n] +def levenshtein_matrix_cache_clear(): + """Clear internal Levenshtein matrix cache. + + You want to do this between different input file pairs to decrease memory + usage by not caching results from prior input files. + """ + _levenshtein_matrix.cache_clear() + + def distance(s1, s2): """Compute the Levenshtein edit distance between two Unicode strings diff --git a/qurator/dinglehopper/ocrd_cli.py b/qurator/dinglehopper/ocrd_cli.py index ee44e2f..8ab5cf2 100644 --- a/qurator/dinglehopper/ocrd_cli.py +++ b/qurator/dinglehopper/ocrd_cli.py @@ -8,7 +8,7 @@ from ocrd_utils import concat_padded, getLogger from pkg_resources import resource_string from qurator.dinglehopper.cli import process as cli_process -from qurator.dinglehopper.edit_distance import _levenshtein_matrix +from qurator.dinglehopper.edit_distance import levenshtein_matrix_cache_clear log = getLogger('processor.OcrdDinglehopperEvaluate') @@ -64,7 +64,7 @@ class OcrdDinglehopperEvaluate(Processor): local_filename=report_prefix + report_suffix) # Clear cache between files - _levenshtein_matrix.cache_clear() + levenshtein_matrix_cache_clear() if __name__ == '__main__':