🎨 dinglehopper: Expose clearing the Levenshtein cache as a function

2025-07-18 23:09:54 +02:00 · 2019-11-20 13:24:45 +01:00 · 2019-11-20 13:24:45 +01:00 · ced6504ad0
commit ced6504ad0
parent 5cf4eddaeb
2 changed files with 13 additions and 4 deletions
--- a/qurator/dinglehopper/edit_distance.py
+++ b/qurator/dinglehopper/edit_distance.py
@ -22,11 +22,11 @@ def levenshtein_matrix(seq1: Sequence, seq2: Sequence):
    return _levenshtein_matrix(tuple(seq1), tuple(seq2))


-@lru_cache()
+@lru_cache(maxsize=10)
 def _levenshtein_matrix(seq1: Tuple, seq2: Tuple):
    """Compute the matrix commonly computed to produce the Levenshtein distance.

-    This is a LRU cached function not meant to be used directly. Use levensthein_matrix() instead.
+    This is a LRU cached function not meant to be used directly. Use levenshtein_matrix() instead.
    """
    m = len(seq1)
    n = len(seq2)
@ -60,6 +60,15 @@ def levenshtein(seq1, seq2):
    return D[m, n]


+def levenshtein_matrix_cache_clear():
+    """Clear internal Levenshtein matrix cache.
+
+    You want to do this between different input file pairs to decrease memory
+    usage by not caching results from prior input files.
+    """
+    _levenshtein_matrix.cache_clear()
+
+
 def distance(s1, s2):
    """Compute the Levenshtein edit distance between two Unicode strings

--- a/qurator/dinglehopper/ocrd_cli.py
+++ b/qurator/dinglehopper/ocrd_cli.py
@ -8,7 +8,7 @@ from ocrd_utils import concat_padded, getLogger
 from pkg_resources import resource_string

 from qurator.dinglehopper.cli import process as cli_process
-from qurator.dinglehopper.edit_distance import _levenshtein_matrix
+from qurator.dinglehopper.edit_distance import levenshtein_matrix_cache_clear

 log = getLogger('processor.OcrdDinglehopperEvaluate')

@ -64,7 +64,7 @@ class OcrdDinglehopperEvaluate(Processor):
                     local_filename=report_prefix + report_suffix)

            # Clear cache between files
-            _levenshtein_matrix.cache_clear()
+            levenshtein_matrix_cache_clear()


 if __name__ == '__main__':