🎨 dinglehopper: Use multimethod to handle str vs ExtractedText

2025-07-03 07:29:59 +02:00 · 2020-10-08 12:15:58 +02:00 · 2020-10-08 12:15:58 +02:00 · b14c35e147
commit b14c35e147
parent a17ee2afec
4 changed files with 55 additions and 43 deletions
--- a/qurator/dinglehopper/edit_distance.py
+++ b/qurator/dinglehopper/edit_distance.py
@ -5,6 +5,7 @@ from functools import partial, lru_cache
 from typing import Sequence, Tuple

 import numpy as np
+from multimethod import multimethod
 from uniseg.graphemecluster import grapheme_clusters

 from .ocr_files import ExtractedText
@ -70,25 +71,23 @@ def levenshtein_matrix_cache_clear():
    _levenshtein_matrix.cache_clear()


-def distance(s1, s2):
+@multimethod
+def distance(s1: str, s2: str):
    """Compute the Levenshtein edit distance between two Unicode strings

    Note that this is different from levenshtein() as this function knows about Unicode normalization and grapheme
    clusters. This should be the correct way to compare two Unicode strings.
    """
-
-    # XXX Implicit normalization
-    if isinstance(s1, str):
-        s1 = ExtractedText.from_str(s1)
-    if isinstance(s2, str):
-        s2 = ExtractedText.from_str(s2)
-    # s1 and s2 are now guaranteed (by ExtractedText) to be in NFC
-
-    seq1 = list(grapheme_clusters(s1.text))
-    seq2 = list(grapheme_clusters(s2.text))
+    seq1 = list(grapheme_clusters(unicodedata.normalize('NFC', s1)))
+    seq2 = list(grapheme_clusters(unicodedata.normalize('NFC', s2)))
    return levenshtein(seq1, seq2)


+@multimethod
+def distance(s1: ExtractedText, s2: ExtractedText):
+    return distance(s1.text, s2.text)
+
+
 def seq_editops(seq1, seq2):
    """
    Return sequence of edit operations transforming one sequence to another.