🚧 dinglehopper: Guarantee NFC + rename from_text → from_str

2025-08-03 14:49:57 +02:00 · 2020-10-08 11:25:01 +02:00 · 2020-10-08 11:25:01 +02:00 · a17ee2afec
commit a17ee2afec
parent 7843824eaf
5 changed files with 29 additions and 13 deletions
--- a/.idea/vcs.xml
+++ b/.idea/vcs.xml
@ -0,0 +1,6 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <project version="4">
  <component name="VcsDirectoryMappings">
    <mapping directory="$PROJECT_DIR$" vcs="Git" />
  </component>
 </project>
--- a/qurator/dinglehopper/character_error_rate.py
+++ b/qurator/dinglehopper/character_error_rate.py
@ -17,7 +17,7 @@ def character_error_rate_n(reference, compared) -> Tuple[float, int]:
    """
    if isinstance(reference, str):
        return character_error_rate_n(
-                ExtractedText.from_text(reference),
+                ExtractedText.from_str(reference),
                compared)
    d = distance(reference, compared)
--- a/qurator/dinglehopper/edit_distance.py
+++ b/qurator/dinglehopper/edit_distance.py
@ -77,14 +77,16 @@ def distance(s1, s2):
    clusters. This should be the correct way to compare two Unicode strings.
    """
-    if isinstance(s1, ExtractedText):
+    # XXX Implicit normalization
-        s1 = s1.text
+    if isinstance(s1, str):
-    if isinstance(s2, ExtractedText):
+        s1 = ExtractedText.from_str(s1)
-        s2 = s2.text
+    if isinstance(s2, str):
        s2 = ExtractedText.from_str(s2)
    # s1 and s2 are now guaranteed (by ExtractedText) to be in NFC
-    s1 = list(grapheme_clusters(unicodedata.normalize('NFC', s1)))
+    seq1 = list(grapheme_clusters(s1.text))
-    s2 = list(grapheme_clusters(unicodedata.normalize('NFC', s2)))
+    seq2 = list(grapheme_clusters(s2.text))
-    return levenshtein(s1, s2)
+    return levenshtein(seq1, seq2)
 def seq_editops(seq1, seq2):
--- a/qurator/dinglehopper/ocr_files.py
+++ b/qurator/dinglehopper/ocr_files.py
@ -23,6 +23,12 @@ class Normalization(enum.Enum):
@attr.s(frozen=True)
 class ExtractedText:
    """
    Extracted text
    Objects of this class are guaranteed to be a. always in their normalization and
    b. in NFC.
    """
    segment_id = attr.ib(type=Optional[str])
    @segment_id.validator
@ -48,6 +54,8 @@ class ExtractedText:
    @_text.validator
    def check(self, _, value):
        if value is not None and unicodedata.normalize('NFC', value) != value:
            raise ValueError('String "{}" is not in NFC.'.format(value))
        if value is not None and normalize(value, self.normalization) != value:
            raise ValueError('String "{}" is not normalized.'.format(value))
@ -93,9 +101,9 @@ class ExtractedText:
        return cls(segment_id, None, None, segment_text)
    @classmethod
-    def from_text(cls, text):
+    def from_str(cls, text, normalization=Normalization.NFC_SBB):
-        return cls(None, None, None, text)
+        normalized_text = normalize(text, normalization)
-
+        return cls(None, None, None, normalized_text, normalization=normalization)
 def normalize(text, normalization):
@ -138,7 +146,7 @@ def alto_extract(tree):
    return ExtractedText(
            None,
-            (ExtractedText.from_text(normalize_sbb(line_text)) for line_text in lines),
+            (ExtractedText.from_str(normalize_sbb(line_text)) for line_text in lines),
            '\n',
            None
    )
--- a/qurator/dinglehopper/tests/extracted_text_test.py
+++ b/qurator/dinglehopper/tests/extracted_text_test.py
@ -20,7 +20,7 @@ def test_text():
 def test_normalization_check():
-    with pytest.raises(ValueError, match=r'.*is not normalized.*'):
+    with pytest.raises(ValueError, match=r'.*is not in NFC.*'):
        ExtractedText('foo', None, None, unicodedata.normalize('NFD', 'Schlyñ'))
    assert ExtractedText('foo', None, None, unicodedata.normalize('NFC', 'Schlyñ'))