diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 0000000..94a25f7 --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/qurator/dinglehopper/character_error_rate.py b/qurator/dinglehopper/character_error_rate.py index 29826e3..9f5fda0 100644 --- a/qurator/dinglehopper/character_error_rate.py +++ b/qurator/dinglehopper/character_error_rate.py @@ -17,7 +17,7 @@ def character_error_rate_n(reference, compared) -> Tuple[float, int]: """ if isinstance(reference, str): return character_error_rate_n( - ExtractedText.from_text(reference), + ExtractedText.from_str(reference), compared) d = distance(reference, compared) diff --git a/qurator/dinglehopper/edit_distance.py b/qurator/dinglehopper/edit_distance.py index bc607a9..88d3127 100644 --- a/qurator/dinglehopper/edit_distance.py +++ b/qurator/dinglehopper/edit_distance.py @@ -77,14 +77,16 @@ def distance(s1, s2): clusters. This should be the correct way to compare two Unicode strings. """ - if isinstance(s1, ExtractedText): - s1 = s1.text - if isinstance(s2, ExtractedText): - s2 = s2.text - - s1 = list(grapheme_clusters(unicodedata.normalize('NFC', s1))) - s2 = list(grapheme_clusters(unicodedata.normalize('NFC', s2))) - return levenshtein(s1, s2) + # XXX Implicit normalization + if isinstance(s1, str): + s1 = ExtractedText.from_str(s1) + if isinstance(s2, str): + s2 = ExtractedText.from_str(s2) + # s1 and s2 are now guaranteed (by ExtractedText) to be in NFC + + seq1 = list(grapheme_clusters(s1.text)) + seq2 = list(grapheme_clusters(s2.text)) + return levenshtein(seq1, seq2) def seq_editops(seq1, seq2): diff --git a/qurator/dinglehopper/ocr_files.py b/qurator/dinglehopper/ocr_files.py index 2b8b0de..5824dda 100644 --- a/qurator/dinglehopper/ocr_files.py +++ b/qurator/dinglehopper/ocr_files.py @@ -23,6 +23,12 @@ class Normalization(enum.Enum): @attr.s(frozen=True) class ExtractedText: + """ + Extracted text + + Objects of this class are guaranteed to be a. always in their normalization and + b. in NFC. + """ segment_id = attr.ib(type=Optional[str]) @segment_id.validator @@ -48,6 +54,8 @@ class ExtractedText: @_text.validator def check(self, _, value): + if value is not None and unicodedata.normalize('NFC', value) != value: + raise ValueError('String "{}" is not in NFC.'.format(value)) if value is not None and normalize(value, self.normalization) != value: raise ValueError('String "{}" is not normalized.'.format(value)) @@ -93,9 +101,9 @@ class ExtractedText: return cls(segment_id, None, None, segment_text) @classmethod - def from_text(cls, text): - return cls(None, None, None, text) - + def from_str(cls, text, normalization=Normalization.NFC_SBB): + normalized_text = normalize(text, normalization) + return cls(None, None, None, normalized_text, normalization=normalization) def normalize(text, normalization): @@ -138,7 +146,7 @@ def alto_extract(tree): return ExtractedText( None, - (ExtractedText.from_text(normalize_sbb(line_text)) for line_text in lines), + (ExtractedText.from_str(normalize_sbb(line_text)) for line_text in lines), '\n', None ) diff --git a/qurator/dinglehopper/tests/extracted_text_test.py b/qurator/dinglehopper/tests/extracted_text_test.py index 8cac4c1..ef2776c 100644 --- a/qurator/dinglehopper/tests/extracted_text_test.py +++ b/qurator/dinglehopper/tests/extracted_text_test.py @@ -20,7 +20,7 @@ def test_text(): def test_normalization_check(): - with pytest.raises(ValueError, match=r'.*is not normalized.*'): + with pytest.raises(ValueError, match=r'.*is not in NFC.*'): ExtractedText('foo', None, None, unicodedata.normalize('NFD', 'Schlyñ')) assert ExtractedText('foo', None, None, unicodedata.normalize('NFC', 'Schlyñ'))