diff --git a/extracted_text.py b/extracted_text.py index d1dc6f0..f99c8ac 100644 --- a/extracted_text.py +++ b/extracted_text.py @@ -1,4 +1,5 @@ import attr +import unicodedata @attr.s(frozen=True) @@ -23,10 +24,25 @@ class ExtractedText: i += len(self.joiner) +NORM_NFC = 0 + + +def normalize(text, normalization): + if normalization == NORM_NFC: + return unicodedata.normalize('NFC', text) + else: + raise ValueError() + + @attr.s(frozen=True) class ExtractedTextSegment: id = attr.ib(type=str) text = attr.ib(type=str) + @text.validator + def check(self, attribute, value): + if normalize(value, self.normalization) != value: + raise ValueError('String "{}" is not normalized.'.format(value)) + normalization = attr.ib(default=NORM_NFC) test1 = ExtractedText([ @@ -40,3 +56,6 @@ assert test1.text == 'foo bar bazinga' assert test1.segment_id_for_pos(0) == 's0' assert test1.segment_id_for_pos(3) == None assert test1.segment_id_for_pos(10) == 's2' + +# ExtractedTextSegment('foo', unicodedata.normalize('NFD', 'Schlyñ')) +ExtractedTextSegment('foo', unicodedata.normalize('NFC', 'Schlyñ'))