From bc630233d0d15aff7dabe8bcda83fe23e766a200 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Wed, 10 Jun 2020 19:36:49 +0200 Subject: [PATCH] =?UTF-8?q?=F0=9F=9A=A7=20dinglehopper:=20WIP=20data=20str?= =?UTF-8?q?ucture=20for=20extracted=20text?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- extracted_text.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/extracted_text.py b/extracted_text.py index d1dc6f0..f99c8ac 100644 --- a/extracted_text.py +++ b/extracted_text.py @@ -1,4 +1,5 @@ import attr +import unicodedata @attr.s(frozen=True) @@ -23,10 +24,25 @@ class ExtractedText: i += len(self.joiner) +NORM_NFC = 0 + + +def normalize(text, normalization): + if normalization == NORM_NFC: + return unicodedata.normalize('NFC', text) + else: + raise ValueError() + + @attr.s(frozen=True) class ExtractedTextSegment: id = attr.ib(type=str) text = attr.ib(type=str) + @text.validator + def check(self, attribute, value): + if normalize(value, self.normalization) != value: + raise ValueError('String "{}" is not normalized.'.format(value)) + normalization = attr.ib(default=NORM_NFC) test1 = ExtractedText([ @@ -40,3 +56,6 @@ assert test1.text == 'foo bar bazinga' assert test1.segment_id_for_pos(0) == 's0' assert test1.segment_id_for_pos(3) == None assert test1.segment_id_for_pos(10) == 's2' + +# ExtractedTextSegment('foo', unicodedata.normalize('NFD', 'Schlyñ')) +ExtractedTextSegment('foo', unicodedata.normalize('NFC', 'Schlyñ'))