From 93608ba69735ac6780cfe79ec26477b848350a35 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Wed, 10 Jun 2020 19:40:57 +0200 Subject: [PATCH] =?UTF-8?q?=F0=9F=9A=A7=20dinglehopper:=20WIP=20data=20str?= =?UTF-8?q?ucture=20for=20extracted=20text?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- extracted_text.py | 16 ---------------- extracted_text_test.py | 19 +++++++++++++++++++ 2 files changed, 19 insertions(+), 16 deletions(-) create mode 100644 extracted_text_test.py diff --git a/extracted_text.py b/extracted_text.py index f99c8ac..a76f402 100644 --- a/extracted_text.py +++ b/extracted_text.py @@ -43,19 +43,3 @@ class ExtractedTextSegment: if normalize(value, self.normalization) != value: raise ValueError('String "{}" is not normalized.'.format(value)) normalization = attr.ib(default=NORM_NFC) - - -test1 = ExtractedText([ - ExtractedTextSegment('s0', 'foo'), - ExtractedTextSegment('s1', 'bar'), - ExtractedTextSegment('s2', 'bazinga') -], ' ') - - -assert test1.text == 'foo bar bazinga' -assert test1.segment_id_for_pos(0) == 's0' -assert test1.segment_id_for_pos(3) == None -assert test1.segment_id_for_pos(10) == 's2' - -# ExtractedTextSegment('foo', unicodedata.normalize('NFD', 'Schlyñ')) -ExtractedTextSegment('foo', unicodedata.normalize('NFC', 'Schlyñ')) diff --git a/extracted_text_test.py b/extracted_text_test.py new file mode 100644 index 0000000..29fabfe --- /dev/null +++ b/extracted_text_test.py @@ -0,0 +1,19 @@ +from extracted_text import * + +def test_text(): + test1 = ExtractedText([ + ExtractedTextSegment('s0', 'foo'), + ExtractedTextSegment('s1', 'bar'), + ExtractedTextSegment('s2', 'bazinga') + ], ' ') + + + assert test1.text == 'foo bar bazinga' + assert test1.segment_id_for_pos(0) == 's0' + assert test1.segment_id_for_pos(3) is None + assert test1.segment_id_for_pos(10) == 's2' + +# TODO handle grapheme cluster positions? + +# ExtractedTextSegment('foo', unicodedata.normalize('NFD', 'Schlyñ')) +ExtractedTextSegment('foo', unicodedata.normalize('NFC', 'Schlyñ'))