From db6292611fb739baf20038ac0a7e63847bd6a96f Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Wed, 7 Oct 2020 16:07:27 +0200 Subject: [PATCH] =?UTF-8?q?=F0=9F=A7=B9=20dinglehopper:=20Remove=20merged?= =?UTF-8?q?=20text=20extraction=20test=20code?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- extracted_text.py | 51 ------------------------------------------ extracted_text_test.py | 22 ------------------ 2 files changed, 73 deletions(-) delete mode 100644 extracted_text.py delete mode 100644 extracted_text_test.py diff --git a/extracted_text.py b/extracted_text.py deleted file mode 100644 index c84c77b..0000000 --- a/extracted_text.py +++ /dev/null @@ -1,51 +0,0 @@ -import attr -import unicodedata -import enum - - -# TODO handle grapheme cluster positions? -# TODO Use type annotations for attr.ib types when support for Python 3.5 is dropped -# TODO types are not validated (attr does not do this yet) - - -@attr.s(frozen=True) -class ExtractedText: - segments = attr.ib() - joiner = attr.ib(type=str) - - @property - def text(self): - return self.joiner.join(s.text for s in self.segments) - - def segment_id_for_pos(self, pos): - i = 0 - for s in self.segments: - if i <= pos < i + len(s.text): - return s.id - i += len(s.text) - if i <= pos < i + len(self.joiner): - return None - i += len(self.joiner) - - -class Normalization(enum.Enum): - NFC = 1 - NFC_MUFI = 2 - - -def normalize(text, normalization): - if normalization == Normalization.NFC: - return unicodedata.normalize('NFC', text) - else: - raise ValueError() - - -@attr.s(frozen=True) -class ExtractedTextSegment: - id = attr.ib(type=str) - text = attr.ib(type=str) - @text.validator - def check(self, attribute, value): - if normalize(value, self.normalization) != value: - raise ValueError('String "{}" is not normalized.'.format(value)) - normalization = attr.ib(converter=Normalization, default=Normalization.NFC) diff --git a/extracted_text_test.py b/extracted_text_test.py deleted file mode 100644 index 4919a76..0000000 --- a/extracted_text_test.py +++ /dev/null @@ -1,22 +0,0 @@ -import unicodedata -import pytest -from extracted_text import ExtractedText, ExtractedTextSegment - - -def test_text(): - test1 = ExtractedText([ - ExtractedTextSegment('s0', 'foo'), - ExtractedTextSegment('s1', 'bar'), - ExtractedTextSegment('s2', 'bazinga') - ], ' ') - - assert test1.text == 'foo bar bazinga' - assert test1.segment_id_for_pos(0) == 's0' - assert test1.segment_id_for_pos(3) is None - assert test1.segment_id_for_pos(10) == 's2' - - -def test_normalization_check(): - with pytest.raises(ValueError, match=r'.*is not normalized.*'): - ExtractedTextSegment('foo', unicodedata.normalize('NFD', 'Schlyñ')) - assert ExtractedTextSegment('foo', unicodedata.normalize('NFC', 'Schlyñ'))