From 91371971ebc65787c0e4a1455e69240e04cf3e05 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Wed, 10 Jun 2020 20:29:01 +0200 Subject: [PATCH] =?UTF-8?q?=F0=9F=9A=A7=20dinglehopper:=20WIP=20data=20str?= =?UTF-8?q?ucture=20for=20extracted=20text?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- extracted_text.py | 9 ++++++--- extracted_text_test.py | 15 ++++++++++----- 2 files changed, 16 insertions(+), 8 deletions(-) diff --git a/extracted_text.py b/extracted_text.py index 69d836b..23cd519 100644 --- a/extracted_text.py +++ b/extracted_text.py @@ -1,5 +1,6 @@ import attr import unicodedata +import enum # TODO handle grapheme cluster positions? @@ -27,11 +28,13 @@ class ExtractedText: i += len(self.joiner) -NORM_NFC = 0 +class Normalization(enum.Enum): + NFC = 1 + NFC_MUFI = 2 def normalize(text, normalization): - if normalization == NORM_NFC: + if normalization == Normalization.NFC: return unicodedata.normalize('NFC', text) else: raise ValueError() @@ -45,4 +48,4 @@ class ExtractedTextSegment: def check(self, attribute, value): if normalize(value, self.normalization) != value: raise ValueError('String "{}" is not normalized.'.format(value)) - normalization = attr.ib(default=NORM_NFC) + normalization = attr.ib(converter=Normalization, default=Normalization.NFC) diff --git a/extracted_text_test.py b/extracted_text_test.py index b302ca8..2e916cd 100644 --- a/extracted_text_test.py +++ b/extracted_text_test.py @@ -1,17 +1,22 @@ -from extracted_text import * +import unicodedata +import pytest +from extracted_text import ExtractedText, ExtractedTextSegment + def test_text(): test1 = ExtractedText([ ExtractedTextSegment('s0', 'foo'), - ExtractedTextSegment('s1', 'bar'), + ExtractedTextSegment(1, 'bar'), ExtractedTextSegment('s2', 'bazinga') ], ' ') - assert test1.text == 'foo bar bazinga' assert test1.segment_id_for_pos(0) == 's0' assert test1.segment_id_for_pos(3) is None assert test1.segment_id_for_pos(10) == 's2' -# ExtractedTextSegment('foo', unicodedata.normalize('NFD', 'Schlyñ')) -ExtractedTextSegment('foo', unicodedata.normalize('NFC', 'Schlyñ')) + +def test_normalization_check(): + with pytest.raises(ValueError, match=r'.*is not normalized.*'): + ExtractedTextSegment('foo', unicodedata.normalize('NFD', 'Schlyñ')) + assert ExtractedTextSegment('foo', unicodedata.normalize('NFC', 'Schlyñ'))