From 89852314dc7cf57aa85574a613b008fee58d3226 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Wed, 10 Jun 2020 19:49:12 +0200 Subject: [PATCH] =?UTF-8?q?=F0=9F=9A=A7=20dinglehopper:=20WIP=20data=20str?= =?UTF-8?q?ucture=20for=20extracted=20text?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- extracted_text.py | 3 +++ extracted_text_test.py | 2 -- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/extracted_text.py b/extracted_text.py index a76f402..69d836b 100644 --- a/extracted_text.py +++ b/extracted_text.py @@ -2,6 +2,9 @@ import attr import unicodedata +# TODO handle grapheme cluster positions? + + @attr.s(frozen=True) class ExtractedText: segments = attr.ib() diff --git a/extracted_text_test.py b/extracted_text_test.py index 29fabfe..b302ca8 100644 --- a/extracted_text_test.py +++ b/extracted_text_test.py @@ -13,7 +13,5 @@ def test_text(): assert test1.segment_id_for_pos(3) is None assert test1.segment_id_for_pos(10) == 's2' -# TODO handle grapheme cluster positions? - # ExtractedTextSegment('foo', unicodedata.normalize('NFD', 'Schlyñ')) ExtractedTextSegment('foo', unicodedata.normalize('NFC', 'Schlyñ'))