From a02e7dcbce9013a1ba6b376dfe0acbb58929c76d Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Wed, 10 Jun 2020 18:29:11 +0200 Subject: [PATCH] =?UTF-8?q?=F0=9F=9A=A7=20dinglehopper:=20WIP=20data=20str?= =?UTF-8?q?ucture=20for=20extracted=20text?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- extracted_text.py | 42 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) create mode 100644 extracted_text.py diff --git a/extracted_text.py b/extracted_text.py new file mode 100644 index 0000000..d1dc6f0 --- /dev/null +++ b/extracted_text.py @@ -0,0 +1,42 @@ +import attr + + +@attr.s(frozen=True) +class ExtractedText: + segments = attr.ib() + joiner = attr.ib(type=str) + # XXX Use type annotations for attr types when support for Python 3.5 is dropped + # XXX Also I think these are not validated? + + @property + def text(self): + return self.joiner.join(s.text for s in self.segments) + + def segment_id_for_pos(self, pos): + i = 0 + for s in self.segments: + if i <= pos < i + len(s.text): + return s.id + i += len(s.text) + if i <= pos < i + len(self.joiner): + return None + i += len(self.joiner) + + +@attr.s(frozen=True) +class ExtractedTextSegment: + id = attr.ib(type=str) + text = attr.ib(type=str) + + +test1 = ExtractedText([ + ExtractedTextSegment('s0', 'foo'), + ExtractedTextSegment('s1', 'bar'), + ExtractedTextSegment('s2', 'bazinga') +], ' ') + + +assert test1.text == 'foo bar bazinga' +assert test1.segment_id_for_pos(0) == 's0' +assert test1.segment_id_for_pos(3) == None +assert test1.segment_id_for_pos(10) == 's2'