From b23b75b6017e6693f820d0fc2d43bbf7af4337e5 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Wed, 21 Oct 2020 16:04:25 +0200 Subject: [PATCH] =?UTF-8?q?=E2=9C=A8=20dinglehopper:=20Give=20segment=20id?= =?UTF-8?q?s=20from=20the=20extracted=20textequiv=5Flevel?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- qurator/dinglehopper/extracted_text.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/qurator/dinglehopper/extracted_text.py b/qurator/dinglehopper/extracted_text.py index 56af085..c039000 100644 --- a/qurator/dinglehopper/extracted_text.py +++ b/qurator/dinglehopper/extracted_text.py @@ -157,11 +157,17 @@ class ExtractedText: def segment_id_for_pos(self, pos): # Calculate segment ids once, on the first call if not self._segment_id_for_pos: - segment_id_for_pos = [] - for s in self.segments: - segment_id_for_pos.extend(repeat(s.segment_id, len(s.text))) - segment_id_for_pos.extend(repeat(None, len(self.joiner))) - segment_id_for_pos = segment_id_for_pos[:-len(self.joiner)] + if self._text is not None: + segment_id_for_pos = list(repeat(self.segment_id, len(self._text))) + else: + # Recurse + segment_id_for_pos = [] + for s in self.segments: + seg_ids = [s.segment_id_for_pos(i) for i in range(len(s.text))] + segment_id_for_pos.extend(seg_ids) + segment_id_for_pos.extend(repeat(None, len(self.joiner))) + segment_id_for_pos = segment_id_for_pos[:-len(self.joiner)] + # This is frozen, so we have to jump through the hoop: object.__setattr__(self, '_segment_id_for_pos', segment_id_for_pos) assert self._segment_id_for_pos