From c010a7f05e4a8e0909006036c91e42629e0713be Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Fri, 12 Jun 2020 18:06:42 +0200 Subject: [PATCH] =?UTF-8?q?=F0=9F=A7=B9=20dinglehopper:=20Calculate=20segm?= =?UTF-8?q?ent=20ids=20once,=20on=20the=20first=20call?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- qurator/dinglehopper/ocr_files.py | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/qurator/dinglehopper/ocr_files.py b/qurator/dinglehopper/ocr_files.py index 5ce0bcd..180ecd3 100644 --- a/qurator/dinglehopper/ocr_files.py +++ b/qurator/dinglehopper/ocr_files.py @@ -5,6 +5,7 @@ from warnings import warn from lxml import etree as ET from lxml.etree import XMLSyntaxError from contextlib import suppress +from itertools import repeat from .substitute_equivalences import substitute_equivalences import sys import attr @@ -22,16 +23,20 @@ class ExtractedText: def text(self): return self.joiner.join(s.text for s in self.segments) + _segment_id_for_pos = None + def segment_id_for_pos(self, pos): - i = 0 - for s in self.segments: - if i <= pos < i + len(s.text): - return s.id - i += len(s.text) - if i <= pos < i + len(self.joiner): - return None - i += len(self.joiner) - # XXX Cache results + # Calculate segment ids once, on the first call + if not self._segment_id_for_pos: + segment_id_for_pos = [] + for s in self.segments: + segment_id_for_pos.extend(repeat(s.id, len(s.text))) + segment_id_for_pos.extend(repeat(None, len(self.joiner))) + # This is frozen, so we have to jump through the hoop: + object.__setattr__(self, '_segment_id_for_pos', segment_id_for_pos) + assert self._segment_id_for_pos + + return self._segment_id_for_pos[pos] class Normalization(enum.Enum):