🧹 dinglehopper: Calculate segment ids once, on the first call

pull/38/head
Gerber, Mike 4 years ago
parent bc05f83088
commit c3ae73d576

@ -5,6 +5,7 @@ from warnings import warn
from lxml import etree as ET from lxml import etree as ET
from lxml.etree import XMLSyntaxError from lxml.etree import XMLSyntaxError
from contextlib import suppress from contextlib import suppress
from itertools import repeat
from .substitute_equivalences import substitute_equivalences from .substitute_equivalences import substitute_equivalences
import sys import sys
import attr import attr
@ -22,16 +23,20 @@ class ExtractedText:
def text(self): def text(self):
return self.joiner.join(s.text for s in self.segments) return self.joiner.join(s.text for s in self.segments)
_segment_id_for_pos = None
def segment_id_for_pos(self, pos): def segment_id_for_pos(self, pos):
i = 0 # Calculate segment ids once, on the first call
if not self._segment_id_for_pos:
segment_id_for_pos = []
for s in self.segments: for s in self.segments:
if i <= pos < i + len(s.text): segment_id_for_pos.extend(repeat(s.id, len(s.text)))
return s.id segment_id_for_pos.extend(repeat(None, len(self.joiner)))
i += len(s.text) # This is frozen, so we have to jump through the hoop:
if i <= pos < i + len(self.joiner): object.__setattr__(self, '_segment_id_for_pos', segment_id_for_pos)
return None assert self._segment_id_for_pos
i += len(self.joiner)
# XXX Cache results return self._segment_id_for_pos[pos]
class Normalization(enum.Enum): class Normalization(enum.Enum):

Loading…
Cancel
Save