mirror of
https://github.com/qurator-spk/dinglehopper.git
synced 2025-06-09 11:50:00 +02:00
🧹 dinglehopper: Calculate segment ids once, on the first call
This commit is contained in:
parent
0cf7ff4721
commit
c010a7f05e
1 changed files with 14 additions and 9 deletions
|
@ -5,6 +5,7 @@ from warnings import warn
|
||||||
from lxml import etree as ET
|
from lxml import etree as ET
|
||||||
from lxml.etree import XMLSyntaxError
|
from lxml.etree import XMLSyntaxError
|
||||||
from contextlib import suppress
|
from contextlib import suppress
|
||||||
|
from itertools import repeat
|
||||||
from .substitute_equivalences import substitute_equivalences
|
from .substitute_equivalences import substitute_equivalences
|
||||||
import sys
|
import sys
|
||||||
import attr
|
import attr
|
||||||
|
@ -22,16 +23,20 @@ class ExtractedText:
|
||||||
def text(self):
|
def text(self):
|
||||||
return self.joiner.join(s.text for s in self.segments)
|
return self.joiner.join(s.text for s in self.segments)
|
||||||
|
|
||||||
|
_segment_id_for_pos = None
|
||||||
|
|
||||||
def segment_id_for_pos(self, pos):
|
def segment_id_for_pos(self, pos):
|
||||||
i = 0
|
# Calculate segment ids once, on the first call
|
||||||
for s in self.segments:
|
if not self._segment_id_for_pos:
|
||||||
if i <= pos < i + len(s.text):
|
segment_id_for_pos = []
|
||||||
return s.id
|
for s in self.segments:
|
||||||
i += len(s.text)
|
segment_id_for_pos.extend(repeat(s.id, len(s.text)))
|
||||||
if i <= pos < i + len(self.joiner):
|
segment_id_for_pos.extend(repeat(None, len(self.joiner)))
|
||||||
return None
|
# This is frozen, so we have to jump through the hoop:
|
||||||
i += len(self.joiner)
|
object.__setattr__(self, '_segment_id_for_pos', segment_id_for_pos)
|
||||||
# XXX Cache results
|
assert self._segment_id_for_pos
|
||||||
|
|
||||||
|
return self._segment_id_for_pos[pos]
|
||||||
|
|
||||||
|
|
||||||
class Normalization(enum.Enum):
|
class Normalization(enum.Enum):
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue