mirror of
https://github.com/qurator-spk/dinglehopper.git
synced 2025-06-09 20:00:01 +02:00
❎ Make joining grapheme clusters more robust by checking joiner and handling an empty joiner
This commit is contained in:
parent
7c6ee593f0
commit
de6cd8f1e7
1 changed files with 31 additions and 3 deletions
|
@ -1,4 +1,5 @@
|
||||||
import enum
|
import enum
|
||||||
|
import functools
|
||||||
import re
|
import re
|
||||||
import unicodedata
|
import unicodedata
|
||||||
from contextlib import suppress
|
from contextlib import suppress
|
||||||
|
@ -141,6 +142,15 @@ class ExtractedText:
|
||||||
if value is not None and self._text is not None:
|
if value is not None and self._text is not None:
|
||||||
raise ValueError("Can't have both segments and text")
|
raise ValueError("Can't have both segments and text")
|
||||||
|
|
||||||
|
@joiner.validator
|
||||||
|
def check(self, _, value):
|
||||||
|
if self.segments is None:
|
||||||
|
if value is not None:
|
||||||
|
raise ValueError("Can't have joiner without segments to join")
|
||||||
|
if self.segments is not None:
|
||||||
|
if value not in ("", " ", "\n"):
|
||||||
|
raise ValueError(f"Unexcepted segment joiner value {repr(value)}")
|
||||||
|
|
||||||
@_text.validator
|
@_text.validator
|
||||||
def check(self, _, value):
|
def check(self, _, value):
|
||||||
if value is None:
|
if value is None:
|
||||||
|
@ -169,16 +179,34 @@ class ExtractedText:
|
||||||
else:
|
else:
|
||||||
return self.joiner.join(s.text for s in self.segments)
|
return self.joiner.join(s.text for s in self.segments)
|
||||||
|
|
||||||
|
@functools.cached_property
|
||||||
|
def _joiner_grapheme_cluster(self):
|
||||||
|
"""We need the joiner as a list of 0 or 1 grapheme clusters.
|
||||||
|
|
||||||
|
This property is cached.
|
||||||
|
"""
|
||||||
|
|
||||||
|
if len(self.joiner) > 0:
|
||||||
|
joiner_grapheme_cluster = list(grapheme_clusters(self.joiner))
|
||||||
|
assert len(joiner_grapheme_cluster) == 1 # see joiner's check above
|
||||||
|
elif len(self.joiner) == 0:
|
||||||
|
joiner_grapheme_cluster = []
|
||||||
|
else:
|
||||||
|
joiner_grapheme_cluster = None
|
||||||
|
|
||||||
|
return joiner_grapheme_cluster
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def grapheme_clusters(self):
|
def grapheme_clusters(self):
|
||||||
if self._text is not None:
|
if self._text is not None:
|
||||||
return self._grapheme_clusters
|
return self._grapheme_clusters
|
||||||
else:
|
else:
|
||||||
|
# TODO Test with text extracted at glyph level (joiner == "")
|
||||||
clusters = []
|
clusters = []
|
||||||
for seg in self.segments:
|
for seg in self.segments:
|
||||||
# todo could there be cases where joiner is no grapheme cluster?
|
clusters += seg.grapheme_clusters + self._joiner_grapheme_cluster
|
||||||
clusters.extend(seg.grapheme_clusters + [self.joiner])
|
clusters = clusters[:-1]
|
||||||
return clusters[:-1]
|
return clusters
|
||||||
|
|
||||||
_segment_id_for_pos = None
|
_segment_id_for_pos = None
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue