From de6cd8f1e7b97c27a9aeca878797d0491d8f1872 Mon Sep 17 00:00:00 2001 From: Mike Gerber Date: Tue, 31 Oct 2023 20:40:27 +0100 Subject: [PATCH] =?UTF-8?q?=E2=9D=8E=20Make=20joining=20grapheme=20cluster?= =?UTF-8?q?s=20more=20robust=20by=20checking=20joiner=20and=20handling=20a?= =?UTF-8?q?n=20empty=20joiner?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- qurator/dinglehopper/extracted_text.py | 34 +++++++++++++++++++++++--- 1 file changed, 31 insertions(+), 3 deletions(-) diff --git a/qurator/dinglehopper/extracted_text.py b/qurator/dinglehopper/extracted_text.py index 19ad9c1..28678e4 100644 --- a/qurator/dinglehopper/extracted_text.py +++ b/qurator/dinglehopper/extracted_text.py @@ -1,4 +1,5 @@ import enum +import functools import re import unicodedata from contextlib import suppress @@ -141,6 +142,15 @@ class ExtractedText: if value is not None and self._text is not None: raise ValueError("Can't have both segments and text") + @joiner.validator + def check(self, _, value): + if self.segments is None: + if value is not None: + raise ValueError("Can't have joiner without segments to join") + if self.segments is not None: + if value not in ("", " ", "\n"): + raise ValueError(f"Unexcepted segment joiner value {repr(value)}") + @_text.validator def check(self, _, value): if value is None: @@ -169,16 +179,34 @@ class ExtractedText: else: return self.joiner.join(s.text for s in self.segments) + @functools.cached_property + def _joiner_grapheme_cluster(self): + """We need the joiner as a list of 0 or 1 grapheme clusters. + + This property is cached. + """ + + if len(self.joiner) > 0: + joiner_grapheme_cluster = list(grapheme_clusters(self.joiner)) + assert len(joiner_grapheme_cluster) == 1 # see joiner's check above + elif len(self.joiner) == 0: + joiner_grapheme_cluster = [] + else: + joiner_grapheme_cluster = None + + return joiner_grapheme_cluster + @property def grapheme_clusters(self): if self._text is not None: return self._grapheme_clusters else: + # TODO Test with text extracted at glyph level (joiner == "") clusters = [] for seg in self.segments: - # todo could there be cases where joiner is no grapheme cluster? - clusters.extend(seg.grapheme_clusters + [self.joiner]) - return clusters[:-1] + clusters += seg.grapheme_clusters + self._joiner_grapheme_cluster + clusters = clusters[:-1] + return clusters _segment_id_for_pos = None