diff --git a/qurator/dinglehopper/ocr_files.py b/qurator/dinglehopper/ocr_files.py index a048b1e..2b8b0de 100644 --- a/qurator/dinglehopper/ocr_files.py +++ b/qurator/dinglehopper/ocr_files.py @@ -15,15 +15,53 @@ import unicodedata import re +class Normalization(enum.Enum): + NFC = 1 + NFC_MUFI = 2 # TODO + NFC_SBB = 3 + + @attr.s(frozen=True) class ExtractedText: - segments = attr.ib(converter=list) - joiner = attr.ib(type=str) - # TODO Types are not validated (attr does not do this yet) + segment_id = attr.ib(type=Optional[str]) + + @segment_id.validator + def check(self, _, value): + if value is None: + return + if not re.match(r'[\w\d_-]+', value): + raise ValueError('Malformed segment id "{}"'.format(value)) + + # An object contains either + # a. _text itself + # b. or segments (ExtractedText) and a joiner + # TODO validator + + segments = attr.ib(type=Optional[list], converter=attr.converters.optional(list)) + joiner = attr.ib(type=Optional[str]) + _text = attr.ib(type=Optional[str]) + + @segments.validator + def check(self, _, value): + if value is not None and self._text is not None: + raise ValueError("Can't have both segments and text") + + @_text.validator + def check(self, _, value): + if value is not None and normalize(value, self.normalization) != value: + raise ValueError('String "{}" is not normalized.'.format(value)) + + normalization = attr.ib(converter=Normalization, default=Normalization.NFC_SBB) @property def text(self): - return self.joiner.join(s.text for s in self.segments) + if self._text is not None: + if self._text == '': + return None + else: + return self._text + else: + return self.joiner.join(s.text for s in self.segments) _segment_id_for_pos = None @@ -34,17 +72,30 @@ class ExtractedText: for s in self.segments: segment_id_for_pos.extend(repeat(s.segment_id, len(s.text))) segment_id_for_pos.extend(repeat(None, len(self.joiner))) + segment_id_for_pos = segment_id_for_pos[:-len(self.joiner)] # This is frozen, so we have to jump through the hoop: object.__setattr__(self, '_segment_id_for_pos', segment_id_for_pos) assert self._segment_id_for_pos return self._segment_id_for_pos[pos] + @classmethod + def from_text_segment(cls, text_segment, nsmap): + """Build an ExtractedText from a PAGE content text element""" + + segment_id = text_segment.attrib['id'] + segment_text = None + with suppress(AttributeError): + segment_text = text_segment.find('./page:TextEquiv/page:Unicode', namespaces=nsmap).text + segment_text = segment_text or '' + segment_text = normalize_sbb(segment_text) # FIXME hardcoded SBB normalization + segment_text = segment_text or '' + return cls(segment_id, None, None, segment_text) + + @classmethod + def from_text(cls, text): + return cls(None, None, None, text) -class Normalization(enum.Enum): - NFC = 1 - NFC_MUFI = 2 # TODO - NFC_SBB = 3 def normalize(text, normalization): @@ -63,37 +114,6 @@ def normalize_sbb(t): return normalize(t, Normalization.NFC_SBB) -@attr.s(frozen=True) -class ExtractedTextSegment: - segment_id = attr.ib(type=Optional[str]) - - @segment_id.validator - def check(self, _, value): - if value is None: - return - if not re.match(r'[\w\d_-]+', value): - raise ValueError('Malformed segment id "{}"'.format(value)) - text = attr.ib(type=str) - - @text.validator - def check(self, _, value): - if value is not None and normalize(value, self.normalization) != value: - raise ValueError('String "{}" is not normalized.'.format(value)) - normalization = attr.ib(converter=Normalization, default=Normalization.NFC_SBB) - - @classmethod - def from_text_segment(cls, text_segment, nsmap): - """Build an ExtractedTextSegment from a PAGE content text element""" - - segment_id = text_segment.attrib['id'] - segment_text = None - with suppress(AttributeError): - segment_text = text_segment.find('./page:TextEquiv/page:Unicode', namespaces=nsmap).text - segment_text = segment_text or '' - segment_text = normalize_sbb(segment_text) - return cls(segment_id, segment_text) - - def alto_namespace(tree): """Return the ALTO namespace used in the given ElementTree. @@ -117,12 +137,14 @@ def alto_extract(tree): for line in tree.iterfind('.//alto:TextLine', namespaces=nsmap)) return ExtractedText( - (ExtractedTextSegment(None, normalize_sbb(line_text)) for line_text in lines), - '\n' + None, + (ExtractedText.from_text(normalize_sbb(line_text)) for line_text in lines), + '\n', + None ) + # FIXME hardcoded SBB normalization # TODO This currently does not extract any segment id, because we are # clueless about the ALTO format. - # FIXME needs to handle normalization def alto_text(tree): @@ -157,20 +179,19 @@ def page_extract(tree): region_id = region_ref_indexed.attrib['regionRef'] region = tree.find('.//page:TextRegion[@id="%s"]' % region_id, namespaces=nsmap) if region is not None: - regions.append(ExtractedTextSegment.from_text_segment(region, nsmap)) + regions.append(ExtractedText.from_text_segment(region, nsmap)) else: warn('Not a TextRegion: "%s"' % region_id) else: raise NotImplementedError else: for region in tree.iterfind('.//page:TextRegion', namespaces=nsmap): - regions.append(ExtractedTextSegment.from_text_segment(region, nsmap)) + regions.append(ExtractedText.from_text_segment(region, nsmap)) # Filter empty region texts regions = (r for r in regions if r.text is not None) - return ExtractedText(regions, '\n') - # FIXME needs to handle normalization + return ExtractedText(None, regions, '\n', None) def page_text(tree): @@ -180,8 +201,10 @@ def page_text(tree): def plain_extract(filename): with open(filename, 'r') as f: return ExtractedText( - (ExtractedTextSegment('line %d' % no, line) for no, line in enumerate(f.readlines())), - '\n' + None, + [ExtractedText('line %d' % no, None, None, line) for no, line in enumerate(f.readlines())], + '\n', + None ) diff --git a/qurator/dinglehopper/tests/extracted_text_test.py b/qurator/dinglehopper/tests/extracted_text_test.py index 2e6a9e6..8cac4c1 100644 --- a/qurator/dinglehopper/tests/extracted_text_test.py +++ b/qurator/dinglehopper/tests/extracted_text_test.py @@ -1,17 +1,17 @@ import unicodedata import pytest -from qurator.dinglehopper import ExtractedText, ExtractedTextSegment +from qurator.dinglehopper import ExtractedText from uniseg.graphemecluster import grapheme_clusters from qurator.dinglehopper import seq_align from collections import namedtuple def test_text(): - test1 = ExtractedText([ - ExtractedTextSegment('s0', 'foo'), - ExtractedTextSegment('s1', 'bar'), - ExtractedTextSegment('s2', 'bazinga') - ], ' ') + test1 = ExtractedText(None, [ + ExtractedText('s0', None, None, 'foo'), + ExtractedText('s1', None, None, 'bar'), + ExtractedText('s2', None, None, 'bazinga') + ], ' ', None) assert test1.text == 'foo bar bazinga' assert test1.segment_id_for_pos(0) == 's0' @@ -21,8 +21,8 @@ def test_text(): def test_normalization_check(): with pytest.raises(ValueError, match=r'.*is not normalized.*'): - ExtractedTextSegment('foo', unicodedata.normalize('NFD', 'Schlyñ')) - assert ExtractedTextSegment('foo', unicodedata.normalize('NFC', 'Schlyñ')) + ExtractedText('foo', None, None, unicodedata.normalize('NFD', 'Schlyñ')) + assert ExtractedText('foo', None, None, unicodedata.normalize('NFC', 'Schlyñ')) AlignmentElement = namedtuple('AlignmentElement', 'left right left_id right_id') @@ -36,17 +36,17 @@ def test_align(): not Python characters. """ - test1 = ExtractedText([ - ExtractedTextSegment('s0', 'foo'), - ExtractedTextSegment('s1', 'bar'), - ExtractedTextSegment('s2', 'batzinga') - ], ' ') - test2 = ExtractedText([ - ExtractedTextSegment('x0', 'foo'), - ExtractedTextSegment('x1', 'bar'), - ExtractedTextSegment('x2', '.'), # extra . - ExtractedTextSegment('x3', 'bazim̃ga'), # deletion + different grapheme cluster, m̃ also is two Python characters - ], ' ') + test1 = ExtractedText(None, [ + ExtractedText('s0', None, None, 'foo'), + ExtractedText('s1', None, None, 'bar'), + ExtractedText('s2', None, None, 'batzinga') + ], ' ', None) + test2 = ExtractedText(None, [ + ExtractedText('x0', None, None, 'foo'), + ExtractedText('x1', None, None, 'bar'), + ExtractedText('x2', None, None, '.'), # extra . + ExtractedText('x3', None, None, 'bazim̃ga'), # deletion + different grapheme cluster, m̃ also is two Python characters + ], ' ', None) left_pos = 0; right_pos = 0; alignment = [] for left, right in seq_align(grapheme_clusters(test1.text), grapheme_clusters(test2.text)):