diff --git a/qurator/dinglehopper/extracted_text.py b/qurator/dinglehopper/extracted_text.py index e873ebd..c6f2984 100644 --- a/qurator/dinglehopper/extracted_text.py +++ b/qurator/dinglehopper/extracted_text.py @@ -4,6 +4,7 @@ import unicodedata from contextlib import suppress from itertools import repeat from typing import Optional +from lxml import etree as ET import attr @@ -171,17 +172,46 @@ class ExtractedText: return self._segment_id_for_pos[pos] @classmethod - def from_text_segment(cls, text_segment, nsmap): + def from_text_segment(cls, text_segment, nsmap, textequiv_level='region'): """Build an ExtractedText from a PAGE content text element""" + def invert_dict(d): + """Invert the given dict""" + return {v: k for k, v in d.items()} + + localname_for_textequiv_level = { + 'region': 'TextRegion', + 'line': 'TextLine' + } + textequiv_level_for_localname = invert_dict(localname_for_textequiv_level) + children_for_localname = { + 'TextRegion': 'TextLine' + } + segment_id = text_segment.attrib['id'] - segment_text = None - with suppress(AttributeError): - segment_text = text_segment.find('./page:TextEquiv/page:Unicode', namespaces=nsmap).text + localname = ET.QName(text_segment).localname + if localname == localname_for_textequiv_level[textequiv_level]: + segment_text = None + with suppress(AttributeError): + segment_text = text_segment.find('./page:TextEquiv/page:Unicode', namespaces=nsmap).text + segment_text = segment_text or '' + segment_text = normalize_sbb(segment_text) # FIXME hardcoded SBB normalization segment_text = segment_text or '' - segment_text = normalize_sbb(segment_text) # FIXME hardcoded SBB normalization - segment_text = segment_text or '' - return cls(segment_id, None, None, segment_text) + return cls(segment_id, None, None, segment_text) + else: + # Recurse + sub_localname = children_for_localname[localname] + sub_textequiv_level = textequiv_level_for_localname[sub_localname] + segments = [] + for sub_segment in text_segment.iterfind('./page:%s' % sub_localname, namespaces=nsmap): + segments.append( + ExtractedText.from_text_segment( + sub_segment, nsmap, + textequiv_level=sub_textequiv_level) + ) + joiner = '\n' # XXX + return cls(segment_id, segments, joiner, None) + @classmethod def from_str(cls, text, normalization=Normalization.NFC_SBB): diff --git a/qurator/dinglehopper/ocr_files.py b/qurator/dinglehopper/ocr_files.py index 78648eb..11e86b2 100644 --- a/qurator/dinglehopper/ocr_files.py +++ b/qurator/dinglehopper/ocr_files.py @@ -54,9 +54,12 @@ def page_namespace(tree): raise ValueError('Not a PAGE tree') -def page_extract(tree): +def page_extract(tree, textequiv_level='region'): """Extract text from the given PAGE content ElementTree.""" + # Internally, this is just parsing the Reading Order (if it exists) and + # and leaves reading the TextRegions to ExtractedText.from_text_segment(). + nsmap = {'page': page_namespace(tree)} regions = [] @@ -69,14 +72,14 @@ def page_extract(tree): region_id = region_ref_indexed.attrib['regionRef'] region = tree.find('.//page:TextRegion[@id="%s"]' % region_id, namespaces=nsmap) if region is not None: - regions.append(ExtractedText.from_text_segment(region, nsmap)) + regions.append(ExtractedText.from_text_segment(region, nsmap, textequiv_level=textequiv_level)) else: warn('Not a TextRegion: "%s"' % region_id) else: raise NotImplementedError else: for region in tree.iterfind('.//page:TextRegion', namespaces=nsmap): - regions.append(ExtractedText.from_text_segment(region, nsmap)) + regions.append(ExtractedText.from_text_segment(region, nsmap, textequiv_level=textequiv_level)) # Filter empty region texts regions = [r for r in regions if r.text is not None] @@ -84,8 +87,8 @@ def page_extract(tree): return ExtractedText(None, regions, '\n', None) -def page_text(tree): - return page_extract(tree).text +def page_text(tree, textequiv_level='region'): + return page_extract(tree, textequiv_level=textequiv_level).text def plain_extract(filename): diff --git a/qurator/dinglehopper/tests/data/levels-are-different.page.xml b/qurator/dinglehopper/tests/data/levels-are-different.page.xml new file mode 100644 index 0000000..f512b2c --- /dev/null +++ b/qurator/dinglehopper/tests/data/levels-are-different.page.xml @@ -0,0 +1,382 @@ + + + + doculibtopagexml + 2018-11-20T05:00:14 + 2019-04-17T10:47:36 + + + + + + + + + + + + + + + + + H + + + + a + + + + n + + + + d + + + + , + + Hand, + + + + + + M + + + + y + + + + l + + + + o + + + + r + + + + d + + + + ? + + Mylord? + + + + + + f + + + + r + + + + a + + + + g + + + + t + + + + e + + fragte + + + + + + d + + + + e + + + + r + + der + + + + + + G + + + + r + + + + a + + + + f + + Graf + + + + + + v + + + + o + + + + n + + von + + + + + + R + + + + o + + + + + + + + e + + + + + + + + e + + + + r + + + + . + + Roeer. + + Hand, Mylord? fragte der Graf von Roeer. + + + + + + + + A + + + + l + + + + s + + Als + + + + + + e + + + + r + + er + + + + + + e + + + + i + + + + n + + + + m + + + + a + + + + l + + + + s + + + + s + + einsmals + + + + + + i + + + + n + + in + + + + + + d + + + + e + + + + m + + dem + + + + + + O + + + + b + + + + e + + + + r + + + + h + + + + a + + + + u + + + + ſ + + + + e + + Oberhauſe + + + + + + w + + + + e + + + + + + we⸗ + + + + + + e + + + + i + + + + n + + + + e + + eine + + + + + + B + + + + i + + + + + + Bi + + Als er einsmals in dem Oberhauſe eine Bi we⸗ + + Inconsistent dummy region text + diff --git a/qurator/dinglehopper/tests/test_ocr_files.py b/qurator/dinglehopper/tests/test_ocr_files.py index 3291152..dd0a1fa 100644 --- a/qurator/dinglehopper/tests/test_ocr_files.py +++ b/qurator/dinglehopper/tests/test_ocr_files.py @@ -130,6 +130,23 @@ def test_page_mixed_regions(): assert 'non exaudiam uos. Chriſtiani uero quia orant iuxta' in result +def test_page_level(): + # This file contains inconsistent TextRegion and TextLine texts + + # TextRegion + tree = ET.parse(os.path.join(data_dir, 'levels-are-different.page.xml')) + result = page_text(tree) + assert result == 'Inconsistent dummy region text' + tree = ET.parse(os.path.join(data_dir, 'levels-are-different.page.xml')) + result = page_text(tree, textequiv_level='region') + assert result == 'Inconsistent dummy region text' + + # TextLine + tree = ET.parse(os.path.join(data_dir, 'levels-are-different.page.xml')) + result = page_text(tree, textequiv_level='line') + assert result == 'Hand, Mylord? fragte der Graf von Rocheſter.\nAls er einsmals in dem Oberhauſe eine Bill we-' + + def test_text(): assert "being erected at the Broadway stock" in text(os.path.join(data_dir, 'test.alto1.xml')) assert "wieder ein. – Er langte den Zettel aus dem" in text(os.path.join(data_dir, 'test.page2018.xml'))