diff --git a/qurator/dinglehopper/ocr_files.py b/qurator/dinglehopper/ocr_files.py index 57ebd3f..cd1a9bf 100644 --- a/qurator/dinglehopper/ocr_files.py +++ b/qurator/dinglehopper/ocr_files.py @@ -69,14 +69,11 @@ def page_extract(tree, *, textequiv_level="region"): reading_order = tree.find(".//page:ReadingOrder", namespaces=nsmap) if reading_order is not None: for group in reading_order.iterfind("./*", namespaces=nsmap): - if ET.QName(group.tag).localname == "OrderedGroup": - regions.extend( - extract_texts_from_reading_order_group( - group, tree, nsmap, textequiv_level - ) + regions.extend( + extract_texts_from_reading_order_group( + group, tree, nsmap, textequiv_level ) - else: - raise NotImplementedError + ) else: for region in tree.iterfind(".//page:TextRegion", namespaces=nsmap): regions.append( @@ -94,11 +91,20 @@ def page_extract(tree, *, textequiv_level="region"): def extract_texts_from_reading_order_group(group, tree, nsmap, textequiv_level): """Recursive function to extract the texts from TextRegions in ReadingOrder.""" regions = [] - ro_children = group.findall("./page:RegionRefIndexed", namespaces=nsmap) - ro_children.extend(group.findall("./page:OrderedGroupIndexed", namespaces=nsmap)) - ro_children = filter(lambda child: "index" in child.attrib.keys(), ro_children) - for ro_child in sorted(ro_children, key=lambda child: int(child.attrib["index"])): - if ET.QName(ro_child.tag).localname == "OrderedGroupIndexed": + + if ET.QName(group.tag).localname in ["OrderedGroup", "OrderedGroupIndexed"]: + ro_children = list(group) + + ro_children = filter(lambda child: "index" in child.attrib.keys(), ro_children) + ro_children = sorted(ro_children, key=lambda child: int(child.attrib["index"])) + elif ET.QName(group.tag).localname == "UnorderedGroup": + ro_children = list(group) + else: + raise NotImplementedError + + + for ro_child in ro_children: + if ET.QName(ro_child.tag).localname in ["OrderedGroup", "OrderedGroupIndexed", "UnorderedGroup"]: regions.extend( extract_texts_from_reading_order_group( ro_child, tree, nsmap, textequiv_level diff --git a/qurator/dinglehopper/tests/test_integ_table_extraction.py b/qurator/dinglehopper/tests/test_integ_table_extraction.py index 1a9722a..868308c 100644 --- a/qurator/dinglehopper/tests/test_integ_table_extraction.py +++ b/qurator/dinglehopper/tests/test_integ_table_extraction.py @@ -13,7 +13,7 @@ from .. import page_text ("table-order-0002.xml", "1\n4\n7\n2\n5\n8\n3\n6\n9"), ("table-region.xml", "1\n2\n3\n4\n5\n6\n7\n8\n9"), ("table-no-reading-order.xml", "5\n6\n7\n8\n9\n1\n2\n3\n4"), - ("table-unordered.xml", "5\n6\n7\n8\n9\n1\n2\n3\n4"), + ("table-unordered.xml", "1\n2\n3\n4\n5\n6\n7\n8\n9"), ], ) @pytest.mark.integration @@ -21,9 +21,5 @@ def test_reading_order_settings(file, expected_text): data_dir = os.path.join( os.path.dirname(os.path.abspath(__file__)), "data", "table-order" ) - if "table-unordered.xml" == file: - with pytest.raises(NotImplementedError): - page_text(ET.parse(os.path.join(data_dir, file))) - else: - ocr = page_text(ET.parse(os.path.join(data_dir, file))) - assert ocr == expected_text + ocr = page_text(ET.parse(os.path.join(data_dir, file))) + assert ocr == expected_text