diff --git a/qurator/dinglehopper/ocr_files.py b/qurator/dinglehopper/ocr_files.py
index 4045680..57ebd3f 100644
--- a/qurator/dinglehopper/ocr_files.py
+++ b/qurator/dinglehopper/ocr_files.py
@@ -70,24 +70,11 @@ def page_extract(tree, *, textequiv_level="region"):
if reading_order is not None:
for group in reading_order.iterfind("./*", namespaces=nsmap):
if ET.QName(group.tag).localname == "OrderedGroup":
- region_ref_indexeds = group.findall(
- "./page:RegionRefIndexed", namespaces=nsmap
- )
- for region_ref_indexed in sorted(
- region_ref_indexeds, key=lambda r: int(r.attrib["index"])
- ):
- region_id = region_ref_indexed.attrib["regionRef"]
- region = tree.find(
- './/page:TextRegion[@id="%s"]' % region_id, namespaces=nsmap
+ regions.extend(
+ extract_texts_from_reading_order_group(
+ group, tree, nsmap, textequiv_level
)
- if region is not None:
- regions.append(
- ExtractedText.from_text_segment(
- region, nsmap, textequiv_level=textequiv_level
- )
- )
- else:
- pass # Not a TextRegion
+ )
else:
raise NotImplementedError
else:
@@ -104,6 +91,35 @@ def page_extract(tree, *, textequiv_level="region"):
return ExtractedText(None, regions, "\n", None)
+def extract_texts_from_reading_order_group(group, tree, nsmap, textequiv_level):
+ """Recursive function to extract the texts from TextRegions in ReadingOrder."""
+ regions = []
+ ro_children = group.findall("./page:RegionRefIndexed", namespaces=nsmap)
+ ro_children.extend(group.findall("./page:OrderedGroupIndexed", namespaces=nsmap))
+ ro_children = filter(lambda child: "index" in child.attrib.keys(), ro_children)
+ for ro_child in sorted(ro_children, key=lambda child: int(child.attrib["index"])):
+ if ET.QName(ro_child.tag).localname == "OrderedGroupIndexed":
+ regions.extend(
+ extract_texts_from_reading_order_group(
+ ro_child, tree, nsmap, textequiv_level
+ )
+ )
+ else:
+ region_id = ro_child.attrib["regionRef"]
+ region = tree.find(
+ './/page:TextRegion[@id="%s"]' % region_id, namespaces=nsmap
+ )
+ if region is not None:
+ regions.append(
+ ExtractedText.from_text_segment(
+ region, nsmap, textequiv_level=textequiv_level
+ )
+ )
+ else:
+ pass # Not a TextRegion
+ return regions
+
+
def page_text(tree, *, textequiv_level="region"):
return page_extract(tree, textequiv_level=textequiv_level).text
diff --git a/qurator/dinglehopper/tests/data/table-order/table-no-reading-order.xml b/qurator/dinglehopper/tests/data/table-order/table-no-reading-order.xml
new file mode 100644
index 0000000..769ced5
--- /dev/null
+++ b/qurator/dinglehopper/tests/data/table-order/table-no-reading-order.xml
@@ -0,0 +1,121 @@
+
+
+
+
+ 2020-10-28T08:43:47
+ 1970-01-01T00:00:00
+
+
+
+
+
+
+
+
+ 5
+
+
+
+ 5
+
+
+
+
+
+
+
+ 6
+
+
+
+ 6
+
+
+
+
+
+
+
+ 7
+
+
+
+ 7
+
+
+
+
+
+
+
+ 8
+
+
+
+ 8
+
+
+
+
+
+
+
+ 9
+
+
+
+ 9
+
+
+
+
+
+
+
+ 1
+
+
+
+ 1
+
+
+
+
+
+
+
+ 2
+
+
+
+ 2
+
+
+
+
+
+
+
+ 3
+
+
+
+ 3
+
+
+
+
+
+
+
+ 4
+
+
+
+ 4
+
+
+
+
diff --git a/qurator/dinglehopper/tests/data/table-order/table-order-0001.xml b/qurator/dinglehopper/tests/data/table-order/table-order-0001.xml
new file mode 100644
index 0000000..d91dde9
--- /dev/null
+++ b/qurator/dinglehopper/tests/data/table-order/table-order-0001.xml
@@ -0,0 +1,134 @@
+
+
+
+
+ 2020-10-28T08:43:47
+ 1970-01-01T00:00:00
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ 5
+
+
+
+ 5
+
+
+
+
+
+
+
+ 6
+
+
+
+ 6
+
+
+
+
+
+
+
+ 7
+
+
+
+ 7
+
+
+
+
+
+
+
+ 8
+
+
+
+ 8
+
+
+
+
+
+
+
+ 9
+
+
+
+ 9
+
+
+
+
+
+
+
+ 1
+
+
+
+ 1
+
+
+
+
+
+
+
+ 2
+
+
+
+ 2
+
+
+
+
+
+
+
+ 3
+
+
+
+ 3
+
+
+
+
+
+
+
+ 4
+
+
+
+ 4
+
+
+
+
diff --git a/qurator/dinglehopper/tests/data/table-order/table-order-0002.xml b/qurator/dinglehopper/tests/data/table-order/table-order-0002.xml
new file mode 100644
index 0000000..3985e59
--- /dev/null
+++ b/qurator/dinglehopper/tests/data/table-order/table-order-0002.xml
@@ -0,0 +1,134 @@
+
+
+
+
+ 2020-10-28T08:43:47
+ 1970-01-01T00:00:00
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ 5
+
+
+
+ 5
+
+
+
+
+
+
+
+ 6
+
+
+
+ 6
+
+
+
+
+
+
+
+ 7
+
+
+
+ 7
+
+
+
+
+
+
+
+ 8
+
+
+
+ 8
+
+
+
+
+
+
+
+ 9
+
+
+
+ 9
+
+
+
+
+
+
+
+ 1
+
+
+
+ 1
+
+
+
+
+
+
+
+ 2
+
+
+
+ 2
+
+
+
+
+
+
+
+ 3
+
+
+
+ 3
+
+
+
+
+
+
+
+ 4
+
+
+
+ 4
+
+
+
+
diff --git a/qurator/dinglehopper/tests/data/table-order/table-region.xml b/qurator/dinglehopper/tests/data/table-order/table-region.xml
new file mode 100644
index 0000000..62e0c10
--- /dev/null
+++ b/qurator/dinglehopper/tests/data/table-order/table-region.xml
@@ -0,0 +1,139 @@
+
+
+
+
+ 2020-10-28T08:43:47
+ 1970-01-01T00:00:00
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ 5
+
+
+
+ 5
+
+
+
+
+
+
+
+ 6
+
+
+
+ 6
+
+
+
+
+
+
+
+ 7
+
+
+
+ 7
+
+
+
+
+
+
+
+ 8
+
+
+
+ 8
+
+
+
+
+
+
+
+ 9
+
+
+
+ 9
+
+
+
+
+
+
+
+ 1
+
+
+
+ 1
+
+
+
+
+
+
+
+ 2
+
+
+
+ 2
+
+
+
+
+
+
+
+ 3
+
+
+
+ 3
+
+
+
+
+
+
+
+ 4
+
+
+
+ 4
+
+
+
+
+
diff --git a/qurator/dinglehopper/tests/data/table-order/table-unordered.xml b/qurator/dinglehopper/tests/data/table-order/table-unordered.xml
new file mode 100644
index 0000000..240e08f
--- /dev/null
+++ b/qurator/dinglehopper/tests/data/table-order/table-unordered.xml
@@ -0,0 +1,134 @@
+
+
+
+
+ 2020-10-28T08:43:47
+ 1970-01-01T00:00:00
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ 5
+
+
+
+ 5
+
+
+
+
+
+
+
+ 6
+
+
+
+ 6
+
+
+
+
+
+
+
+ 7
+
+
+
+ 7
+
+
+
+
+
+
+
+ 8
+
+
+
+ 8
+
+
+
+
+
+
+
+ 9
+
+
+
+ 9
+
+
+
+
+
+
+
+ 1
+
+
+
+ 1
+
+
+
+
+
+
+
+ 2
+
+
+
+ 2
+
+
+
+
+
+
+
+ 3
+
+
+
+ 3
+
+
+
+
+
+
+
+ 4
+
+
+
+ 4
+
+
+
+
diff --git a/qurator/dinglehopper/tests/test_integ_table_extraction.py b/qurator/dinglehopper/tests/test_integ_table_extraction.py
new file mode 100644
index 0000000..1a9722a
--- /dev/null
+++ b/qurator/dinglehopper/tests/test_integ_table_extraction.py
@@ -0,0 +1,29 @@
+import os
+
+import pytest
+from lxml import etree as ET
+
+from .. import page_text
+
+
+@pytest.mark.parametrize(
+ "file,expected_text",
+ [
+ ("table-order-0001.xml", "1\n2\n3\n4\n5\n6\n7\n8\n9"),
+ ("table-order-0002.xml", "1\n4\n7\n2\n5\n8\n3\n6\n9"),
+ ("table-region.xml", "1\n2\n3\n4\n5\n6\n7\n8\n9"),
+ ("table-no-reading-order.xml", "5\n6\n7\n8\n9\n1\n2\n3\n4"),
+ ("table-unordered.xml", "5\n6\n7\n8\n9\n1\n2\n3\n4"),
+ ],
+)
+@pytest.mark.integration
+def test_reading_order_settings(file, expected_text):
+ data_dir = os.path.join(
+ os.path.dirname(os.path.abspath(__file__)), "data", "table-order"
+ )
+ if "table-unordered.xml" == file:
+ with pytest.raises(NotImplementedError):
+ page_text(ET.parse(os.path.join(data_dir, file)))
+ else:
+ ocr = page_text(ET.parse(os.path.join(data_dir, file)))
+ assert ocr == expected_text