From a68fc269d9941247bfd32ba0c1f8445194124654 Mon Sep 17 00:00:00 2001 From: Benjamin Rosemann Date: Fri, 27 Nov 2020 11:18:11 +0100 Subject: [PATCH] Fix the extraction of text from Page with TableRegion Dinglehopper did not consider `OrderedGroupIndex` in the `ReadingOrder` element when extracting text regions. As a consequence a `TableRegion` was not considered for text extraction. --- qurator/dinglehopper/ocr_files.py | 50 ++++--- .../table-order/table-no-reading-order.xml | 121 +++++++++++++++ .../data/table-order/table-order-0001.xml | 134 +++++++++++++++++ .../data/table-order/table-order-0002.xml | 134 +++++++++++++++++ .../tests/data/table-order/table-region.xml | 139 ++++++++++++++++++ .../data/table-order/table-unordered.xml | 134 +++++++++++++++++ .../tests/test_integ_table_extraction.py | 29 ++++ 7 files changed, 724 insertions(+), 17 deletions(-) create mode 100644 qurator/dinglehopper/tests/data/table-order/table-no-reading-order.xml create mode 100644 qurator/dinglehopper/tests/data/table-order/table-order-0001.xml create mode 100644 qurator/dinglehopper/tests/data/table-order/table-order-0002.xml create mode 100644 qurator/dinglehopper/tests/data/table-order/table-region.xml create mode 100644 qurator/dinglehopper/tests/data/table-order/table-unordered.xml create mode 100644 qurator/dinglehopper/tests/test_integ_table_extraction.py diff --git a/qurator/dinglehopper/ocr_files.py b/qurator/dinglehopper/ocr_files.py index 4045680..57ebd3f 100644 --- a/qurator/dinglehopper/ocr_files.py +++ b/qurator/dinglehopper/ocr_files.py @@ -70,24 +70,11 @@ def page_extract(tree, *, textequiv_level="region"): if reading_order is not None: for group in reading_order.iterfind("./*", namespaces=nsmap): if ET.QName(group.tag).localname == "OrderedGroup": - region_ref_indexeds = group.findall( - "./page:RegionRefIndexed", namespaces=nsmap - ) - for region_ref_indexed in sorted( - region_ref_indexeds, key=lambda r: int(r.attrib["index"]) - ): - region_id = region_ref_indexed.attrib["regionRef"] - region = tree.find( - './/page:TextRegion[@id="%s"]' % region_id, namespaces=nsmap + regions.extend( + extract_texts_from_reading_order_group( + group, tree, nsmap, textequiv_level ) - if region is not None: - regions.append( - ExtractedText.from_text_segment( - region, nsmap, textequiv_level=textequiv_level - ) - ) - else: - pass # Not a TextRegion + ) else: raise NotImplementedError else: @@ -104,6 +91,35 @@ def page_extract(tree, *, textequiv_level="region"): return ExtractedText(None, regions, "\n", None) +def extract_texts_from_reading_order_group(group, tree, nsmap, textequiv_level): + """Recursive function to extract the texts from TextRegions in ReadingOrder.""" + regions = [] + ro_children = group.findall("./page:RegionRefIndexed", namespaces=nsmap) + ro_children.extend(group.findall("./page:OrderedGroupIndexed", namespaces=nsmap)) + ro_children = filter(lambda child: "index" in child.attrib.keys(), ro_children) + for ro_child in sorted(ro_children, key=lambda child: int(child.attrib["index"])): + if ET.QName(ro_child.tag).localname == "OrderedGroupIndexed": + regions.extend( + extract_texts_from_reading_order_group( + ro_child, tree, nsmap, textequiv_level + ) + ) + else: + region_id = ro_child.attrib["regionRef"] + region = tree.find( + './/page:TextRegion[@id="%s"]' % region_id, namespaces=nsmap + ) + if region is not None: + regions.append( + ExtractedText.from_text_segment( + region, nsmap, textequiv_level=textequiv_level + ) + ) + else: + pass # Not a TextRegion + return regions + + def page_text(tree, *, textequiv_level="region"): return page_extract(tree, textequiv_level=textequiv_level).text diff --git a/qurator/dinglehopper/tests/data/table-order/table-no-reading-order.xml b/qurator/dinglehopper/tests/data/table-order/table-no-reading-order.xml new file mode 100644 index 0000000..769ced5 --- /dev/null +++ b/qurator/dinglehopper/tests/data/table-order/table-no-reading-order.xml @@ -0,0 +1,121 @@ + + + + + 2020-10-28T08:43:47 + 1970-01-01T00:00:00 + + + + + + + + + 5 + + + + 5 + + + + + + + + 6 + + + + 6 + + + + + + + + 7 + + + + 7 + + + + + + + + 8 + + + + 8 + + + + + + + + 9 + + + + 9 + + + + + + + + 1 + + + + 1 + + + + + + + + 2 + + + + 2 + + + + + + + + 3 + + + + 3 + + + + + + + + 4 + + + + 4 + + + + diff --git a/qurator/dinglehopper/tests/data/table-order/table-order-0001.xml b/qurator/dinglehopper/tests/data/table-order/table-order-0001.xml new file mode 100644 index 0000000..d91dde9 --- /dev/null +++ b/qurator/dinglehopper/tests/data/table-order/table-order-0001.xml @@ -0,0 +1,134 @@ + + + + + 2020-10-28T08:43:47 + 1970-01-01T00:00:00 + + + + + + + + + + + + + + + + + + + + + + 5 + + + + 5 + + + + + + + + 6 + + + + 6 + + + + + + + + 7 + + + + 7 + + + + + + + + 8 + + + + 8 + + + + + + + + 9 + + + + 9 + + + + + + + + 1 + + + + 1 + + + + + + + + 2 + + + + 2 + + + + + + + + 3 + + + + 3 + + + + + + + + 4 + + + + 4 + + + + diff --git a/qurator/dinglehopper/tests/data/table-order/table-order-0002.xml b/qurator/dinglehopper/tests/data/table-order/table-order-0002.xml new file mode 100644 index 0000000..3985e59 --- /dev/null +++ b/qurator/dinglehopper/tests/data/table-order/table-order-0002.xml @@ -0,0 +1,134 @@ + + + + + 2020-10-28T08:43:47 + 1970-01-01T00:00:00 + + + + + + + + + + + + + + + + + + + + + + 5 + + + + 5 + + + + + + + + 6 + + + + 6 + + + + + + + + 7 + + + + 7 + + + + + + + + 8 + + + + 8 + + + + + + + + 9 + + + + 9 + + + + + + + + 1 + + + + 1 + + + + + + + + 2 + + + + 2 + + + + + + + + 3 + + + + 3 + + + + + + + + 4 + + + + 4 + + + + diff --git a/qurator/dinglehopper/tests/data/table-order/table-region.xml b/qurator/dinglehopper/tests/data/table-order/table-region.xml new file mode 100644 index 0000000..62e0c10 --- /dev/null +++ b/qurator/dinglehopper/tests/data/table-order/table-region.xml @@ -0,0 +1,139 @@ + + + + + 2020-10-28T08:43:47 + 1970-01-01T00:00:00 + + + + + + + + + + + + + + + + + + + + + + + + + + 5 + + + + 5 + + + + + + + + 6 + + + + 6 + + + + + + + + 7 + + + + 7 + + + + + + + + 8 + + + + 8 + + + + + + + + 9 + + + + 9 + + + + + + + + 1 + + + + 1 + + + + + + + + 2 + + + + 2 + + + + + + + + 3 + + + + 3 + + + + + + + + 4 + + + + 4 + + + + + diff --git a/qurator/dinglehopper/tests/data/table-order/table-unordered.xml b/qurator/dinglehopper/tests/data/table-order/table-unordered.xml new file mode 100644 index 0000000..240e08f --- /dev/null +++ b/qurator/dinglehopper/tests/data/table-order/table-unordered.xml @@ -0,0 +1,134 @@ + + + + + 2020-10-28T08:43:47 + 1970-01-01T00:00:00 + + + + + + + + + + + + + + + + + + + + + + 5 + + + + 5 + + + + + + + + 6 + + + + 6 + + + + + + + + 7 + + + + 7 + + + + + + + + 8 + + + + 8 + + + + + + + + 9 + + + + 9 + + + + + + + + 1 + + + + 1 + + + + + + + + 2 + + + + 2 + + + + + + + + 3 + + + + 3 + + + + + + + + 4 + + + + 4 + + + + diff --git a/qurator/dinglehopper/tests/test_integ_table_extraction.py b/qurator/dinglehopper/tests/test_integ_table_extraction.py new file mode 100644 index 0000000..1a9722a --- /dev/null +++ b/qurator/dinglehopper/tests/test_integ_table_extraction.py @@ -0,0 +1,29 @@ +import os + +import pytest +from lxml import etree as ET + +from .. import page_text + + +@pytest.mark.parametrize( + "file,expected_text", + [ + ("table-order-0001.xml", "1\n2\n3\n4\n5\n6\n7\n8\n9"), + ("table-order-0002.xml", "1\n4\n7\n2\n5\n8\n3\n6\n9"), + ("table-region.xml", "1\n2\n3\n4\n5\n6\n7\n8\n9"), + ("table-no-reading-order.xml", "5\n6\n7\n8\n9\n1\n2\n3\n4"), + ("table-unordered.xml", "5\n6\n7\n8\n9\n1\n2\n3\n4"), + ], +) +@pytest.mark.integration +def test_reading_order_settings(file, expected_text): + data_dir = os.path.join( + os.path.dirname(os.path.abspath(__file__)), "data", "table-order" + ) + if "table-unordered.xml" == file: + with pytest.raises(NotImplementedError): + page_text(ET.parse(os.path.join(data_dir, file))) + else: + ocr = page_text(ET.parse(os.path.join(data_dir, file))) + assert ocr == expected_text