🚧 dinglehopper: Read PAGE UnorderedGroup in XML order

pull/58/head
Gerber, Mike 3 years ago
parent bd324331e6
commit 1778b36a9a

@ -69,14 +69,11 @@ def page_extract(tree, *, textequiv_level="region"):
reading_order = tree.find(".//page:ReadingOrder", namespaces=nsmap)
if reading_order is not None:
for group in reading_order.iterfind("./*", namespaces=nsmap):
if ET.QName(group.tag).localname == "OrderedGroup":
regions.extend(
extract_texts_from_reading_order_group(
group, tree, nsmap, textequiv_level
)
regions.extend(
extract_texts_from_reading_order_group(
group, tree, nsmap, textequiv_level
)
else:
raise NotImplementedError
)
else:
for region in tree.iterfind(".//page:TextRegion", namespaces=nsmap):
regions.append(
@ -94,11 +91,20 @@ def page_extract(tree, *, textequiv_level="region"):
def extract_texts_from_reading_order_group(group, tree, nsmap, textequiv_level):
"""Recursive function to extract the texts from TextRegions in ReadingOrder."""
regions = []
ro_children = group.findall("./page:RegionRefIndexed", namespaces=nsmap)
ro_children.extend(group.findall("./page:OrderedGroupIndexed", namespaces=nsmap))
ro_children = filter(lambda child: "index" in child.attrib.keys(), ro_children)
for ro_child in sorted(ro_children, key=lambda child: int(child.attrib["index"])):
if ET.QName(ro_child.tag).localname == "OrderedGroupIndexed":
if ET.QName(group.tag).localname in ["OrderedGroup", "OrderedGroupIndexed"]:
ro_children = list(group)
ro_children = filter(lambda child: "index" in child.attrib.keys(), ro_children)
ro_children = sorted(ro_children, key=lambda child: int(child.attrib["index"]))
elif ET.QName(group.tag).localname == "UnorderedGroup":
ro_children = list(group)
else:
raise NotImplementedError
for ro_child in ro_children:
if ET.QName(ro_child.tag).localname in ["OrderedGroup", "OrderedGroupIndexed", "UnorderedGroup"]:
regions.extend(
extract_texts_from_reading_order_group(
ro_child, tree, nsmap, textequiv_level

@ -13,7 +13,7 @@ from .. import page_text
("table-order-0002.xml", "1\n4\n7\n2\n5\n8\n3\n6\n9"),
("table-region.xml", "1\n2\n3\n4\n5\n6\n7\n8\n9"),
("table-no-reading-order.xml", "5\n6\n7\n8\n9\n1\n2\n3\n4"),
("table-unordered.xml", "5\n6\n7\n8\n9\n1\n2\n3\n4"),
("table-unordered.xml", "1\n2\n3\n4\n5\n6\n7\n8\n9"),
],
)
@pytest.mark.integration
@ -21,9 +21,5 @@ def test_reading_order_settings(file, expected_text):
data_dir = os.path.join(
os.path.dirname(os.path.abspath(__file__)), "data", "table-order"
)
if "table-unordered.xml" == file:
with pytest.raises(NotImplementedError):
page_text(ET.parse(os.path.join(data_dir, file)))
else:
ocr = page_text(ET.parse(os.path.join(data_dir, file)))
assert ocr == expected_text
ocr = page_text(ET.parse(os.path.join(data_dir, file)))
assert ocr == expected_text

Loading…
Cancel
Save