mirror of
https://github.com/qurator-spk/dinglehopper.git
synced 2025-06-09 20:00:01 +02:00
🚧 dinglehopper: Read PAGE UnorderedGroup in XML order
This commit is contained in:
parent
bd324331e6
commit
1778b36a9a
2 changed files with 21 additions and 19 deletions
|
@ -69,14 +69,11 @@ def page_extract(tree, *, textequiv_level="region"):
|
|||
reading_order = tree.find(".//page:ReadingOrder", namespaces=nsmap)
|
||||
if reading_order is not None:
|
||||
for group in reading_order.iterfind("./*", namespaces=nsmap):
|
||||
if ET.QName(group.tag).localname == "OrderedGroup":
|
||||
regions.extend(
|
||||
extract_texts_from_reading_order_group(
|
||||
group, tree, nsmap, textequiv_level
|
||||
)
|
||||
regions.extend(
|
||||
extract_texts_from_reading_order_group(
|
||||
group, tree, nsmap, textequiv_level
|
||||
)
|
||||
else:
|
||||
raise NotImplementedError
|
||||
)
|
||||
else:
|
||||
for region in tree.iterfind(".//page:TextRegion", namespaces=nsmap):
|
||||
regions.append(
|
||||
|
@ -94,11 +91,20 @@ def page_extract(tree, *, textequiv_level="region"):
|
|||
def extract_texts_from_reading_order_group(group, tree, nsmap, textequiv_level):
|
||||
"""Recursive function to extract the texts from TextRegions in ReadingOrder."""
|
||||
regions = []
|
||||
ro_children = group.findall("./page:RegionRefIndexed", namespaces=nsmap)
|
||||
ro_children.extend(group.findall("./page:OrderedGroupIndexed", namespaces=nsmap))
|
||||
ro_children = filter(lambda child: "index" in child.attrib.keys(), ro_children)
|
||||
for ro_child in sorted(ro_children, key=lambda child: int(child.attrib["index"])):
|
||||
if ET.QName(ro_child.tag).localname == "OrderedGroupIndexed":
|
||||
|
||||
if ET.QName(group.tag).localname in ["OrderedGroup", "OrderedGroupIndexed"]:
|
||||
ro_children = list(group)
|
||||
|
||||
ro_children = filter(lambda child: "index" in child.attrib.keys(), ro_children)
|
||||
ro_children = sorted(ro_children, key=lambda child: int(child.attrib["index"]))
|
||||
elif ET.QName(group.tag).localname == "UnorderedGroup":
|
||||
ro_children = list(group)
|
||||
else:
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
for ro_child in ro_children:
|
||||
if ET.QName(ro_child.tag).localname in ["OrderedGroup", "OrderedGroupIndexed", "UnorderedGroup"]:
|
||||
regions.extend(
|
||||
extract_texts_from_reading_order_group(
|
||||
ro_child, tree, nsmap, textequiv_level
|
||||
|
|
|
@ -13,7 +13,7 @@ from .. import page_text
|
|||
("table-order-0002.xml", "1\n4\n7\n2\n5\n8\n3\n6\n9"),
|
||||
("table-region.xml", "1\n2\n3\n4\n5\n6\n7\n8\n9"),
|
||||
("table-no-reading-order.xml", "5\n6\n7\n8\n9\n1\n2\n3\n4"),
|
||||
("table-unordered.xml", "5\n6\n7\n8\n9\n1\n2\n3\n4"),
|
||||
("table-unordered.xml", "1\n2\n3\n4\n5\n6\n7\n8\n9"),
|
||||
],
|
||||
)
|
||||
@pytest.mark.integration
|
||||
|
@ -21,9 +21,5 @@ def test_reading_order_settings(file, expected_text):
|
|||
data_dir = os.path.join(
|
||||
os.path.dirname(os.path.abspath(__file__)), "data", "table-order"
|
||||
)
|
||||
if "table-unordered.xml" == file:
|
||||
with pytest.raises(NotImplementedError):
|
||||
page_text(ET.parse(os.path.join(data_dir, file)))
|
||||
else:
|
||||
ocr = page_text(ET.parse(os.path.join(data_dir, file)))
|
||||
assert ocr == expected_text
|
||||
ocr = page_text(ET.parse(os.path.join(data_dir, file)))
|
||||
assert ocr == expected_text
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue