From 10b8023dd6e009a7a4623555607e26eaefdfb03d Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Fri, 6 May 2022 20:59:51 +0200 Subject: [PATCH] =?UTF-8?q?=E2=9C=A8=20ALTO:=20Count=20Layout/Page/*=20ele?= =?UTF-8?q?ments?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- qurator/modstool/lib.py | 2 +- qurator/modstool/tests/test_alto.py | 39 +++++++++++++++++++++++++++++ 2 files changed, 40 insertions(+), 1 deletion(-) create mode 100644 qurator/modstool/tests/test_alto.py diff --git a/qurator/modstool/lib.py b/qurator/modstool/lib.py index 9f01be8..5ebf0ac 100644 --- a/qurator/modstool/lib.py +++ b/qurator/modstool/lib.py @@ -198,7 +198,7 @@ class TagGroup: counts = {} for e in self.group: for x in e.iter(): - tag = ET.QName(x).localname + tag = ET.QName(x.tag).localname key = f"{tag}-count" counts[key] = counts.get(key, 0) + 1 return counts diff --git a/qurator/modstool/tests/test_alto.py b/qurator/modstool/tests/test_alto.py new file mode 100644 index 0000000..bc79d1d --- /dev/null +++ b/qurator/modstool/tests/test_alto.py @@ -0,0 +1,39 @@ +import xml.etree.ElementTree as ET + + +from qurator.modstool.altotool import alto_to_dict +from qurator.modstool.lib import flatten + + +def dict_fromstring(x): + return flatten(alto_to_dict(ET.fromstring(x))) + +def test_Page_counts(): + """ + Elements below Layout/Page should be counted + """ + d = dict_fromstring(""" + + + + + + + + + + + + + + + + + + + + + """) + assert d['Layout_Page_TextBlock-count'] == 1 + assert d['Layout_Page_TextLine-count'] == 3 + assert d['Layout_Page_String-count'] == 6