From c803ce09075297bbcb8816cfa08c2ff081af7fdd Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Fri, 17 Jun 2022 17:59:34 +0200 Subject: [PATCH] =?UTF-8?q?=E2=9C=A8=20Count=20all=20alto:String=20element?= =?UTF-8?q?s=20with=20TAGREFS=20attribute?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- qurator/modstool/alto4pandas.py | 3 +++ qurator/modstool/lib.py | 11 +++++++++++ qurator/modstool/tests/test_alto.py | 24 ++++++++++++++++++++++++ 3 files changed, 38 insertions(+) diff --git a/qurator/modstool/alto4pandas.py b/qurator/modstool/alto4pandas.py index 44e543a..c6db8fb 100755 --- a/qurator/modstool/alto4pandas.py +++ b/qurator/modstool/alto4pandas.py @@ -83,6 +83,9 @@ def alto_to_dict(alto, raise_errors=True): value[localname].update(TagGroup(tag, group).subelement_counts()) value[localname].update(TagGroup(tag, group).xpath_statistics("//alto:String/@WC", namespaces)) + # Count all alto:String elements with TAGREFS attribute + value[localname].update(TagGroup(tag, group).xpath_count("//alto:String[@TAGREFS]", namespaces)) + elif localname == 'Styles': pass elif localname == 'Tags': diff --git a/qurator/modstool/lib.py b/qurator/modstool/lib.py index a6be479..1eba0f9 100644 --- a/qurator/modstool/lib.py +++ b/qurator/modstool/lib.py @@ -229,6 +229,17 @@ class TagGroup: statistics[f'{xpath_expr}-max'] = np.max(values) return statistics + def xpath_count(self, xpath_expr, namespaces): + """ + Count all elements matching xpath_expr + """ + values = [] + for e in self.group: + r = e.xpath(xpath_expr, namespaces=namespaces) + values += r + + counts = {f'{xpath_expr}-count': len(values)} + return counts diff --git a/qurator/modstool/tests/test_alto.py b/qurator/modstool/tests/test_alto.py index 13416d4..509d24a 100644 --- a/qurator/modstool/tests/test_alto.py +++ b/qurator/modstool/tests/test_alto.py @@ -55,3 +55,27 @@ def test_Tags_counts(): """) assert d['Tags_NamedEntityTag-count'] == 9 + +def test_String_TAGREF_counts(): + d = dict_fromstring(""" + + + + + + + + + + + + + + + + + + + """) + assert d['Layout_Page_//alto:String[@TAGREFS]-count'] == 3 + assert d['Layout_Page_String-count'] == 4 \ No newline at end of file