From a40716a3208093ef58a73716b160bdeb47a66c18 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Fri, 17 Jun 2022 17:32:17 +0200 Subject: [PATCH] =?UTF-8?q?=E2=9C=A8=20ALTO:=20Count=20alto:Tags?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- qurator/modstool/alto4pandas.py | 3 ++- qurator/modstool/tests/test_alto.py | 18 ++++++++++++++++++ 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/qurator/modstool/alto4pandas.py b/qurator/modstool/alto4pandas.py index 3f3724b..44e543a 100755 --- a/qurator/modstool/alto4pandas.py +++ b/qurator/modstool/alto4pandas.py @@ -86,7 +86,8 @@ def alto_to_dict(alto, raise_errors=True): elif localname == 'Styles': pass elif localname == 'Tags': - pass + value[localname] = {} + value[localname].update(TagGroup(tag, group).subelement_counts()) else: if raise_errors: print(value) diff --git a/qurator/modstool/tests/test_alto.py b/qurator/modstool/tests/test_alto.py index 5853154..13416d4 100644 --- a/qurator/modstool/tests/test_alto.py +++ b/qurator/modstool/tests/test_alto.py @@ -37,3 +37,21 @@ def test_Page_counts(): assert d['Layout_Page_TextBlock-count'] == 1 assert d['Layout_Page_TextLine-count'] == 3 assert d['Layout_Page_String-count'] == 6 + +def test_Tags_counts(): + d = dict_fromstring(""" + + + + + + + + + + + + + + """) + assert d['Tags_NamedEntityTag-count'] == 9