diff --git a/qurator/modstool/alto4pandas.py b/qurator/modstool/alto4pandas.py index 3f3724b..44e543a 100755 --- a/qurator/modstool/alto4pandas.py +++ b/qurator/modstool/alto4pandas.py @@ -86,7 +86,8 @@ def alto_to_dict(alto, raise_errors=True): elif localname == 'Styles': pass elif localname == 'Tags': - pass + value[localname] = {} + value[localname].update(TagGroup(tag, group).subelement_counts()) else: if raise_errors: print(value) diff --git a/qurator/modstool/tests/test_alto.py b/qurator/modstool/tests/test_alto.py index 5853154..13416d4 100644 --- a/qurator/modstool/tests/test_alto.py +++ b/qurator/modstool/tests/test_alto.py @@ -37,3 +37,21 @@ def test_Page_counts(): assert d['Layout_Page_TextBlock-count'] == 1 assert d['Layout_Page_TextLine-count'] == 3 assert d['Layout_Page_String-count'] == 6 + +def test_Tags_counts(): + d = dict_fromstring(""" + + + + + + + + + + + + + + """) + assert d['Tags_NamedEntityTag-count'] == 9