from lxml import etree as ET from qurator.mods4pandas.alto4pandas import alto_to_dict from qurator.mods4pandas.lib import flatten def dict_fromstring(x): return flatten(alto_to_dict(ET.fromstring(x))) def test_Page_counts(): """ Elements below Layout/Page should be counted """ d = dict_fromstring(""" """) assert d['Layout_Page_TextBlock-count'] == 1 assert d['Layout_Page_TextLine-count'] == 3 assert d['Layout_Page_String-count'] == 6 def test_Tags_counts(): d = dict_fromstring(""" """) assert d['Tags_NamedEntityTag-count'] == 9 def test_String_TAGREF_counts(): d = dict_fromstring(""" """) assert d['Layout_Page_//alto:String[@TAGREFS]-count'] == 3 assert d['Layout_Page_String-count'] == 4