from lxml import etree as ET
from qurator.mods4pandas.alto4pandas import alto_to_dict
from qurator.mods4pandas.lib import flatten
def dict_fromstring(x):
return flatten(alto_to_dict(ET.fromstring(x)))
def test_Page_counts():
"""
Elements below Layout/Page should be counted
"""
d = dict_fromstring("""
""")
assert d['Layout_Page_TextBlock-count'] == 1
assert d['Layout_Page_TextLine-count'] == 3
assert d['Layout_Page_String-count'] == 6
def test_Tags_counts():
d = dict_fromstring("""
""")
assert d['Tags_NamedEntityTag-count'] == 9
def test_String_TAGREF_counts():
d = dict_fromstring("""
""")
assert d['Layout_Page_//alto:String[@TAGREFS]-count'] == 3
assert d['Layout_Page_String-count'] == 4