Count all alto:String elements with TAGREFS attribute

master
Gerber, Mike 2 years ago
parent a40716a320
commit c803ce0907

@ -83,6 +83,9 @@ def alto_to_dict(alto, raise_errors=True):
value[localname].update(TagGroup(tag, group).subelement_counts())
value[localname].update(TagGroup(tag, group).xpath_statistics("//alto:String/@WC", namespaces))
# Count all alto:String elements with TAGREFS attribute
value[localname].update(TagGroup(tag, group).xpath_count("//alto:String[@TAGREFS]", namespaces))
elif localname == 'Styles':
pass
elif localname == 'Tags':

@ -229,6 +229,17 @@ class TagGroup:
statistics[f'{xpath_expr}-max'] = np.max(values)
return statistics
def xpath_count(self, xpath_expr, namespaces):
"""
Count all elements matching xpath_expr
"""
values = []
for e in self.group:
r = e.xpath(xpath_expr, namespaces=namespaces)
values += r
counts = {f'{xpath_expr}-count': len(values)}
return counts

@ -55,3 +55,27 @@ def test_Tags_counts():
</alto>
""")
assert d['Tags_NamedEntityTag-count'] == 9
def test_String_TAGREF_counts():
d = dict_fromstring("""
<alto xmlns="http://www.loc.gov/standards/alto/ns-v2#">
<Layout>
<Page>
<PrintSpace>
<TextBlock>
<TextLine>
<String CONTENT="Pentlings" HEIGHT="33" HPOS="330" TAGREFS="PER0" VPOS="699" WC="0.4511111081" WIDTH="146"/>
</TextLine>
<TextLine>
<String CONTENT="Pentlings" HEIGHT="33" HPOS="330" TAGREFS="PER0" VPOS="699" WC="0.4511111081" WIDTH="146"/>
<String CONTENT="Pentlings" HEIGHT="33" HPOS="330" TAGREFS="PER0" VPOS="699" WC="0.4511111081" WIDTH="146"/>
<String CONTENT="No TAGREF!" />
</TextLine>
</TextBlock>
</PrintSpace>
</Page>
</Layout>
</alto>
""")
assert d['Layout_Page_//alto:String[@TAGREFS]-count'] == 3
assert d['Layout_Page_String-count'] == 4
Loading…
Cancel
Save