mirror of
https://github.com/qurator-spk/modstool.git
synced 2025-06-07 19:05:06 +02:00
✨ Count all alto:String elements with TAGREFS attribute
This commit is contained in:
parent
a40716a320
commit
c803ce0907
3 changed files with 38 additions and 0 deletions
|
@ -83,6 +83,9 @@ def alto_to_dict(alto, raise_errors=True):
|
||||||
value[localname].update(TagGroup(tag, group).subelement_counts())
|
value[localname].update(TagGroup(tag, group).subelement_counts())
|
||||||
value[localname].update(TagGroup(tag, group).xpath_statistics("//alto:String/@WC", namespaces))
|
value[localname].update(TagGroup(tag, group).xpath_statistics("//alto:String/@WC", namespaces))
|
||||||
|
|
||||||
|
# Count all alto:String elements with TAGREFS attribute
|
||||||
|
value[localname].update(TagGroup(tag, group).xpath_count("//alto:String[@TAGREFS]", namespaces))
|
||||||
|
|
||||||
elif localname == 'Styles':
|
elif localname == 'Styles':
|
||||||
pass
|
pass
|
||||||
elif localname == 'Tags':
|
elif localname == 'Tags':
|
||||||
|
|
|
@ -229,6 +229,17 @@ class TagGroup:
|
||||||
statistics[f'{xpath_expr}-max'] = np.max(values)
|
statistics[f'{xpath_expr}-max'] = np.max(values)
|
||||||
return statistics
|
return statistics
|
||||||
|
|
||||||
|
def xpath_count(self, xpath_expr, namespaces):
|
||||||
|
"""
|
||||||
|
Count all elements matching xpath_expr
|
||||||
|
"""
|
||||||
|
values = []
|
||||||
|
for e in self.group:
|
||||||
|
r = e.xpath(xpath_expr, namespaces=namespaces)
|
||||||
|
values += r
|
||||||
|
|
||||||
|
counts = {f'{xpath_expr}-count': len(values)}
|
||||||
|
return counts
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -55,3 +55,27 @@ def test_Tags_counts():
|
||||||
</alto>
|
</alto>
|
||||||
""")
|
""")
|
||||||
assert d['Tags_NamedEntityTag-count'] == 9
|
assert d['Tags_NamedEntityTag-count'] == 9
|
||||||
|
|
||||||
|
def test_String_TAGREF_counts():
|
||||||
|
d = dict_fromstring("""
|
||||||
|
<alto xmlns="http://www.loc.gov/standards/alto/ns-v2#">
|
||||||
|
<Layout>
|
||||||
|
<Page>
|
||||||
|
<PrintSpace>
|
||||||
|
<TextBlock>
|
||||||
|
<TextLine>
|
||||||
|
<String CONTENT="Pentlings" HEIGHT="33" HPOS="330" TAGREFS="PER0" VPOS="699" WC="0.4511111081" WIDTH="146"/>
|
||||||
|
</TextLine>
|
||||||
|
<TextLine>
|
||||||
|
<String CONTENT="Pentlings" HEIGHT="33" HPOS="330" TAGREFS="PER0" VPOS="699" WC="0.4511111081" WIDTH="146"/>
|
||||||
|
<String CONTENT="Pentlings" HEIGHT="33" HPOS="330" TAGREFS="PER0" VPOS="699" WC="0.4511111081" WIDTH="146"/>
|
||||||
|
<String CONTENT="No TAGREF!" />
|
||||||
|
</TextLine>
|
||||||
|
</TextBlock>
|
||||||
|
</PrintSpace>
|
||||||
|
</Page>
|
||||||
|
</Layout>
|
||||||
|
</alto>
|
||||||
|
""")
|
||||||
|
assert d['Layout_Page_//alto:String[@TAGREFS]-count'] == 3
|
||||||
|
assert d['Layout_Page_String-count'] == 4
|
Loading…
Add table
Add a link
Reference in a new issue