1
0
Fork 0
mirror of https://github.com/qurator-spk/modstool.git synced 2025-06-08 19:29:57 +02:00

🚧 ALTO: Extract a function to calculate statistics on xpath expressions

This commit is contained in:
Gerber, Mike 2022-05-23 19:33:54 +02:00
parent e24a846ea2
commit 9246519162

View file

@ -81,14 +81,22 @@ def alto_to_dict(alto, raise_errors=True):
value['Page'].update(TagGroup(tag, group).subelement_counts()) value['Page'].update(TagGroup(tag, group).subelement_counts())
xpath_expr = "//alto:String/@WC" xpath_expr = "//alto:String/@WC"
values = [] alto_namespace = ET.QName(group[0]).namespace
for e in group: namespaces={"alto": alto_namespace}
# TODO need a smart way to always have the correct namespaces for a document
alto_namespace = ET.QName(e).namespace def xpath_statistics(xpath_expr, namespaces):
r = e.xpath(xpath_expr, namespaces={"alto": alto_namespace}) values = []
values += r for e in group:
values = np.array([float(v) for v in values]) r = e.xpath(xpath_expr, namespaces=namespaces)
value['Page'][f'{xpath_expr}-mean'] = np.mean(values) values += r
values = np.array([float(v) for v in values])
statistics = {}
statistics[f'{xpath_expr}-mean'] = np.mean(values)
return statistics
value['Page'].update(xpath_statistics(xpath_expr, namespaces))
elif localname == 'Styles': elif localname == 'Styles':
pass pass