mirror of
https://github.com/qurator-spk/modstool.git
synced 2025-06-08 11:20:07 +02:00
🚧 ALTO: Extract a function to calculate statistics on xpath expressions
This commit is contained in:
parent
e24a846ea2
commit
9246519162
1 changed files with 16 additions and 8 deletions
|
@ -81,14 +81,22 @@ def alto_to_dict(alto, raise_errors=True):
|
|||
value['Page'].update(TagGroup(tag, group).subelement_counts())
|
||||
|
||||
xpath_expr = "//alto:String/@WC"
|
||||
alto_namespace = ET.QName(group[0]).namespace
|
||||
namespaces={"alto": alto_namespace}
|
||||
|
||||
def xpath_statistics(xpath_expr, namespaces):
|
||||
values = []
|
||||
for e in group:
|
||||
# TODO need a smart way to always have the correct namespaces for a document
|
||||
alto_namespace = ET.QName(e).namespace
|
||||
r = e.xpath(xpath_expr, namespaces={"alto": alto_namespace})
|
||||
r = e.xpath(xpath_expr, namespaces=namespaces)
|
||||
values += r
|
||||
values = np.array([float(v) for v in values])
|
||||
value['Page'][f'{xpath_expr}-mean'] = np.mean(values)
|
||||
|
||||
statistics = {}
|
||||
statistics[f'{xpath_expr}-mean'] = np.mean(values)
|
||||
return statistics
|
||||
|
||||
value['Page'].update(xpath_statistics(xpath_expr, namespaces))
|
||||
|
||||
|
||||
elif localname == 'Styles':
|
||||
pass
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue