mirror of
https://github.com/qurator-spk/modstool.git
synced 2025-06-07 19:05:06 +02:00
🚧 ALTO: Move xpath_statistics to TagGroup class
This commit is contained in:
parent
9246519162
commit
aa4e8e290d
2 changed files with 23 additions and 18 deletions
|
@ -76,27 +76,13 @@ def alto_to_dict(alto, raise_errors=True):
|
||||||
elif localname == 'Layout':
|
elif localname == 'Layout':
|
||||||
value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().descend(raise_errors)
|
value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().descend(raise_errors)
|
||||||
elif localname == 'Page':
|
elif localname == 'Page':
|
||||||
value['Page'] = {}
|
|
||||||
value['Page'].update(TagGroup(tag, group).is_singleton().attributes())
|
|
||||||
value['Page'].update(TagGroup(tag, group).subelement_counts())
|
|
||||||
|
|
||||||
xpath_expr = "//alto:String/@WC"
|
|
||||||
alto_namespace = ET.QName(group[0]).namespace
|
alto_namespace = ET.QName(group[0]).namespace
|
||||||
namespaces={"alto": alto_namespace}
|
namespaces={"alto": alto_namespace}
|
||||||
|
|
||||||
def xpath_statistics(xpath_expr, namespaces):
|
value[localname] = {}
|
||||||
values = []
|
value[localname].update(TagGroup(tag, group).is_singleton().attributes())
|
||||||
for e in group:
|
value[localname].update(TagGroup(tag, group).subelement_counts())
|
||||||
r = e.xpath(xpath_expr, namespaces=namespaces)
|
value[localname].update(TagGroup(tag, group).xpath_statistics("//alto:String/@WC", namespaces))
|
||||||
values += r
|
|
||||||
values = np.array([float(v) for v in values])
|
|
||||||
|
|
||||||
statistics = {}
|
|
||||||
statistics[f'{xpath_expr}-mean'] = np.mean(values)
|
|
||||||
return statistics
|
|
||||||
|
|
||||||
value['Page'].update(xpath_statistics(xpath_expr, namespaces))
|
|
||||||
|
|
||||||
|
|
||||||
elif localname == 'Styles':
|
elif localname == 'Styles':
|
||||||
pass
|
pass
|
||||||
|
|
|
@ -3,6 +3,7 @@ import re
|
||||||
import warnings
|
import warnings
|
||||||
from typing import List, Sequence, MutableMapping
|
from typing import List, Sequence, MutableMapping
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
from lxml import etree as ET
|
from lxml import etree as ET
|
||||||
|
|
||||||
|
|
||||||
|
@ -206,6 +207,24 @@ class TagGroup:
|
||||||
counts[key] = counts.get(key, 0) + 1
|
counts[key] = counts.get(key, 0) + 1
|
||||||
return counts
|
return counts
|
||||||
|
|
||||||
|
def xpath_statistics(self, xpath_expr, namespaces):
|
||||||
|
"""
|
||||||
|
Extract values and calculate statistics
|
||||||
|
|
||||||
|
Extract values using the given XPath expression, convert them to float and return descriptive
|
||||||
|
statistics on the values.
|
||||||
|
"""
|
||||||
|
values = []
|
||||||
|
for e in self.group:
|
||||||
|
r = e.xpath(xpath_expr, namespaces=namespaces)
|
||||||
|
values += r
|
||||||
|
values = np.array([float(v) for v in values])
|
||||||
|
|
||||||
|
statistics = {}
|
||||||
|
statistics[f'{xpath_expr}-mean'] = np.mean(values)
|
||||||
|
return statistics
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def sorted_groupby(iterable, key=None):
|
def sorted_groupby(iterable, key=None):
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue