🚧 ALTO: Move xpath_statistics to TagGroup class

feat/alto
Gerber, Mike 2 years ago
parent 9246519162
commit aa4e8e290d

@ -76,27 +76,13 @@ def alto_to_dict(alto, raise_errors=True):
elif localname == 'Layout': elif localname == 'Layout':
value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().descend(raise_errors) value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().descend(raise_errors)
elif localname == 'Page': elif localname == 'Page':
value['Page'] = {}
value['Page'].update(TagGroup(tag, group).is_singleton().attributes())
value['Page'].update(TagGroup(tag, group).subelement_counts())
xpath_expr = "//alto:String/@WC"
alto_namespace = ET.QName(group[0]).namespace alto_namespace = ET.QName(group[0]).namespace
namespaces={"alto": alto_namespace} namespaces={"alto": alto_namespace}
def xpath_statistics(xpath_expr, namespaces): value[localname] = {}
values = [] value[localname].update(TagGroup(tag, group).is_singleton().attributes())
for e in group: value[localname].update(TagGroup(tag, group).subelement_counts())
r = e.xpath(xpath_expr, namespaces=namespaces) value[localname].update(TagGroup(tag, group).xpath_statistics("//alto:String/@WC", namespaces))
values += r
values = np.array([float(v) for v in values])
statistics = {}
statistics[f'{xpath_expr}-mean'] = np.mean(values)
return statistics
value['Page'].update(xpath_statistics(xpath_expr, namespaces))
elif localname == 'Styles': elif localname == 'Styles':
pass pass

@ -3,6 +3,7 @@ import re
import warnings import warnings
from typing import List, Sequence, MutableMapping from typing import List, Sequence, MutableMapping
import numpy as np
from lxml import etree as ET from lxml import etree as ET
@ -206,6 +207,24 @@ class TagGroup:
counts[key] = counts.get(key, 0) + 1 counts[key] = counts.get(key, 0) + 1
return counts return counts
def xpath_statistics(self, xpath_expr, namespaces):
"""
Extract values and calculate statistics
Extract values using the given XPath expression, convert them to float and return descriptive
statistics on the values.
"""
values = []
for e in self.group:
r = e.xpath(xpath_expr, namespaces=namespaces)
values += r
values = np.array([float(v) for v in values])
statistics = {}
statistics[f'{xpath_expr}-mean'] = np.mean(values)
return statistics
def sorted_groupby(iterable, key=None): def sorted_groupby(iterable, key=None):

Loading…
Cancel
Save