🚧 ALTO: Move xpath_statistics to TagGroup class

feat/alto
Gerber, Mike 2 years ago
parent 9246519162
commit aa4e8e290d

@ -76,27 +76,13 @@ def alto_to_dict(alto, raise_errors=True):
elif localname == 'Layout':
value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().descend(raise_errors)
elif localname == 'Page':
value['Page'] = {}
value['Page'].update(TagGroup(tag, group).is_singleton().attributes())
value['Page'].update(TagGroup(tag, group).subelement_counts())
xpath_expr = "//alto:String/@WC"
alto_namespace = ET.QName(group[0]).namespace
namespaces={"alto": alto_namespace}
def xpath_statistics(xpath_expr, namespaces):
values = []
for e in group:
r = e.xpath(xpath_expr, namespaces=namespaces)
values += r
values = np.array([float(v) for v in values])
statistics = {}
statistics[f'{xpath_expr}-mean'] = np.mean(values)
return statistics
value['Page'].update(xpath_statistics(xpath_expr, namespaces))
value[localname] = {}
value[localname].update(TagGroup(tag, group).is_singleton().attributes())
value[localname].update(TagGroup(tag, group).subelement_counts())
value[localname].update(TagGroup(tag, group).xpath_statistics("//alto:String/@WC", namespaces))
elif localname == 'Styles':
pass

@ -3,6 +3,7 @@ import re
import warnings
from typing import List, Sequence, MutableMapping
import numpy as np
from lxml import etree as ET
@ -206,6 +207,24 @@ class TagGroup:
counts[key] = counts.get(key, 0) + 1
return counts
def xpath_statistics(self, xpath_expr, namespaces):
"""
Extract values and calculate statistics
Extract values using the given XPath expression, convert them to float and return descriptive
statistics on the values.
"""
values = []
for e in self.group:
r = e.xpath(xpath_expr, namespaces=namespaces)
values += r
values = np.array([float(v) for v in values])
statistics = {}
statistics[f'{xpath_expr}-mean'] = np.mean(values)
return statistics
def sorted_groupby(iterable, key=None):

Loading…
Cancel
Save