From aa4e8e290dd1d2f1e1f382ebfa799eafac8f6795 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Mon, 23 May 2022 19:39:21 +0200 Subject: [PATCH] =?UTF-8?q?=F0=9F=9A=A7=20ALTO:=20Move=20xpath=5Fstatistic?= =?UTF-8?q?s=20to=20TagGroup=20class?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- qurator/modstool/altotool.py | 22 ++++------------------ qurator/modstool/lib.py | 19 +++++++++++++++++++ 2 files changed, 23 insertions(+), 18 deletions(-) diff --git a/qurator/modstool/altotool.py b/qurator/modstool/altotool.py index 2d83051..218e448 100755 --- a/qurator/modstool/altotool.py +++ b/qurator/modstool/altotool.py @@ -76,27 +76,13 @@ def alto_to_dict(alto, raise_errors=True): elif localname == 'Layout': value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().descend(raise_errors) elif localname == 'Page': - value['Page'] = {} - value['Page'].update(TagGroup(tag, group).is_singleton().attributes()) - value['Page'].update(TagGroup(tag, group).subelement_counts()) - - xpath_expr = "//alto:String/@WC" alto_namespace = ET.QName(group[0]).namespace namespaces={"alto": alto_namespace} - def xpath_statistics(xpath_expr, namespaces): - values = [] - for e in group: - r = e.xpath(xpath_expr, namespaces=namespaces) - values += r - values = np.array([float(v) for v in values]) - - statistics = {} - statistics[f'{xpath_expr}-mean'] = np.mean(values) - return statistics - - value['Page'].update(xpath_statistics(xpath_expr, namespaces)) - + value[localname] = {} + value[localname].update(TagGroup(tag, group).is_singleton().attributes()) + value[localname].update(TagGroup(tag, group).subelement_counts()) + value[localname].update(TagGroup(tag, group).xpath_statistics("//alto:String/@WC", namespaces)) elif localname == 'Styles': pass diff --git a/qurator/modstool/lib.py b/qurator/modstool/lib.py index 75d0f86..383ba8f 100644 --- a/qurator/modstool/lib.py +++ b/qurator/modstool/lib.py @@ -3,6 +3,7 @@ import re import warnings from typing import List, Sequence, MutableMapping +import numpy as np from lxml import etree as ET @@ -206,6 +207,24 @@ class TagGroup: counts[key] = counts.get(key, 0) + 1 return counts + def xpath_statistics(self, xpath_expr, namespaces): + """ + Extract values and calculate statistics + + Extract values using the given XPath expression, convert them to float and return descriptive + statistics on the values. + """ + values = [] + for e in self.group: + r = e.xpath(xpath_expr, namespaces=namespaces) + values += r + values = np.array([float(v) for v in values]) + + statistics = {} + statistics[f'{xpath_expr}-mean'] = np.mean(values) + return statistics + + def sorted_groupby(iterable, key=None):