From 9246519162e2063f988b45098e7a06a30605cbe4 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Mon, 23 May 2022 19:33:54 +0200 Subject: [PATCH] =?UTF-8?q?=F0=9F=9A=A7=20ALTO:=20Extract=20a=20function?= =?UTF-8?q?=20to=20calculate=20statistics=20on=20xpath=20expressions?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- qurator/modstool/altotool.py | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/qurator/modstool/altotool.py b/qurator/modstool/altotool.py index 3381c74..2d83051 100755 --- a/qurator/modstool/altotool.py +++ b/qurator/modstool/altotool.py @@ -81,14 +81,22 @@ def alto_to_dict(alto, raise_errors=True): value['Page'].update(TagGroup(tag, group).subelement_counts()) xpath_expr = "//alto:String/@WC" - values = [] - for e in group: - # TODO need a smart way to always have the correct namespaces for a document - alto_namespace = ET.QName(e).namespace - r = e.xpath(xpath_expr, namespaces={"alto": alto_namespace}) - values += r - values = np.array([float(v) for v in values]) - value['Page'][f'{xpath_expr}-mean'] = np.mean(values) + alto_namespace = ET.QName(group[0]).namespace + namespaces={"alto": alto_namespace} + + def xpath_statistics(xpath_expr, namespaces): + values = [] + for e in group: + r = e.xpath(xpath_expr, namespaces=namespaces) + values += r + values = np.array([float(v) for v in values]) + + statistics = {} + statistics[f'{xpath_expr}-mean'] = np.mean(values) + return statistics + + value['Page'].update(xpath_statistics(xpath_expr, namespaces)) + elif localname == 'Styles': pass