From 9246519162e2063f988b45098e7a06a30605cbe4 Mon Sep 17 00:00:00 2001
From: "Gerber, Mike" <mike.gerber@sbb.spk-berlin.de>
Date: Mon, 23 May 2022 19:33:54 +0200
Subject: [PATCH] =?UTF-8?q?=F0=9F=9A=A7=20ALTO:=20Extract=20a=20function?=
 =?UTF-8?q?=20to=20calculate=20statistics=20on=20xpath=20expressions?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 qurator/modstool/altotool.py | 24 ++++++++++++++++--------
 1 file changed, 16 insertions(+), 8 deletions(-)

diff --git a/qurator/modstool/altotool.py b/qurator/modstool/altotool.py
index 3381c74..2d83051 100755
--- a/qurator/modstool/altotool.py
+++ b/qurator/modstool/altotool.py
@@ -81,14 +81,22 @@ def alto_to_dict(alto, raise_errors=True):
             value['Page'].update(TagGroup(tag, group).subelement_counts())
 
             xpath_expr = "//alto:String/@WC"
-            values = []
-            for e in group:
-                # TODO need a smart way to always have the correct namespaces for a document
-                alto_namespace = ET.QName(e).namespace
-                r = e.xpath(xpath_expr, namespaces={"alto": alto_namespace})
-                values += r
-            values = np.array([float(v) for v in values])
-            value['Page'][f'{xpath_expr}-mean'] = np.mean(values)
+            alto_namespace = ET.QName(group[0]).namespace
+            namespaces={"alto": alto_namespace}
+
+            def xpath_statistics(xpath_expr, namespaces):
+                values = []
+                for e in group:
+                    r = e.xpath(xpath_expr, namespaces=namespaces)
+                    values += r
+                values = np.array([float(v) for v in values])
+
+                statistics = {}
+                statistics[f'{xpath_expr}-mean'] = np.mean(values)
+                return statistics
+
+            value['Page'].update(xpath_statistics(xpath_expr, namespaces))
+
 
         elif localname == 'Styles':
             pass