1
0
Fork 0
mirror of https://github.com/qurator-spk/modstool.git synced 2025-06-08 03:10:15 +02:00

🚧 ALTO: Calculate mean of String@WC

This commit is contained in:
Gerber, Mike 2022-05-23 19:12:39 +02:00
parent 9b3db1cd1d
commit e24a846ea2

View file

@ -5,6 +5,7 @@ import os
import re import re
import warnings import warnings
import sys import sys
from xml.dom.expatbuilder import Namespaces
from lxml import etree as ET from lxml import etree as ET
from itertools import groupby from itertools import groupby
from operator import attrgetter from operator import attrgetter
@ -13,6 +14,7 @@ from collections.abc import MutableMapping, Sequence
import click import click
import pandas as pd import pandas as pd
import numpy as np
from tqdm import tqdm from tqdm import tqdm
from .lib import TagGroup, sorted_groupby, flatten, ns from .lib import TagGroup, sorted_groupby, flatten, ns
@ -77,6 +79,17 @@ def alto_to_dict(alto, raise_errors=True):
value['Page'] = {} value['Page'] = {}
value['Page'].update(TagGroup(tag, group).is_singleton().attributes()) value['Page'].update(TagGroup(tag, group).is_singleton().attributes())
value['Page'].update(TagGroup(tag, group).subelement_counts()) value['Page'].update(TagGroup(tag, group).subelement_counts())
xpath_expr = "//alto:String/@WC"
values = []
for e in group:
# TODO need a smart way to always have the correct namespaces for a document
alto_namespace = ET.QName(e).namespace
r = e.xpath(xpath_expr, namespaces={"alto": alto_namespace})
values += r
values = np.array([float(v) for v in values])
value['Page'][f'{xpath_expr}-mean'] = np.mean(values)
elif localname == 'Styles': elif localname == 'Styles':
pass pass
elif localname == 'Tags': elif localname == 'Tags':
@ -158,7 +171,7 @@ def process(alto_files: List[str], output_file: str, output_csv: str, output_xls
csvwriter.writerow([alto_file, caught_warning.message]) csvwriter.writerow([alto_file, caught_warning.message])
except Exception as e: except Exception as e:
logger.error('Exception in {}: {}'.format(alto_file, e)) logger.error('Exception in {}: {}'.format(alto_file, e))
#import traceback; traceback.print_exc() import traceback; traceback.print_exc()
# Convert the alto_info List[Dict] to a pandas DataFrame # Convert the alto_info List[Dict] to a pandas DataFrame
columns = [] columns = []