🚧 ALTO: Calculate mean of String@WC

master
Gerber, Mike 3 years ago
parent 9b3db1cd1d
commit e24a846ea2

@ -5,6 +5,7 @@ import os
import re import re
import warnings import warnings
import sys import sys
from xml.dom.expatbuilder import Namespaces
from lxml import etree as ET from lxml import etree as ET
from itertools import groupby from itertools import groupby
from operator import attrgetter from operator import attrgetter
@ -13,6 +14,7 @@ from collections.abc import MutableMapping, Sequence
import click import click
import pandas as pd import pandas as pd
import numpy as np
from tqdm import tqdm from tqdm import tqdm
from .lib import TagGroup, sorted_groupby, flatten, ns from .lib import TagGroup, sorted_groupby, flatten, ns
@ -77,6 +79,17 @@ def alto_to_dict(alto, raise_errors=True):
value['Page'] = {} value['Page'] = {}
value['Page'].update(TagGroup(tag, group).is_singleton().attributes()) value['Page'].update(TagGroup(tag, group).is_singleton().attributes())
value['Page'].update(TagGroup(tag, group).subelement_counts()) value['Page'].update(TagGroup(tag, group).subelement_counts())
xpath_expr = "//alto:String/@WC"
values = []
for e in group:
# TODO need a smart way to always have the correct namespaces for a document
alto_namespace = ET.QName(e).namespace
r = e.xpath(xpath_expr, namespaces={"alto": alto_namespace})
values += r
values = np.array([float(v) for v in values])
value['Page'][f'{xpath_expr}-mean'] = np.mean(values)
elif localname == 'Styles': elif localname == 'Styles':
pass pass
elif localname == 'Tags': elif localname == 'Tags':
@ -158,7 +171,7 @@ def process(alto_files: List[str], output_file: str, output_csv: str, output_xls
csvwriter.writerow([alto_file, caught_warning.message]) csvwriter.writerow([alto_file, caught_warning.message])
except Exception as e: except Exception as e:
logger.error('Exception in {}: {}'.format(alto_file, e)) logger.error('Exception in {}: {}'.format(alto_file, e))
#import traceback; traceback.print_exc() import traceback; traceback.print_exc()
# Convert the alto_info List[Dict] to a pandas DataFrame # Convert the alto_info List[Dict] to a pandas DataFrame
columns = [] columns = []

Loading…
Cancel
Save