mirror of
https://github.com/qurator-spk/modstool.git
synced 2025-06-07 19:05:06 +02:00
🚧 ALTO: Calculate mean of String@WC
This commit is contained in:
parent
9b3db1cd1d
commit
e24a846ea2
1 changed files with 14 additions and 1 deletions
|
@ -5,6 +5,7 @@ import os
|
||||||
import re
|
import re
|
||||||
import warnings
|
import warnings
|
||||||
import sys
|
import sys
|
||||||
|
from xml.dom.expatbuilder import Namespaces
|
||||||
from lxml import etree as ET
|
from lxml import etree as ET
|
||||||
from itertools import groupby
|
from itertools import groupby
|
||||||
from operator import attrgetter
|
from operator import attrgetter
|
||||||
|
@ -13,6 +14,7 @@ from collections.abc import MutableMapping, Sequence
|
||||||
|
|
||||||
import click
|
import click
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
import numpy as np
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
|
||||||
from .lib import TagGroup, sorted_groupby, flatten, ns
|
from .lib import TagGroup, sorted_groupby, flatten, ns
|
||||||
|
@ -77,6 +79,17 @@ def alto_to_dict(alto, raise_errors=True):
|
||||||
value['Page'] = {}
|
value['Page'] = {}
|
||||||
value['Page'].update(TagGroup(tag, group).is_singleton().attributes())
|
value['Page'].update(TagGroup(tag, group).is_singleton().attributes())
|
||||||
value['Page'].update(TagGroup(tag, group).subelement_counts())
|
value['Page'].update(TagGroup(tag, group).subelement_counts())
|
||||||
|
|
||||||
|
xpath_expr = "//alto:String/@WC"
|
||||||
|
values = []
|
||||||
|
for e in group:
|
||||||
|
# TODO need a smart way to always have the correct namespaces for a document
|
||||||
|
alto_namespace = ET.QName(e).namespace
|
||||||
|
r = e.xpath(xpath_expr, namespaces={"alto": alto_namespace})
|
||||||
|
values += r
|
||||||
|
values = np.array([float(v) for v in values])
|
||||||
|
value['Page'][f'{xpath_expr}-mean'] = np.mean(values)
|
||||||
|
|
||||||
elif localname == 'Styles':
|
elif localname == 'Styles':
|
||||||
pass
|
pass
|
||||||
elif localname == 'Tags':
|
elif localname == 'Tags':
|
||||||
|
@ -158,7 +171,7 @@ def process(alto_files: List[str], output_file: str, output_csv: str, output_xls
|
||||||
csvwriter.writerow([alto_file, caught_warning.message])
|
csvwriter.writerow([alto_file, caught_warning.message])
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error('Exception in {}: {}'.format(alto_file, e))
|
logger.error('Exception in {}: {}'.format(alto_file, e))
|
||||||
#import traceback; traceback.print_exc()
|
import traceback; traceback.print_exc()
|
||||||
|
|
||||||
# Convert the alto_info List[Dict] to a pandas DataFrame
|
# Convert the alto_info List[Dict] to a pandas DataFrame
|
||||||
columns = []
|
columns = []
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue