Merge branch 'feat/alto'
commit
a2fb3ee387
@ -1 +0,0 @@
|
|||||||
from .modstool import *
|
|
@ -0,0 +1,202 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
import csv
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
import warnings
|
||||||
|
import sys
|
||||||
|
from xml.dom.expatbuilder import Namespaces
|
||||||
|
from lxml import etree as ET
|
||||||
|
from itertools import groupby
|
||||||
|
from operator import attrgetter
|
||||||
|
from typing import List
|
||||||
|
from collections.abc import MutableMapping, Sequence
|
||||||
|
|
||||||
|
import click
|
||||||
|
import pandas as pd
|
||||||
|
import numpy as np
|
||||||
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
from .lib import TagGroup, sorted_groupby, flatten, ns
|
||||||
|
|
||||||
|
|
||||||
|
logger = logging.getLogger('alto4pandas')
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def alto_to_dict(alto, raise_errors=True):
|
||||||
|
"""Convert ALTO metadata to a nested dictionary"""
|
||||||
|
|
||||||
|
value = {}
|
||||||
|
|
||||||
|
# Iterate through each group of tags
|
||||||
|
for tag, group in sorted_groupby(alto, key=attrgetter('tag')):
|
||||||
|
group = list(group)
|
||||||
|
|
||||||
|
localname = ET.QName(tag).localname
|
||||||
|
|
||||||
|
if localname == 'Description':
|
||||||
|
value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().descend(raise_errors)
|
||||||
|
elif localname == 'MeasurementUnit':
|
||||||
|
value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().text()
|
||||||
|
elif localname == 'OCRProcessing':
|
||||||
|
value[localname] = TagGroup(tag, group).is_singleton().descend(raise_errors)
|
||||||
|
elif localname == 'Processing':
|
||||||
|
# TODO This enumerated descent is used more than once, DRY!
|
||||||
|
for n, e in enumerate(group):
|
||||||
|
value[f'{localname}{n}'] = alto_to_dict(e, raise_errors)
|
||||||
|
elif localname == 'ocrProcessingStep':
|
||||||
|
for n, e in enumerate(group):
|
||||||
|
value[f'{localname}{n}'] = alto_to_dict(e, raise_errors)
|
||||||
|
elif localname == 'preProcessingStep':
|
||||||
|
for n, e in enumerate(group):
|
||||||
|
value[f'{localname}{n}'] = alto_to_dict(e, raise_errors)
|
||||||
|
elif localname == 'processingDateTime':
|
||||||
|
value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().text()
|
||||||
|
elif localname == 'processingSoftware':
|
||||||
|
value[localname] = TagGroup(tag, group).is_singleton().descend(raise_errors)
|
||||||
|
elif localname == 'processingAgency':
|
||||||
|
value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().text()
|
||||||
|
elif localname == 'processingStepDescription':
|
||||||
|
value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().text()
|
||||||
|
elif localname == 'processingStepSettings':
|
||||||
|
value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().text()
|
||||||
|
elif localname == 'softwareCreator':
|
||||||
|
value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().text()
|
||||||
|
elif localname == 'softwareName':
|
||||||
|
value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().text()
|
||||||
|
elif localname == 'softwareVersion':
|
||||||
|
value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().text()
|
||||||
|
|
||||||
|
elif localname == 'sourceImageInformation':
|
||||||
|
value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().descend(raise_errors)
|
||||||
|
elif localname == 'fileName':
|
||||||
|
value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().text()
|
||||||
|
|
||||||
|
elif localname == 'Layout':
|
||||||
|
value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().descend(raise_errors)
|
||||||
|
elif localname == 'Page':
|
||||||
|
alto_namespace = ET.QName(group[0]).namespace
|
||||||
|
namespaces={"alto": alto_namespace}
|
||||||
|
|
||||||
|
value[localname] = {}
|
||||||
|
value[localname].update(TagGroup(tag, group).is_singleton().attributes())
|
||||||
|
value[localname].update(TagGroup(tag, group).subelement_counts())
|
||||||
|
value[localname].update(TagGroup(tag, group).xpath_statistics("//alto:String/@WC", namespaces))
|
||||||
|
|
||||||
|
elif localname == 'Styles':
|
||||||
|
pass
|
||||||
|
elif localname == 'Tags':
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
if raise_errors:
|
||||||
|
print(value)
|
||||||
|
raise ValueError('Unknown tag "{}"'.format(tag))
|
||||||
|
else:
|
||||||
|
pass
|
||||||
|
|
||||||
|
return value
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def walk(m):
|
||||||
|
# XXX do this in modstool, too
|
||||||
|
if os.path.isdir(m):
|
||||||
|
tqdm.write(f'Scanning directory {m}')
|
||||||
|
for f in tqdm(os.scandir(m), leave=False):
|
||||||
|
if f.is_file() and not f.name.startswith('.'):
|
||||||
|
yield f.path
|
||||||
|
elif f.is_dir():
|
||||||
|
try:
|
||||||
|
yield from walk(f.path)
|
||||||
|
except PermissionError:
|
||||||
|
warnings.warn(f"Error walking {f.path}")
|
||||||
|
else:
|
||||||
|
yield m.path
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@click.command()
|
||||||
|
@click.argument('alto_files', type=click.Path(exists=True), required=True, nargs=-1)
|
||||||
|
@click.option('--output', '-o', 'output_file', type=click.Path(), help='Output pickle file',
|
||||||
|
default='alto_info_df.pkl', show_default=True)
|
||||||
|
@click.option('--output-csv', type=click.Path(), help='Output CSV file')
|
||||||
|
@click.option('--output-xlsx', type=click.Path(), help='Output Excel .xlsx file')
|
||||||
|
def process(alto_files: List[str], output_file: str, output_csv: str, output_xlsx: str):
|
||||||
|
"""
|
||||||
|
A tool to convert the ALTO metadata in INPUT to a pandas DataFrame.
|
||||||
|
|
||||||
|
INPUT is assumed to be a ALTO document. INPUT may optionally be a directory. The tool then reads
|
||||||
|
all files in the directory.
|
||||||
|
|
||||||
|
alto4pandas writes two output files: A pickled pandas DataFrame and a CSV file with all conversion warnings.
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Extend file list if directories are given
|
||||||
|
alto_files_real = []
|
||||||
|
for m in alto_files:
|
||||||
|
for x in walk(m):
|
||||||
|
alto_files_real.append(x)
|
||||||
|
|
||||||
|
# Process ALTO files
|
||||||
|
with open(output_file + '.warnings.csv', 'w') as csvfile:
|
||||||
|
csvwriter = csv.writer(csvfile)
|
||||||
|
alto_info = []
|
||||||
|
logger.info('Processing ALTO files')
|
||||||
|
for alto_file in tqdm(alto_files_real, leave=False):
|
||||||
|
try:
|
||||||
|
root = ET.parse(alto_file).getroot()
|
||||||
|
alto = root # XXX .find('alto:alto', ns) does not work here
|
||||||
|
|
||||||
|
with warnings.catch_warnings(record=True) as caught_warnings:
|
||||||
|
warnings.simplefilter('always') # do NOT filter double occurrences
|
||||||
|
|
||||||
|
# ALTO
|
||||||
|
d = flatten(alto_to_dict(alto, raise_errors=True))
|
||||||
|
# "meta"
|
||||||
|
d['alto_file'] = alto_file
|
||||||
|
d['alto_xmlns'] = ET.QName(alto).namespace
|
||||||
|
|
||||||
|
alto_info.append(d)
|
||||||
|
|
||||||
|
if caught_warnings:
|
||||||
|
# PyCharm thinks caught_warnings is not Iterable:
|
||||||
|
# noinspection PyTypeChecker
|
||||||
|
for caught_warning in caught_warnings:
|
||||||
|
csvwriter.writerow([alto_file, caught_warning.message])
|
||||||
|
except Exception as e:
|
||||||
|
logger.error('Exception in {}: {}'.format(alto_file, e))
|
||||||
|
import traceback; traceback.print_exc()
|
||||||
|
|
||||||
|
# Convert the alto_info List[Dict] to a pandas DataFrame
|
||||||
|
columns = []
|
||||||
|
for m in alto_info:
|
||||||
|
for c in m.keys():
|
||||||
|
if c not in columns:
|
||||||
|
columns.append(c)
|
||||||
|
data = [[m.get(c) for c in columns] for m in alto_info]
|
||||||
|
index = [m['alto_file'] for m in alto_info] # TODO use ppn + page?
|
||||||
|
alto_info_df = pd.DataFrame(data=data, index=index, columns=columns)
|
||||||
|
|
||||||
|
# Pickle the DataFrame
|
||||||
|
logger.info('Writing DataFrame to {}'.format(output_file))
|
||||||
|
alto_info_df.to_pickle(output_file)
|
||||||
|
if output_csv:
|
||||||
|
logger.info('Writing CSV to {}'.format(output_csv))
|
||||||
|
alto_info_df.to_csv(output_csv)
|
||||||
|
if output_xlsx:
|
||||||
|
logger.info('Writing Excel .xlsx to {}'.format(output_xlsx))
|
||||||
|
alto_info_df.to_excel(output_xlsx)
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
logging.basicConfig(level=logging.INFO)
|
||||||
|
|
||||||
|
for prefix, uri in ns.items():
|
||||||
|
ET.register_namespace(prefix, uri)
|
||||||
|
|
||||||
|
process()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
@ -0,0 +1,285 @@
|
|||||||
|
from itertools import groupby
|
||||||
|
import re
|
||||||
|
import warnings
|
||||||
|
from typing import List, Sequence, MutableMapping
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
from lxml import etree as ET
|
||||||
|
|
||||||
|
|
||||||
|
__all__ = ["ns"]
|
||||||
|
|
||||||
|
|
||||||
|
ns = {
|
||||||
|
'mets': 'http://www.loc.gov/METS/',
|
||||||
|
'mods': 'http://www.loc.gov/mods/v3',
|
||||||
|
"alto": "http://www.loc.gov/standards/alto/ns-v2"
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
class TagGroup:
|
||||||
|
"""Helper class to simplify the parsing and checking of MODS metadata"""
|
||||||
|
|
||||||
|
def __init__(self, tag, group: List[ET.Element]):
|
||||||
|
self.tag = tag
|
||||||
|
self.group = group
|
||||||
|
|
||||||
|
def __str__(self):
|
||||||
|
return '\n'.join(str(ET.tostring(e), 'utf-8').strip() for e in self.group)
|
||||||
|
|
||||||
|
def is_singleton(self):
|
||||||
|
if len(self.group) != 1:
|
||||||
|
raise ValueError('More than one instance: {}'.format(self))
|
||||||
|
return self
|
||||||
|
|
||||||
|
def has_no_attributes(self):
|
||||||
|
return self.has_attributes({})
|
||||||
|
|
||||||
|
def has_attributes(self, attrib):
|
||||||
|
if not isinstance(attrib, Sequence):
|
||||||
|
attrib = [attrib]
|
||||||
|
if not all(e.attrib in attrib for e in self.group):
|
||||||
|
raise ValueError('One or more element has unexpected attributes: {}'.format(self))
|
||||||
|
return self
|
||||||
|
|
||||||
|
def ignore_attributes(self):
|
||||||
|
# This serves as documentation for now.
|
||||||
|
return self
|
||||||
|
|
||||||
|
def sort(self, key=None, reverse=False):
|
||||||
|
self.group = sorted(self.group, key=key, reverse=reverse)
|
||||||
|
return self
|
||||||
|
|
||||||
|
def text(self, separator='\n'):
|
||||||
|
t = ''
|
||||||
|
for e in self.group:
|
||||||
|
if t != '':
|
||||||
|
t += separator
|
||||||
|
if e.text:
|
||||||
|
t += e.text
|
||||||
|
return t
|
||||||
|
|
||||||
|
def text_set(self):
|
||||||
|
return {e.text for e in self.group}
|
||||||
|
|
||||||
|
def descend(self, raise_errors):
|
||||||
|
return _to_dict(self.is_singleton().group[0], raise_errors)
|
||||||
|
|
||||||
|
def filter(self, cond, warn=None):
|
||||||
|
new_group = []
|
||||||
|
for e in self.group:
|
||||||
|
if cond(e):
|
||||||
|
new_group.append(e)
|
||||||
|
else:
|
||||||
|
if warn:
|
||||||
|
warnings.warn('Filtered {} element ({})'.format(self.tag, warn))
|
||||||
|
return TagGroup(self.tag, new_group)
|
||||||
|
|
||||||
|
def force_singleton(self, warn=True):
|
||||||
|
if len(self.group) == 1:
|
||||||
|
return self
|
||||||
|
else:
|
||||||
|
if warn:
|
||||||
|
warnings.warn('Forced single instance of {}'.format(self.tag))
|
||||||
|
return TagGroup(self.tag, self.group[:1])
|
||||||
|
|
||||||
|
RE_ISO8601_DATE = r'^\d{2}(\d{2}|XX)(-\d{2}-\d{2})?$' # Note: Includes non-specific century dates like '18XX'
|
||||||
|
RE_GERMAN_DATE = r'^(?P<dd>\d{2})\.(?P<mm>\d{2})\.(?P<yyyy>\d{4})$'
|
||||||
|
|
||||||
|
def fix_date(self):
|
||||||
|
|
||||||
|
for e in self.group:
|
||||||
|
if e.attrib.get('encoding') == 'w3cdtf':
|
||||||
|
# This should be 'iso8601' according to MODS-AP 2.3.1
|
||||||
|
warnings.warn('Changed w3cdtf encoding to iso8601')
|
||||||
|
e.attrib['encoding'] = 'iso8601'
|
||||||
|
|
||||||
|
new_group = []
|
||||||
|
for e in self.group:
|
||||||
|
if e.attrib.get('encoding') == 'iso8601' and re.match(self.RE_ISO8601_DATE, e.text):
|
||||||
|
new_group.append(e)
|
||||||
|
elif re.match(self.RE_ISO8601_DATE, e.text):
|
||||||
|
warnings.warn('Added iso8601 encoding to date {}'.format(e.text))
|
||||||
|
e.attrib['encoding'] = 'iso8601'
|
||||||
|
new_group.append(e)
|
||||||
|
elif re.match(self.RE_GERMAN_DATE, e.text):
|
||||||
|
warnings.warn('Converted date {} to iso8601 encoding'.format(e.text))
|
||||||
|
m = re.match(self.RE_GERMAN_DATE, e.text)
|
||||||
|
e.text = '{}-{}-{}'.format(m.group('yyyy'), m.group('mm'), m.group('dd'))
|
||||||
|
e.attrib['encoding'] = 'iso8601'
|
||||||
|
new_group.append(e)
|
||||||
|
else:
|
||||||
|
warnings.warn('Not a iso8601 date: "{}"'.format(e.text))
|
||||||
|
new_group.append(e)
|
||||||
|
self.group = new_group
|
||||||
|
|
||||||
|
# Notes:
|
||||||
|
# - There are dates with the misspelled qualifier 'aproximate'
|
||||||
|
# - Rough periods are sometimes given either by:
|
||||||
|
# - years like '19xx'
|
||||||
|
# - or 'approximate' date ranges with point="start"/"end" attributes set
|
||||||
|
# (this could be correct according to MODS-AP 2.3.1)
|
||||||
|
# - Some very specific dates like '06.08.1820' are sometimes given the 'approximate' qualifier
|
||||||
|
# - Sometimes, approximate date ranges are given in the text "1785-1800 (ca.)"
|
||||||
|
|
||||||
|
return self
|
||||||
|
|
||||||
|
def fix_event_type(self):
|
||||||
|
# According to MODS-AP 2.3.1, every originInfo should have its eventType set.
|
||||||
|
# Fix this for special cases.
|
||||||
|
|
||||||
|
for e in self.group:
|
||||||
|
if e.attrib.get('eventType') is None:
|
||||||
|
try:
|
||||||
|
if e.find('mods:publisher', ns).text.startswith('Staatsbibliothek zu Berlin') and \
|
||||||
|
e.find('mods:edition', ns).text == '[Electronic ed.]':
|
||||||
|
e.attrib['eventType'] = 'digitization'
|
||||||
|
warnings.warn('Fixed eventType for electronic ed.')
|
||||||
|
continue
|
||||||
|
except AttributeError:
|
||||||
|
pass
|
||||||
|
try:
|
||||||
|
if e.find('mods:dateIssued', ns) is not None:
|
||||||
|
e.attrib['eventType'] = 'publication'
|
||||||
|
warnings.warn('Fixed eventType for an issued origin')
|
||||||
|
continue
|
||||||
|
except AttributeError:
|
||||||
|
pass
|
||||||
|
try:
|
||||||
|
if e.find('mods:dateCreated', ns) is not None:
|
||||||
|
e.attrib['eventType'] = 'production'
|
||||||
|
warnings.warn('Fixed eventType for a created origin')
|
||||||
|
continue
|
||||||
|
except AttributeError:
|
||||||
|
pass
|
||||||
|
return self
|
||||||
|
|
||||||
|
def fix_script_term(self):
|
||||||
|
for e in self.group:
|
||||||
|
# MODS-AP 2.3.1 is not clear about this, but it looks like that this should be lower case.
|
||||||
|
if e.attrib['authority'] == 'ISO15924':
|
||||||
|
e.attrib['authority'] = 'iso15924'
|
||||||
|
warnings.warn('Changed scriptTerm authority to lower case')
|
||||||
|
return self
|
||||||
|
|
||||||
|
def merge_sub_tags_to_set(self):
|
||||||
|
from .modstool import mods_to_dict
|
||||||
|
value = {}
|
||||||
|
|
||||||
|
sub_dicts = [mods_to_dict(e) for e in self.group]
|
||||||
|
sub_tags = {k for d in sub_dicts for k in d.keys()}
|
||||||
|
for sub_tag in sub_tags:
|
||||||
|
s = set()
|
||||||
|
for d in sub_dicts:
|
||||||
|
v = d.get(sub_tag)
|
||||||
|
if v:
|
||||||
|
# There could be multiple scriptTerms in one language element, e.g. Antiqua and Fraktur in a
|
||||||
|
# German language document.
|
||||||
|
if isinstance(v, set):
|
||||||
|
s.update(v)
|
||||||
|
else:
|
||||||
|
s.add(v)
|
||||||
|
value[sub_tag] = s
|
||||||
|
return value
|
||||||
|
|
||||||
|
def attributes(self):
|
||||||
|
"""
|
||||||
|
Return a merged dict of all attributes of the tag group.
|
||||||
|
|
||||||
|
Probably most useful if used on a singleton, for example:
|
||||||
|
|
||||||
|
value['Page'] = TagGroup(tag, group).is_singleton().attributes()
|
||||||
|
"""
|
||||||
|
attrib = {}
|
||||||
|
for e in self.group:
|
||||||
|
for a, v in e.attrib.items():
|
||||||
|
a_localname = ET.QName(a).localname
|
||||||
|
attrib[a_localname] = v
|
||||||
|
return attrib
|
||||||
|
|
||||||
|
def subelement_counts(self):
|
||||||
|
counts = {}
|
||||||
|
for e in self.group:
|
||||||
|
for x in e.iter():
|
||||||
|
tag = ET.QName(x.tag).localname
|
||||||
|
key = f"{tag}-count"
|
||||||
|
counts[key] = counts.get(key, 0) + 1
|
||||||
|
return counts
|
||||||
|
|
||||||
|
def xpath_statistics(self, xpath_expr, namespaces):
|
||||||
|
"""
|
||||||
|
Extract values and calculate statistics
|
||||||
|
|
||||||
|
Extract values using the given XPath expression, convert them to float and return descriptive
|
||||||
|
statistics on the values.
|
||||||
|
"""
|
||||||
|
values = []
|
||||||
|
for e in self.group:
|
||||||
|
r = e.xpath(xpath_expr, namespaces=namespaces)
|
||||||
|
values += r
|
||||||
|
values = np.array([float(v) for v in values])
|
||||||
|
|
||||||
|
statistics = {}
|
||||||
|
if values.size > 0:
|
||||||
|
statistics[f'{xpath_expr}-mean'] = np.mean(values)
|
||||||
|
statistics[f'{xpath_expr}-median'] = np.median(values)
|
||||||
|
statistics[f'{xpath_expr}-std'] = np.std(values)
|
||||||
|
statistics[f'{xpath_expr}-min'] = np.min(values)
|
||||||
|
statistics[f'{xpath_expr}-max'] = np.max(values)
|
||||||
|
return statistics
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def sorted_groupby(iterable, key=None):
|
||||||
|
"""
|
||||||
|
Sort iterable by key and then group by the same key.
|
||||||
|
|
||||||
|
itertools.groupby() assumes that the iterable is already sorted. This function
|
||||||
|
conveniently sorts the iterable first, and then groups its elements.
|
||||||
|
"""
|
||||||
|
return groupby(sorted(iterable, key=key), key=key)
|
||||||
|
|
||||||
|
|
||||||
|
def _to_dict(root, raise_errors):
|
||||||
|
from .modstool import mods_to_dict, mets_to_dict
|
||||||
|
from .alto4pandas import alto_to_dict
|
||||||
|
|
||||||
|
root_name = ET.QName(root.tag)
|
||||||
|
if root_name.namespace == "http://www.loc.gov/mods/v3":
|
||||||
|
return mods_to_dict(root, raise_errors)
|
||||||
|
elif root_name.namespace == "http://www.loc.gov/METS/":
|
||||||
|
return mets_to_dict(root, raise_errors)
|
||||||
|
elif root_name.namespace in [
|
||||||
|
"http://schema.ccs-gmbh.com/ALTO",
|
||||||
|
"http://www.loc.gov/standards/alto/",
|
||||||
|
"http://www.loc.gov/standards/alto/ns-v2#",
|
||||||
|
"http://www.loc.gov/standards/alto/ns-v4#",
|
||||||
|
]:
|
||||||
|
return alto_to_dict(root, raise_errors)
|
||||||
|
else:
|
||||||
|
raise ValueError(f"Unknown namespace {root_name.namespace}")
|
||||||
|
|
||||||
|
|
||||||
|
def flatten(d: MutableMapping, parent='', separator='_'):
|
||||||
|
"""
|
||||||
|
Flatten the given nested dict.
|
||||||
|
|
||||||
|
It is assumed that d maps strings to either another dictionary (similarly structured) or some other value.
|
||||||
|
"""
|
||||||
|
items = []
|
||||||
|
|
||||||
|
for k, v in d.items():
|
||||||
|
if parent:
|
||||||
|
new_key = parent + separator + k
|
||||||
|
else:
|
||||||
|
new_key = k
|
||||||
|
|
||||||
|
if isinstance(v, MutableMapping):
|
||||||
|
items.extend(flatten(v, new_key, separator=separator).items())
|
||||||
|
else:
|
||||||
|
items.append((new_key, v))
|
||||||
|
|
||||||
|
return dict(items)
|
||||||
|
|
@ -0,0 +1,39 @@
|
|||||||
|
import xml.etree.ElementTree as ET
|
||||||
|
|
||||||
|
|
||||||
|
from qurator.modstool.alto4pandas import alto_to_dict
|
||||||
|
from qurator.modstool.lib import flatten
|
||||||
|
|
||||||
|
|
||||||
|
def dict_fromstring(x):
|
||||||
|
return flatten(alto_to_dict(ET.fromstring(x)))
|
||||||
|
|
||||||
|
def test_Page_counts():
|
||||||
|
"""
|
||||||
|
Elements below Layout/Page should be counted
|
||||||
|
"""
|
||||||
|
d = dict_fromstring("""
|
||||||
|
<alto xmlns="http://www.loc.gov/standards/alto/ns-v2#">
|
||||||
|
<Layout>
|
||||||
|
<Page ID="Page1" PHYSICAL_IMG_NR="1">
|
||||||
|
<TextBlock ID="Page1_Block1">
|
||||||
|
<TextLine>
|
||||||
|
<String STYLE="bold" WC="0.8937500119" CONTENT="Staatsbibliothek" />
|
||||||
|
</TextLine>
|
||||||
|
<TextLine>
|
||||||
|
<String STYLE="bold" WC="0.8899999857" CONTENT="zu" />
|
||||||
|
<String STYLE="bold" WC="0.9866666794" CONTENT="Berlin" />
|
||||||
|
</TextLine>
|
||||||
|
<TextLine>
|
||||||
|
<String STYLE="bold" WC="1." CONTENT="WM" />
|
||||||
|
<String STYLE="bold" WC="0.8927272558" CONTENT="Preußischer" />
|
||||||
|
<String STYLE="bold" WC="0.9058333039" CONTENT="Kulturbesitz" />
|
||||||
|
</TextLine>
|
||||||
|
</TextBlock>
|
||||||
|
</Page>
|
||||||
|
</Layout>
|
||||||
|
</alto>
|
||||||
|
""")
|
||||||
|
assert d['Layout_Page_TextBlock-count'] == 1
|
||||||
|
assert d['Layout_Page_TextLine-count'] == 3
|
||||||
|
assert d['Layout_Page_String-count'] == 6
|
@ -0,0 +1 @@
|
|||||||
|
pytest
|
Loading…
Reference in New Issue