From e86369e76d62eac26a3daf2e6757348451f06b9b Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Wed, 4 May 2022 20:02:27 +0200 Subject: [PATCH 01/19] =?UTF-8?q?=F0=9F=9A=A7=20Add=20support=20for=20ALTO?= =?UTF-8?q?=20Description?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- qurator/modstool/__init__.py | 1 - qurator/modstool/altotool.py | 170 +++++++++++++++++++++ qurator/modstool/lib.py | 229 ++++++++++++++++++++++++++++ qurator/modstool/modstool.py | 211 +------------------------ qurator/modstool/tests/test_mets.py | 1 - requirements-test.txt | 1 + setup.py | 5 +- 7 files changed, 406 insertions(+), 212 deletions(-) create mode 100755 qurator/modstool/altotool.py create mode 100644 qurator/modstool/lib.py create mode 100644 requirements-test.txt diff --git a/qurator/modstool/__init__.py b/qurator/modstool/__init__.py index eabaacd..e69de29 100644 --- a/qurator/modstool/__init__.py +++ b/qurator/modstool/__init__.py @@ -1 +0,0 @@ -from .modstool import * diff --git a/qurator/modstool/altotool.py b/qurator/modstool/altotool.py new file mode 100755 index 0000000..9006601 --- /dev/null +++ b/qurator/modstool/altotool.py @@ -0,0 +1,170 @@ +#!/usr/bin/env python3 +import csv +import logging +import os +import re +import warnings +import sys +from lxml import etree as ET +from itertools import groupby +from operator import attrgetter +from typing import List +from collections.abc import MutableMapping, Sequence + +import click +import pandas as pd +from tqdm import tqdm + +from .lib import TagGroup, sorted_groupby, flatten, ns + + +logger = logging.getLogger('altotool') + + + +def alto_to_dict(alto, raise_errors=True): + """Convert ALTO metadata to a nested dictionary""" + + value = {} + + # Iterate through each group of tags + for tag, group in sorted_groupby(alto, key=attrgetter('tag')): + group = list(group) + + # XXX Namespaces seem to use a trailing / sometimes, sometimes not. + # (e.g. {http://www.loc.gov/METS/} vs {http://www.loc.gov/METS}) + if tag == '{http://www.loc.gov/standards/alto/ns-v2#}Description': + value['Description'] = TagGroup(tag, group).is_singleton().has_no_attributes().descend(raise_errors) + elif tag == '{http://www.loc.gov/standards/alto/ns-v2#}MeasurementUnit': + value['MeasurementUnit'] = TagGroup(tag, group).is_singleton().has_no_attributes().text() + elif tag == '{http://www.loc.gov/standards/alto/ns-v2#}OCRProcessing': + value['OCRProcessing'] = TagGroup(tag, group).is_singleton().descend(raise_errors) + elif tag == '{http://www.loc.gov/standards/alto/ns-v2#}ocrProcessingStep': + for n, e in enumerate(group): + value['ocrProcessingStep{}'.format(n)] = alto_to_dict(e, raise_errors) + elif tag == '{http://www.loc.gov/standards/alto/ns-v2#}processingDateTime': + value['processingDateTime'] = TagGroup(tag, group).is_singleton().has_no_attributes().text() + elif tag == '{http://www.loc.gov/standards/alto/ns-v2#}processingSoftware': + value['processingSoftware'] = TagGroup(tag, group).is_singleton().descend(raise_errors) + elif tag == '{http://www.loc.gov/standards/alto/ns-v2#}softwareCreator': + value['softwareCreator'] = TagGroup(tag, group).is_singleton().has_no_attributes().text() + elif tag == '{http://www.loc.gov/standards/alto/ns-v2#}softwareName': + value['softwareName'] = TagGroup(tag, group).is_singleton().has_no_attributes().text() + elif tag == '{http://www.loc.gov/standards/alto/ns-v2#}softwareVersion': + value['softwareVersion'] = TagGroup(tag, group).is_singleton().has_no_attributes().text() + elif tag == '{http://www.loc.gov/standards/alto/ns-v2#}Layout': + pass # TODO + elif tag == '{http://www.loc.gov/standards/alto/ns-v2#}Styles': + pass + else: + if raise_errors: + print(value) + raise ValueError('Unknown tag "{}"'.format(tag)) + else: + pass + + return value + + + +def walk(m): + # XXX do this in modstool, too + if os.path.isdir(m): + logger.info('Scanning directory {}'.format(m)) + for f in tqdm(os.scandir(m), leave=False): + if f.is_file() and not f.name.startswith('.'): + yield f.path + elif f.is_dir(): + yield from walk(f.path) + else: + yield m.path + + + +@click.command() +@click.argument('alto_files', type=click.Path(exists=True), required=True, nargs=-1) +@click.option('--output', '-o', 'output_file', type=click.Path(), help='Output pickle file', + default='alto_info_df.pkl', show_default=True) +@click.option('--output-csv', type=click.Path(), help='Output CSV file') +@click.option('--output-xlsx', type=click.Path(), help='Output Excel .xlsx file') +def process(alto_files: List[str], output_file: str, output_csv: str, output_xlsx: str): + """ + A tool to convert the ALTO metadata in INPUT to a pandas DataFrame. + + INPUT is assumed to be a ALTO document. INPUT may optionally be a directory. The tool then reads + all files in the directory. + + altotool writes two output files: A pickled pandas DataFrame and a CSV file with all conversion warnings. + """ + + # Extend file list if directories are given + alto_files_real = [] + for m in alto_files: + for x in walk(m): + alto_files_real.append(x) + + # Process ALTO files + with open(output_file + '.warnings.csv', 'w') as csvfile: + csvwriter = csv.writer(csvfile) + alto_info = [] + logger.info('Processing ALTO files') + for alto_file in tqdm(alto_files_real, leave=False): + try: + root = ET.parse(alto_file).getroot() + alto = root # XXX .find('alto:alto', ns) does not work here + + with warnings.catch_warnings(record=True) as caught_warnings: + warnings.simplefilter('always') # do NOT filter double occurrences + + # MODS + d = flatten(alto_to_dict(alto, raise_errors=True)) + # METS + d_alto = flatten(alto_to_dict(alto, raise_errors=True)) + for k, v in d_alto.items(): + d[f"alto_{k}"] = v + # "meta" + d['alto_file'] = alto_file + + alto_info.append(d) + + if caught_warnings: + # PyCharm thinks caught_warnings is not Iterable: + # noinspection PyTypeChecker + for caught_warning in caught_warnings: + csvwriter.writerow([alto_file, caught_warning.message]) + except Exception as e: + logger.error('Exception in {}: {}'.format(alto_file, e)) + #import traceback; traceback.print_exc() + + # Convert the alto_info List[Dict] to a pandas DataFrame + columns = [] + for m in alto_info: + for c in m.keys(): + if c not in columns: + columns.append(c) + data = [[m.get(c) for c in columns] for m in alto_info] + index = [m['alto_file'] for m in alto_info] # TODO use ppn + page? + alto_info_df = pd.DataFrame(data=data, index=index, columns=columns) + + # Pickle the DataFrame + logger.info('Writing DataFrame to {}'.format(output_file)) + alto_info_df.to_pickle(output_file) + if output_csv: + logger.info('Writing CSV to {}'.format(output_csv)) + alto_info_df.to_csv(output_csv) + if output_xlsx: + logger.info('Writing Excel .xlsx to {}'.format(output_xlsx)) + alto_info_df.to_excel(output_xlsx) + + +def main(): + logging.basicConfig(level=logging.INFO) + + for prefix, uri in ns.items(): + ET.register_namespace(prefix, uri) + + process() + + +if __name__ == '__main__': + main() diff --git a/qurator/modstool/lib.py b/qurator/modstool/lib.py new file mode 100644 index 0000000..c22689e --- /dev/null +++ b/qurator/modstool/lib.py @@ -0,0 +1,229 @@ +from itertools import groupby +import re +import warnings +from typing import List, Sequence, MutableMapping + +from lxml import etree as ET + + +__all__ = ["ns"] + + +ns = { + 'mets': 'http://www.loc.gov/METS/', + 'mods': 'http://www.loc.gov/mods/v3', + "alto": "http://www.loc.gov/standards/alto/ns-v2" +} + + + +class TagGroup: + """Helper class to simplify the parsing and checking of MODS metadata""" + + def __init__(self, tag, group: List[ET.Element]): + self.tag = tag + self.group = group + + def __str__(self): + return '\n'.join(str(ET.tostring(e), 'utf-8').strip() for e in self.group) + + def is_singleton(self): + if len(self.group) != 1: + raise ValueError('More than one instance: {}'.format(self)) + return self + + def has_no_attributes(self): + return self.has_attributes({}) + + def has_attributes(self, attrib): + if not isinstance(attrib, Sequence): + attrib = [attrib] + if not all(e.attrib in attrib for e in self.group): + raise ValueError('One or more element has unexpected attributes: {}'.format(self)) + return self + + def ignore_attributes(self): + # This serves as documentation for now. + return self + + def sort(self, key=None, reverse=False): + self.group = sorted(self.group, key=key, reverse=reverse) + return self + + def text(self, separator='\n'): + t = '' + for e in self.group: + if t != '': + t += separator + t += e.text + return t + + def text_set(self): + return {e.text for e in self.group} + + def descend(self, raise_errors): + return _to_dict(self.is_singleton().group[0], raise_errors) + + def filter(self, cond, warn=None): + new_group = [] + for e in self.group: + if cond(e): + new_group.append(e) + else: + if warn: + warnings.warn('Filtered {} element ({})'.format(self.tag, warn)) + return TagGroup(self.tag, new_group) + + def force_singleton(self, warn=True): + if len(self.group) == 1: + return self + else: + if warn: + warnings.warn('Forced single instance of {}'.format(self.tag)) + return TagGroup(self.tag, self.group[:1]) + + RE_ISO8601_DATE = r'^\d{2}(\d{2}|XX)(-\d{2}-\d{2})?$' # Note: Includes non-specific century dates like '18XX' + RE_GERMAN_DATE = r'^(?P
\d{2})\.(?P\d{2})\.(?P\d{4})$' + + def fix_date(self): + + for e in self.group: + if e.attrib.get('encoding') == 'w3cdtf': + # This should be 'iso8601' according to MODS-AP 2.3.1 + warnings.warn('Changed w3cdtf encoding to iso8601') + e.attrib['encoding'] = 'iso8601' + + new_group = [] + for e in self.group: + if e.attrib.get('encoding') == 'iso8601' and re.match(self.RE_ISO8601_DATE, e.text): + new_group.append(e) + elif re.match(self.RE_ISO8601_DATE, e.text): + warnings.warn('Added iso8601 encoding to date {}'.format(e.text)) + e.attrib['encoding'] = 'iso8601' + new_group.append(e) + elif re.match(self.RE_GERMAN_DATE, e.text): + warnings.warn('Converted date {} to iso8601 encoding'.format(e.text)) + m = re.match(self.RE_GERMAN_DATE, e.text) + e.text = '{}-{}-{}'.format(m.group('yyyy'), m.group('mm'), m.group('dd')) + e.attrib['encoding'] = 'iso8601' + new_group.append(e) + else: + warnings.warn('Not a iso8601 date: "{}"'.format(e.text)) + new_group.append(e) + self.group = new_group + + # Notes: + # - There are dates with the misspelled qualifier 'aproximate' + # - Rough periods are sometimes given either by: + # - years like '19xx' + # - or 'approximate' date ranges with point="start"/"end" attributes set + # (this could be correct according to MODS-AP 2.3.1) + # - Some very specific dates like '06.08.1820' are sometimes given the 'approximate' qualifier + # - Sometimes, approximate date ranges are given in the text "1785-1800 (ca.)" + + return self + + def fix_event_type(self): + # According to MODS-AP 2.3.1, every originInfo should have its eventType set. + # Fix this for special cases. + + for e in self.group: + if e.attrib.get('eventType') is None: + try: + if e.find('mods:publisher', ns).text.startswith('Staatsbibliothek zu Berlin') and \ + e.find('mods:edition', ns).text == '[Electronic ed.]': + e.attrib['eventType'] = 'digitization' + warnings.warn('Fixed eventType for electronic ed.') + continue + except AttributeError: + pass + try: + if e.find('mods:dateIssued', ns) is not None: + e.attrib['eventType'] = 'publication' + warnings.warn('Fixed eventType for an issued origin') + continue + except AttributeError: + pass + try: + if e.find('mods:dateCreated', ns) is not None: + e.attrib['eventType'] = 'production' + warnings.warn('Fixed eventType for a created origin') + continue + except AttributeError: + pass + return self + + def fix_script_term(self): + for e in self.group: + # MODS-AP 2.3.1 is not clear about this, but it looks like that this should be lower case. + if e.attrib['authority'] == 'ISO15924': + e.attrib['authority'] = 'iso15924' + warnings.warn('Changed scriptTerm authority to lower case') + return self + + def merge_sub_tags_to_set(self): + value = {} + + sub_dicts = [mods_to_dict(e) for e in self.group] + sub_tags = {k for d in sub_dicts for k in d.keys()} + for sub_tag in sub_tags: + s = set() + for d in sub_dicts: + v = d.get(sub_tag) + if v: + # There could be multiple scriptTerms in one language element, e.g. Antiqua and Fraktur in a + # German language document. + if isinstance(v, set): + s.update(v) + else: + s.add(v) + value[sub_tag] = s + return value + + +def sorted_groupby(iterable, key=None): + """ + Sort iterable by key and then group by the same key. + + itertools.groupby() assumes that the iterable is already sorted. This function + conveniently sorts the iterable first, and then groups its elements. + """ + return groupby(sorted(iterable, key=key), key=key) + + +def _to_dict(root, raise_errors): + from .modstool import mods_to_dict, mets_to_dict + from .altotool import alto_to_dict + + root_name = ET.QName(root.tag) + if root_name.namespace == "http://www.loc.gov/mods/v3": + return mods_to_dict(root, raise_errors) + elif root_name.namespace == "http://www.loc.gov/METS/": + return mets_to_dict(root, raise_errors) + elif root_name.namespace == "http://www.loc.gov/standards/alto/ns-v2#": + return alto_to_dict(root, raise_errors) + else: + raise ValueError(f"Unknown namespace {root_name.namespace}") + + +def flatten(d: MutableMapping, parent='', separator='_'): + """ + Flatten the given nested dict. + + It is assumed that d maps strings to either another dictionary (similarly structured) or some other value. + """ + items = [] + + for k, v in d.items(): + if parent: + new_key = parent + separator + k + else: + new_key = k + + if isinstance(v, MutableMapping): + items.extend(flatten(v, new_key, separator=separator).items()) + else: + items.append((new_key, v)) + + return dict(items) + diff --git a/qurator/modstool/modstool.py b/qurator/modstool/modstool.py index 4b035ac..bc7429b 100755 --- a/qurator/modstool/modstool.py +++ b/qurator/modstool/modstool.py @@ -14,196 +14,11 @@ import click import pandas as pd from tqdm import tqdm +from .lib import sorted_groupby, TagGroup, ns -ns = { - 'mets': 'http://www.loc.gov/METS/', - 'mods': 'http://www.loc.gov/mods/v3' -} -logger = logging.getLogger('modstool') - -class TagGroup: - """Helper class to simplify the parsing and checking of MODS metadata""" - - def __init__(self, tag, group: List[ET.Element]): - self.tag = tag - self.group = group - - def __str__(self): - return '\n'.join(str(ET.tostring(e), 'utf-8').strip() for e in self.group) - - def is_singleton(self): - if len(self.group) != 1: - raise ValueError('More than one instance: {}'.format(self)) - return self - - def has_no_attributes(self): - return self.has_attributes({}) - - def has_attributes(self, attrib): - if not isinstance(attrib, Sequence): - attrib = [attrib] - if not all(e.attrib in attrib for e in self.group): - raise ValueError('One or more element has unexpected attributes: {}'.format(self)) - return self - - def ignore_attributes(self): - # This serves as documentation for now. - return self - - def sort(self, key=None, reverse=False): - self.group = sorted(self.group, key=key, reverse=reverse) - return self - - def text(self, separator='\n'): - t = '' - for e in self.group: - if t != '': - t += separator - t += e.text - return t - - def text_set(self): - return {e.text for e in self.group} - - def descend(self, raise_errors): - return _to_dict(self.is_singleton().group[0], raise_errors) - - def filter(self, cond, warn=None): - new_group = [] - for e in self.group: - if cond(e): - new_group.append(e) - else: - if warn: - warnings.warn('Filtered {} element ({})'.format(self.tag, warn)) - return TagGroup(self.tag, new_group) - def force_singleton(self, warn=True): - if len(self.group) == 1: - return self - else: - if warn: - warnings.warn('Forced single instance of {}'.format(self.tag)) - return TagGroup(self.tag, self.group[:1]) - - RE_ISO8601_DATE = r'^\d{2}(\d{2}|XX)(-\d{2}-\d{2})?$' # Note: Includes non-specific century dates like '18XX' - RE_GERMAN_DATE = r'^(?P
\d{2})\.(?P\d{2})\.(?P\d{4})$' - - def fix_date(self): - - for e in self.group: - if e.attrib.get('encoding') == 'w3cdtf': - # This should be 'iso8601' according to MODS-AP 2.3.1 - warnings.warn('Changed w3cdtf encoding to iso8601') - e.attrib['encoding'] = 'iso8601' - - new_group = [] - for e in self.group: - if e.attrib.get('encoding') == 'iso8601' and re.match(self.RE_ISO8601_DATE, e.text): - new_group.append(e) - elif re.match(self.RE_ISO8601_DATE, e.text): - warnings.warn('Added iso8601 encoding to date {}'.format(e.text)) - e.attrib['encoding'] = 'iso8601' - new_group.append(e) - elif re.match(self.RE_GERMAN_DATE, e.text): - warnings.warn('Converted date {} to iso8601 encoding'.format(e.text)) - m = re.match(self.RE_GERMAN_DATE, e.text) - e.text = '{}-{}-{}'.format(m.group('yyyy'), m.group('mm'), m.group('dd')) - e.attrib['encoding'] = 'iso8601' - new_group.append(e) - else: - warnings.warn('Not a iso8601 date: "{}"'.format(e.text)) - new_group.append(e) - self.group = new_group - - # Notes: - # - There are dates with the misspelled qualifier 'aproximate' - # - Rough periods are sometimes given either by: - # - years like '19xx' - # - or 'approximate' date ranges with point="start"/"end" attributes set - # (this could be correct according to MODS-AP 2.3.1) - # - Some very specific dates like '06.08.1820' are sometimes given the 'approximate' qualifier - # - Sometimes, approximate date ranges are given in the text "1785-1800 (ca.)" - - return self - - def fix_event_type(self): - # According to MODS-AP 2.3.1, every originInfo should have its eventType set. - # Fix this for special cases. - - for e in self.group: - if e.attrib.get('eventType') is None: - try: - if e.find('mods:publisher', ns).text.startswith('Staatsbibliothek zu Berlin') and \ - e.find('mods:edition', ns).text == '[Electronic ed.]': - e.attrib['eventType'] = 'digitization' - warnings.warn('Fixed eventType for electronic ed.') - continue - except AttributeError: - pass - try: - if e.find('mods:dateIssued', ns) is not None: - e.attrib['eventType'] = 'publication' - warnings.warn('Fixed eventType for an issued origin') - continue - except AttributeError: - pass - try: - if e.find('mods:dateCreated', ns) is not None: - e.attrib['eventType'] = 'production' - warnings.warn('Fixed eventType for a created origin') - continue - except AttributeError: - pass - return self - - def fix_script_term(self): - for e in self.group: - # MODS-AP 2.3.1 is not clear about this, but it looks like that this should be lower case. - if e.attrib['authority'] == 'ISO15924': - e.attrib['authority'] = 'iso15924' - warnings.warn('Changed scriptTerm authority to lower case') - return self - - def merge_sub_tags_to_set(self): - value = {} - - sub_dicts = [mods_to_dict(e) for e in self.group] - sub_tags = {k for d in sub_dicts for k in d.keys()} - for sub_tag in sub_tags: - s = set() - for d in sub_dicts: - v = d.get(sub_tag) - if v: - # There could be multiple scriptTerms in one language element, e.g. Antiqua and Fraktur in a - # German language document. - if isinstance(v, set): - s.update(v) - else: - s.add(v) - value[sub_tag] = s - return value - - -def sorted_groupby(iterable, key=None): - """ - Sort iterable by key and then group by the same key. - - itertools.groupby() assumes that the iterable is already sorted. This function - conveniently sorts the iterable first, and then groups its elements. - """ - return groupby(sorted(iterable, key=key), key=key) - -def _to_dict(root, raise_errors): - - root_name = ET.QName(root.tag) - if root_name.namespace == "http://www.loc.gov/mods/v3": - return mods_to_dict(root, raise_errors) - elif root_name.namespace == "http://www.loc.gov/METS/": - return mets_to_dict(root, raise_errors) - else: - raise ValueError(f"Unknown namespace {root_name.namespace}") +logger = logging.getLogger('modstool') def mods_to_dict(mods, raise_errors=True): """Convert MODS metadata to a nested dictionary""" @@ -427,28 +242,6 @@ def mets_to_dict(mets, raise_errors=True): return value -def flatten(d: MutableMapping, parent='', separator='_'): - """ - Flatten the given nested dict. - - It is assumed that d maps strings to either another dictionary (similarly structured) or some other value. - """ - items = [] - - for k, v in d.items(): - if parent: - new_key = parent + separator + k - else: - new_key = k - - if isinstance(v, MutableMapping): - items.extend(flatten(v, new_key, separator=separator).items()) - else: - items.append((new_key, v)) - - return dict(items) - - @click.command() @click.argument('mets_files', type=click.Path(exists=True), required=True, nargs=-1) @click.option('--output', '-o', 'output_file', type=click.Path(), help='Output pickle file', diff --git a/qurator/modstool/tests/test_mets.py b/qurator/modstool/tests/test_mets.py index 6ca22fc..76aa73f 100644 --- a/qurator/modstool/tests/test_mets.py +++ b/qurator/modstool/tests/test_mets.py @@ -1,4 +1,3 @@ -import pytest import xml.etree.ElementTree as ET diff --git a/requirements-test.txt b/requirements-test.txt new file mode 100644 index 0000000..e079f8a --- /dev/null +++ b/requirements-test.txt @@ -0,0 +1 @@ +pytest diff --git a/setup.py b/setup.py index d03bc94..cd19801 100644 --- a/setup.py +++ b/setup.py @@ -3,6 +3,8 @@ from setuptools import find_packages, setup with open('requirements.txt') as fp: install_requires = fp.read() +with open('requirements-test.txt') as fp: + tests_requires = fp.read() setup( name='modstool', @@ -19,8 +21,9 @@ setup( entry_points={ 'console_scripts': [ 'modstool=qurator.modstool.modstool:main', + 'altotool=qurator.modstool.altotool:main', ] }, python_requires='>=3.0.0', - tests_require=['pytest'], + tests_requires=tests_requires, ) From 6e2e0bd67ac75ef96de41eac806ec1b980505c54 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Thu, 5 May 2022 11:10:59 +0200 Subject: [PATCH 02/19] =?UTF-8?q?=F0=9F=90=9B=20Fix=20imports?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- qurator/modstool/lib.py | 1 + qurator/modstool/modstool.py | 2 +- qurator/modstool/tests/test_mets.py | 3 ++- qurator/modstool/tests/test_modstool.py | 4 +++- 4 files changed, 7 insertions(+), 3 deletions(-) diff --git a/qurator/modstool/lib.py b/qurator/modstool/lib.py index c22689e..b24b698 100644 --- a/qurator/modstool/lib.py +++ b/qurator/modstool/lib.py @@ -162,6 +162,7 @@ class TagGroup: return self def merge_sub_tags_to_set(self): + from .modstool import mods_to_dict value = {} sub_dicts = [mods_to_dict(e) for e in self.group] diff --git a/qurator/modstool/modstool.py b/qurator/modstool/modstool.py index bc7429b..a6fe164 100755 --- a/qurator/modstool/modstool.py +++ b/qurator/modstool/modstool.py @@ -14,7 +14,7 @@ import click import pandas as pd from tqdm import tqdm -from .lib import sorted_groupby, TagGroup, ns +from .lib import sorted_groupby, TagGroup, ns, flatten diff --git a/qurator/modstool/tests/test_mets.py b/qurator/modstool/tests/test_mets.py index 76aa73f..315c7b6 100644 --- a/qurator/modstool/tests/test_mets.py +++ b/qurator/modstool/tests/test_mets.py @@ -1,7 +1,8 @@ import xml.etree.ElementTree as ET -from .. import mets_to_dict, flatten +from qurator.modstool.modstool import mets_to_dict +from qurator.modstool.lib import flatten def dict_fromstring(x): diff --git a/qurator/modstool/tests/test_modstool.py b/qurator/modstool/tests/test_modstool.py index b13586a..c31e3e8 100644 --- a/qurator/modstool/tests/test_modstool.py +++ b/qurator/modstool/tests/test_modstool.py @@ -1,8 +1,10 @@ +from tkinter import W import pytest import xml.etree.ElementTree as ET -from .. import mods_to_dict, flatten +from qurator.modstool.modstool import mods_to_dict +from qurator.modstool.lib import flatten def dict_fromstring(x): From 102b15ffa997bb440759dfee49976cbabbffb694 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Fri, 6 May 2022 19:36:50 +0200 Subject: [PATCH 03/19] =?UTF-8?q?=F0=9F=A7=B9=20Do=20not=20duplicate=20ALT?= =?UTF-8?q?O=20metadata?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- qurator/modstool/altotool.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/qurator/modstool/altotool.py b/qurator/modstool/altotool.py index 9006601..b38dddf 100755 --- a/qurator/modstool/altotool.py +++ b/qurator/modstool/altotool.py @@ -116,12 +116,8 @@ def process(alto_files: List[str], output_file: str, output_csv: str, output_xls with warnings.catch_warnings(record=True) as caught_warnings: warnings.simplefilter('always') # do NOT filter double occurrences - # MODS + # ALTO d = flatten(alto_to_dict(alto, raise_errors=True)) - # METS - d_alto = flatten(alto_to_dict(alto, raise_errors=True)) - for k, v in d_alto.items(): - d[f"alto_{k}"] = v # "meta" d['alto_file'] = alto_file From c9737683b130835019a5f8aef88f9d6b173c9d74 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Fri, 6 May 2022 19:59:19 +0200 Subject: [PATCH 04/19] =?UTF-8?q?=E2=9C=A8=20ALTO:=20Add=20Layout/Page's?= =?UTF-8?q?=20attribute=20values?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- qurator/modstool/altotool.py | 5 ++++- qurator/modstool/lib.py | 13 +++++++++++++ 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/qurator/modstool/altotool.py b/qurator/modstool/altotool.py index b38dddf..fee1f73 100755 --- a/qurator/modstool/altotool.py +++ b/qurator/modstool/altotool.py @@ -53,7 +53,10 @@ def alto_to_dict(alto, raise_errors=True): elif tag == '{http://www.loc.gov/standards/alto/ns-v2#}softwareVersion': value['softwareVersion'] = TagGroup(tag, group).is_singleton().has_no_attributes().text() elif tag == '{http://www.loc.gov/standards/alto/ns-v2#}Layout': - pass # TODO + value['Layout'] = TagGroup(tag, group).is_singleton().has_no_attributes().descend(raise_errors) + elif tag == '{http://www.loc.gov/standards/alto/ns-v2#}Page': + value['Page'] = TagGroup(tag, group).is_singleton().attributes() + # TODO subelements elif tag == '{http://www.loc.gov/standards/alto/ns-v2#}Styles': pass else: diff --git a/qurator/modstool/lib.py b/qurator/modstool/lib.py index b24b698..c4ff8b0 100644 --- a/qurator/modstool/lib.py +++ b/qurator/modstool/lib.py @@ -181,6 +181,19 @@ class TagGroup: value[sub_tag] = s return value + def attributes(self): + """ + Return a merged dict of all attributes of the tag group. + + Probably most useful if used on a singleton, for example: + + value['Page'] = TagGroup(tag, group).is_singleton().attributes() + """ + attrib = {} + for e in self.group: + attrib.update(e.attrib) + return attrib + def sorted_groupby(iterable, key=None): """ From 1c620856129d46f9b41f5dea440fffd10f702f68 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Fri, 6 May 2022 20:28:55 +0200 Subject: [PATCH 05/19] =?UTF-8?q?=E2=9C=A8=20ALTO:=20Count=20Layout/Page/*?= =?UTF-8?q?=20elements?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- qurator/modstool/altotool.py | 5 +++-- qurator/modstool/lib.py | 10 ++++++++++ 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/qurator/modstool/altotool.py b/qurator/modstool/altotool.py index fee1f73..2bd73c1 100755 --- a/qurator/modstool/altotool.py +++ b/qurator/modstool/altotool.py @@ -55,8 +55,9 @@ def alto_to_dict(alto, raise_errors=True): elif tag == '{http://www.loc.gov/standards/alto/ns-v2#}Layout': value['Layout'] = TagGroup(tag, group).is_singleton().has_no_attributes().descend(raise_errors) elif tag == '{http://www.loc.gov/standards/alto/ns-v2#}Page': - value['Page'] = TagGroup(tag, group).is_singleton().attributes() - # TODO subelements + value['Page'] = {} + value['Page'].update(TagGroup(tag, group).is_singleton().attributes()) + value['Page'].update(TagGroup(tag, group).subelement_counts()) elif tag == '{http://www.loc.gov/standards/alto/ns-v2#}Styles': pass else: diff --git a/qurator/modstool/lib.py b/qurator/modstool/lib.py index c4ff8b0..9f01be8 100644 --- a/qurator/modstool/lib.py +++ b/qurator/modstool/lib.py @@ -194,6 +194,16 @@ class TagGroup: attrib.update(e.attrib) return attrib + def subelement_counts(self): + counts = {} + for e in self.group: + for x in e.iter(): + tag = ET.QName(x).localname + key = f"{tag}-count" + counts[key] = counts.get(key, 0) + 1 + return counts + + def sorted_groupby(iterable, key=None): """ From 10b8023dd6e009a7a4623555607e26eaefdfb03d Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Fri, 6 May 2022 20:59:51 +0200 Subject: [PATCH 06/19] =?UTF-8?q?=E2=9C=A8=20ALTO:=20Count=20Layout/Page/*?= =?UTF-8?q?=20elements?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- qurator/modstool/lib.py | 2 +- qurator/modstool/tests/test_alto.py | 39 +++++++++++++++++++++++++++++ 2 files changed, 40 insertions(+), 1 deletion(-) create mode 100644 qurator/modstool/tests/test_alto.py diff --git a/qurator/modstool/lib.py b/qurator/modstool/lib.py index 9f01be8..5ebf0ac 100644 --- a/qurator/modstool/lib.py +++ b/qurator/modstool/lib.py @@ -198,7 +198,7 @@ class TagGroup: counts = {} for e in self.group: for x in e.iter(): - tag = ET.QName(x).localname + tag = ET.QName(x.tag).localname key = f"{tag}-count" counts[key] = counts.get(key, 0) + 1 return counts diff --git a/qurator/modstool/tests/test_alto.py b/qurator/modstool/tests/test_alto.py new file mode 100644 index 0000000..bc79d1d --- /dev/null +++ b/qurator/modstool/tests/test_alto.py @@ -0,0 +1,39 @@ +import xml.etree.ElementTree as ET + + +from qurator.modstool.altotool import alto_to_dict +from qurator.modstool.lib import flatten + + +def dict_fromstring(x): + return flatten(alto_to_dict(ET.fromstring(x))) + +def test_Page_counts(): + """ + Elements below Layout/Page should be counted + """ + d = dict_fromstring(""" + + + + + + + + + + + + + + + + + + + + + """) + assert d['Layout_Page_TextBlock-count'] == 1 + assert d['Layout_Page_TextLine-count'] == 3 + assert d['Layout_Page_String-count'] == 6 From 01326050d3237fd1339a04637174bfd2bf35dfa5 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Mon, 9 May 2022 18:28:31 +0200 Subject: [PATCH 07/19] =?UTF-8?q?=E2=9C=A8=20ALTO:=20Handle=20PermissionEr?= =?UTF-8?q?rors?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- qurator/modstool/altotool.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/qurator/modstool/altotool.py b/qurator/modstool/altotool.py index 2bd73c1..66fac8c 100755 --- a/qurator/modstool/altotool.py +++ b/qurator/modstool/altotool.py @@ -79,7 +79,10 @@ def walk(m): if f.is_file() and not f.name.startswith('.'): yield f.path elif f.is_dir(): - yield from walk(f.path) + try: + yield from walk(f.path) + except PermissionError: + warnings.warn(f"Error walking {f.path}") else: yield m.path From c91c9b171416e07bc3bcb28e9928ab24aa1b6cbb Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Tue, 10 May 2022 14:27:39 +0200 Subject: [PATCH 08/19] =?UTF-8?q?=E2=9C=A8=20ALTO:=20preProcessingStep/pro?= =?UTF-8?q?cessingAgency/sourceImageInformation=20etc.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- qurator/modstool/altotool.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/qurator/modstool/altotool.py b/qurator/modstool/altotool.py index 66fac8c..431aa1e 100755 --- a/qurator/modstool/altotool.py +++ b/qurator/modstool/altotool.py @@ -42,16 +42,32 @@ def alto_to_dict(alto, raise_errors=True): elif tag == '{http://www.loc.gov/standards/alto/ns-v2#}ocrProcessingStep': for n, e in enumerate(group): value['ocrProcessingStep{}'.format(n)] = alto_to_dict(e, raise_errors) + elif tag == '{http://www.loc.gov/standards/alto/ns-v2#}preProcessingStep': + # TODO This enumerated descent is used more than once, DRY! + for n, e in enumerate(group): + value['preProcessingStep{}'.format(n)] = alto_to_dict(e, raise_errors) elif tag == '{http://www.loc.gov/standards/alto/ns-v2#}processingDateTime': value['processingDateTime'] = TagGroup(tag, group).is_singleton().has_no_attributes().text() elif tag == '{http://www.loc.gov/standards/alto/ns-v2#}processingSoftware': value['processingSoftware'] = TagGroup(tag, group).is_singleton().descend(raise_errors) + elif tag == '{http://www.loc.gov/standards/alto/ns-v2#}processingAgency': + value['processingAgency'] = TagGroup(tag, group).is_singleton().has_no_attributes().text() + elif tag == '{http://www.loc.gov/standards/alto/ns-v2#}processingStepDescription': + value['processingStepDescription'] = TagGroup(tag, group).is_singleton().has_no_attributes().text() + elif tag == '{http://www.loc.gov/standards/alto/ns-v2#}processingStepSettings': + value['processingStepSettings'] = TagGroup(tag, group).is_singleton().has_no_attributes().text() elif tag == '{http://www.loc.gov/standards/alto/ns-v2#}softwareCreator': value['softwareCreator'] = TagGroup(tag, group).is_singleton().has_no_attributes().text() elif tag == '{http://www.loc.gov/standards/alto/ns-v2#}softwareName': value['softwareName'] = TagGroup(tag, group).is_singleton().has_no_attributes().text() elif tag == '{http://www.loc.gov/standards/alto/ns-v2#}softwareVersion': value['softwareVersion'] = TagGroup(tag, group).is_singleton().has_no_attributes().text() + + elif tag == '{http://www.loc.gov/standards/alto/ns-v2#}sourceImageInformation': + value['sourceImageInformation'] = TagGroup(tag, group).is_singleton().has_no_attributes().descend(raise_errors) + elif tag == '{http://www.loc.gov/standards/alto/ns-v2#}fileName': + value['fileName'] = TagGroup(tag, group).is_singleton().has_no_attributes().text() + elif tag == '{http://www.loc.gov/standards/alto/ns-v2#}Layout': value['Layout'] = TagGroup(tag, group).is_singleton().has_no_attributes().descend(raise_errors) elif tag == '{http://www.loc.gov/standards/alto/ns-v2#}Page': From c85356bd23bd3eded686210e1e11adb030af7404 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Tue, 10 May 2022 17:46:50 +0200 Subject: [PATCH 09/19] =?UTF-8?q?=E2=9C=A8=20ALTO:=20Support=20more=20ALTO?= =?UTF-8?q?=20versions?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- qurator/modstool/altotool.py | 76 ++++++++++++++++++------------------ qurator/modstool/lib.py | 9 ++++- 2 files changed, 45 insertions(+), 40 deletions(-) diff --git a/qurator/modstool/altotool.py b/qurator/modstool/altotool.py index 431aa1e..8fd3635 100755 --- a/qurator/modstool/altotool.py +++ b/qurator/modstool/altotool.py @@ -31,50 +31,50 @@ def alto_to_dict(alto, raise_errors=True): for tag, group in sorted_groupby(alto, key=attrgetter('tag')): group = list(group) - # XXX Namespaces seem to use a trailing / sometimes, sometimes not. - # (e.g. {http://www.loc.gov/METS/} vs {http://www.loc.gov/METS}) - if tag == '{http://www.loc.gov/standards/alto/ns-v2#}Description': - value['Description'] = TagGroup(tag, group).is_singleton().has_no_attributes().descend(raise_errors) - elif tag == '{http://www.loc.gov/standards/alto/ns-v2#}MeasurementUnit': - value['MeasurementUnit'] = TagGroup(tag, group).is_singleton().has_no_attributes().text() - elif tag == '{http://www.loc.gov/standards/alto/ns-v2#}OCRProcessing': - value['OCRProcessing'] = TagGroup(tag, group).is_singleton().descend(raise_errors) - elif tag == '{http://www.loc.gov/standards/alto/ns-v2#}ocrProcessingStep': + localname = ET.QName(tag).localname + + if localname == 'Description': + value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().descend(raise_errors) + elif localname == 'MeasurementUnit': + value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().text() + elif localname == 'OCRProcessing': + value[localname] = TagGroup(tag, group).is_singleton().descend(raise_errors) + elif localname == 'ocrProcessingStep': for n, e in enumerate(group): - value['ocrProcessingStep{}'.format(n)] = alto_to_dict(e, raise_errors) - elif tag == '{http://www.loc.gov/standards/alto/ns-v2#}preProcessingStep': + value[f'{localname}{n}'] = alto_to_dict(e, raise_errors) + elif localname == 'preProcessingStep': # TODO This enumerated descent is used more than once, DRY! for n, e in enumerate(group): - value['preProcessingStep{}'.format(n)] = alto_to_dict(e, raise_errors) - elif tag == '{http://www.loc.gov/standards/alto/ns-v2#}processingDateTime': - value['processingDateTime'] = TagGroup(tag, group).is_singleton().has_no_attributes().text() - elif tag == '{http://www.loc.gov/standards/alto/ns-v2#}processingSoftware': - value['processingSoftware'] = TagGroup(tag, group).is_singleton().descend(raise_errors) - elif tag == '{http://www.loc.gov/standards/alto/ns-v2#}processingAgency': - value['processingAgency'] = TagGroup(tag, group).is_singleton().has_no_attributes().text() - elif tag == '{http://www.loc.gov/standards/alto/ns-v2#}processingStepDescription': - value['processingStepDescription'] = TagGroup(tag, group).is_singleton().has_no_attributes().text() - elif tag == '{http://www.loc.gov/standards/alto/ns-v2#}processingStepSettings': - value['processingStepSettings'] = TagGroup(tag, group).is_singleton().has_no_attributes().text() - elif tag == '{http://www.loc.gov/standards/alto/ns-v2#}softwareCreator': - value['softwareCreator'] = TagGroup(tag, group).is_singleton().has_no_attributes().text() - elif tag == '{http://www.loc.gov/standards/alto/ns-v2#}softwareName': - value['softwareName'] = TagGroup(tag, group).is_singleton().has_no_attributes().text() - elif tag == '{http://www.loc.gov/standards/alto/ns-v2#}softwareVersion': - value['softwareVersion'] = TagGroup(tag, group).is_singleton().has_no_attributes().text() - - elif tag == '{http://www.loc.gov/standards/alto/ns-v2#}sourceImageInformation': - value['sourceImageInformation'] = TagGroup(tag, group).is_singleton().has_no_attributes().descend(raise_errors) - elif tag == '{http://www.loc.gov/standards/alto/ns-v2#}fileName': - value['fileName'] = TagGroup(tag, group).is_singleton().has_no_attributes().text() - - elif tag == '{http://www.loc.gov/standards/alto/ns-v2#}Layout': - value['Layout'] = TagGroup(tag, group).is_singleton().has_no_attributes().descend(raise_errors) - elif tag == '{http://www.loc.gov/standards/alto/ns-v2#}Page': + value[f'{localname}{n}'] = alto_to_dict(e, raise_errors) + elif localname == 'processingDateTime': + value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().text() + elif localname == 'processingSoftware': + value[localname] = TagGroup(tag, group).is_singleton().descend(raise_errors) + elif localname == 'processingAgency': + value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().text() + elif localname == 'processingStepDescription': + value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().text() + elif localname == 'processingStepSettings': + value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().text() + elif localname == 'softwareCreator': + value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().text() + elif localname == 'softwareName': + value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().text() + elif localname == 'softwareVersion': + value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().text() + + elif localname == 'sourceImageInformation': + value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().descend(raise_errors) + elif localname == 'fileName': + value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().text() + + elif localname == 'Layout': + value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().descend(raise_errors) + elif localname == 'Page': value['Page'] = {} value['Page'].update(TagGroup(tag, group).is_singleton().attributes()) value['Page'].update(TagGroup(tag, group).subelement_counts()) - elif tag == '{http://www.loc.gov/standards/alto/ns-v2#}Styles': + elif localname == 'Styles': pass else: if raise_errors: diff --git a/qurator/modstool/lib.py b/qurator/modstool/lib.py index 5ebf0ac..4d00510 100644 --- a/qurator/modstool/lib.py +++ b/qurator/modstool/lib.py @@ -191,7 +191,9 @@ class TagGroup: """ attrib = {} for e in self.group: - attrib.update(e.attrib) + for a, v in e.attrib.items(): + a_localname = ET.QName(a).localname + attrib[a_localname] = v return attrib def subelement_counts(self): @@ -224,7 +226,10 @@ def _to_dict(root, raise_errors): return mods_to_dict(root, raise_errors) elif root_name.namespace == "http://www.loc.gov/METS/": return mets_to_dict(root, raise_errors) - elif root_name.namespace == "http://www.loc.gov/standards/alto/ns-v2#": + elif root_name.namespace in [ + "http://www.loc.gov/standards/alto/ns-v2#", + "http://www.loc.gov/standards/alto/", + ]: return alto_to_dict(root, raise_errors) else: raise ValueError(f"Unknown namespace {root_name.namespace}") From 6a549968b5cda04b5f1e2e3436de0819e23bce20 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Tue, 10 May 2022 17:47:38 +0200 Subject: [PATCH 10/19] =?UTF-8?q?=F0=9F=90=9B=20Produce=20a=20text=20attri?= =?UTF-8?q?bute=20even=20if=20the=20attribute=20has=20no=20value?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- qurator/modstool/lib.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/qurator/modstool/lib.py b/qurator/modstool/lib.py index 4d00510..9a158e1 100644 --- a/qurator/modstool/lib.py +++ b/qurator/modstool/lib.py @@ -55,7 +55,8 @@ class TagGroup: for e in self.group: if t != '': t += separator - t += e.text + if e.text: + t += e.text return t def text_set(self): From 4bb3379ab1ff69f818dde8299cde386b8dce05d0 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Tue, 10 May 2022 17:57:36 +0200 Subject: [PATCH 11/19] =?UTF-8?q?=F0=9F=90=9B=20Use=20tqdm's=20write()=20i?= =?UTF-8?q?nstead=20of=20logging=20during=20scanning?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- qurator/modstool/altotool.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/qurator/modstool/altotool.py b/qurator/modstool/altotool.py index 8fd3635..3115f67 100755 --- a/qurator/modstool/altotool.py +++ b/qurator/modstool/altotool.py @@ -90,7 +90,7 @@ def alto_to_dict(alto, raise_errors=True): def walk(m): # XXX do this in modstool, too if os.path.isdir(m): - logger.info('Scanning directory {}'.format(m)) + tqdm.write(f'Scanning directory {m}') for f in tqdm(os.scandir(m), leave=False): if f.is_file() and not f.name.startswith('.'): yield f.path From 937e7d74eb3f42d4e1c5e39000c3bc666d59d0de Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Tue, 10 May 2022 18:15:35 +0200 Subject: [PATCH 12/19] =?UTF-8?q?=E2=9C=A8=20ALTO:=20Support=20more=20ALTO?= =?UTF-8?q?=20versions?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- qurator/modstool/lib.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/qurator/modstool/lib.py b/qurator/modstool/lib.py index 9a158e1..1e8b560 100644 --- a/qurator/modstool/lib.py +++ b/qurator/modstool/lib.py @@ -228,8 +228,9 @@ def _to_dict(root, raise_errors): elif root_name.namespace == "http://www.loc.gov/METS/": return mets_to_dict(root, raise_errors) elif root_name.namespace in [ - "http://www.loc.gov/standards/alto/ns-v2#", + "http://schema.ccs-gmbh.com/ALTO", "http://www.loc.gov/standards/alto/", + "http://www.loc.gov/standards/alto/ns-v2#", ]: return alto_to_dict(root, raise_errors) else: From 9b3db1cd1d1cf8b595d53e1aff2703d9fee742a0 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Tue, 10 May 2022 19:32:26 +0200 Subject: [PATCH 13/19] =?UTF-8?q?=E2=9C=A8=20ALTO:=20Support=20more=20ALTO?= =?UTF-8?q?=20versions?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- qurator/modstool/altotool.py | 7 ++++++- qurator/modstool/lib.py | 1 + 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/qurator/modstool/altotool.py b/qurator/modstool/altotool.py index 3115f67..1cecc03 100755 --- a/qurator/modstool/altotool.py +++ b/qurator/modstool/altotool.py @@ -39,11 +39,14 @@ def alto_to_dict(alto, raise_errors=True): value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().text() elif localname == 'OCRProcessing': value[localname] = TagGroup(tag, group).is_singleton().descend(raise_errors) + elif localname == 'Processing': + # TODO This enumerated descent is used more than once, DRY! + for n, e in enumerate(group): + value[f'{localname}{n}'] = alto_to_dict(e, raise_errors) elif localname == 'ocrProcessingStep': for n, e in enumerate(group): value[f'{localname}{n}'] = alto_to_dict(e, raise_errors) elif localname == 'preProcessingStep': - # TODO This enumerated descent is used more than once, DRY! for n, e in enumerate(group): value[f'{localname}{n}'] = alto_to_dict(e, raise_errors) elif localname == 'processingDateTime': @@ -76,6 +79,8 @@ def alto_to_dict(alto, raise_errors=True): value['Page'].update(TagGroup(tag, group).subelement_counts()) elif localname == 'Styles': pass + elif localname == 'Tags': + pass else: if raise_errors: print(value) diff --git a/qurator/modstool/lib.py b/qurator/modstool/lib.py index 1e8b560..75d0f86 100644 --- a/qurator/modstool/lib.py +++ b/qurator/modstool/lib.py @@ -231,6 +231,7 @@ def _to_dict(root, raise_errors): "http://schema.ccs-gmbh.com/ALTO", "http://www.loc.gov/standards/alto/", "http://www.loc.gov/standards/alto/ns-v2#", + "http://www.loc.gov/standards/alto/ns-v4#", ]: return alto_to_dict(root, raise_errors) else: From e24a846ea2df58b9f17e088e34a6a885289c7896 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Mon, 23 May 2022 19:12:39 +0200 Subject: [PATCH 14/19] =?UTF-8?q?=F0=9F=9A=A7=20ALTO:=20Calculate=20mean?= =?UTF-8?q?=20of=20String@WC?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- qurator/modstool/altotool.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/qurator/modstool/altotool.py b/qurator/modstool/altotool.py index 1cecc03..3381c74 100755 --- a/qurator/modstool/altotool.py +++ b/qurator/modstool/altotool.py @@ -5,6 +5,7 @@ import os import re import warnings import sys +from xml.dom.expatbuilder import Namespaces from lxml import etree as ET from itertools import groupby from operator import attrgetter @@ -13,6 +14,7 @@ from collections.abc import MutableMapping, Sequence import click import pandas as pd +import numpy as np from tqdm import tqdm from .lib import TagGroup, sorted_groupby, flatten, ns @@ -77,6 +79,17 @@ def alto_to_dict(alto, raise_errors=True): value['Page'] = {} value['Page'].update(TagGroup(tag, group).is_singleton().attributes()) value['Page'].update(TagGroup(tag, group).subelement_counts()) + + xpath_expr = "//alto:String/@WC" + values = [] + for e in group: + # TODO need a smart way to always have the correct namespaces for a document + alto_namespace = ET.QName(e).namespace + r = e.xpath(xpath_expr, namespaces={"alto": alto_namespace}) + values += r + values = np.array([float(v) for v in values]) + value['Page'][f'{xpath_expr}-mean'] = np.mean(values) + elif localname == 'Styles': pass elif localname == 'Tags': @@ -158,7 +171,7 @@ def process(alto_files: List[str], output_file: str, output_csv: str, output_xls csvwriter.writerow([alto_file, caught_warning.message]) except Exception as e: logger.error('Exception in {}: {}'.format(alto_file, e)) - #import traceback; traceback.print_exc() + import traceback; traceback.print_exc() # Convert the alto_info List[Dict] to a pandas DataFrame columns = [] From 9246519162e2063f988b45098e7a06a30605cbe4 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Mon, 23 May 2022 19:33:54 +0200 Subject: [PATCH 15/19] =?UTF-8?q?=F0=9F=9A=A7=20ALTO:=20Extract=20a=20func?= =?UTF-8?q?tion=20to=20calculate=20statistics=20on=20xpath=20expressions?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- qurator/modstool/altotool.py | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/qurator/modstool/altotool.py b/qurator/modstool/altotool.py index 3381c74..2d83051 100755 --- a/qurator/modstool/altotool.py +++ b/qurator/modstool/altotool.py @@ -81,14 +81,22 @@ def alto_to_dict(alto, raise_errors=True): value['Page'].update(TagGroup(tag, group).subelement_counts()) xpath_expr = "//alto:String/@WC" - values = [] - for e in group: - # TODO need a smart way to always have the correct namespaces for a document - alto_namespace = ET.QName(e).namespace - r = e.xpath(xpath_expr, namespaces={"alto": alto_namespace}) - values += r - values = np.array([float(v) for v in values]) - value['Page'][f'{xpath_expr}-mean'] = np.mean(values) + alto_namespace = ET.QName(group[0]).namespace + namespaces={"alto": alto_namespace} + + def xpath_statistics(xpath_expr, namespaces): + values = [] + for e in group: + r = e.xpath(xpath_expr, namespaces=namespaces) + values += r + values = np.array([float(v) for v in values]) + + statistics = {} + statistics[f'{xpath_expr}-mean'] = np.mean(values) + return statistics + + value['Page'].update(xpath_statistics(xpath_expr, namespaces)) + elif localname == 'Styles': pass From aa4e8e290dd1d2f1e1f382ebfa799eafac8f6795 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Mon, 23 May 2022 19:39:21 +0200 Subject: [PATCH 16/19] =?UTF-8?q?=F0=9F=9A=A7=20ALTO:=20Move=20xpath=5Fsta?= =?UTF-8?q?tistics=20to=20TagGroup=20class?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- qurator/modstool/altotool.py | 22 ++++------------------ qurator/modstool/lib.py | 19 +++++++++++++++++++ 2 files changed, 23 insertions(+), 18 deletions(-) diff --git a/qurator/modstool/altotool.py b/qurator/modstool/altotool.py index 2d83051..218e448 100755 --- a/qurator/modstool/altotool.py +++ b/qurator/modstool/altotool.py @@ -76,27 +76,13 @@ def alto_to_dict(alto, raise_errors=True): elif localname == 'Layout': value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().descend(raise_errors) elif localname == 'Page': - value['Page'] = {} - value['Page'].update(TagGroup(tag, group).is_singleton().attributes()) - value['Page'].update(TagGroup(tag, group).subelement_counts()) - - xpath_expr = "//alto:String/@WC" alto_namespace = ET.QName(group[0]).namespace namespaces={"alto": alto_namespace} - def xpath_statistics(xpath_expr, namespaces): - values = [] - for e in group: - r = e.xpath(xpath_expr, namespaces=namespaces) - values += r - values = np.array([float(v) for v in values]) - - statistics = {} - statistics[f'{xpath_expr}-mean'] = np.mean(values) - return statistics - - value['Page'].update(xpath_statistics(xpath_expr, namespaces)) - + value[localname] = {} + value[localname].update(TagGroup(tag, group).is_singleton().attributes()) + value[localname].update(TagGroup(tag, group).subelement_counts()) + value[localname].update(TagGroup(tag, group).xpath_statistics("//alto:String/@WC", namespaces)) elif localname == 'Styles': pass diff --git a/qurator/modstool/lib.py b/qurator/modstool/lib.py index 75d0f86..383ba8f 100644 --- a/qurator/modstool/lib.py +++ b/qurator/modstool/lib.py @@ -3,6 +3,7 @@ import re import warnings from typing import List, Sequence, MutableMapping +import numpy as np from lxml import etree as ET @@ -206,6 +207,24 @@ class TagGroup: counts[key] = counts.get(key, 0) + 1 return counts + def xpath_statistics(self, xpath_expr, namespaces): + """ + Extract values and calculate statistics + + Extract values using the given XPath expression, convert them to float and return descriptive + statistics on the values. + """ + values = [] + for e in self.group: + r = e.xpath(xpath_expr, namespaces=namespaces) + values += r + values = np.array([float(v) for v in values]) + + statistics = {} + statistics[f'{xpath_expr}-mean'] = np.mean(values) + return statistics + + def sorted_groupby(iterable, key=None): From 8285bdb423f583447aeadfd4aefe5a31ad59a0a9 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Mon, 23 May 2022 19:45:44 +0200 Subject: [PATCH 17/19] =?UTF-8?q?=F0=9F=9A=A7=20ALTO:=20Calculate=20more?= =?UTF-8?q?=20descriptive=20statistics=20for=20String@WC?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- qurator/modstool/lib.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/qurator/modstool/lib.py b/qurator/modstool/lib.py index 383ba8f..b3ce44f 100644 --- a/qurator/modstool/lib.py +++ b/qurator/modstool/lib.py @@ -221,7 +221,12 @@ class TagGroup: values = np.array([float(v) for v in values]) statistics = {} - statistics[f'{xpath_expr}-mean'] = np.mean(values) + if values.size > 0: + statistics[f'{xpath_expr}-mean'] = np.mean(values) + statistics[f'{xpath_expr}-median'] = np.median(values) + statistics[f'{xpath_expr}-std'] = np.std(values) + statistics[f'{xpath_expr}-min'] = np.min(values) + statistics[f'{xpath_expr}-max'] = np.max(values) return statistics From 3d2e53f7396b75d27eda27ec9b60a9c50ef402ad Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Wed, 8 Jun 2022 18:25:33 +0200 Subject: [PATCH 18/19] =?UTF-8?q?=E2=9C=A8=20ALTO:=20Extract=20namespace?= =?UTF-8?q?=20=3D=3D=20ALTO=20version?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- qurator/modstool/altotool.py | 1 + 1 file changed, 1 insertion(+) diff --git a/qurator/modstool/altotool.py b/qurator/modstool/altotool.py index 218e448..a66defa 100755 --- a/qurator/modstool/altotool.py +++ b/qurator/modstool/altotool.py @@ -155,6 +155,7 @@ def process(alto_files: List[str], output_file: str, output_csv: str, output_xls d = flatten(alto_to_dict(alto, raise_errors=True)) # "meta" d['alto_file'] = alto_file + d['alto_xmlns'] = ET.QName(alto).namespace alto_info.append(d) From 21f906ec7d50ab03f16ecbaf28a23e0ac9dab356 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Thu, 16 Jun 2022 19:27:54 +0200 Subject: [PATCH 19/19] =?UTF-8?q?=E2=9C=A8=20Rename=20altotool=20to=20alto?= =?UTF-8?q?4pandas?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit See gh-15. --- qurator/modstool/{altotool.py => alto4pandas.py} | 4 ++-- qurator/modstool/lib.py | 2 +- qurator/modstool/tests/test_alto.py | 2 +- setup.py | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) rename qurator/modstool/{altotool.py => alto4pandas.py} (98%) diff --git a/qurator/modstool/altotool.py b/qurator/modstool/alto4pandas.py similarity index 98% rename from qurator/modstool/altotool.py rename to qurator/modstool/alto4pandas.py index a66defa..ceb498a 100755 --- a/qurator/modstool/altotool.py +++ b/qurator/modstool/alto4pandas.py @@ -20,7 +20,7 @@ from tqdm import tqdm from .lib import TagGroup, sorted_groupby, flatten, ns -logger = logging.getLogger('altotool') +logger = logging.getLogger('alto4pandas') @@ -129,7 +129,7 @@ def process(alto_files: List[str], output_file: str, output_csv: str, output_xls INPUT is assumed to be a ALTO document. INPUT may optionally be a directory. The tool then reads all files in the directory. - altotool writes two output files: A pickled pandas DataFrame and a CSV file with all conversion warnings. + alto4pandas writes two output files: A pickled pandas DataFrame and a CSV file with all conversion warnings. """ # Extend file list if directories are given diff --git a/qurator/modstool/lib.py b/qurator/modstool/lib.py index b3ce44f..a6be479 100644 --- a/qurator/modstool/lib.py +++ b/qurator/modstool/lib.py @@ -244,7 +244,7 @@ def sorted_groupby(iterable, key=None): def _to_dict(root, raise_errors): from .modstool import mods_to_dict, mets_to_dict - from .altotool import alto_to_dict + from .alto4pandas import alto_to_dict root_name = ET.QName(root.tag) if root_name.namespace == "http://www.loc.gov/mods/v3": diff --git a/qurator/modstool/tests/test_alto.py b/qurator/modstool/tests/test_alto.py index bc79d1d..154c848 100644 --- a/qurator/modstool/tests/test_alto.py +++ b/qurator/modstool/tests/test_alto.py @@ -1,7 +1,7 @@ import xml.etree.ElementTree as ET -from qurator.modstool.altotool import alto_to_dict +from qurator.modstool.alto4pandas import alto_to_dict from qurator.modstool.lib import flatten diff --git a/setup.py b/setup.py index cd19801..a35c92d 100644 --- a/setup.py +++ b/setup.py @@ -21,7 +21,7 @@ setup( entry_points={ 'console_scripts': [ 'modstool=qurator.modstool.modstool:main', - 'altotool=qurator.modstool.altotool:main', + 'alto4pandas=qurator.modstool.alto4pandas:main', ] }, python_requires='>=3.0.0',