🚧 Add support for ALTO Description

2026-03-06 07:11:54 +01:00 · 2022-05-04 20:02:27 +02:00 · 2022-05-04 20:02:27 +02:00 · e86369e76d
commit e86369e76d
parent 08082d5fe8
7 changed files with 406 additions and 212 deletions
--- a/qurator/modstool/init.py
+++ b/qurator/modstool/init.py
@ -1 +0,0 @@
-from .modstool import *
--- a/qurator/modstool/altotool.py
+++ b/qurator/modstool/altotool.py
@ -0,0 +1,170 @@
+#!/usr/bin/env python3
+import csv
+import logging
+import os
+import re
+import warnings
+import sys
+from lxml import etree as ET
+from itertools import groupby
+from operator import attrgetter
+from typing import List
+from collections.abc import MutableMapping, Sequence
+
+import click
+import pandas as pd
+from tqdm import tqdm
+
+from .lib import TagGroup, sorted_groupby, flatten, ns
+
+
+logger = logging.getLogger('altotool')
+
+
+
+def alto_to_dict(alto, raise_errors=True):
+    """Convert ALTO metadata to a nested dictionary"""
+
+    value = {}
+
+    # Iterate through each group of tags
+    for tag, group in sorted_groupby(alto, key=attrgetter('tag')):
+        group = list(group)
+
+        # XXX Namespaces seem to use a trailing / sometimes, sometimes not.
+        #     (e.g. {http://www.loc.gov/METS/} vs {http://www.loc.gov/METS})
+        if tag == '{http://www.loc.gov/standards/alto/ns-v2#}Description':
+            value['Description'] = TagGroup(tag, group).is_singleton().has_no_attributes().descend(raise_errors)
+        elif tag == '{http://www.loc.gov/standards/alto/ns-v2#}MeasurementUnit':
+            value['MeasurementUnit'] = TagGroup(tag, group).is_singleton().has_no_attributes().text()
+        elif tag == '{http://www.loc.gov/standards/alto/ns-v2#}OCRProcessing':
+            value['OCRProcessing'] = TagGroup(tag, group).is_singleton().descend(raise_errors)
+        elif tag == '{http://www.loc.gov/standards/alto/ns-v2#}ocrProcessingStep':
+            for n, e in enumerate(group):
+                value['ocrProcessingStep{}'.format(n)] = alto_to_dict(e, raise_errors)
+        elif tag == '{http://www.loc.gov/standards/alto/ns-v2#}processingDateTime':
+            value['processingDateTime'] = TagGroup(tag, group).is_singleton().has_no_attributes().text()
+        elif tag == '{http://www.loc.gov/standards/alto/ns-v2#}processingSoftware':
+            value['processingSoftware'] = TagGroup(tag, group).is_singleton().descend(raise_errors)
+        elif tag == '{http://www.loc.gov/standards/alto/ns-v2#}softwareCreator':
+            value['softwareCreator'] = TagGroup(tag, group).is_singleton().has_no_attributes().text()
+        elif tag == '{http://www.loc.gov/standards/alto/ns-v2#}softwareName':
+            value['softwareName'] = TagGroup(tag, group).is_singleton().has_no_attributes().text()
+        elif tag == '{http://www.loc.gov/standards/alto/ns-v2#}softwareVersion':
+            value['softwareVersion'] = TagGroup(tag, group).is_singleton().has_no_attributes().text()
+        elif tag == '{http://www.loc.gov/standards/alto/ns-v2#}Layout':
+            pass  # TODO
+        elif tag == '{http://www.loc.gov/standards/alto/ns-v2#}Styles':
+            pass
+        else:
+            if raise_errors:
+                print(value)
+                raise ValueError('Unknown tag "{}"'.format(tag))
+            else:
+                pass
+
+    return value
+
+
+
+def walk(m):
+    # XXX do this in modstool, too
+    if os.path.isdir(m):
+        logger.info('Scanning directory {}'.format(m))
+        for f in tqdm(os.scandir(m), leave=False):
+            if f.is_file() and not f.name.startswith('.'):
+                yield f.path
+            elif f.is_dir():
+                yield from walk(f.path)
+    else:
+        yield m.path
+
+
+
+@click.command()
+@click.argument('alto_files', type=click.Path(exists=True), required=True, nargs=-1)
+@click.option('--output', '-o', 'output_file', type=click.Path(), help='Output pickle file',
+              default='alto_info_df.pkl', show_default=True)
+@click.option('--output-csv', type=click.Path(), help='Output CSV file')
+@click.option('--output-xlsx', type=click.Path(), help='Output Excel .xlsx file')
+def process(alto_files: List[str], output_file: str, output_csv: str, output_xlsx: str):
+    """
+    A tool to convert the ALTO metadata in INPUT to a pandas DataFrame.
+
+    INPUT is assumed to be a ALTO document. INPUT may optionally be a directory. The tool then reads
+    all files in the directory.
+
+    altotool writes two output files: A pickled pandas DataFrame and a CSV file with all conversion warnings.
+    """
+
+    # Extend file list if directories are given
+    alto_files_real = []
+    for m in alto_files:
+        for x in walk(m):
+            alto_files_real.append(x)
+
+    # Process ALTO files
+    with open(output_file + '.warnings.csv', 'w') as csvfile:
+        csvwriter = csv.writer(csvfile)
+        alto_info = []
+        logger.info('Processing ALTO files')
+        for alto_file in tqdm(alto_files_real, leave=False):
+            try:
+                root = ET.parse(alto_file).getroot()
+                alto = root # XXX .find('alto:alto', ns) does not work here
+
+                with warnings.catch_warnings(record=True) as caught_warnings:
+                    warnings.simplefilter('always')  # do NOT filter double occurrences
+
+                    # MODS
+                    d = flatten(alto_to_dict(alto, raise_errors=True))
+                    # METS
+                    d_alto = flatten(alto_to_dict(alto, raise_errors=True))
+                    for k, v in d_alto.items():
+                        d[f"alto_{k}"] = v
+                    # "meta"
+                    d['alto_file'] = alto_file
+
+                    alto_info.append(d)
+
+                    if caught_warnings:
+                        # PyCharm thinks caught_warnings is not Iterable:
+                        # noinspection PyTypeChecker
+                        for caught_warning in caught_warnings:
+                            csvwriter.writerow([alto_file, caught_warning.message])
+            except Exception as e:
+                logger.error('Exception in {}: {}'.format(alto_file, e))
+                #import traceback; traceback.print_exc()
+
+    # Convert the alto_info List[Dict] to a pandas DataFrame
+    columns = []
+    for m in alto_info:
+        for c in m.keys():
+            if c not in columns:
+                columns.append(c)
+    data = [[m.get(c) for c in columns] for m in alto_info]
+    index = [m['alto_file'] for m in alto_info] # TODO use ppn + page?
+    alto_info_df = pd.DataFrame(data=data, index=index, columns=columns)
+
+    # Pickle the DataFrame
+    logger.info('Writing DataFrame to {}'.format(output_file))
+    alto_info_df.to_pickle(output_file)
+    if output_csv:
+        logger.info('Writing CSV to {}'.format(output_csv))
+        alto_info_df.to_csv(output_csv)
+    if output_xlsx:
+        logger.info('Writing Excel .xlsx to {}'.format(output_xlsx))
+        alto_info_df.to_excel(output_xlsx)
+
+
+def main():
+    logging.basicConfig(level=logging.INFO)
+
+    for prefix, uri in ns.items():
+        ET.register_namespace(prefix, uri)
+
+    process()
+
+
+if __name__ == '__main__':
+    main()
--- a/qurator/modstool/lib.py
+++ b/qurator/modstool/lib.py
@ -0,0 +1,229 @@
+from itertools import groupby
+import re
+import warnings
+from typing import List, Sequence, MutableMapping
+
+from lxml import etree as ET
+
+
+__all__ = ["ns"]
+
+
+ns = {
+    'mets': 'http://www.loc.gov/METS/',
+    'mods': 'http://www.loc.gov/mods/v3',
+    "alto": "http://www.loc.gov/standards/alto/ns-v2"
+}
+
+
+
+class TagGroup:
+    """Helper class to simplify the parsing and checking of MODS metadata"""
+
+    def __init__(self, tag, group: List[ET.Element]):
+        self.tag = tag
+        self.group = group
+
+    def __str__(self):
+        return '\n'.join(str(ET.tostring(e), 'utf-8').strip() for e in self.group)
+
+    def is_singleton(self):
+        if len(self.group) != 1:
+            raise ValueError('More than one instance: {}'.format(self))
+        return self
+
+    def has_no_attributes(self):
+        return self.has_attributes({})
+
+    def has_attributes(self, attrib):
+        if not isinstance(attrib, Sequence):
+            attrib = [attrib]
+        if not all(e.attrib in attrib for e in self.group):
+            raise ValueError('One or more element has unexpected attributes: {}'.format(self))
+        return self
+
+    def ignore_attributes(self):
+        # This serves as documentation for now.
+        return self
+
+    def sort(self, key=None, reverse=False):
+        self.group = sorted(self.group, key=key, reverse=reverse)
+        return self
+
+    def text(self, separator='\n'):
+        t = ''
+        for e in self.group:
+            if t != '':
+                t += separator
+            t += e.text
+        return t
+
+    def text_set(self):
+        return {e.text for e in self.group}
+
+    def descend(self, raise_errors):
+        return _to_dict(self.is_singleton().group[0], raise_errors)
+
+    def filter(self, cond, warn=None):
+        new_group = []
+        for e in self.group:
+            if cond(e):
+                new_group.append(e)
+            else:
+                if warn:
+                    warnings.warn('Filtered {} element ({})'.format(self.tag, warn))
+        return TagGroup(self.tag, new_group)
+
+    def force_singleton(self, warn=True):
+        if len(self.group) == 1:
+            return self
+        else:
+            if warn:
+                warnings.warn('Forced single instance of {}'.format(self.tag))
+            return TagGroup(self.tag, self.group[:1])
+
+    RE_ISO8601_DATE = r'^\d{2}(\d{2}|XX)(-\d{2}-\d{2})?$'  # Note: Includes non-specific century dates like '18XX'
+    RE_GERMAN_DATE = r'^(?P<dd>\d{2})\.(?P<mm>\d{2})\.(?P<yyyy>\d{4})$'
+
+    def fix_date(self):
+
+        for e in self.group:
+            if e.attrib.get('encoding') == 'w3cdtf':
+                # This should be 'iso8601' according to MODS-AP 2.3.1
+                warnings.warn('Changed w3cdtf encoding to iso8601')
+                e.attrib['encoding'] = 'iso8601'
+
+        new_group = []
+        for e in self.group:
+            if e.attrib.get('encoding') == 'iso8601' and re.match(self.RE_ISO8601_DATE, e.text):
+                new_group.append(e)
+            elif re.match(self.RE_ISO8601_DATE, e.text):
+                warnings.warn('Added iso8601 encoding to date {}'.format(e.text))
+                e.attrib['encoding'] = 'iso8601'
+                new_group.append(e)
+            elif re.match(self.RE_GERMAN_DATE, e.text):
+                warnings.warn('Converted date {} to iso8601 encoding'.format(e.text))
+                m = re.match(self.RE_GERMAN_DATE, e.text)
+                e.text = '{}-{}-{}'.format(m.group('yyyy'), m.group('mm'), m.group('dd'))
+                e.attrib['encoding'] = 'iso8601'
+                new_group.append(e)
+            else:
+                warnings.warn('Not a iso8601 date: "{}"'.format(e.text))
+                new_group.append(e)
+        self.group = new_group
+
+        # Notes:
+        # - There are dates with the misspelled qualifier 'aproximate'
+        # - Rough periods are sometimes given either by:
+        #   - years like '19xx'
+        #   - or 'approximate' date ranges with point="start"/"end" attributes set
+        #     (this could be correct according to MODS-AP 2.3.1)
+        # - Some very specific dates like '06.08.1820' are sometimes given the 'approximate' qualifier
+        # - Sometimes, approximate date ranges are given in the text "1785-1800 (ca.)"
+
+        return self
+
+    def fix_event_type(self):
+        # According to MODS-AP 2.3.1, every originInfo should have its eventType set.
+        # Fix this for special cases.
+
+        for e in self.group:
+            if e.attrib.get('eventType') is None:
+                try:
+                    if e.find('mods:publisher', ns).text.startswith('Staatsbibliothek zu Berlin') and \
+                            e.find('mods:edition', ns).text == '[Electronic ed.]':
+                        e.attrib['eventType'] = 'digitization'
+                        warnings.warn('Fixed eventType for electronic ed.')
+                        continue
+                except AttributeError:
+                    pass
+                try:
+                    if e.find('mods:dateIssued', ns) is not None:
+                        e.attrib['eventType'] = 'publication'
+                        warnings.warn('Fixed eventType for an issued origin')
+                        continue
+                except AttributeError:
+                    pass
+                try:
+                    if e.find('mods:dateCreated', ns) is not None:
+                        e.attrib['eventType'] = 'production'
+                        warnings.warn('Fixed eventType for a created origin')
+                        continue
+                except AttributeError:
+                    pass
+        return self
+
+    def fix_script_term(self):
+        for e in self.group:
+            # MODS-AP 2.3.1 is not clear about this, but it looks like that this should be lower case.
+            if e.attrib['authority'] == 'ISO15924':
+                e.attrib['authority'] = 'iso15924'
+                warnings.warn('Changed scriptTerm authority to lower case')
+        return self
+
+    def merge_sub_tags_to_set(self):
+        value = {}
+
+        sub_dicts = [mods_to_dict(e) for e in self.group]
+        sub_tags = {k for d in sub_dicts for k in d.keys()}
+        for sub_tag in sub_tags:
+            s = set()
+            for d in sub_dicts:
+                v = d.get(sub_tag)
+                if v:
+                    # There could be multiple scriptTerms in one language element, e.g. Antiqua and Fraktur in a
+                    # German language document.
+                    if isinstance(v, set):
+                        s.update(v)
+                    else:
+                        s.add(v)
+            value[sub_tag] = s
+        return value
+
+
+def sorted_groupby(iterable, key=None):
+    """
+    Sort iterable by key and then group by the same key.
+
+    itertools.groupby() assumes that the iterable is already sorted. This function
+    conveniently sorts the iterable first, and then groups its elements.
+    """
+    return groupby(sorted(iterable, key=key), key=key)
+
+
+def _to_dict(root, raise_errors):
+    from .modstool import mods_to_dict, mets_to_dict
+    from .altotool import alto_to_dict
+
+    root_name = ET.QName(root.tag)
+    if root_name.namespace == "http://www.loc.gov/mods/v3":
+        return mods_to_dict(root, raise_errors)
+    elif root_name.namespace == "http://www.loc.gov/METS/":
+        return mets_to_dict(root, raise_errors)
+    elif root_name.namespace == "http://www.loc.gov/standards/alto/ns-v2#":
+        return alto_to_dict(root, raise_errors)
+    else:
+        raise ValueError(f"Unknown namespace {root_name.namespace}")
+
+
+def flatten(d: MutableMapping, parent='', separator='_'):
+    """
+    Flatten the given nested dict.
+
+    It is assumed that d maps strings to either another dictionary (similarly structured) or some other value.
+    """
+    items = []
+
+    for k, v in d.items():
+        if parent:
+            new_key = parent + separator + k
+        else:
+            new_key = k
+
+        if isinstance(v, MutableMapping):
+            items.extend(flatten(v, new_key, separator=separator).items())
+        else:
+            items.append((new_key, v))
+
+    return dict(items)
+
--- a/qurator/modstool/modstool.py
+++ b/qurator/modstool/modstool.py
@ -14,197 +14,12 @@ import click
 import pandas as pd
 from tqdm import tqdm

+from .lib import sorted_groupby, TagGroup, ns
+

-ns = {
-    'mets': 'http://www.loc.gov/METS/',
-    'mods': 'http://www.loc.gov/mods/v3'
-}

 logger = logging.getLogger('modstool')

-class TagGroup:
-    """Helper class to simplify the parsing and checking of MODS metadata"""
-
-    def __init__(self, tag, group: List[ET.Element]):
-        self.tag = tag
-        self.group = group
-
-    def __str__(self):
-        return '\n'.join(str(ET.tostring(e), 'utf-8').strip() for e in self.group)
-
-    def is_singleton(self):
-        if len(self.group) != 1:
-            raise ValueError('More than one instance: {}'.format(self))
-        return self
-
-    def has_no_attributes(self):
-        return self.has_attributes({})
-
-    def has_attributes(self, attrib):
-        if not isinstance(attrib, Sequence):
-            attrib = [attrib]
-        if not all(e.attrib in attrib for e in self.group):
-            raise ValueError('One or more element has unexpected attributes: {}'.format(self))
-        return self
-
-    def ignore_attributes(self):
-        # This serves as documentation for now.
-        return self
-
-    def sort(self, key=None, reverse=False):
-        self.group = sorted(self.group, key=key, reverse=reverse)
-        return self
-
-    def text(self, separator='\n'):
-        t = ''
-        for e in self.group:
-            if t != '':
-                t += separator
-            t += e.text
-        return t
-
-    def text_set(self):
-        return {e.text for e in self.group}
-
-    def descend(self, raise_errors):
-        return _to_dict(self.is_singleton().group[0], raise_errors)
-
-    def filter(self, cond, warn=None):
-        new_group = []
-        for e in self.group:
-            if cond(e):
-                new_group.append(e)
-            else:
-                if warn:
-                    warnings.warn('Filtered {} element ({})'.format(self.tag, warn))
-        return TagGroup(self.tag, new_group)
-
-    def force_singleton(self, warn=True):
-        if len(self.group) == 1:
-            return self
-        else:
-            if warn:
-                warnings.warn('Forced single instance of {}'.format(self.tag))
-            return TagGroup(self.tag, self.group[:1])
-
-    RE_ISO8601_DATE = r'^\d{2}(\d{2}|XX)(-\d{2}-\d{2})?$'  # Note: Includes non-specific century dates like '18XX'
-    RE_GERMAN_DATE = r'^(?P<dd>\d{2})\.(?P<mm>\d{2})\.(?P<yyyy>\d{4})$'
-
-    def fix_date(self):
-
-        for e in self.group:
-            if e.attrib.get('encoding') == 'w3cdtf':
-                # This should be 'iso8601' according to MODS-AP 2.3.1
-                warnings.warn('Changed w3cdtf encoding to iso8601')
-                e.attrib['encoding'] = 'iso8601'
-
-        new_group = []
-        for e in self.group:
-            if e.attrib.get('encoding') == 'iso8601' and re.match(self.RE_ISO8601_DATE, e.text):
-                new_group.append(e)
-            elif re.match(self.RE_ISO8601_DATE, e.text):
-                warnings.warn('Added iso8601 encoding to date {}'.format(e.text))
-                e.attrib['encoding'] = 'iso8601'
-                new_group.append(e)
-            elif re.match(self.RE_GERMAN_DATE, e.text):
-                warnings.warn('Converted date {} to iso8601 encoding'.format(e.text))
-                m = re.match(self.RE_GERMAN_DATE, e.text)
-                e.text = '{}-{}-{}'.format(m.group('yyyy'), m.group('mm'), m.group('dd'))
-                e.attrib['encoding'] = 'iso8601'
-                new_group.append(e)
-            else:
-                warnings.warn('Not a iso8601 date: "{}"'.format(e.text))
-                new_group.append(e)
-        self.group = new_group
-
-        # Notes:
-        # - There are dates with the misspelled qualifier 'aproximate'
-        # - Rough periods are sometimes given either by:
-        #   - years like '19xx'
-        #   - or 'approximate' date ranges with point="start"/"end" attributes set
-        #     (this could be correct according to MODS-AP 2.3.1)
-        # - Some very specific dates like '06.08.1820' are sometimes given the 'approximate' qualifier
-        # - Sometimes, approximate date ranges are given in the text "1785-1800 (ca.)"
-
-        return self
-
-    def fix_event_type(self):
-        # According to MODS-AP 2.3.1, every originInfo should have its eventType set.
-        # Fix this for special cases.
-
-        for e in self.group:
-            if e.attrib.get('eventType') is None:
-                try:
-                    if e.find('mods:publisher', ns).text.startswith('Staatsbibliothek zu Berlin') and \
-                            e.find('mods:edition', ns).text == '[Electronic ed.]':
-                        e.attrib['eventType'] = 'digitization'
-                        warnings.warn('Fixed eventType for electronic ed.')
-                        continue
-                except AttributeError:
-                    pass
-                try:
-                    if e.find('mods:dateIssued', ns) is not None:
-                        e.attrib['eventType'] = 'publication'
-                        warnings.warn('Fixed eventType for an issued origin')
-                        continue
-                except AttributeError:
-                    pass
-                try:
-                    if e.find('mods:dateCreated', ns) is not None:
-                        e.attrib['eventType'] = 'production'
-                        warnings.warn('Fixed eventType for a created origin')
-                        continue
-                except AttributeError:
-                    pass
-        return self
-
-    def fix_script_term(self):
-        for e in self.group:
-            # MODS-AP 2.3.1 is not clear about this, but it looks like that this should be lower case.
-            if e.attrib['authority'] == 'ISO15924':
-                e.attrib['authority'] = 'iso15924'
-                warnings.warn('Changed scriptTerm authority to lower case')
-        return self
-
-    def merge_sub_tags_to_set(self):
-        value = {}
-
-        sub_dicts = [mods_to_dict(e) for e in self.group]
-        sub_tags = {k for d in sub_dicts for k in d.keys()}
-        for sub_tag in sub_tags:
-            s = set()
-            for d in sub_dicts:
-                v = d.get(sub_tag)
-                if v:
-                    # There could be multiple scriptTerms in one language element, e.g. Antiqua and Fraktur in a
-                    # German language document.
-                    if isinstance(v, set):
-                        s.update(v)
-                    else:
-                        s.add(v)
-            value[sub_tag] = s
-        return value
-
-
-def sorted_groupby(iterable, key=None):
-    """
-    Sort iterable by key and then group by the same key.
-
-    itertools.groupby() assumes that the iterable is already sorted. This function
-    conveniently sorts the iterable first, and then groups its elements.
-    """
-    return groupby(sorted(iterable, key=key), key=key)
-
-def _to_dict(root, raise_errors):
-
-    root_name = ET.QName(root.tag)
-    if root_name.namespace == "http://www.loc.gov/mods/v3":
-        return mods_to_dict(root, raise_errors)
-    elif root_name.namespace == "http://www.loc.gov/METS/":
-        return mets_to_dict(root, raise_errors)
-    else:
-        raise ValueError(f"Unknown namespace {root_name.namespace}")
-
 def mods_to_dict(mods, raise_errors=True):
    """Convert MODS metadata to a nested dictionary"""

@ -427,28 +242,6 @@ def mets_to_dict(mets, raise_errors=True):
    return value


-def flatten(d: MutableMapping, parent='', separator='_'):
-    """
-    Flatten the given nested dict.
-
-    It is assumed that d maps strings to either another dictionary (similarly structured) or some other value.
-    """
-    items = []
-
-    for k, v in d.items():
-        if parent:
-            new_key = parent + separator + k
-        else:
-            new_key = k
-
-        if isinstance(v, MutableMapping):
-            items.extend(flatten(v, new_key, separator=separator).items())
-        else:
-            items.append((new_key, v))
-
-    return dict(items)
-
-
@click.command()
@click.argument('mets_files', type=click.Path(exists=True), required=True, nargs=-1)
@click.option('--output', '-o', 'output_file', type=click.Path(), help='Output pickle file',
--- a/qurator/modstool/tests/test_mets.py
+++ b/qurator/modstool/tests/test_mets.py
@ -1,4 +1,3 @@
-import pytest
 import xml.etree.ElementTree as ET