modstool/qurator/mods4pandas/alto4pandas.py

#!/usr/bin/env python3
import csv
import logging
import os
import re
import warnings
import sys
from xml.dom.expatbuilder import Namespaces
from lxml import etree as ET
from itertools import groupby
from operator import attrgetter
from typing import List
from collections.abc import MutableMapping, Sequence

import click
import pandas as pd
import numpy as np
from tqdm import tqdm

from .lib import TagGroup, sorted_groupby, flatten, ns


logger = logging.getLogger('alto4pandas')


def alto_to_dict(alto, raise_errors=True):
    """Convert ALTO metadata to a nested dictionary"""

    value = {}

    # Iterate through each group of tags
    for tag, group in sorted_groupby(alto, key=attrgetter('tag')):
        group = list(group)

        localname = ET.QName(tag).localname
        alto_namespace = ET.QName(tag).namespace
        namespaces={"alto": alto_namespace}

        if localname == 'Description':
            value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().descend(raise_errors)
        elif localname == 'MeasurementUnit':
            value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().text()
        elif localname == 'OCRProcessing':
            value[localname] = TagGroup(tag, group).is_singleton().descend(raise_errors)
        elif localname == 'Processing':
            # TODO This enumerated descent is used more than once, DRY!
            for n, e in enumerate(group):
                value[f'{localname}{n}'] = alto_to_dict(e, raise_errors)
        elif localname == 'ocrProcessingStep':
            for n, e in enumerate(group):
                value[f'{localname}{n}'] = alto_to_dict(e, raise_errors)
        elif localname == 'preProcessingStep':
            for n, e in enumerate(group):
                value[f'{localname}{n}'] = alto_to_dict(e, raise_errors)
        elif localname == 'processingDateTime':
            value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().text()
        elif localname == 'processingSoftware':
            value[localname] = TagGroup(tag, group).is_singleton().descend(raise_errors)
        elif localname == 'processingAgency':
            value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().text()
        elif localname == 'processingStepDescription':
            value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().text()
        elif localname == 'processingStepSettings':
            value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().text()
        elif localname == 'softwareCreator':
            value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().text()
        elif localname == 'softwareName':
            value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().text()
        elif localname == 'softwareVersion':
            value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().text()

        elif localname == 'sourceImageInformation':
            value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().descend(raise_errors)
        elif localname == 'fileName':
            value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().text()

        elif localname == 'Layout':
            value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().descend(raise_errors)
        elif localname == 'Page':
            value[localname] = {}
            value[localname].update(TagGroup(tag, group).is_singleton().attributes())
            value[localname].update(TagGroup(tag, group).subelement_counts())
            value[localname].update(TagGroup(tag, group).xpath_statistics("//alto:String/@WC", namespaces))

            # Count all alto:String elements with TAGREFS attribute
            value[localname].update(TagGroup(tag, group).xpath_count("//alto:String[@TAGREFS]", namespaces))

        elif localname == 'Styles':
            pass
        elif localname == 'Tags':
            value[localname] = {}
            value[localname].update(TagGroup(tag, group).subelement_counts())
        else:
            if raise_errors:
                print(value)
                raise ValueError('Unknown tag "{}"'.format(tag))
            else:
                pass

    return value


def walk(m):
    # XXX do this in modstool, too
    if os.path.isdir(m):
        tqdm.write(f'Scanning directory {m}')
        for f in tqdm(os.scandir(m), leave=False):
            if f.is_file() and not f.name.startswith('.'):
                yield f.path
            elif f.is_dir():
                try:
                    yield from walk(f.path)
                except PermissionError:
                    warnings.warn(f"Error walking {f.path}")
    else:
        yield m.path


@click.command()
@click.argument('alto_files', type=click.Path(exists=True), required=True, nargs=-1)
@click.option('--output', '-o', 'output_file', type=click.Path(), help='Output pickle file',
              default='alto_info_df.pkl', show_default=True)
@click.option('--output-csv', type=click.Path(), help='Output CSV file')
@click.option('--output-xlsx', type=click.Path(), help='Output Excel .xlsx file')
def process(alto_files: List[str], output_file: str, output_csv: str, output_xlsx: str):
    """
    A tool to convert the ALTO metadata in INPUT to a pandas DataFrame.

    INPUT is assumed to be a ALTO document. INPUT may optionally be a directory. The tool then reads
    all files in the directory.

    alto4pandas writes two output files: A pickled pandas DataFrame and a CSV file with all conversion warnings.
    """

    # Extend file list if directories are given
    alto_files_real = []
    for m in alto_files:
        for x in walk(m):
            alto_files_real.append(x)

    # Process ALTO files
    with open(output_file + '.warnings.csv', 'w') as csvfile:
        csvwriter = csv.writer(csvfile)
        alto_info = []
        logger.info('Processing ALTO files')
        for alto_file in tqdm(alto_files_real, leave=False):
            try:
                root = ET.parse(alto_file).getroot()
                alto = root # XXX .find('alto:alto', ns) does not work here

                with warnings.catch_warnings(record=True) as caught_warnings:
                    warnings.simplefilter('always')  # do NOT filter double occurrences

                    # ALTO
                    d = flatten(alto_to_dict(alto, raise_errors=True))
                    # "meta"
                    d['alto_file'] = alto_file
                    d['alto_xmlns'] = ET.QName(alto).namespace

                    alto_info.append(d)

                    if caught_warnings:
                        # PyCharm thinks caught_warnings is not Iterable:
                        # noinspection PyTypeChecker
                        for caught_warning in caught_warnings:
                            csvwriter.writerow([alto_file, caught_warning.message])
            except Exception as e:
                logger.error('Exception in {}: {}'.format(alto_file, e))
                import traceback; traceback.print_exc()

    # Convert the alto_info List[Dict] to a pandas DataFrame
    columns = []
    for m in alto_info:
        for c in m.keys():
            if c not in columns:
                columns.append(c)
    data = [[m.get(c) for c in columns] for m in alto_info]
    index = [m['alto_file'] for m in alto_info] # TODO use ppn + page?
    alto_info_df = pd.DataFrame(data=data, index=index, columns=columns)

    # Pickle the DataFrame
    logger.info('Writing DataFrame to {}'.format(output_file))
    alto_info_df.to_pickle(output_file)
    if output_csv:
        logger.info('Writing CSV to {}'.format(output_csv))
        alto_info_df.to_csv(output_csv)
    if output_xlsx:
        logger.info('Writing Excel .xlsx to {}'.format(output_xlsx))
        alto_info_df.to_excel(output_xlsx)


def main():
    logging.basicConfig(level=logging.INFO)

    for prefix, uri in ns.items():
        ET.register_namespace(prefix, uri)

    process()


if __name__ == '__main__':
    main()
🚧 Add support for ALTO Description 2022-05-04 20:02:27 +02:00			`#!/usr/bin/env python3`
			`import csv`
			`import logging`
			`import os`
			`import re`
			`import warnings`
			`import sys`
🚧 ALTO: Calculate mean of String@WC 2022-05-23 19:12:39 +02:00			`from xml.dom.expatbuilder import Namespaces`
🚧 Add support for ALTO Description 2022-05-04 20:02:27 +02:00			`from lxml import etree as ET`
			`from itertools import groupby`
			`from operator import attrgetter`
			`from typing import List`
			`from collections.abc import MutableMapping, Sequence`

			`import click`
			`import pandas as pd`
🚧 ALTO: Calculate mean of String@WC 2022-05-23 19:12:39 +02:00			`import numpy as np`
🚧 Add support for ALTO Description 2022-05-04 20:02:27 +02:00			`from tqdm import tqdm`

			`from .lib import TagGroup, sorted_groupby, flatten, ns`


✨ Rename altotool to alto4pandas See gh-15. 2022-06-16 19:27:54 +02:00			`logger = logging.getLogger('alto4pandas')`
🚧 Add support for ALTO Description 2022-05-04 20:02:27 +02:00


			`def alto_to_dict(alto, raise_errors=True):`
			`"""Convert ALTO metadata to a nested dictionary"""`

			`value = {}`

			`# Iterate through each group of tags`
			`for tag, group in sorted_groupby(alto, key=attrgetter('tag')):`
			`group = list(group)`

✨ ALTO: Support more ALTO versions 2022-05-10 17:46:50 +02:00			`localname = ET.QName(tag).localname`
🚧 alto4pandas: Determine ALTO namespace for group 2022-06-17 17:01:07 +02:00			`alto_namespace = ET.QName(tag).namespace`
			`namespaces={"alto": alto_namespace}`
✨ ALTO: Support more ALTO versions 2022-05-10 17:46:50 +02:00
			`if localname == 'Description':`
			`value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().descend(raise_errors)`
			`elif localname == 'MeasurementUnit':`
			`value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().text()`
			`elif localname == 'OCRProcessing':`
			`value[localname] = TagGroup(tag, group).is_singleton().descend(raise_errors)`
✨ ALTO: Support more ALTO versions 2022-05-10 19:32:26 +02:00			`elif localname == 'Processing':`
			`# TODO This enumerated descent is used more than once, DRY!`
			`for n, e in enumerate(group):`
			`value[f'{localname}{n}'] = alto_to_dict(e, raise_errors)`
✨ ALTO: Support more ALTO versions 2022-05-10 17:46:50 +02:00			`elif localname == 'ocrProcessingStep':`
🚧 Add support for ALTO Description 2022-05-04 20:02:27 +02:00			`for n, e in enumerate(group):`
✨ ALTO: Support more ALTO versions 2022-05-10 17:46:50 +02:00			`value[f'{localname}{n}'] = alto_to_dict(e, raise_errors)`
			`elif localname == 'preProcessingStep':`
✨ ALTO: preProcessingStep/processingAgency/sourceImageInformation etc. 2022-05-10 14:27:39 +02:00			`for n, e in enumerate(group):`
✨ ALTO: Support more ALTO versions 2022-05-10 17:46:50 +02:00			`value[f'{localname}{n}'] = alto_to_dict(e, raise_errors)`
			`elif localname == 'processingDateTime':`
			`value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().text()`
			`elif localname == 'processingSoftware':`
			`value[localname] = TagGroup(tag, group).is_singleton().descend(raise_errors)`
			`elif localname == 'processingAgency':`
			`value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().text()`
			`elif localname == 'processingStepDescription':`
			`value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().text()`
			`elif localname == 'processingStepSettings':`
			`value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().text()`
			`elif localname == 'softwareCreator':`
			`value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().text()`
			`elif localname == 'softwareName':`
			`value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().text()`
			`elif localname == 'softwareVersion':`
			`value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().text()`

			`elif localname == 'sourceImageInformation':`
			`value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().descend(raise_errors)`
			`elif localname == 'fileName':`
			`value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().text()`

			`elif localname == 'Layout':`
			`value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().descend(raise_errors)`
			`elif localname == 'Page':`
🚧 ALTO: Move xpath_statistics to TagGroup class 2022-05-23 19:39:21 +02:00			`value[localname] = {}`
			`value[localname].update(TagGroup(tag, group).is_singleton().attributes())`
			`value[localname].update(TagGroup(tag, group).subelement_counts())`
			`value[localname].update(TagGroup(tag, group).xpath_statistics("//alto:String/@WC", namespaces))`
🚧 ALTO: Calculate mean of String@WC 2022-05-23 19:12:39 +02:00
✨ Count all alto:String elements with TAGREFS attribute 2022-06-17 17:59:34 +02:00			`# Count all alto:String elements with TAGREFS attribute`
			`value[localname].update(TagGroup(tag, group).xpath_count("//alto:String[@TAGREFS]", namespaces))`

✨ ALTO: Support more ALTO versions 2022-05-10 17:46:50 +02:00			`elif localname == 'Styles':`
🚧 Add support for ALTO Description 2022-05-04 20:02:27 +02:00			`pass`
✨ ALTO: Support more ALTO versions 2022-05-10 19:32:26 +02:00			`elif localname == 'Tags':`
✨ ALTO: Count alto:Tags 2022-06-17 17:32:17 +02:00			`value[localname] = {}`
			`value[localname].update(TagGroup(tag, group).subelement_counts())`
🚧 Add support for ALTO Description 2022-05-04 20:02:27 +02:00			`else:`
			`if raise_errors:`
			`print(value)`
			`raise ValueError('Unknown tag "{}"'.format(tag))`
			`else:`
			`pass`

			`return value`



			`def walk(m):`
			`# XXX do this in modstool, too`
			`if os.path.isdir(m):`
🐛 Use tqdm's write() instead of logging during scanning 2022-05-10 17:57:36 +02:00			`tqdm.write(f'Scanning directory {m}')`
🚧 Add support for ALTO Description 2022-05-04 20:02:27 +02:00			`for f in tqdm(os.scandir(m), leave=False):`
			`if f.is_file() and not f.name.startswith('.'):`
			`yield f.path`
			`elif f.is_dir():`
✨ ALTO: Handle PermissionErrors 2022-05-09 18:28:31 +02:00			`try:`
			`yield from walk(f.path)`
			`except PermissionError:`
			`warnings.warn(f"Error walking {f.path}")`
🚧 Add support for ALTO Description 2022-05-04 20:02:27 +02:00			`else:`
			`yield m.path`



			`@click.command()`
			`@click.argument('alto_files', type=click.Path(exists=True), required=True, nargs=-1)`
			`@click.option('--output', '-o', 'output_file', type=click.Path(), help='Output pickle file',`
			`default='alto_info_df.pkl', show_default=True)`
			`@click.option('--output-csv', type=click.Path(), help='Output CSV file')`
			`@click.option('--output-xlsx', type=click.Path(), help='Output Excel .xlsx file')`
			`def process(alto_files: List[str], output_file: str, output_csv: str, output_xlsx: str):`
			`"""`
			`A tool to convert the ALTO metadata in INPUT to a pandas DataFrame.`

			`INPUT is assumed to be a ALTO document. INPUT may optionally be a directory. The tool then reads`
			`all files in the directory.`

✨ Rename altotool to alto4pandas See gh-15. 2022-06-16 19:27:54 +02:00			`alto4pandas writes two output files: A pickled pandas DataFrame and a CSV file with all conversion warnings.`
🚧 Add support for ALTO Description 2022-05-04 20:02:27 +02:00			`"""`

			`# Extend file list if directories are given`
			`alto_files_real = []`
			`for m in alto_files:`
			`for x in walk(m):`
			`alto_files_real.append(x)`

			`# Process ALTO files`
			`with open(output_file + '.warnings.csv', 'w') as csvfile:`
			`csvwriter = csv.writer(csvfile)`
			`alto_info = []`
			`logger.info('Processing ALTO files')`
			`for alto_file in tqdm(alto_files_real, leave=False):`
			`try:`
			`root = ET.parse(alto_file).getroot()`
			`alto = root # XXX .find('alto:alto', ns) does not work here`

			`with warnings.catch_warnings(record=True) as caught_warnings:`
			`warnings.simplefilter('always') # do NOT filter double occurrences`

🧹 Do not duplicate ALTO metadata 2022-05-06 19:36:50 +02:00			`# ALTO`
🚧 Add support for ALTO Description 2022-05-04 20:02:27 +02:00			`d = flatten(alto_to_dict(alto, raise_errors=True))`
			`# "meta"`
			`d['alto_file'] = alto_file`
✨ ALTO: Extract namespace == ALTO version 2022-06-08 18:25:33 +02:00			`d['alto_xmlns'] = ET.QName(alto).namespace`
🚧 Add support for ALTO Description 2022-05-04 20:02:27 +02:00
			`alto_info.append(d)`

			`if caught_warnings:`
			`# PyCharm thinks caught_warnings is not Iterable:`
			`# noinspection PyTypeChecker`
			`for caught_warning in caught_warnings:`
			`csvwriter.writerow([alto_file, caught_warning.message])`
			`except Exception as e:`
			`logger.error('Exception in {}: {}'.format(alto_file, e))`
🚧 ALTO: Calculate mean of String@WC 2022-05-23 19:12:39 +02:00			`import traceback; traceback.print_exc()`
🚧 Add support for ALTO Description 2022-05-04 20:02:27 +02:00
			`# Convert the alto_info List[Dict] to a pandas DataFrame`
			`columns = []`
			`for m in alto_info:`
			`for c in m.keys():`
			`if c not in columns:`
			`columns.append(c)`
			`data = [[m.get(c) for c in columns] for m in alto_info]`
			`index = [m['alto_file'] for m in alto_info] # TODO use ppn + page?`
			`alto_info_df = pd.DataFrame(data=data, index=index, columns=columns)`

			`# Pickle the DataFrame`
			`logger.info('Writing DataFrame to {}'.format(output_file))`
			`alto_info_df.to_pickle(output_file)`
			`if output_csv:`
			`logger.info('Writing CSV to {}'.format(output_csv))`
			`alto_info_df.to_csv(output_csv)`
			`if output_xlsx:`
			`logger.info('Writing Excel .xlsx to {}'.format(output_xlsx))`
			`alto_info_df.to_excel(output_xlsx)`


			`def main():`
			`logging.basicConfig(level=logging.INFO)`

			`for prefix, uri in ns.items():`
			`ET.register_namespace(prefix, uri)`

			`process()`


			`if __name__ == '__main__':`
			`main()`