modstool/src/mods4pandas/alto4pandas.py

#!/usr/bin/env python3
import contextlib
import csv
import os
import sqlite3
import warnings
from operator import attrgetter
from typing import List

import click
from loguru import logger
from lxml import etree as ET
from tqdm import tqdm

from .lib import (
    TagGroup,
    convert_db_to_parquet,
    flatten,
    insert_into_db,
    ns,
    sorted_groupby,
)


def alto_to_dict(alto, raise_errors=True):
    """Convert ALTO metadata to a nested dictionary"""

    value = {}

    # Iterate through each group of tags
    for tag, group in sorted_groupby(alto, key=attrgetter("tag")):
        group = list(group)

        localname = ET.QName(tag).localname
        alto_namespace = ET.QName(tag).namespace
        namespaces = {"alto": alto_namespace}

        if localname == "Description":
            value[localname] = (
                TagGroup(tag, group)
                .is_singleton()
                .has_no_attributes()
                .descend(raise_errors)
            )
        elif localname == "MeasurementUnit":
            value[localname] = (
                TagGroup(tag, group).is_singleton().has_no_attributes().text()
            )
        elif localname == "OCRProcessing":
            value[localname] = TagGroup(tag, group).is_singleton().descend(raise_errors)
        elif localname == "Processing":
            # TODO This enumerated descent is used more than once, DRY!
            for n, e in enumerate(group):
                value[f"{localname}{n}"] = alto_to_dict(e, raise_errors)
        elif localname == "ocrProcessingStep":
            for n, e in enumerate(group):
                value[f"{localname}{n}"] = alto_to_dict(e, raise_errors)
        elif localname == "preProcessingStep":
            for n, e in enumerate(group):
                value[f"{localname}{n}"] = alto_to_dict(e, raise_errors)
        elif localname == "processingDateTime":
            value[localname] = (
                TagGroup(tag, group).is_singleton().has_no_attributes().text()
            )
        elif localname == "processingSoftware":
            value[localname] = TagGroup(tag, group).is_singleton().descend(raise_errors)
        elif localname == "processingAgency":
            value[localname] = (
                TagGroup(tag, group).is_singleton().has_no_attributes().text()
            )
        elif localname == "processingStepDescription":
            value[localname] = (
                TagGroup(tag, group).is_singleton().has_no_attributes().text()
            )
        elif localname == "processingStepSettings":
            value[localname] = (
                TagGroup(tag, group).is_singleton().has_no_attributes().text()
            )
        elif localname == "softwareCreator":
            value[localname] = (
                TagGroup(tag, group).is_singleton().has_no_attributes().text()
            )
        elif localname == "softwareName":
            value[localname] = (
                TagGroup(tag, group).is_singleton().has_no_attributes().text()
            )
        elif localname == "softwareVersion":
            value[localname] = (
                TagGroup(tag, group).is_singleton().has_no_attributes().text()
            )

        elif localname == "sourceImageInformation":
            value[localname] = (
                TagGroup(tag, group)
                .is_singleton()
                .has_no_attributes()
                .descend(raise_errors)
            )
        elif localname == "fileName":
            value[localname] = (
                TagGroup(tag, group).is_singleton().has_no_attributes().text()
            )
        elif localname == "fileIdentifier":
            value[localname] = (
                TagGroup(tag, group).is_singleton().has_no_attributes().text()
            )

        elif localname == "Layout":
            value[localname] = (
                TagGroup(tag, group)
                .is_singleton()
                .has_no_attributes()
                .descend(raise_errors)
            )
        elif localname == "Page":
            value[localname] = {}
            value[localname].update(TagGroup(tag, group).is_singleton().attributes())
            for attr in ("WIDTH", "HEIGHT"):
                if attr in value[localname]:
                    try:
                        value[localname][attr] = int(value[localname][attr])
                    except ValueError:
                        del value[localname][attr]
            value[localname].update(TagGroup(tag, group).subelement_counts())
            value[localname].update(
                TagGroup(tag, group).xpath_statistics("//alto:String/@WC", namespaces)
            )

            # Count all alto:String elements with TAGREFS attribute
            value[localname].update(
                TagGroup(tag, group).xpath_count("//alto:String[@TAGREFS]", namespaces)
            )

        elif localname == "Styles":
            pass
        elif localname == "Tags":
            value[localname] = {}
            value[localname].update(TagGroup(tag, group).subelement_counts())
        else:
            if raise_errors:
                print(value)
                raise ValueError('Unknown tag "{}"'.format(tag))
            else:
                pass

    return value


def walk(m):
    # XXX do this in mods4pandas, too
    if os.path.isdir(m):
        logger.info(f"Scanning directory {m}")
        for f in os.scandir(m):
            if f.is_file() and not f.name.startswith("."):
                yield f.path
            elif f.is_dir():
                try:
                    yield from walk(f.path)
                except PermissionError:
                    warnings.warn(f"Error walking {f.path}")
    else:
        yield m.path


@click.command()
@click.argument("alto_files", type=click.Path(exists=True), required=True, nargs=-1)
@click.option(
    "--output",
    "-o",
    "output_file",
    type=click.Path(),
    help="Output Parquet file",
    default="alto_info_df.parquet",
    show_default=True,
)
def process_command(alto_files: List[str], output_file: str):
    """
    A tool to convert the ALTO metadata in INPUT to a pandas DataFrame.

    INPUT is assumed to be a ALTO document. INPUT may optionally be a directory. The tool then reads
    all files in the directory.

    alto4pandas writes multiple output files:
    - A Parquet DataFrame
    - A SQLite database
    - and a CSV file with all conversion warnings.
    """

    process(alto_files, output_file)


def process(alto_files: List[str], output_file: str):
    # Extend file list if directories are given
    alto_files_real = []
    for m in alto_files:
        for x in walk(m):
            alto_files_real.append(x)

    # Prepare output files
    with contextlib.suppress(FileNotFoundError):
        os.remove(output_file)
    output_file_sqlite3 = output_file + ".sqlite3"
    with contextlib.suppress(FileNotFoundError):
        os.remove(output_file_sqlite3)

    logger.info("Writing SQLite DB to {}".format(output_file_sqlite3))
    con = sqlite3.connect(output_file_sqlite3)

    # Process ALTO files
    with open(output_file + ".warnings.csv", "w") as csvfile:
        csvwriter = csv.writer(csvfile)
        logger.info("Processing ALTO files")
        for alto_file in tqdm(alto_files_real, leave=False):
            try:
                root = ET.parse(alto_file).getroot()
                alto = root  # XXX .find('alto:alto', ns) does not work here

                with warnings.catch_warnings(record=True) as caught_warnings:
                    warnings.simplefilter("always")  # do NOT filter double occurrences

                    # ALTO
                    d = flatten(alto_to_dict(alto, raise_errors=True))
                    # "meta"
                    d["alto_file"] = alto_file
                    d["alto_xmlns"] = ET.QName(alto).namespace

                    # Save
                    insert_into_db(con, "alto_info", d)
                    con.commit()

                    if caught_warnings:
                        # PyCharm thinks caught_warnings is not Iterable:
                        # noinspection PyTypeChecker
                        for caught_warning in caught_warnings:
                            csvwriter.writerow([alto_file, caught_warning.message])
            except Exception as e:
                logger.error("Exception in {}: {}".format(alto_file, e))
                import traceback

                traceback.print_exc()

    # Convert the alto_info SQL to a pandas DataFrame
    logger.info("Writing DataFrame to {}".format(output_file))
    convert_db_to_parquet(con, "alto_info", "alto_file", output_file)


def main():
    for prefix, uri in ns.items():
        ET.register_namespace(prefix, uri)

    process_command()


if __name__ == "__main__":
    main()
🚧 Add support for ALTO Description 2022-05-04 20:02:27 +02:00			`#!/usr/bin/env python3`
🎨 Sort and remove unused imports 2025-06-13 19:20:48 +02:00			`import contextlib`
🚧 Add support for ALTO Description 2022-05-04 20:02:27 +02:00			`import csv`
			`import os`
🚧 Use temporary SQLite DB for alto4pandas, too 2024-11-29 15:53:00 +01:00			`import sqlite3`
🎨 Sort and remove unused imports 2025-06-13 19:20:48 +02:00			`import warnings`
🚧 Add support for ALTO Description 2022-05-04 20:02:27 +02:00			`from operator import attrgetter`
			`from typing import List`

			`import click`
💄 Use loguru for logging/remove extra progress bars Closes gh-42. 2025-08-08 14:34:06 +02:00			`from loguru import logger`
🎨 Sort and remove unused imports 2025-06-13 19:20:48 +02:00			`from lxml import etree as ET`
🚧 Add support for ALTO Description 2022-05-04 20:02:27 +02:00			`from tqdm import tqdm`

🎨 Reformat (Black) 2025-06-12 09:51:02 +02:00			`from .lib import (`
			`TagGroup,`
			`convert_db_to_parquet,`
			`flatten,`
			`insert_into_db,`
🎨 Sort and remove unused imports 2025-06-13 19:20:48 +02:00			`ns,`
			`sorted_groupby,`
🎨 Reformat (Black) 2025-06-12 09:51:02 +02:00			`)`
🚧 Add support for ALTO Description 2022-05-04 20:02:27 +02:00

			`def alto_to_dict(alto, raise_errors=True):`
			`"""Convert ALTO metadata to a nested dictionary"""`

			`value = {}`

			`# Iterate through each group of tags`
🎨 Reformat (Black) 2025-06-12 09:51:02 +02:00			`for tag, group in sorted_groupby(alto, key=attrgetter("tag")):`
🚧 Add support for ALTO Description 2022-05-04 20:02:27 +02:00			`group = list(group)`

✨ ALTO: Support more ALTO versions 2022-05-10 17:46:50 +02:00			`localname = ET.QName(tag).localname`
🚧 alto4pandas: Determine ALTO namespace for group 2022-06-17 17:01:07 +02:00			`alto_namespace = ET.QName(tag).namespace`
🎨 Reformat (Black) 2025-06-12 09:51:02 +02:00			`namespaces = {"alto": alto_namespace}`

			`if localname == "Description":`
			`value[localname] = (`
			`TagGroup(tag, group)`
			`.is_singleton()`
			`.has_no_attributes()`
			`.descend(raise_errors)`
			`)`
			`elif localname == "MeasurementUnit":`
			`value[localname] = (`
			`TagGroup(tag, group).is_singleton().has_no_attributes().text()`
			`)`
			`elif localname == "OCRProcessing":`
✨ ALTO: Support more ALTO versions 2022-05-10 17:46:50 +02:00			`value[localname] = TagGroup(tag, group).is_singleton().descend(raise_errors)`
🎨 Reformat (Black) 2025-06-12 09:51:02 +02:00			`elif localname == "Processing":`
✨ ALTO: Support more ALTO versions 2022-05-10 19:32:26 +02:00			`# TODO This enumerated descent is used more than once, DRY!`
			`for n, e in enumerate(group):`
🎨 Reformat (Black) 2025-06-12 09:51:02 +02:00			`value[f"{localname}{n}"] = alto_to_dict(e, raise_errors)`
			`elif localname == "ocrProcessingStep":`
🚧 Add support for ALTO Description 2022-05-04 20:02:27 +02:00			`for n, e in enumerate(group):`
🎨 Reformat (Black) 2025-06-12 09:51:02 +02:00			`value[f"{localname}{n}"] = alto_to_dict(e, raise_errors)`
			`elif localname == "preProcessingStep":`
✨ ALTO: preProcessingStep/processingAgency/sourceImageInformation etc. 2022-05-10 14:27:39 +02:00			`for n, e in enumerate(group):`
🎨 Reformat (Black) 2025-06-12 09:51:02 +02:00			`value[f"{localname}{n}"] = alto_to_dict(e, raise_errors)`
			`elif localname == "processingDateTime":`
			`value[localname] = (`
			`TagGroup(tag, group).is_singleton().has_no_attributes().text()`
			`)`
			`elif localname == "processingSoftware":`
✨ ALTO: Support more ALTO versions 2022-05-10 17:46:50 +02:00			`value[localname] = TagGroup(tag, group).is_singleton().descend(raise_errors)`
🎨 Reformat (Black) 2025-06-12 09:51:02 +02:00			`elif localname == "processingAgency":`
			`value[localname] = (`
			`TagGroup(tag, group).is_singleton().has_no_attributes().text()`
			`)`
			`elif localname == "processingStepDescription":`
			`value[localname] = (`
			`TagGroup(tag, group).is_singleton().has_no_attributes().text()`
			`)`
			`elif localname == "processingStepSettings":`
			`value[localname] = (`
			`TagGroup(tag, group).is_singleton().has_no_attributes().text()`
			`)`
			`elif localname == "softwareCreator":`
			`value[localname] = (`
			`TagGroup(tag, group).is_singleton().has_no_attributes().text()`
			`)`
			`elif localname == "softwareName":`
			`value[localname] = (`
			`TagGroup(tag, group).is_singleton().has_no_attributes().text()`
			`)`
			`elif localname == "softwareVersion":`
			`value[localname] = (`
			`TagGroup(tag, group).is_singleton().has_no_attributes().text()`
			`)`

			`elif localname == "sourceImageInformation":`
			`value[localname] = (`
			`TagGroup(tag, group)`
			`.is_singleton()`
			`.has_no_attributes()`
			`.descend(raise_errors)`
			`)`
			`elif localname == "fileName":`
			`value[localname] = (`
			`TagGroup(tag, group).is_singleton().has_no_attributes().text()`
			`)`
			`elif localname == "fileIdentifier":`
			`value[localname] = (`
			`TagGroup(tag, group).is_singleton().has_no_attributes().text()`
			`)`

			`elif localname == "Layout":`
			`value[localname] = (`
			`TagGroup(tag, group)`
			`.is_singleton()`
			`.has_no_attributes()`
			`.descend(raise_errors)`
			`)`
			`elif localname == "Page":`
🚧 ALTO: Move xpath_statistics to TagGroup class 2022-05-23 19:39:21 +02:00			`value[localname] = {}`
			`value[localname].update(TagGroup(tag, group).is_singleton().attributes())`
✨ Make Layout_Page_WIDTH/HEIGHT integer values 2025-06-11 19:13:38 +02:00			`for attr in ("WIDTH", "HEIGHT"):`
			`if attr in value[localname]:`
			`try:`
			`value[localname][attr] = int(value[localname][attr])`
			`except ValueError:`
			`del value[localname][attr]`
🚧 ALTO: Move xpath_statistics to TagGroup class 2022-05-23 19:39:21 +02:00			`value[localname].update(TagGroup(tag, group).subelement_counts())`
🎨 Reformat (Black) 2025-06-12 09:51:02 +02:00			`value[localname].update(`
			`TagGroup(tag, group).xpath_statistics("//alto:String/@WC", namespaces)`
			`)`
🚧 ALTO: Calculate mean of String@WC 2022-05-23 19:12:39 +02:00
✨ Count all alto:String elements with TAGREFS attribute 2022-06-17 17:59:34 +02:00			`# Count all alto:String elements with TAGREFS attribute`
🎨 Reformat (Black) 2025-06-12 09:51:02 +02:00			`value[localname].update(`
			`TagGroup(tag, group).xpath_count("//alto:String[@TAGREFS]", namespaces)`
			`)`
✨ Count all alto:String elements with TAGREFS attribute 2022-06-17 17:59:34 +02:00
🎨 Reformat (Black) 2025-06-12 09:51:02 +02:00			`elif localname == "Styles":`
🚧 Add support for ALTO Description 2022-05-04 20:02:27 +02:00			`pass`
🎨 Reformat (Black) 2025-06-12 09:51:02 +02:00			`elif localname == "Tags":`
✨ ALTO: Count alto:Tags 2022-06-17 17:32:17 +02:00			`value[localname] = {}`
			`value[localname].update(TagGroup(tag, group).subelement_counts())`
🚧 Add support for ALTO Description 2022-05-04 20:02:27 +02:00			`else:`
			`if raise_errors:`
			`print(value)`
			`raise ValueError('Unknown tag "{}"'.format(tag))`
			`else:`
			`pass`

			`return value`


			`def walk(m):`
🎨 Rename modstool to mods4pandas in the last code parts 2022-07-04 19:26:41 +02:00			`# XXX do this in mods4pandas, too`
🚧 Add support for ALTO Description 2022-05-04 20:02:27 +02:00			`if os.path.isdir(m):`
💄 Use loguru for logging/remove extra progress bars Closes gh-42. 2025-08-08 14:34:06 +02:00			`logger.info(f"Scanning directory {m}")`
			`for f in os.scandir(m):`
🎨 Reformat (Black) 2025-06-12 09:51:02 +02:00			`if f.is_file() and not f.name.startswith("."):`
🚧 Add support for ALTO Description 2022-05-04 20:02:27 +02:00			`yield f.path`
			`elif f.is_dir():`
✨ ALTO: Handle PermissionErrors 2022-05-09 18:28:31 +02:00			`try:`
			`yield from walk(f.path)`
			`except PermissionError:`
			`warnings.warn(f"Error walking {f.path}")`
🚧 Add support for ALTO Description 2022-05-04 20:02:27 +02:00			`else:`
			`yield m.path`


			`@click.command()`
🎨 Reformat (Black) 2025-06-12 09:51:02 +02:00			`@click.argument("alto_files", type=click.Path(exists=True), required=True, nargs=-1)`
			`@click.option(`
			`"--output",`
			`"-o",`
			`"output_file",`
			`type=click.Path(),`
			`help="Output Parquet file",`
			`default="alto_info_df.parquet",`
			`show_default=True,`
			`)`
✔ Test if dtypes are as expected in produced Parquet files 2025-06-12 09:42:29 +02:00			`def process_command(alto_files: List[str], output_file: str):`
🚧 Add support for ALTO Description 2022-05-04 20:02:27 +02:00			`"""`
			`A tool to convert the ALTO metadata in INPUT to a pandas DataFrame.`

			`INPUT is assumed to be a ALTO document. INPUT may optionally be a directory. The tool then reads`
			`all files in the directory.`

🚧 Use temporary SQLite DB for alto4pandas, too 2024-11-29 15:53:00 +01:00			`alto4pandas writes multiple output files:`
			`- A Parquet DataFrame`
			`- A SQLite database`
			`- and a CSV file with all conversion warnings.`
🚧 Add support for ALTO Description 2022-05-04 20:02:27 +02:00			`"""`

✔ Test if dtypes are as expected in produced Parquet files 2025-06-12 09:42:29 +02:00			`process(alto_files, output_file)`

🎨 Reformat (Black) 2025-06-12 09:51:02 +02:00
✔ Test if dtypes are as expected in produced Parquet files 2025-06-12 09:42:29 +02:00			`def process(alto_files: List[str], output_file: str):`
🚧 Add support for ALTO Description 2022-05-04 20:02:27 +02:00			`# Extend file list if directories are given`
			`alto_files_real = []`
			`for m in alto_files:`
			`for x in walk(m):`
			`alto_files_real.append(x)`

🚧 Use temporary SQLite DB for alto4pandas, too 2024-11-29 15:53:00 +01:00			`# Prepare output files`
			`with contextlib.suppress(FileNotFoundError):`
			`os.remove(output_file)`
			`output_file_sqlite3 = output_file + ".sqlite3"`
			`with contextlib.suppress(FileNotFoundError):`
			`os.remove(output_file_sqlite3)`

🎨 Reformat (Black) 2025-06-12 09:51:02 +02:00			`logger.info("Writing SQLite DB to {}".format(output_file_sqlite3))`
🚧 Use temporary SQLite DB for alto4pandas, too 2024-11-29 15:53:00 +01:00			`con = sqlite3.connect(output_file_sqlite3)`

🚧 Add support for ALTO Description 2022-05-04 20:02:27 +02:00			`# Process ALTO files`
🎨 Reformat (Black) 2025-06-12 09:51:02 +02:00			`with open(output_file + ".warnings.csv", "w") as csvfile:`
🚧 Add support for ALTO Description 2022-05-04 20:02:27 +02:00			`csvwriter = csv.writer(csvfile)`
🎨 Reformat (Black) 2025-06-12 09:51:02 +02:00			`logger.info("Processing ALTO files")`
🚧 Add support for ALTO Description 2022-05-04 20:02:27 +02:00			`for alto_file in tqdm(alto_files_real, leave=False):`
			`try:`
			`root = ET.parse(alto_file).getroot()`
🎨 Reformat (Black) 2025-06-12 09:51:02 +02:00			`alto = root # XXX .find('alto:alto', ns) does not work here`
🚧 Add support for ALTO Description 2022-05-04 20:02:27 +02:00
			`with warnings.catch_warnings(record=True) as caught_warnings:`
🎨 Reformat (Black) 2025-06-12 09:51:02 +02:00			`warnings.simplefilter("always") # do NOT filter double occurrences`
🚧 Add support for ALTO Description 2022-05-04 20:02:27 +02:00
🧹 Do not duplicate ALTO metadata 2022-05-06 19:36:50 +02:00			`# ALTO`
🚧 Add support for ALTO Description 2022-05-04 20:02:27 +02:00			`d = flatten(alto_to_dict(alto, raise_errors=True))`
			`# "meta"`
🎨 Reformat (Black) 2025-06-12 09:51:02 +02:00			`d["alto_file"] = alto_file`
			`d["alto_xmlns"] = ET.QName(alto).namespace`
🚧 Add support for ALTO Description 2022-05-04 20:02:27 +02:00
🚧 Use temporary SQLite DB for alto4pandas, too 2024-11-29 15:53:00 +01:00			`# Save`
			`insert_into_db(con, "alto_info", d)`
🐛 alto4pandas: Really commit data to SQLite DB 2024-12-03 17:34:07 +01:00			`con.commit()`
🚧 Add support for ALTO Description 2022-05-04 20:02:27 +02:00
			`if caught_warnings:`
			`# PyCharm thinks caught_warnings is not Iterable:`
			`# noinspection PyTypeChecker`
			`for caught_warning in caught_warnings:`
			`csvwriter.writerow([alto_file, caught_warning.message])`
			`except Exception as e:`
🎨 Reformat (Black) 2025-06-12 09:51:02 +02:00			`logger.error("Exception in {}: {}".format(alto_file, e))`
			`import traceback`

			`traceback.print_exc()`
🚧 Add support for ALTO Description 2022-05-04 20:02:27 +02:00
🚧 Use temporary SQLite DB for alto4pandas, too 2024-11-29 15:53:00 +01:00			`# Convert the alto_info SQL to a pandas DataFrame`
🎨 Reformat (Black) 2025-06-12 09:51:02 +02:00			`logger.info("Writing DataFrame to {}".format(output_file))`
🚧 Restore types before saving as Parquet 2025-06-04 21:10:10 +02:00			`convert_db_to_parquet(con, "alto_info", "alto_file", output_file)`
🚧 Add support for ALTO Description 2022-05-04 20:02:27 +02:00

			`def main():`
			`for prefix, uri in ns.items():`
			`ET.register_namespace(prefix, uri)`

🐛 Fix alto4pandas CLI 2025-06-13 20:29:12 +02:00			`process_command()`
🚧 Add support for ALTO Description 2022-05-04 20:02:27 +02:00

🎨 Reformat (Black) 2025-06-12 09:51:02 +02:00			`if __name__ == "__main__":`
🚧 Add support for ALTO Description 2022-05-04 20:02:27 +02:00			`main()`