🎨 Reformat (Black)

2025-08-16 13:09:53 +02:00 · 2025-06-12 09:51:02 +02:00 · 2025-06-12 09:51:02 +02:00 · 212df99436
commit 212df99436
parent 5c9858a061
7 changed files with 639 additions and 355 deletions
--- a/src/mods4pandas/mods4pandas.py
+++ b/src/mods4pandas/mods4pandas.py
@ -17,7 +17,16 @@ from collections.abc import MutableMapping, Sequence
 import click
 from tqdm import tqdm

-from .lib import convert_db_to_parquet, sorted_groupby, TagGroup, ns, flatten, insert_into_db, insert_into_db_multiple, current_columns_types
+from .lib import (
+    convert_db_to_parquet,
+    sorted_groupby,
+    TagGroup,
+    ns,
+    flatten,
+    insert_into_db,
+    insert_into_db_multiple,
+    current_columns_types,
+)

 with warnings.catch_warnings():
    # Filter warnings on WSL
@ -26,7 +35,8 @@ with warnings.catch_warnings():
    import pandas as pd


-logger = logging.getLogger('mods4pandas')
+logger = logging.getLogger("mods4pandas")
+

 def mods_to_dict(mods, raise_errors=True):
    """Convert MODS metadata to a nested dictionary"""
@ -37,179 +47,290 @@ def mods_to_dict(mods, raise_errors=True):
    value = {}

    # Iterate through each group of tags
-    for tag, group in sorted_groupby(mods, key=attrgetter('tag')):
+    for tag, group in sorted_groupby(mods, key=attrgetter("tag")):
        group = list(group)
-        if tag == '{http://www.loc.gov/mods/v3}location':
+        if tag == "{http://www.loc.gov/mods/v3}location":
+
            def only_current_location(location):
-                return location.get('type') != 'former'
-            value['location'] = TagGroup(tag, group) \
-                .filter(only_current_location) \
-                .has_attributes([{}, {'type': 'current'}]) \
-                .is_singleton().descend(raise_errors)
-        elif tag == '{http://www.loc.gov/mods/v3}physicalLocation':
+                return location.get("type") != "former"
+
+            value["location"] = (
+                TagGroup(tag, group)
+                .filter(only_current_location)
+                .has_attributes([{}, {"type": "current"}])
+                .is_singleton()
+                .descend(raise_errors)
+            )
+        elif tag == "{http://www.loc.gov/mods/v3}physicalLocation":
+
            def no_display_label(physical_location):
-                return physical_location.get('displayLabel') is None
-            value['physicalLocation'] = TagGroup(tag, group).filter(no_display_label).text()
-        elif tag == '{http://www.loc.gov/mods/v3}shelfLocator':
+                return physical_location.get("displayLabel") is None
+
+            value["physicalLocation"] = (
+                TagGroup(tag, group).filter(no_display_label).text()
+            )
+        elif tag == "{http://www.loc.gov/mods/v3}shelfLocator":
            # This element should not be repeated according to MODS-AP 2.3.1, however a few of the files contain
            # a second element with empty text and a "displayLabel" attribute set.
            def no_display_label(shelf_locator):
-                return shelf_locator.get('displayLabel') is None
-            value['shelfLocator'] = TagGroup(tag, group) \
-                .filter(no_display_label) \
-                .force_singleton() \
-                .has_no_attributes() \
+                return shelf_locator.get("displayLabel") is None
+
+            value["shelfLocator"] = (
+                TagGroup(tag, group)
+                .filter(no_display_label)
+                .force_singleton()
+                .has_no_attributes()
                .text()
-        elif tag == '{http://www.loc.gov/mods/v3}originInfo':
+            )
+        elif tag == "{http://www.loc.gov/mods/v3}originInfo":
+
            def has_event_type(origin_info):
                # According to MODS-AP 2.3.1, every originInfo should have its eventType set. However, some
                # are empty and not fixable.
-                return origin_info.attrib.get('eventType') is not None
-            tag_group = TagGroup(tag, group).fix_event_type().filter(has_event_type, warn="has no eventType")
-            for event_type, grouped_group in sorted_groupby(tag_group.group, key=lambda g: g.attrib['eventType']):
+                return origin_info.attrib.get("eventType") is not None
+
+            tag_group = (
+                TagGroup(tag, group)
+                .fix_event_type()
+                .filter(has_event_type, warn="has no eventType")
+            )
+            for event_type, grouped_group in sorted_groupby(
+                tag_group.group, key=lambda g: g.attrib["eventType"]
+            ):
                for n, e in enumerate(grouped_group):
-                    value['originInfo-{}{}'.format(event_type, n)] = mods_to_dict(e, raise_errors)
-        elif tag == '{http://www.loc.gov/mods/v3}place':
-            value['place'] = TagGroup(tag, group).force_singleton(warn=False).has_no_attributes().descend(raise_errors)
-        elif tag == '{http://www.loc.gov/mods/v3}placeTerm':
-            value['placeTerm'] = TagGroup(tag, group).is_singleton().has_attributes({'type': 'text'}).text()
-        elif tag == '{http://www.loc.gov/mods/v3}dateIssued':
-            value['dateIssued'] = TagGroup(tag, group) \
-                .fix_date() \
-                .sort(key=lambda d: d.attrib.get('keyDate') == 'yes', reverse=True) \
-                .ignore_attributes() \
-                .force_singleton() \
+                    value["originInfo-{}{}".format(event_type, n)] = mods_to_dict(
+                        e, raise_errors
+                    )
+        elif tag == "{http://www.loc.gov/mods/v3}place":
+            value["place"] = (
+                TagGroup(tag, group)
+                .force_singleton(warn=False)
+                .has_no_attributes()
+                .descend(raise_errors)
+            )
+        elif tag == "{http://www.loc.gov/mods/v3}placeTerm":
+            value["placeTerm"] = (
+                TagGroup(tag, group)
+                .is_singleton()
+                .has_attributes({"type": "text"})
                .text()
-        elif tag == '{http://www.loc.gov/mods/v3}dateCreated':
-            value['dateCreated'] = TagGroup(tag, group) \
-                .fix_date() \
-                .sort(key=lambda d: d.attrib.get('keyDate') == 'yes', reverse=True) \
-                .ignore_attributes() \
-                .force_singleton() \
+            )
+        elif tag == "{http://www.loc.gov/mods/v3}dateIssued":
+            value["dateIssued"] = (
+                TagGroup(tag, group)
+                .fix_date()
+                .sort(key=lambda d: d.attrib.get("keyDate") == "yes", reverse=True)
+                .ignore_attributes()
+                .force_singleton()
                .text()
-        elif tag == '{http://www.loc.gov/mods/v3}dateCaptured':
-            value['dateCaptured'] = TagGroup(tag, group).fix_date().ignore_attributes().is_singleton().text()
-        elif tag == '{http://www.loc.gov/mods/v3}dateOther':
-            value['dateOther'] = TagGroup(tag, group).fix_date().ignore_attributes().is_singleton().text()
-        elif tag == '{http://www.loc.gov/mods/v3}publisher':
-            value['publisher'] = TagGroup(tag, group).force_singleton(warn=False).has_no_attributes().text()
-        elif tag == '{http://www.loc.gov/mods/v3}edition':
-            value['edition'] = TagGroup(tag, group).force_singleton().has_no_attributes().text()
-        elif tag == '{http://www.loc.gov/mods/v3}classification':
-            authorities = {e.attrib['authority'] for e in group}
+            )
+        elif tag == "{http://www.loc.gov/mods/v3}dateCreated":
+            value["dateCreated"] = (
+                TagGroup(tag, group)
+                .fix_date()
+                .sort(key=lambda d: d.attrib.get("keyDate") == "yes", reverse=True)
+                .ignore_attributes()
+                .force_singleton()
+                .text()
+            )
+        elif tag == "{http://www.loc.gov/mods/v3}dateCaptured":
+            value["dateCaptured"] = (
+                TagGroup(tag, group)
+                .fix_date()
+                .ignore_attributes()
+                .is_singleton()
+                .text()
+            )
+        elif tag == "{http://www.loc.gov/mods/v3}dateOther":
+            value["dateOther"] = (
+                TagGroup(tag, group)
+                .fix_date()
+                .ignore_attributes()
+                .is_singleton()
+                .text()
+            )
+        elif tag == "{http://www.loc.gov/mods/v3}publisher":
+            value["publisher"] = (
+                TagGroup(tag, group)
+                .force_singleton(warn=False)
+                .has_no_attributes()
+                .text()
+            )
+        elif tag == "{http://www.loc.gov/mods/v3}edition":
+            value["edition"] = (
+                TagGroup(tag, group).force_singleton().has_no_attributes().text()
+            )
+        elif tag == "{http://www.loc.gov/mods/v3}classification":
+            authorities = {e.attrib["authority"] for e in group}
            for authority in authorities:
-                sub_group = [e for e in group if e.attrib.get('authority') == authority]
-                value['classification-{}'.format(authority)] = TagGroup(tag, sub_group).text_set()
-        elif tag == '{http://www.loc.gov/mods/v3}recordInfo':
-            value['recordInfo'] = TagGroup(tag, group).is_singleton().has_no_attributes().descend(raise_errors)
-        elif tag == '{http://www.loc.gov/mods/v3}recordIdentifier':
+                sub_group = [e for e in group if e.attrib.get("authority") == authority]
+                value["classification-{}".format(authority)] = TagGroup(
+                    tag, sub_group
+                ).text_set()
+        elif tag == "{http://www.loc.gov/mods/v3}recordInfo":
+            value["recordInfo"] = (
+                TagGroup(tag, group)
+                .is_singleton()
+                .has_no_attributes()
+                .descend(raise_errors)
+            )
+        elif tag == "{http://www.loc.gov/mods/v3}recordIdentifier":
            # By default we assume source="gbv-ppn" mods:recordIdentifiers (= PPNs),
            # however, in mods:relatedItems, there may be source="dnb-ppns",
            # which we need to distinguish by using a separate field name.
            try:
-                value['recordIdentifier'] = TagGroup(tag, group).is_singleton().has_attributes({'source': 'gbv-ppn'}).text()
+                value["recordIdentifier"] = (
+                    TagGroup(tag, group)
+                    .is_singleton()
+                    .has_attributes({"source": "gbv-ppn"})
+                    .text()
+                )
            except ValueError:
-                value['recordIdentifier-dnb-ppn'] = TagGroup(tag, group).is_singleton().has_attributes({'source': 'dnb-ppn'}).text()
-        elif tag == '{http://www.loc.gov/mods/v3}identifier':
+                value["recordIdentifier-dnb-ppn"] = (
+                    TagGroup(tag, group)
+                    .is_singleton()
+                    .has_attributes({"source": "dnb-ppn"})
+                    .text()
+                )
+        elif tag == "{http://www.loc.gov/mods/v3}identifier":
            for e in group:
                if len(e.attrib) != 1:
-                    raise ValueError('Unknown attributes for identifier {}'.format(e.attrib))
-                value['identifier-{}'.format(e.attrib['type'])] = e.text
-        elif tag == '{http://www.loc.gov/mods/v3}titleInfo':
+                    raise ValueError(
+                        "Unknown attributes for identifier {}".format(e.attrib)
+                    )
+                value["identifier-{}".format(e.attrib["type"])] = e.text
+        elif tag == "{http://www.loc.gov/mods/v3}titleInfo":
+
            def only_standard_title(title_info):
-                return title_info.attrib.get('type') is None
-            value['titleInfo'] = TagGroup(tag, group) \
-                .filter(only_standard_title) \
-                .is_singleton().has_no_attributes().descend(raise_errors)
-        elif tag == '{http://www.loc.gov/mods/v3}title':
-            value['title'] = TagGroup(tag, group).is_singleton().has_no_attributes().text()
-        elif tag == '{http://www.loc.gov/mods/v3}partName':
-            value['partName'] = TagGroup(tag, group).is_singleton().has_no_attributes().text()
-        elif tag == '{http://www.loc.gov/mods/v3}subTitle':
-            value['subTitle'] = TagGroup(tag, group).force_singleton().has_no_attributes().text()
-        elif tag == '{http://www.loc.gov/mods/v3}note':
+                return title_info.attrib.get("type") is None
+
+            value["titleInfo"] = (
+                TagGroup(tag, group)
+                .filter(only_standard_title)
+                .is_singleton()
+                .has_no_attributes()
+                .descend(raise_errors)
+            )
+        elif tag == "{http://www.loc.gov/mods/v3}title":
+            value["title"] = (
+                TagGroup(tag, group).is_singleton().has_no_attributes().text()
+            )
+        elif tag == "{http://www.loc.gov/mods/v3}partName":
+            value["partName"] = (
+                TagGroup(tag, group).is_singleton().has_no_attributes().text()
+            )
+        elif tag == "{http://www.loc.gov/mods/v3}subTitle":
+            value["subTitle"] = (
+                TagGroup(tag, group).force_singleton().has_no_attributes().text()
+            )
+        elif tag == "{http://www.loc.gov/mods/v3}note":
            # This could be useful if distinguished by type attribute.
            pass
-        elif tag == '{http://www.loc.gov/mods/v3}part':
+        elif tag == "{http://www.loc.gov/mods/v3}part":
            pass
-        elif tag == '{http://www.loc.gov/mods/v3}abstract':
-            value['abstract'] = TagGroup(tag, group).has_no_attributes().text()
-        elif tag == '{http://www.loc.gov/mods/v3}subject':
-            authorities = {e.attrib.get('authority') for e in group}
+        elif tag == "{http://www.loc.gov/mods/v3}abstract":
+            value["abstract"] = TagGroup(tag, group).has_no_attributes().text()
+        elif tag == "{http://www.loc.gov/mods/v3}subject":
+            authorities = {e.attrib.get("authority") for e in group}
            for authority in authorities:
-                k = 'subject-{}'.format(authority) if authority is not None else 'subject'
-                sub_group = [e for e in group if e.attrib.get('authority') == authority]
-                value[k] = TagGroup(tag, sub_group).force_singleton().descend(raise_errors)
-        elif tag == '{http://www.loc.gov/mods/v3}topic':
+                k = (
+                    "subject-{}".format(authority)
+                    if authority is not None
+                    else "subject"
+                )
+                sub_group = [e for e in group if e.attrib.get("authority") == authority]
+                value[k] = (
+                    TagGroup(tag, sub_group).force_singleton().descend(raise_errors)
+                )
+        elif tag == "{http://www.loc.gov/mods/v3}topic":
            TagGroup(tag, group).text_set()
-        elif tag == '{http://www.loc.gov/mods/v3}cartographics':
+        elif tag == "{http://www.loc.gov/mods/v3}cartographics":
            pass
-        elif tag == '{http://www.loc.gov/mods/v3}geographic':
+        elif tag == "{http://www.loc.gov/mods/v3}geographic":
            TagGroup(tag, group).text_set()
-        elif tag == '{http://www.loc.gov/mods/v3}temporal':
+        elif tag == "{http://www.loc.gov/mods/v3}temporal":
            TagGroup(tag, group).text_set()
-        elif tag == '{http://www.loc.gov/mods/v3}genre':
-            authorities = {e.attrib.get('authority') for e in group}
+        elif tag == "{http://www.loc.gov/mods/v3}genre":
+            authorities = {e.attrib.get("authority") for e in group}
            for authority in authorities:
-                k = 'genre-{}'.format(authority) if authority is not None else 'genre'
-                value[k] = {e.text for e in group if e.attrib.get('authority') == authority}
-        elif tag == '{http://www.loc.gov/mods/v3}language':
-            value["language"] = TagGroup(tag, group) \
-                .merge_sub_tags_to_set()
-        elif tag == '{http://www.loc.gov/mods/v3}languageTerm':
-            value['languageTerm'] = TagGroup(tag, group) \
-                .has_attributes({'authority': 'iso639-2b', 'type': 'code'}) \
+                k = "genre-{}".format(authority) if authority is not None else "genre"
+                value[k] = {
+                    e.text for e in group if e.attrib.get("authority") == authority
+                }
+        elif tag == "{http://www.loc.gov/mods/v3}language":
+            value["language"] = TagGroup(tag, group).merge_sub_tags_to_set()
+        elif tag == "{http://www.loc.gov/mods/v3}languageTerm":
+            value["languageTerm"] = (
+                TagGroup(tag, group)
+                .has_attributes({"authority": "iso639-2b", "type": "code"})
                .text_set()
-        elif tag == '{http://www.loc.gov/mods/v3}scriptTerm':
-            value['scriptTerm'] = TagGroup(tag, group) \
-                .fix_script_term() \
-                .has_attributes({'authority': 'iso15924', 'type': 'code'}) \
+            )
+        elif tag == "{http://www.loc.gov/mods/v3}scriptTerm":
+            value["scriptTerm"] = (
+                TagGroup(tag, group)
+                .fix_script_term()
+                .has_attributes({"authority": "iso15924", "type": "code"})
                .text_set()
-        elif tag == '{http://www.loc.gov/mods/v3}relatedItem':
+            )
+        elif tag == "{http://www.loc.gov/mods/v3}relatedItem":
            tag_group = TagGroup(tag, group)
-            for type_, grouped_group in sorted_groupby(tag_group.group, key=lambda g: g.attrib['type']):
-                sub_tag = 'relatedItem-{}'.format(type_)
+            for type_, grouped_group in sorted_groupby(
+                tag_group.group, key=lambda g: g.attrib["type"]
+            ):
+                sub_tag = "relatedItem-{}".format(type_)
                grouped_group = list(grouped_group)
                if type_ in ["original", "host"]:
-                    value[sub_tag] = TagGroup(sub_tag, grouped_group).is_singleton().descend(raise_errors)
+                    value[sub_tag] = (
+                        TagGroup(sub_tag, grouped_group)
+                        .is_singleton()
+                        .descend(raise_errors)
+                    )
                else:
                    # TODO type="series"
                    pass
-        elif tag == '{http://www.loc.gov/mods/v3}name':
+        elif tag == "{http://www.loc.gov/mods/v3}name":
            for n, e in enumerate(group):
-                value['name{}'.format(n)] = mods_to_dict(e, raise_errors)
-        elif tag == '{http://www.loc.gov/mods/v3}role':
-            value["role"] = TagGroup(tag, group) \
-                .has_no_attributes() \
-                .merge_sub_tags_to_set()
-        elif tag == '{http://www.loc.gov/mods/v3}roleTerm':
-            value['roleTerm'] = TagGroup(tag, group) \
-                .has_attributes({'authority': 'marcrelator', 'type': 'code'}) \
+                value["name{}".format(n)] = mods_to_dict(e, raise_errors)
+        elif tag == "{http://www.loc.gov/mods/v3}role":
+            value["role"] = (
+                TagGroup(tag, group).has_no_attributes().merge_sub_tags_to_set()
+            )
+        elif tag == "{http://www.loc.gov/mods/v3}roleTerm":
+            value["roleTerm"] = (
+                TagGroup(tag, group)
+                .has_attributes({"authority": "marcrelator", "type": "code"})
                .text_set()
-        elif tag == '{http://www.loc.gov/mods/v3}namePart':
+            )
+        elif tag == "{http://www.loc.gov/mods/v3}namePart":
            for e in group:
-                if not e.attrib.get('type'):
-                    value['namePart'] = e.text
+                if not e.attrib.get("type"):
+                    value["namePart"] = e.text
                else:
-                    value['namePart-{}'.format(e.attrib['type'])] = e.text
-        elif tag == '{http://www.loc.gov/mods/v3}nameIdentifier':
+                    value["namePart-{}".format(e.attrib["type"])] = e.text
+        elif tag == "{http://www.loc.gov/mods/v3}nameIdentifier":
            # TODO Use this (e.g. <mods:nameIdentifier type="ppn">106168096</mods:nameIdentifier>) or the
            # mods:name@valueURI to disambiguate
            pass
-        elif tag == '{http://www.loc.gov/mods/v3}displayForm':
-            value['displayForm'] = TagGroup(tag, group).is_singleton().has_no_attributes().text()
-        elif tag == '{http://www.loc.gov/mods/v3}physicalDescription':
+        elif tag == "{http://www.loc.gov/mods/v3}displayForm":
+            value["displayForm"] = (
+                TagGroup(tag, group).is_singleton().has_no_attributes().text()
+            )
+        elif tag == "{http://www.loc.gov/mods/v3}physicalDescription":
            pass
-        elif tag == '{http://www.loc.gov/mods/v3}extension':
+        elif tag == "{http://www.loc.gov/mods/v3}extension":
            pass
-        elif tag == '{http://www.loc.gov/mods/v3}accessCondition':
+        elif tag == "{http://www.loc.gov/mods/v3}accessCondition":
            for e in group:
-                if not e.attrib.get('type'):
-                    raise ValueError('Unknown attributes for accessCondition {}'.format(e.attrib))
-                value['accessCondition-{}'.format(e.attrib['type'])] = e.text
-        elif tag == '{http://www.loc.gov/mods/v3}typeOfResource':
-            value['typeOfResource'] = TagGroup(tag, group).is_singleton().has_no_attributes().text()
-        elif tag == '{http://www.loc.gov/mods/v3}mods':
+                if not e.attrib.get("type"):
+                    raise ValueError(
+                        "Unknown attributes for accessCondition {}".format(e.attrib)
+                    )
+                value["accessCondition-{}".format(e.attrib["type"])] = e.text
+        elif tag == "{http://www.loc.gov/mods/v3}typeOfResource":
+            value["typeOfResource"] = (
+                TagGroup(tag, group).is_singleton().has_no_attributes().text()
+            )
+        elif tag == "{http://www.loc.gov/mods/v3}mods":
            # XXX Ignore nested mods:mods for now (used in mods:subject)
            pass
        else:
@ -230,30 +351,29 @@ def mets_to_dict(mets, raise_errors=True):
    value = {}

    # Iterate through each group of tags
-    for tag, group in sorted_groupby(mets, key=attrgetter('tag')):
+    for tag, group in sorted_groupby(mets, key=attrgetter("tag")):
        group = list(group)

        # XXX Namespaces seem to use a trailing / sometimes, sometimes not.
        #     (e.g. {http://www.loc.gov/METS/} vs {http://www.loc.gov/METS})
-        if tag == '{http://www.loc.gov/METS/}amdSec':
+        if tag == "{http://www.loc.gov/METS/}amdSec":
            pass  # TODO
-        elif tag == '{http://www.loc.gov/METS/}dmdSec':
+        elif tag == "{http://www.loc.gov/METS/}dmdSec":
            pass  # TODO
-        elif tag == '{http://www.loc.gov/METS/}metsHdr':
+        elif tag == "{http://www.loc.gov/METS/}metsHdr":
            pass  # TODO
-        elif tag == '{http://www.loc.gov/METS/}structLink':
+        elif tag == "{http://www.loc.gov/METS/}structLink":
            pass  # TODO
-        elif tag == '{http://www.loc.gov/METS/}structMap':
+        elif tag == "{http://www.loc.gov/METS/}structMap":
            pass  # TODO
-        elif tag == '{http://www.loc.gov/METS/}fileSec':
-            value['fileSec'] = TagGroup(tag, group) \
-                .is_singleton().descend(raise_errors)
-        elif tag == '{http://www.loc.gov/METS/}fileGrp':
+        elif tag == "{http://www.loc.gov/METS/}fileSec":
+            value["fileSec"] = TagGroup(tag, group).is_singleton().descend(raise_errors)
+        elif tag == "{http://www.loc.gov/METS/}fileGrp":
            for e in group:
-                use = e.attrib.get('USE')
+                use = e.attrib.get("USE")
                if not use:
-                    raise ValueError('No USE attribute for fileGrp {}'.format(e))
-                value[f'fileGrp-{use}-count'] = len(e)
+                    raise ValueError("No USE attribute for fileGrp {}".format(e))
+                value[f"fileGrp-{use}-count"] = len(e)
        else:
            if raise_errors:
                print(value)
@ -262,6 +382,7 @@ def mets_to_dict(mets, raise_errors=True):
                pass
    return value

+
 def pages_to_dict(mets, raise_errors=True) -> List[Dict]:
    # TODO replace asserts by ValueError

@ -269,23 +390,36 @@ def pages_to_dict(mets, raise_errors=True) -> List[Dict]:

    # PPN
    def get_mets_recordIdentifier(*, source="gbv-ppn"):
-        return (mets.xpath(f'//mets:dmdSec[1]//mods:mods/mods:recordInfo/mods:recordIdentifier[@source="{source}"]',
-                           namespaces=ns) or [None])[0].text
+        return (
+            mets.xpath(
+                f'//mets:dmdSec[1]//mods:mods/mods:recordInfo/mods:recordIdentifier[@source="{source}"]',
+                namespaces=ns,
+            )
+            or [None]
+        )[0].text
+
    ppn = get_mets_recordIdentifier()

    # Getting per-page/structure information is a bit different
    structMap_PHYSICAL = mets.find('./mets:structMap[@TYPE="PHYSICAL"]', ns)
    structMap_LOGICAL = mets.find('./mets:structMap[@TYPE="LOGICAL"]', ns)
-    fileSec = mets.find('./mets:fileSec', ns)
+    fileSec = mets.find("./mets:fileSec", ns)
    if structMap_PHYSICAL is None:
        # This is expected in a multivolume work or periodical!
        if any(
-                structMap_LOGICAL.find(f'./mets:div[@TYPE="{t}"]', ns) is not None
-                for t in ["multivolume_work", "MultivolumeWork", "multivolume_manuscript", "periodical"]
+            structMap_LOGICAL.find(f'./mets:div[@TYPE="{t}"]', ns) is not None
+            for t in [
+                "multivolume_work",
+                "MultivolumeWork",
+                "multivolume_manuscript",
+                "periodical",
+            ]
        ):
            return []
        else:
-            raise ValueError("No structMap[@TYPE='PHYSICAL'] found (but not a multivolume work)")
+            raise ValueError(
+                "No structMap[@TYPE='PHYSICAL'] found (but not a multivolume work)"
+            )
    if structMap_LOGICAL is None:
        raise ValueError("No structMap[@TYPE='LOGICAL'] found")
    if fileSec is None:
@ -294,13 +428,14 @@ def pages_to_dict(mets, raise_errors=True) -> List[Dict]:
    div_physSequence = structMap_PHYSICAL[0]
    assert div_physSequence.attrib.get("TYPE") == "physSequence"

-
    # Build a look-up table to get mets:file by @ID
    # This cuts retrieving the mets:file down to half the time.
    mets_file_by_ID = {}
+
    def _init_mets_file_by_ID():
-        for f in fileSec.iterfind('./mets:fileGrp/mets:file', ns):
+        for f in fileSec.iterfind("./mets:fileGrp/mets:file", ns):
            mets_file_by_ID[f.attrib.get("ID")] = f
+
    _init_mets_file_by_ID()

    def get_mets_file(*, ID):
@ -312,7 +447,6 @@ def pages_to_dict(mets, raise_errors=True) -> List[Dict]:
            return structMap_LOGICAL.findall(f'.//mets:div[@ID="{ID}"]', ns)

    for page in div_physSequence:
-
        # TODO sort by ORDER?
        assert page.attrib.get("TYPE") == "page"
        page_dict = {}
@ -326,7 +460,9 @@ def pages_to_dict(mets, raise_errors=True) -> List[Dict]:
            file_ = get_mets_file(ID=file_id)
            assert file_ is not None
            fileGrp_USE = file_.getparent().attrib.get("USE")
-            file_FLocat_href = (file_.xpath('mets:FLocat/@xlink:href', namespaces=ns) or [None])[0]
+            file_FLocat_href = (
+                file_.xpath("mets:FLocat/@xlink:href", namespaces=ns) or [None]
+            )[0]
            if file_FLocat_href is not None:
                file_FLocat_href = str(file_FLocat_href)
            page_dict[f"fileGrp_{fileGrp_USE}_file_FLocat_href"] = file_FLocat_href
@ -343,7 +479,7 @@ def pages_to_dict(mets, raise_errors=True) -> List[Dict]:
            # it suffices to do this the old-fashioned way.

            sm_links = mets.findall(
-                    f'./mets:structLink/mets:smLink[@xlink:to="{to_phys}"]', ns
+                f'./mets:structLink/mets:smLink[@xlink:to="{to_phys}"]', ns
            )

            targets = []
@ -378,10 +514,19 @@ def pages_to_dict(mets, raise_errors=True) -> List[Dict]:


@click.command()
-@click.argument('mets_files', type=click.Path(exists=True), required=True, nargs=-1)
-@click.option('--output', '-o', 'output_file', type=click.Path(), help='Output Parquet file',
-              default='mods_info_df.parquet', show_default=True)
-@click.option('--output-page-info', type=click.Path(), help='Output page info Parquet file')
+@click.argument("mets_files", type=click.Path(exists=True), required=True, nargs=-1)
+@click.option(
+    "--output",
+    "-o",
+    "output_file",
+    type=click.Path(),
+    help="Output Parquet file",
+    default="mods_info_df.parquet",
+    show_default=True,
+)
+@click.option(
+    "--output-page-info", type=click.Path(), help="Output page info Parquet file"
+)
 def process_command(mets_files: list[str], output_file: str, output_page_info: str):
    """
    A tool to convert the MODS metadata in INPUT to a pandas DataFrame.
@ -395,18 +540,21 @@ def process_command(mets_files: list[str], output_file: str, output_page_info: s
    """
    process(mets_files, output_file, output_page_info)

+
 def process(mets_files: list[str], output_file: str, output_page_info: str):
    # Extend file list if directories are given
    mets_files_real: list[str] = []
    for m in mets_files:
        if os.path.isdir(m):
-            logger.info('Scanning directory {}'.format(m))
-            mets_files_real.extend(f.path for f in tqdm(os.scandir(m), leave=False)
-                                   if f.is_file() and not f.name.startswith('.'))
+            logger.info("Scanning directory {}".format(m))
+            mets_files_real.extend(
+                f.path
+                for f in tqdm(os.scandir(m), leave=False)
+                if f.is_file() and not f.name.startswith(".")
+            )
        else:
            mets_files_real.append(m)

-
    # Prepare output files
    with contextlib.suppress(FileNotFoundError):
        os.remove(output_file)
@ -414,28 +562,28 @@ def process(mets_files: list[str], output_file: str, output_page_info: str):
    with contextlib.suppress(FileNotFoundError):
        os.remove(output_file_sqlite3)

-    logger.info('Writing SQLite DB to {}'.format(output_file_sqlite3))
+    logger.info("Writing SQLite DB to {}".format(output_file_sqlite3))
    con = sqlite3.connect(output_file_sqlite3)

    if output_page_info:
        output_page_info_sqlite3 = output_page_info + ".sqlite3"
-        logger.info('Writing SQLite DB to {}'.format(output_page_info_sqlite3))
+        logger.info("Writing SQLite DB to {}".format(output_page_info_sqlite3))
        with contextlib.suppress(FileNotFoundError):
            os.remove(output_page_info_sqlite3)
        con_page_info = sqlite3.connect(output_page_info_sqlite3)

    # Process METS files
-    with open(output_file + '.warnings.csv', 'w') as csvfile:
+    with open(output_file + ".warnings.csv", "w") as csvfile:
        csvwriter = csv.writer(csvfile)
-        logger.info('Processing METS files')
+        logger.info("Processing METS files")
        for mets_file in tqdm(mets_files_real, leave=True):
            try:
                root = ET.parse(mets_file).getroot()
-                mets = root # XXX .find('mets:mets', ns) does not work here
-                mods = root.find('mets:dmdSec//mods:mods', ns)
+                mets = root  # XXX .find('mets:mets', ns) does not work here
+                mods = root.find("mets:dmdSec//mods:mods", ns)

                with warnings.catch_warnings(record=True) as caught_warnings:
-                    warnings.simplefilter('always')  # do NOT filter double occurrences
+                    warnings.simplefilter("always")  # do NOT filter double occurrences

                    # MODS
                    d = flatten(mods_to_dict(mods, raise_errors=True))
@ -445,7 +593,7 @@ def process(mets_files: list[str], output_file: str, output_page_info: str):
                    for k, v in d_mets.items():
                        d[f"mets_{k}"] = v
                    # "meta"
-                    d['mets_file'] = mets_file
+                    d["mets_file"] = mets_file

                    # Save
                    insert_into_db(con, "mods_info", d)
@ -453,8 +601,12 @@ def process(mets_files: list[str], output_file: str, output_page_info: str):

                    # METS - per-page
                    if output_page_info:
-                        page_info_doc: list[dict] = pages_to_dict(mets, raise_errors=True)
-                        insert_into_db_multiple(con_page_info, "page_info", page_info_doc)
+                        page_info_doc: list[dict] = pages_to_dict(
+                            mets, raise_errors=True
+                        )
+                        insert_into_db_multiple(
+                            con_page_info, "page_info", page_info_doc
+                        )
                        con_page_info.commit()

                    if caught_warnings:
@ -463,13 +615,15 @@ def process(mets_files: list[str], output_file: str, output_page_info: str):
                        for caught_warning in caught_warnings:
                            csvwriter.writerow([mets_file, caught_warning.message])
            except Exception as e:
-                logger.exception('Exception in {}'.format(mets_file))
+                logger.exception("Exception in {}".format(mets_file))

-    logger.info('Writing DataFrame to {}'.format(output_file))
+    logger.info("Writing DataFrame to {}".format(output_file))
    convert_db_to_parquet(con, "mods_info", "recordInfo_recordIdentifier", output_file)
    if output_page_info:
-          logger.info('Writing DataFrame to {}'.format(output_page_info))
-          convert_db_to_parquet(con_page_info, "page_info", ["ppn", "ID"], output_page_info)
+        logger.info("Writing DataFrame to {}".format(output_page_info))
+        convert_db_to_parquet(
+            con_page_info, "page_info", ["ppn", "ID"], output_page_info
+        )


 def main():
@ -481,5 +635,5 @@ def main():
    process_command()


-if __name__ == '__main__':
+if __name__ == "__main__":
    main()