2019-08-28 16:18:39 +02:00
|
|
|
#!/usr/bin/env python3
|
2024-11-27 18:48:56 +01:00
|
|
|
import contextlib
|
2019-08-28 16:18:39 +02:00
|
|
|
import csv
|
|
|
|
import logging
|
|
|
|
import os
|
2024-11-26 16:27:43 +01:00
|
|
|
import sqlite3
|
2019-08-28 16:18:39 +02:00
|
|
|
import warnings
|
|
|
|
from operator import attrgetter
|
2023-11-23 15:04:27 +01:00
|
|
|
from typing import Dict, List
|
2019-08-28 16:18:39 +02:00
|
|
|
|
|
|
|
import click
|
2025-06-13 19:20:48 +02:00
|
|
|
from lxml import etree as ET
|
2019-08-28 16:18:39 +02:00
|
|
|
from tqdm import tqdm
|
|
|
|
|
2025-06-12 09:51:02 +02:00
|
|
|
from .lib import (
|
|
|
|
TagGroup,
|
2025-06-13 19:20:48 +02:00
|
|
|
convert_db_to_parquet,
|
2025-06-12 09:51:02 +02:00
|
|
|
flatten,
|
|
|
|
insert_into_db,
|
|
|
|
insert_into_db_multiple,
|
2025-06-13 19:20:48 +02:00
|
|
|
ns,
|
|
|
|
sorted_groupby,
|
2025-06-12 09:51:02 +02:00
|
|
|
)
|
2019-08-28 16:18:39 +02:00
|
|
|
|
2025-06-12 09:51:02 +02:00
|
|
|
logger = logging.getLogger("mods4pandas")
|
|
|
|
|
2019-08-28 16:18:39 +02:00
|
|
|
|
|
|
|
def mods_to_dict(mods, raise_errors=True):
|
|
|
|
"""Convert MODS metadata to a nested dictionary"""
|
|
|
|
|
|
|
|
# The approach taken here is to handle each element explicitly. This also means that ignored elements are ignored
|
|
|
|
# explicitly.
|
|
|
|
|
|
|
|
value = {}
|
|
|
|
|
|
|
|
# Iterate through each group of tags
|
2025-06-12 09:51:02 +02:00
|
|
|
for tag, group in sorted_groupby(mods, key=attrgetter("tag")):
|
2019-08-28 16:18:39 +02:00
|
|
|
group = list(group)
|
2025-06-12 09:51:02 +02:00
|
|
|
if tag == "{http://www.loc.gov/mods/v3}location":
|
|
|
|
|
2019-08-28 16:18:39 +02:00
|
|
|
def only_current_location(location):
|
2025-06-12 09:51:02 +02:00
|
|
|
return location.get("type") != "former"
|
|
|
|
|
|
|
|
value["location"] = (
|
|
|
|
TagGroup(tag, group)
|
|
|
|
.filter(only_current_location)
|
|
|
|
.has_attributes([{}, {"type": "current"}])
|
|
|
|
.is_singleton()
|
|
|
|
.descend(raise_errors)
|
|
|
|
)
|
|
|
|
elif tag == "{http://www.loc.gov/mods/v3}physicalLocation":
|
|
|
|
|
2019-08-28 16:18:39 +02:00
|
|
|
def no_display_label(physical_location):
|
2025-06-12 09:51:02 +02:00
|
|
|
return physical_location.get("displayLabel") is None
|
|
|
|
|
|
|
|
value["physicalLocation"] = (
|
|
|
|
TagGroup(tag, group).filter(no_display_label).text()
|
|
|
|
)
|
|
|
|
elif tag == "{http://www.loc.gov/mods/v3}shelfLocator":
|
2019-08-28 16:18:39 +02:00
|
|
|
# This element should not be repeated according to MODS-AP 2.3.1, however a few of the files contain
|
|
|
|
# a second element with empty text and a "displayLabel" attribute set.
|
|
|
|
def no_display_label(shelf_locator):
|
2025-06-12 09:51:02 +02:00
|
|
|
return shelf_locator.get("displayLabel") is None
|
|
|
|
|
|
|
|
value["shelfLocator"] = (
|
|
|
|
TagGroup(tag, group)
|
|
|
|
.filter(no_display_label)
|
|
|
|
.force_singleton()
|
|
|
|
.has_no_attributes()
|
2019-08-28 16:18:39 +02:00
|
|
|
.text()
|
2025-06-12 09:51:02 +02:00
|
|
|
)
|
|
|
|
elif tag == "{http://www.loc.gov/mods/v3}originInfo":
|
|
|
|
|
2019-08-28 16:18:39 +02:00
|
|
|
def has_event_type(origin_info):
|
|
|
|
# According to MODS-AP 2.3.1, every originInfo should have its eventType set. However, some
|
|
|
|
# are empty and not fixable.
|
2025-06-12 09:51:02 +02:00
|
|
|
return origin_info.attrib.get("eventType") is not None
|
|
|
|
|
|
|
|
tag_group = (
|
|
|
|
TagGroup(tag, group)
|
|
|
|
.fix_event_type()
|
|
|
|
.filter(has_event_type, warn="has no eventType")
|
|
|
|
)
|
|
|
|
for event_type, grouped_group in sorted_groupby(
|
|
|
|
tag_group.group, key=lambda g: g.attrib["eventType"]
|
|
|
|
):
|
2019-08-28 16:18:39 +02:00
|
|
|
for n, e in enumerate(grouped_group):
|
2025-06-12 09:51:02 +02:00
|
|
|
value["originInfo-{}{}".format(event_type, n)] = mods_to_dict(
|
|
|
|
e, raise_errors
|
|
|
|
)
|
|
|
|
elif tag == "{http://www.loc.gov/mods/v3}place":
|
|
|
|
value["place"] = (
|
|
|
|
TagGroup(tag, group)
|
|
|
|
.force_singleton(warn=False)
|
|
|
|
.has_no_attributes()
|
|
|
|
.descend(raise_errors)
|
|
|
|
)
|
|
|
|
elif tag == "{http://www.loc.gov/mods/v3}placeTerm":
|
|
|
|
value["placeTerm"] = (
|
|
|
|
TagGroup(tag, group)
|
|
|
|
.is_singleton()
|
|
|
|
.has_attributes({"type": "text"})
|
2019-08-28 16:18:39 +02:00
|
|
|
.text()
|
2025-06-12 09:51:02 +02:00
|
|
|
)
|
|
|
|
elif tag == "{http://www.loc.gov/mods/v3}dateIssued":
|
|
|
|
value["dateIssued"] = (
|
|
|
|
TagGroup(tag, group)
|
|
|
|
.fix_date()
|
|
|
|
.sort(key=lambda d: d.attrib.get("keyDate") == "yes", reverse=True)
|
|
|
|
.ignore_attributes()
|
|
|
|
.force_singleton()
|
|
|
|
.text()
|
|
|
|
)
|
|
|
|
elif tag == "{http://www.loc.gov/mods/v3}dateCreated":
|
|
|
|
value["dateCreated"] = (
|
|
|
|
TagGroup(tag, group)
|
|
|
|
.fix_date()
|
|
|
|
.sort(key=lambda d: d.attrib.get("keyDate") == "yes", reverse=True)
|
|
|
|
.ignore_attributes()
|
|
|
|
.force_singleton()
|
2019-08-28 16:18:39 +02:00
|
|
|
.text()
|
2025-06-12 09:51:02 +02:00
|
|
|
)
|
|
|
|
elif tag == "{http://www.loc.gov/mods/v3}dateCaptured":
|
|
|
|
value["dateCaptured"] = (
|
|
|
|
TagGroup(tag, group)
|
|
|
|
.fix_date()
|
|
|
|
.ignore_attributes()
|
|
|
|
.is_singleton()
|
|
|
|
.text()
|
|
|
|
)
|
|
|
|
elif tag == "{http://www.loc.gov/mods/v3}dateOther":
|
|
|
|
value["dateOther"] = (
|
|
|
|
TagGroup(tag, group)
|
|
|
|
.fix_date()
|
|
|
|
.ignore_attributes()
|
|
|
|
.is_singleton()
|
|
|
|
.text()
|
|
|
|
)
|
|
|
|
elif tag == "{http://www.loc.gov/mods/v3}publisher":
|
|
|
|
value["publisher"] = (
|
|
|
|
TagGroup(tag, group)
|
|
|
|
.force_singleton(warn=False)
|
|
|
|
.has_no_attributes()
|
|
|
|
.text()
|
|
|
|
)
|
|
|
|
elif tag == "{http://www.loc.gov/mods/v3}edition":
|
|
|
|
value["edition"] = (
|
|
|
|
TagGroup(tag, group).force_singleton().has_no_attributes().text()
|
|
|
|
)
|
|
|
|
elif tag == "{http://www.loc.gov/mods/v3}classification":
|
|
|
|
authorities = {e.attrib["authority"] for e in group}
|
2019-08-28 16:18:39 +02:00
|
|
|
for authority in authorities:
|
2025-06-12 09:51:02 +02:00
|
|
|
sub_group = [e for e in group if e.attrib.get("authority") == authority]
|
|
|
|
value["classification-{}".format(authority)] = TagGroup(
|
|
|
|
tag, sub_group
|
|
|
|
).text_set()
|
|
|
|
elif tag == "{http://www.loc.gov/mods/v3}recordInfo":
|
|
|
|
value["recordInfo"] = (
|
|
|
|
TagGroup(tag, group)
|
|
|
|
.is_singleton()
|
|
|
|
.has_no_attributes()
|
|
|
|
.descend(raise_errors)
|
|
|
|
)
|
|
|
|
elif tag == "{http://www.loc.gov/mods/v3}recordIdentifier":
|
2023-04-17 19:21:43 +02:00
|
|
|
# By default we assume source="gbv-ppn" mods:recordIdentifiers (= PPNs),
|
|
|
|
# however, in mods:relatedItems, there may be source="dnb-ppns",
|
|
|
|
# which we need to distinguish by using a separate field name.
|
|
|
|
try:
|
2025-06-12 09:51:02 +02:00
|
|
|
value["recordIdentifier"] = (
|
|
|
|
TagGroup(tag, group)
|
|
|
|
.is_singleton()
|
|
|
|
.has_attributes({"source": "gbv-ppn"})
|
|
|
|
.text()
|
|
|
|
)
|
2023-04-17 19:21:43 +02:00
|
|
|
except ValueError:
|
2025-06-12 09:51:02 +02:00
|
|
|
value["recordIdentifier-dnb-ppn"] = (
|
|
|
|
TagGroup(tag, group)
|
|
|
|
.is_singleton()
|
|
|
|
.has_attributes({"source": "dnb-ppn"})
|
|
|
|
.text()
|
|
|
|
)
|
|
|
|
elif tag == "{http://www.loc.gov/mods/v3}identifier":
|
2019-08-28 16:18:39 +02:00
|
|
|
for e in group:
|
|
|
|
if len(e.attrib) != 1:
|
2025-06-12 09:51:02 +02:00
|
|
|
raise ValueError(
|
|
|
|
"Unknown attributes for identifier {}".format(e.attrib)
|
|
|
|
)
|
|
|
|
value["identifier-{}".format(e.attrib["type"])] = e.text
|
|
|
|
elif tag == "{http://www.loc.gov/mods/v3}titleInfo":
|
|
|
|
|
2019-08-28 16:18:39 +02:00
|
|
|
def only_standard_title(title_info):
|
2025-06-12 09:51:02 +02:00
|
|
|
return title_info.attrib.get("type") is None
|
|
|
|
|
|
|
|
value["titleInfo"] = (
|
|
|
|
TagGroup(tag, group)
|
|
|
|
.filter(only_standard_title)
|
|
|
|
.is_singleton()
|
|
|
|
.has_no_attributes()
|
|
|
|
.descend(raise_errors)
|
|
|
|
)
|
|
|
|
elif tag == "{http://www.loc.gov/mods/v3}title":
|
|
|
|
value["title"] = (
|
|
|
|
TagGroup(tag, group).is_singleton().has_no_attributes().text()
|
|
|
|
)
|
|
|
|
elif tag == "{http://www.loc.gov/mods/v3}partName":
|
|
|
|
value["partName"] = (
|
|
|
|
TagGroup(tag, group).is_singleton().has_no_attributes().text()
|
|
|
|
)
|
|
|
|
elif tag == "{http://www.loc.gov/mods/v3}subTitle":
|
|
|
|
value["subTitle"] = (
|
|
|
|
TagGroup(tag, group).force_singleton().has_no_attributes().text()
|
|
|
|
)
|
|
|
|
elif tag == "{http://www.loc.gov/mods/v3}note":
|
2019-08-28 16:18:39 +02:00
|
|
|
# This could be useful if distinguished by type attribute.
|
|
|
|
pass
|
2025-06-12 09:51:02 +02:00
|
|
|
elif tag == "{http://www.loc.gov/mods/v3}part":
|
2019-08-28 16:18:39 +02:00
|
|
|
pass
|
2025-06-12 09:51:02 +02:00
|
|
|
elif tag == "{http://www.loc.gov/mods/v3}abstract":
|
|
|
|
value["abstract"] = TagGroup(tag, group).has_no_attributes().text()
|
|
|
|
elif tag == "{http://www.loc.gov/mods/v3}subject":
|
|
|
|
authorities = {e.attrib.get("authority") for e in group}
|
2019-08-28 16:18:39 +02:00
|
|
|
for authority in authorities:
|
2025-06-12 09:51:02 +02:00
|
|
|
k = (
|
|
|
|
"subject-{}".format(authority)
|
|
|
|
if authority is not None
|
|
|
|
else "subject"
|
|
|
|
)
|
|
|
|
sub_group = [e for e in group if e.attrib.get("authority") == authority]
|
|
|
|
value[k] = (
|
|
|
|
TagGroup(tag, sub_group).force_singleton().descend(raise_errors)
|
|
|
|
)
|
|
|
|
elif tag == "{http://www.loc.gov/mods/v3}topic":
|
2019-08-28 16:18:39 +02:00
|
|
|
TagGroup(tag, group).text_set()
|
2025-06-12 09:51:02 +02:00
|
|
|
elif tag == "{http://www.loc.gov/mods/v3}cartographics":
|
2019-08-28 16:18:39 +02:00
|
|
|
pass
|
2025-06-12 09:51:02 +02:00
|
|
|
elif tag == "{http://www.loc.gov/mods/v3}geographic":
|
2019-08-28 16:18:39 +02:00
|
|
|
TagGroup(tag, group).text_set()
|
2025-06-12 09:51:02 +02:00
|
|
|
elif tag == "{http://www.loc.gov/mods/v3}temporal":
|
2019-08-28 16:18:39 +02:00
|
|
|
TagGroup(tag, group).text_set()
|
2025-06-12 09:51:02 +02:00
|
|
|
elif tag == "{http://www.loc.gov/mods/v3}genre":
|
|
|
|
authorities = {e.attrib.get("authority") for e in group}
|
2019-08-28 16:18:39 +02:00
|
|
|
for authority in authorities:
|
2025-06-12 09:51:02 +02:00
|
|
|
k = "genre-{}".format(authority) if authority is not None else "genre"
|
|
|
|
value[k] = {
|
|
|
|
e.text for e in group if e.attrib.get("authority") == authority
|
|
|
|
}
|
|
|
|
elif tag == "{http://www.loc.gov/mods/v3}language":
|
|
|
|
value["language"] = TagGroup(tag, group).merge_sub_tags_to_set()
|
|
|
|
elif tag == "{http://www.loc.gov/mods/v3}languageTerm":
|
|
|
|
value["languageTerm"] = (
|
|
|
|
TagGroup(tag, group)
|
|
|
|
.has_attributes({"authority": "iso639-2b", "type": "code"})
|
2022-04-01 14:02:19 +02:00
|
|
|
.text_set()
|
2025-06-12 09:51:02 +02:00
|
|
|
)
|
|
|
|
elif tag == "{http://www.loc.gov/mods/v3}scriptTerm":
|
|
|
|
value["scriptTerm"] = (
|
|
|
|
TagGroup(tag, group)
|
|
|
|
.fix_script_term()
|
|
|
|
.has_attributes({"authority": "iso15924", "type": "code"})
|
2019-08-29 15:42:13 +02:00
|
|
|
.text_set()
|
2025-06-12 09:51:02 +02:00
|
|
|
)
|
|
|
|
elif tag == "{http://www.loc.gov/mods/v3}relatedItem":
|
2023-04-14 12:53:11 +02:00
|
|
|
tag_group = TagGroup(tag, group)
|
2025-06-12 09:51:02 +02:00
|
|
|
for type_, grouped_group in sorted_groupby(
|
|
|
|
tag_group.group, key=lambda g: g.attrib["type"]
|
|
|
|
):
|
|
|
|
sub_tag = "relatedItem-{}".format(type_)
|
2023-04-14 12:53:11 +02:00
|
|
|
grouped_group = list(grouped_group)
|
|
|
|
if type_ in ["original", "host"]:
|
2025-06-12 09:51:02 +02:00
|
|
|
value[sub_tag] = (
|
|
|
|
TagGroup(sub_tag, grouped_group)
|
|
|
|
.is_singleton()
|
|
|
|
.descend(raise_errors)
|
|
|
|
)
|
2023-04-14 12:53:11 +02:00
|
|
|
else:
|
|
|
|
# TODO type="series"
|
|
|
|
pass
|
2025-06-12 09:51:02 +02:00
|
|
|
elif tag == "{http://www.loc.gov/mods/v3}name":
|
2019-08-28 16:18:39 +02:00
|
|
|
for n, e in enumerate(group):
|
2025-06-12 09:51:02 +02:00
|
|
|
value["name{}".format(n)] = mods_to_dict(e, raise_errors)
|
|
|
|
elif tag == "{http://www.loc.gov/mods/v3}role":
|
|
|
|
value["role"] = (
|
|
|
|
TagGroup(tag, group).has_no_attributes().merge_sub_tags_to_set()
|
|
|
|
)
|
|
|
|
elif tag == "{http://www.loc.gov/mods/v3}roleTerm":
|
|
|
|
value["roleTerm"] = (
|
|
|
|
TagGroup(tag, group)
|
|
|
|
.has_attributes({"authority": "marcrelator", "type": "code"})
|
2022-04-01 14:52:54 +02:00
|
|
|
.text_set()
|
2025-06-12 09:51:02 +02:00
|
|
|
)
|
|
|
|
elif tag == "{http://www.loc.gov/mods/v3}namePart":
|
2022-03-31 18:57:35 +02:00
|
|
|
for e in group:
|
2025-06-12 09:51:02 +02:00
|
|
|
if not e.attrib.get("type"):
|
|
|
|
value["namePart"] = e.text
|
2022-03-31 18:57:35 +02:00
|
|
|
else:
|
2025-06-12 09:51:02 +02:00
|
|
|
value["namePart-{}".format(e.attrib["type"])] = e.text
|
|
|
|
elif tag == "{http://www.loc.gov/mods/v3}nameIdentifier":
|
2022-03-31 18:57:35 +02:00
|
|
|
# TODO Use this (e.g. <mods:nameIdentifier type="ppn">106168096</mods:nameIdentifier>) or the
|
|
|
|
# mods:name@valueURI to disambiguate
|
2019-08-28 16:18:39 +02:00
|
|
|
pass
|
2025-06-12 09:51:02 +02:00
|
|
|
elif tag == "{http://www.loc.gov/mods/v3}displayForm":
|
|
|
|
value["displayForm"] = (
|
|
|
|
TagGroup(tag, group).is_singleton().has_no_attributes().text()
|
|
|
|
)
|
|
|
|
elif tag == "{http://www.loc.gov/mods/v3}physicalDescription":
|
2019-08-28 16:18:39 +02:00
|
|
|
pass
|
2025-06-12 09:51:02 +02:00
|
|
|
elif tag == "{http://www.loc.gov/mods/v3}extension":
|
2019-08-28 16:18:39 +02:00
|
|
|
pass
|
2025-06-12 09:51:02 +02:00
|
|
|
elif tag == "{http://www.loc.gov/mods/v3}accessCondition":
|
2019-08-28 16:18:39 +02:00
|
|
|
for e in group:
|
2025-06-12 09:51:02 +02:00
|
|
|
if not e.attrib.get("type"):
|
|
|
|
raise ValueError(
|
|
|
|
"Unknown attributes for accessCondition {}".format(e.attrib)
|
|
|
|
)
|
|
|
|
value["accessCondition-{}".format(e.attrib["type"])] = e.text
|
|
|
|
elif tag == "{http://www.loc.gov/mods/v3}typeOfResource":
|
|
|
|
value["typeOfResource"] = (
|
|
|
|
TagGroup(tag, group).is_singleton().has_no_attributes().text()
|
|
|
|
)
|
|
|
|
elif tag == "{http://www.loc.gov/mods/v3}mods":
|
2019-08-28 16:18:39 +02:00
|
|
|
# XXX Ignore nested mods:mods for now (used in mods:subject)
|
|
|
|
pass
|
|
|
|
else:
|
|
|
|
if raise_errors:
|
|
|
|
raise ValueError('Unknown tag "{}"'.format(tag))
|
|
|
|
else:
|
|
|
|
pass
|
|
|
|
|
|
|
|
return value
|
|
|
|
|
|
|
|
|
2022-03-31 21:03:58 +02:00
|
|
|
def mets_to_dict(mets, raise_errors=True):
|
|
|
|
"""Convert METS metadata to a nested dictionary"""
|
|
|
|
|
|
|
|
# The approach taken here is to handle each element explicitly. This also means that ignored elements are ignored
|
|
|
|
# explicitly.
|
|
|
|
|
|
|
|
value = {}
|
|
|
|
|
|
|
|
# Iterate through each group of tags
|
2025-06-12 09:51:02 +02:00
|
|
|
for tag, group in sorted_groupby(mets, key=attrgetter("tag")):
|
2022-03-31 21:03:58 +02:00
|
|
|
group = list(group)
|
|
|
|
|
|
|
|
# XXX Namespaces seem to use a trailing / sometimes, sometimes not.
|
|
|
|
# (e.g. {http://www.loc.gov/METS/} vs {http://www.loc.gov/METS})
|
2025-06-12 09:51:02 +02:00
|
|
|
if tag == "{http://www.loc.gov/METS/}amdSec":
|
2022-03-31 21:03:58 +02:00
|
|
|
pass # TODO
|
2025-06-12 09:51:02 +02:00
|
|
|
elif tag == "{http://www.loc.gov/METS/}dmdSec":
|
2022-03-31 21:03:58 +02:00
|
|
|
pass # TODO
|
2025-06-12 09:51:02 +02:00
|
|
|
elif tag == "{http://www.loc.gov/METS/}metsHdr":
|
2022-03-31 21:03:58 +02:00
|
|
|
pass # TODO
|
2025-06-12 09:51:02 +02:00
|
|
|
elif tag == "{http://www.loc.gov/METS/}structLink":
|
2022-03-31 21:03:58 +02:00
|
|
|
pass # TODO
|
2025-06-12 09:51:02 +02:00
|
|
|
elif tag == "{http://www.loc.gov/METS/}structMap":
|
2022-03-31 21:03:58 +02:00
|
|
|
pass # TODO
|
2025-06-12 09:51:02 +02:00
|
|
|
elif tag == "{http://www.loc.gov/METS/}fileSec":
|
|
|
|
value["fileSec"] = TagGroup(tag, group).is_singleton().descend(raise_errors)
|
|
|
|
elif tag == "{http://www.loc.gov/METS/}fileGrp":
|
2022-03-31 21:03:58 +02:00
|
|
|
for e in group:
|
2025-06-12 09:51:02 +02:00
|
|
|
use = e.attrib.get("USE")
|
2022-03-31 21:03:58 +02:00
|
|
|
if not use:
|
2025-06-12 09:51:02 +02:00
|
|
|
raise ValueError("No USE attribute for fileGrp {}".format(e))
|
|
|
|
value[f"fileGrp-{use}-count"] = len(e)
|
2022-03-31 21:03:58 +02:00
|
|
|
else:
|
|
|
|
if raise_errors:
|
|
|
|
print(value)
|
|
|
|
raise ValueError('Unknown tag "{}"'.format(tag))
|
|
|
|
else:
|
|
|
|
pass
|
|
|
|
return value
|
|
|
|
|
2025-06-12 09:51:02 +02:00
|
|
|
|
2023-11-23 15:04:27 +01:00
|
|
|
def pages_to_dict(mets, raise_errors=True) -> List[Dict]:
|
2023-11-22 18:11:14 +01:00
|
|
|
# TODO replace asserts by ValueError
|
|
|
|
|
|
|
|
result = []
|
|
|
|
|
|
|
|
# PPN
|
|
|
|
def get_mets_recordIdentifier(*, source="gbv-ppn"):
|
2025-06-12 09:51:02 +02:00
|
|
|
return (
|
|
|
|
mets.xpath(
|
|
|
|
f'//mets:dmdSec[1]//mods:mods/mods:recordInfo/mods:recordIdentifier[@source="{source}"]',
|
|
|
|
namespaces=ns,
|
|
|
|
)
|
|
|
|
or [None]
|
|
|
|
)[0].text
|
|
|
|
|
2023-11-22 18:11:14 +01:00
|
|
|
ppn = get_mets_recordIdentifier()
|
|
|
|
|
|
|
|
# Getting per-page/structure information is a bit different
|
2023-12-09 11:35:24 +01:00
|
|
|
structMap_PHYSICAL = mets.find('./mets:structMap[@TYPE="PHYSICAL"]', ns)
|
|
|
|
structMap_LOGICAL = mets.find('./mets:structMap[@TYPE="LOGICAL"]', ns)
|
2025-06-12 09:51:02 +02:00
|
|
|
fileSec = mets.find("./mets:fileSec", ns)
|
2023-12-09 11:35:24 +01:00
|
|
|
if structMap_PHYSICAL is None:
|
2023-12-12 13:13:23 +01:00
|
|
|
# This is expected in a multivolume work or periodical!
|
|
|
|
if any(
|
2025-06-12 09:51:02 +02:00
|
|
|
structMap_LOGICAL.find(f'./mets:div[@TYPE="{t}"]', ns) is not None
|
|
|
|
for t in [
|
|
|
|
"multivolume_work",
|
|
|
|
"MultivolumeWork",
|
|
|
|
"multivolume_manuscript",
|
|
|
|
"periodical",
|
|
|
|
]
|
2023-12-12 13:13:23 +01:00
|
|
|
):
|
2023-12-12 12:34:24 +01:00
|
|
|
return []
|
|
|
|
else:
|
2025-06-12 09:51:02 +02:00
|
|
|
raise ValueError(
|
|
|
|
"No structMap[@TYPE='PHYSICAL'] found (but not a multivolume work)"
|
|
|
|
)
|
2023-12-09 11:35:24 +01:00
|
|
|
if structMap_LOGICAL is None:
|
|
|
|
raise ValueError("No structMap[@TYPE='LOGICAL'] found")
|
2023-12-09 11:40:45 +01:00
|
|
|
if fileSec is None:
|
|
|
|
raise ValueError("No fileSec found")
|
2023-11-22 18:11:14 +01:00
|
|
|
|
|
|
|
div_physSequence = structMap_PHYSICAL[0]
|
|
|
|
assert div_physSequence.attrib.get("TYPE") == "physSequence"
|
|
|
|
|
2023-12-09 12:05:20 +01:00
|
|
|
# Build a look-up table to get mets:file by @ID
|
|
|
|
# This cuts retrieving the mets:file down to half the time.
|
|
|
|
mets_file_by_ID = {}
|
2025-06-12 09:51:02 +02:00
|
|
|
|
2023-12-09 12:05:20 +01:00
|
|
|
def _init_mets_file_by_ID():
|
2025-06-12 09:51:02 +02:00
|
|
|
for f in fileSec.iterfind("./mets:fileGrp/mets:file", ns):
|
2023-12-09 12:05:20 +01:00
|
|
|
mets_file_by_ID[f.attrib.get("ID")] = f
|
2025-06-12 09:51:02 +02:00
|
|
|
|
2023-12-09 12:05:20 +01:00
|
|
|
_init_mets_file_by_ID()
|
|
|
|
|
2023-12-09 10:24:38 +01:00
|
|
|
def get_mets_file(*, ID):
|
|
|
|
if ID:
|
2023-12-09 12:05:20 +01:00
|
|
|
return mets_file_by_ID[ID]
|
2023-12-09 10:24:38 +01:00
|
|
|
|
2023-12-09 11:35:24 +01:00
|
|
|
def get_mets_div(*, ID):
|
|
|
|
if ID:
|
|
|
|
return structMap_LOGICAL.findall(f'.//mets:div[@ID="{ID}"]', ns)
|
2023-12-09 10:24:38 +01:00
|
|
|
|
2023-11-22 18:11:14 +01:00
|
|
|
for page in div_physSequence:
|
|
|
|
# TODO sort by ORDER?
|
|
|
|
assert page.attrib.get("TYPE") == "page"
|
|
|
|
page_dict = {}
|
|
|
|
page_dict["ppn"] = ppn
|
|
|
|
page_dict["ID"] = page.attrib.get("ID")
|
|
|
|
for fptr in page:
|
|
|
|
assert fptr.tag == "{http://www.loc.gov/METS/}fptr"
|
|
|
|
file_id = fptr.attrib.get("FILEID")
|
|
|
|
assert file_id
|
|
|
|
|
|
|
|
file_ = get_mets_file(ID=file_id)
|
2023-12-09 10:24:38 +01:00
|
|
|
assert file_ is not None
|
2023-11-22 18:11:14 +01:00
|
|
|
fileGrp_USE = file_.getparent().attrib.get("USE")
|
2025-06-12 09:51:02 +02:00
|
|
|
file_FLocat_href = (
|
|
|
|
file_.xpath("mets:FLocat/@xlink:href", namespaces=ns) or [None]
|
|
|
|
)[0]
|
2025-06-11 20:41:13 +02:00
|
|
|
if file_FLocat_href is not None:
|
|
|
|
file_FLocat_href = str(file_FLocat_href)
|
2023-11-22 18:11:14 +01:00
|
|
|
page_dict[f"fileGrp_{fileGrp_USE}_file_FLocat_href"] = file_FLocat_href
|
|
|
|
|
|
|
|
def get_struct_log(*, to_phys):
|
|
|
|
"""
|
|
|
|
Get the logical structMap elements that link to the given physical page.
|
|
|
|
|
|
|
|
Keyword arguments:
|
|
|
|
to_phys -- ID of the page, as per structMap[@TYPE="PHYSICAL"]
|
|
|
|
"""
|
|
|
|
|
|
|
|
# This is all XLink, there might be a more generic way to traverse the links. However, currently,
|
|
|
|
# it suffices to do this the old-fashioned way.
|
|
|
|
|
2023-12-09 10:36:55 +01:00
|
|
|
sm_links = mets.findall(
|
2025-06-12 09:51:02 +02:00
|
|
|
f'./mets:structLink/mets:smLink[@xlink:to="{to_phys}"]', ns
|
2023-12-09 10:36:55 +01:00
|
|
|
)
|
2023-11-22 18:11:14 +01:00
|
|
|
|
|
|
|
targets = []
|
|
|
|
for sm_link in sm_links:
|
|
|
|
xlink_from = sm_link.attrib.get(f"{{{ns['xlink']}}}from")
|
2023-12-09 11:35:24 +01:00
|
|
|
targets.extend(get_mets_div(ID=xlink_from))
|
2023-11-22 18:11:14 +01:00
|
|
|
return targets
|
|
|
|
|
|
|
|
struct_divs = set(get_struct_log(to_phys=page_dict["ID"]))
|
|
|
|
|
|
|
|
# In our documents, there are already links to parent elements, but we want to make
|
|
|
|
# sure and add them.
|
|
|
|
def get_struct_log_parents(div):
|
|
|
|
cursor = div
|
|
|
|
while (cursor := cursor.getparent()).tag == f"{{{ns['mets']}}}div":
|
|
|
|
yield cursor
|
|
|
|
|
2023-11-27 16:35:48 +01:00
|
|
|
struct_divs_to_add = set()
|
2023-11-22 18:11:14 +01:00
|
|
|
for struct_div in struct_divs:
|
2023-11-27 16:35:48 +01:00
|
|
|
struct_divs_to_add.update(get_struct_log_parents(struct_div))
|
|
|
|
struct_divs.update(struct_divs_to_add)
|
2023-11-22 18:11:14 +01:00
|
|
|
|
2023-11-27 16:36:45 +01:00
|
|
|
# Populate structure type indicator variables
|
2023-11-22 18:11:14 +01:00
|
|
|
for struct_div in struct_divs:
|
2024-11-27 19:56:36 +01:00
|
|
|
type_ = struct_div.attrib.get("TYPE").lower()
|
2023-11-22 18:11:14 +01:00
|
|
|
assert type_
|
2025-06-11 20:41:13 +02:00
|
|
|
page_dict[f"structMap-LOGICAL_TYPE_{type_}"] = True
|
2023-11-22 18:11:14 +01:00
|
|
|
|
|
|
|
result.append(page_dict)
|
|
|
|
|
|
|
|
return result
|
|
|
|
|
2022-03-31 21:03:58 +02:00
|
|
|
|
2019-08-28 16:18:39 +02:00
|
|
|
@click.command()
|
2025-06-12 09:51:02 +02:00
|
|
|
@click.argument("mets_files", type=click.Path(exists=True), required=True, nargs=-1)
|
|
|
|
@click.option(
|
|
|
|
"--output",
|
|
|
|
"-o",
|
|
|
|
"output_file",
|
|
|
|
type=click.Path(),
|
|
|
|
help="Output Parquet file",
|
|
|
|
default="mods_info_df.parquet",
|
|
|
|
show_default=True,
|
|
|
|
)
|
|
|
|
@click.option(
|
|
|
|
"--output-page-info", type=click.Path(), help="Output page info Parquet file"
|
|
|
|
)
|
2025-06-12 09:42:29 +02:00
|
|
|
def process_command(mets_files: list[str], output_file: str, output_page_info: str):
|
2019-08-28 16:18:39 +02:00
|
|
|
"""
|
|
|
|
A tool to convert the MODS metadata in INPUT to a pandas DataFrame.
|
|
|
|
|
|
|
|
INPUT is assumed to be a METS document with MODS metadata. INPUT may optionally be a directory. The tool then reads
|
|
|
|
all files in the directory.
|
|
|
|
|
2024-07-31 11:09:34 +02:00
|
|
|
mods4pandas writes two output files: A pandas DataFrame (as Parquet) and a CSV file with all conversion warnings.
|
|
|
|
|
|
|
|
Per-page information (e.g. structure information) can be output to a separate Parquet file.
|
2019-08-28 16:18:39 +02:00
|
|
|
"""
|
2025-06-12 09:42:29 +02:00
|
|
|
process(mets_files, output_file, output_page_info)
|
2019-08-28 16:18:39 +02:00
|
|
|
|
2025-06-12 09:51:02 +02:00
|
|
|
|
2025-06-12 09:42:29 +02:00
|
|
|
def process(mets_files: list[str], output_file: str, output_page_info: str):
|
2019-08-28 16:18:39 +02:00
|
|
|
# Extend file list if directories are given
|
2025-06-11 14:36:29 +02:00
|
|
|
mets_files_real: list[str] = []
|
2019-08-28 16:18:39 +02:00
|
|
|
for m in mets_files:
|
|
|
|
if os.path.isdir(m):
|
2025-06-12 09:51:02 +02:00
|
|
|
logger.info("Scanning directory {}".format(m))
|
|
|
|
mets_files_real.extend(
|
|
|
|
f.path
|
|
|
|
for f in tqdm(os.scandir(m), leave=False)
|
|
|
|
if f.is_file() and not f.name.startswith(".")
|
|
|
|
)
|
2019-08-28 16:18:39 +02:00
|
|
|
else:
|
|
|
|
mets_files_real.append(m)
|
|
|
|
|
2024-11-28 18:27:39 +01:00
|
|
|
# Prepare output files
|
|
|
|
with contextlib.suppress(FileNotFoundError):
|
|
|
|
os.remove(output_file)
|
2024-11-26 16:27:43 +01:00
|
|
|
output_file_sqlite3 = output_file + ".sqlite3"
|
2024-11-27 18:48:56 +01:00
|
|
|
with contextlib.suppress(FileNotFoundError):
|
|
|
|
os.remove(output_file_sqlite3)
|
2024-11-28 18:27:39 +01:00
|
|
|
|
2025-06-12 09:51:02 +02:00
|
|
|
logger.info("Writing SQLite DB to {}".format(output_file_sqlite3))
|
2024-11-26 16:27:43 +01:00
|
|
|
con = sqlite3.connect(output_file_sqlite3)
|
2024-11-27 14:43:42 +01:00
|
|
|
|
|
|
|
if output_page_info:
|
|
|
|
output_page_info_sqlite3 = output_page_info + ".sqlite3"
|
2025-06-12 09:51:02 +02:00
|
|
|
logger.info("Writing SQLite DB to {}".format(output_page_info_sqlite3))
|
2024-11-27 18:48:56 +01:00
|
|
|
with contextlib.suppress(FileNotFoundError):
|
|
|
|
os.remove(output_page_info_sqlite3)
|
2024-11-27 14:43:42 +01:00
|
|
|
con_page_info = sqlite3.connect(output_page_info_sqlite3)
|
|
|
|
|
2024-11-28 18:27:39 +01:00
|
|
|
# Process METS files
|
2025-06-12 09:51:02 +02:00
|
|
|
with open(output_file + ".warnings.csv", "w") as csvfile:
|
2019-08-28 16:18:39 +02:00
|
|
|
csvwriter = csv.writer(csvfile)
|
2025-06-12 09:51:02 +02:00
|
|
|
logger.info("Processing METS files")
|
2024-11-27 19:03:35 +01:00
|
|
|
for mets_file in tqdm(mets_files_real, leave=True):
|
2019-08-28 16:18:39 +02:00
|
|
|
try:
|
2022-03-31 21:03:58 +02:00
|
|
|
root = ET.parse(mets_file).getroot()
|
2025-06-12 09:51:02 +02:00
|
|
|
mets = root # XXX .find('mets:mets', ns) does not work here
|
|
|
|
mods = root.find("mets:dmdSec//mods:mods", ns)
|
2019-08-28 16:18:39 +02:00
|
|
|
|
|
|
|
with warnings.catch_warnings(record=True) as caught_warnings:
|
2025-06-12 09:51:02 +02:00
|
|
|
warnings.simplefilter("always") # do NOT filter double occurrences
|
2022-03-31 21:03:58 +02:00
|
|
|
|
|
|
|
# MODS
|
2019-08-28 16:18:39 +02:00
|
|
|
d = flatten(mods_to_dict(mods, raise_errors=True))
|
2023-11-22 18:11:14 +01:00
|
|
|
|
2022-03-31 21:03:58 +02:00
|
|
|
# METS
|
|
|
|
d_mets = flatten(mets_to_dict(mets, raise_errors=True))
|
|
|
|
for k, v in d_mets.items():
|
|
|
|
d[f"mets_{k}"] = v
|
|
|
|
# "meta"
|
2025-06-12 09:51:02 +02:00
|
|
|
d["mets_file"] = mets_file
|
2022-03-31 21:03:58 +02:00
|
|
|
|
2024-11-27 19:05:05 +01:00
|
|
|
# Save
|
2024-11-27 14:43:42 +01:00
|
|
|
insert_into_db(con, "mods_info", d)
|
2024-11-26 16:27:43 +01:00
|
|
|
con.commit()
|
2024-11-27 19:05:05 +01:00
|
|
|
|
|
|
|
# METS - per-page
|
2024-11-27 14:43:42 +01:00
|
|
|
if output_page_info:
|
2025-06-12 09:51:02 +02:00
|
|
|
page_info_doc: list[dict] = pages_to_dict(
|
|
|
|
mets, raise_errors=True
|
|
|
|
)
|
|
|
|
insert_into_db_multiple(
|
|
|
|
con_page_info, "page_info", page_info_doc
|
|
|
|
)
|
2024-11-27 14:43:42 +01:00
|
|
|
con_page_info.commit()
|
2019-08-28 16:18:39 +02:00
|
|
|
|
|
|
|
if caught_warnings:
|
|
|
|
# PyCharm thinks caught_warnings is not Iterable:
|
|
|
|
# noinspection PyTypeChecker
|
|
|
|
for caught_warning in caught_warnings:
|
|
|
|
csvwriter.writerow([mets_file, caught_warning.message])
|
|
|
|
except Exception as e:
|
2025-06-12 09:51:02 +02:00
|
|
|
logger.exception("Exception in {}".format(mets_file))
|
2019-08-28 16:18:39 +02:00
|
|
|
|
2025-06-12 09:51:02 +02:00
|
|
|
logger.info("Writing DataFrame to {}".format(output_file))
|
2025-06-04 21:10:10 +02:00
|
|
|
convert_db_to_parquet(con, "mods_info", "recordInfo_recordIdentifier", output_file)
|
2024-11-28 18:32:40 +01:00
|
|
|
if output_page_info:
|
2025-06-12 09:51:02 +02:00
|
|
|
logger.info("Writing DataFrame to {}".format(output_page_info))
|
|
|
|
convert_db_to_parquet(
|
|
|
|
con_page_info, "page_info", ["ppn", "ID"], output_page_info
|
|
|
|
)
|
2023-11-23 16:37:30 +01:00
|
|
|
|
2019-08-28 16:18:39 +02:00
|
|
|
|
|
|
|
def main():
|
|
|
|
logging.basicConfig(level=logging.INFO)
|
|
|
|
|
|
|
|
for prefix, uri in ns.items():
|
|
|
|
ET.register_namespace(prefix, uri)
|
|
|
|
|
2025-06-12 09:42:29 +02:00
|
|
|
process_command()
|
2019-08-28 16:18:39 +02:00
|
|
|
|
|
|
|
|
2025-06-12 09:51:02 +02:00
|
|
|
if __name__ == "__main__":
|
2019-08-28 16:18:39 +02:00
|
|
|
main()
|