1
0
Fork 0
mirror of https://github.com/qurator-spk/modstool.git synced 2025-06-26 12:09:55 +02:00

🎨 Reformat (Black)

This commit is contained in:
Mike Gerber 2025-06-12 09:51:02 +02:00
parent 5c9858a061
commit 212df99436
7 changed files with 639 additions and 355 deletions

View file

@ -18,7 +18,14 @@ import click
import numpy as np import numpy as np
from tqdm import tqdm from tqdm import tqdm
from .lib import TagGroup, convert_db_to_parquet, sorted_groupby, flatten, ns, insert_into_db from .lib import (
TagGroup,
convert_db_to_parquet,
sorted_groupby,
flatten,
ns,
insert_into_db,
)
with warnings.catch_warnings(): with warnings.catch_warnings():
# Filter warnings on WSL # Filter warnings on WSL
@ -27,8 +34,7 @@ with warnings.catch_warnings():
import pandas as pd import pandas as pd
logger = logging.getLogger('alto4pandas') logger = logging.getLogger("alto4pandas")
def alto_to_dict(alto, raise_errors=True): def alto_to_dict(alto, raise_errors=True):
@ -37,56 +43,91 @@ def alto_to_dict(alto, raise_errors=True):
value = {} value = {}
# Iterate through each group of tags # Iterate through each group of tags
for tag, group in sorted_groupby(alto, key=attrgetter('tag')): for tag, group in sorted_groupby(alto, key=attrgetter("tag")):
group = list(group) group = list(group)
localname = ET.QName(tag).localname localname = ET.QName(tag).localname
alto_namespace = ET.QName(tag).namespace alto_namespace = ET.QName(tag).namespace
namespaces={"alto": alto_namespace} namespaces = {"alto": alto_namespace}
if localname == 'Description': if localname == "Description":
value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().descend(raise_errors) value[localname] = (
elif localname == 'MeasurementUnit': TagGroup(tag, group)
value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().text() .is_singleton()
elif localname == 'OCRProcessing': .has_no_attributes()
.descend(raise_errors)
)
elif localname == "MeasurementUnit":
value[localname] = (
TagGroup(tag, group).is_singleton().has_no_attributes().text()
)
elif localname == "OCRProcessing":
value[localname] = TagGroup(tag, group).is_singleton().descend(raise_errors) value[localname] = TagGroup(tag, group).is_singleton().descend(raise_errors)
elif localname == 'Processing': elif localname == "Processing":
# TODO This enumerated descent is used more than once, DRY! # TODO This enumerated descent is used more than once, DRY!
for n, e in enumerate(group): for n, e in enumerate(group):
value[f'{localname}{n}'] = alto_to_dict(e, raise_errors) value[f"{localname}{n}"] = alto_to_dict(e, raise_errors)
elif localname == 'ocrProcessingStep': elif localname == "ocrProcessingStep":
for n, e in enumerate(group): for n, e in enumerate(group):
value[f'{localname}{n}'] = alto_to_dict(e, raise_errors) value[f"{localname}{n}"] = alto_to_dict(e, raise_errors)
elif localname == 'preProcessingStep': elif localname == "preProcessingStep":
for n, e in enumerate(group): for n, e in enumerate(group):
value[f'{localname}{n}'] = alto_to_dict(e, raise_errors) value[f"{localname}{n}"] = alto_to_dict(e, raise_errors)
elif localname == 'processingDateTime': elif localname == "processingDateTime":
value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().text() value[localname] = (
elif localname == 'processingSoftware': TagGroup(tag, group).is_singleton().has_no_attributes().text()
)
elif localname == "processingSoftware":
value[localname] = TagGroup(tag, group).is_singleton().descend(raise_errors) value[localname] = TagGroup(tag, group).is_singleton().descend(raise_errors)
elif localname == 'processingAgency': elif localname == "processingAgency":
value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().text() value[localname] = (
elif localname == 'processingStepDescription': TagGroup(tag, group).is_singleton().has_no_attributes().text()
value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().text() )
elif localname == 'processingStepSettings': elif localname == "processingStepDescription":
value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().text() value[localname] = (
elif localname == 'softwareCreator': TagGroup(tag, group).is_singleton().has_no_attributes().text()
value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().text() )
elif localname == 'softwareName': elif localname == "processingStepSettings":
value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().text() value[localname] = (
elif localname == 'softwareVersion': TagGroup(tag, group).is_singleton().has_no_attributes().text()
value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().text() )
elif localname == "softwareCreator":
value[localname] = (
TagGroup(tag, group).is_singleton().has_no_attributes().text()
)
elif localname == "softwareName":
value[localname] = (
TagGroup(tag, group).is_singleton().has_no_attributes().text()
)
elif localname == "softwareVersion":
value[localname] = (
TagGroup(tag, group).is_singleton().has_no_attributes().text()
)
elif localname == 'sourceImageInformation': elif localname == "sourceImageInformation":
value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().descend(raise_errors) value[localname] = (
elif localname == 'fileName': TagGroup(tag, group)
value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().text() .is_singleton()
elif localname == 'fileIdentifier': .has_no_attributes()
value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().text() .descend(raise_errors)
)
elif localname == "fileName":
value[localname] = (
TagGroup(tag, group).is_singleton().has_no_attributes().text()
)
elif localname == "fileIdentifier":
value[localname] = (
TagGroup(tag, group).is_singleton().has_no_attributes().text()
)
elif localname == 'Layout': elif localname == "Layout":
value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().descend(raise_errors) value[localname] = (
elif localname == 'Page': TagGroup(tag, group)
.is_singleton()
.has_no_attributes()
.descend(raise_errors)
)
elif localname == "Page":
value[localname] = {} value[localname] = {}
value[localname].update(TagGroup(tag, group).is_singleton().attributes()) value[localname].update(TagGroup(tag, group).is_singleton().attributes())
for attr in ("WIDTH", "HEIGHT"): for attr in ("WIDTH", "HEIGHT"):
@ -96,14 +137,18 @@ def alto_to_dict(alto, raise_errors=True):
except ValueError: except ValueError:
del value[localname][attr] del value[localname][attr]
value[localname].update(TagGroup(tag, group).subelement_counts()) value[localname].update(TagGroup(tag, group).subelement_counts())
value[localname].update(TagGroup(tag, group).xpath_statistics("//alto:String/@WC", namespaces)) value[localname].update(
TagGroup(tag, group).xpath_statistics("//alto:String/@WC", namespaces)
)
# Count all alto:String elements with TAGREFS attribute # Count all alto:String elements with TAGREFS attribute
value[localname].update(TagGroup(tag, group).xpath_count("//alto:String[@TAGREFS]", namespaces)) value[localname].update(
TagGroup(tag, group).xpath_count("//alto:String[@TAGREFS]", namespaces)
)
elif localname == 'Styles': elif localname == "Styles":
pass pass
elif localname == 'Tags': elif localname == "Tags":
value[localname] = {} value[localname] = {}
value[localname].update(TagGroup(tag, group).subelement_counts()) value[localname].update(TagGroup(tag, group).subelement_counts())
else: else:
@ -116,13 +161,12 @@ def alto_to_dict(alto, raise_errors=True):
return value return value
def walk(m): def walk(m):
# XXX do this in mods4pandas, too # XXX do this in mods4pandas, too
if os.path.isdir(m): if os.path.isdir(m):
tqdm.write(f'Scanning directory {m}') tqdm.write(f"Scanning directory {m}")
for f in tqdm(os.scandir(m), leave=False): for f in tqdm(os.scandir(m), leave=False):
if f.is_file() and not f.name.startswith('.'): if f.is_file() and not f.name.startswith("."):
yield f.path yield f.path
elif f.is_dir(): elif f.is_dir():
try: try:
@ -133,11 +177,17 @@ def walk(m):
yield m.path yield m.path
@click.command() @click.command()
@click.argument('alto_files', type=click.Path(exists=True), required=True, nargs=-1) @click.argument("alto_files", type=click.Path(exists=True), required=True, nargs=-1)
@click.option('--output', '-o', 'output_file', type=click.Path(), help='Output Parquet file', @click.option(
default='alto_info_df.parquet', show_default=True) "--output",
"-o",
"output_file",
type=click.Path(),
help="Output Parquet file",
default="alto_info_df.parquet",
show_default=True,
)
def process_command(alto_files: List[str], output_file: str): def process_command(alto_files: List[str], output_file: str):
""" """
A tool to convert the ALTO metadata in INPUT to a pandas DataFrame. A tool to convert the ALTO metadata in INPUT to a pandas DataFrame.
@ -153,6 +203,7 @@ def process_command(alto_files: List[str], output_file: str):
process(alto_files, output_file) process(alto_files, output_file)
def process(alto_files: List[str], output_file: str): def process(alto_files: List[str], output_file: str):
# Extend file list if directories are given # Extend file list if directories are given
alto_files_real = [] alto_files_real = []
@ -167,26 +218,26 @@ def process(alto_files: List[str], output_file: str):
with contextlib.suppress(FileNotFoundError): with contextlib.suppress(FileNotFoundError):
os.remove(output_file_sqlite3) os.remove(output_file_sqlite3)
logger.info('Writing SQLite DB to {}'.format(output_file_sqlite3)) logger.info("Writing SQLite DB to {}".format(output_file_sqlite3))
con = sqlite3.connect(output_file_sqlite3) con = sqlite3.connect(output_file_sqlite3)
# Process ALTO files # Process ALTO files
with open(output_file + '.warnings.csv', 'w') as csvfile: with open(output_file + ".warnings.csv", "w") as csvfile:
csvwriter = csv.writer(csvfile) csvwriter = csv.writer(csvfile)
logger.info('Processing ALTO files') logger.info("Processing ALTO files")
for alto_file in tqdm(alto_files_real, leave=False): for alto_file in tqdm(alto_files_real, leave=False):
try: try:
root = ET.parse(alto_file).getroot() root = ET.parse(alto_file).getroot()
alto = root # XXX .find('alto:alto', ns) does not work here alto = root # XXX .find('alto:alto', ns) does not work here
with warnings.catch_warnings(record=True) as caught_warnings: with warnings.catch_warnings(record=True) as caught_warnings:
warnings.simplefilter('always') # do NOT filter double occurrences warnings.simplefilter("always") # do NOT filter double occurrences
# ALTO # ALTO
d = flatten(alto_to_dict(alto, raise_errors=True)) d = flatten(alto_to_dict(alto, raise_errors=True))
# "meta" # "meta"
d['alto_file'] = alto_file d["alto_file"] = alto_file
d['alto_xmlns'] = ET.QName(alto).namespace d["alto_xmlns"] = ET.QName(alto).namespace
# Save # Save
insert_into_db(con, "alto_info", d) insert_into_db(con, "alto_info", d)
@ -198,11 +249,13 @@ def process(alto_files: List[str], output_file: str):
for caught_warning in caught_warnings: for caught_warning in caught_warnings:
csvwriter.writerow([alto_file, caught_warning.message]) csvwriter.writerow([alto_file, caught_warning.message])
except Exception as e: except Exception as e:
logger.error('Exception in {}: {}'.format(alto_file, e)) logger.error("Exception in {}: {}".format(alto_file, e))
import traceback; traceback.print_exc() import traceback
traceback.print_exc()
# Convert the alto_info SQL to a pandas DataFrame # Convert the alto_info SQL to a pandas DataFrame
logger.info('Writing DataFrame to {}'.format(output_file)) logger.info("Writing DataFrame to {}".format(output_file))
convert_db_to_parquet(con, "alto_info", "alto_file", output_file) convert_db_to_parquet(con, "alto_info", "alto_file", output_file)
@ -215,5 +268,5 @@ def main():
process() process()
if __name__ == '__main__': if __name__ == "__main__":
main() main()

View file

@ -21,14 +21,13 @@ __all__ = ["ns"]
ns = { ns = {
'mets': 'http://www.loc.gov/METS/', "mets": "http://www.loc.gov/METS/",
'mods': 'http://www.loc.gov/mods/v3', "mods": "http://www.loc.gov/mods/v3",
"alto": "http://www.loc.gov/standards/alto/ns-v2", "alto": "http://www.loc.gov/standards/alto/ns-v2",
"xlink": "http://www.w3.org/1999/xlink", "xlink": "http://www.w3.org/1999/xlink",
} }
class TagGroup: class TagGroup:
"""Helper class to simplify the parsing and checking of MODS metadata""" """Helper class to simplify the parsing and checking of MODS metadata"""
@ -37,14 +36,14 @@ class TagGroup:
self.group = group self.group = group
def to_xml(self) -> str: def to_xml(self) -> str:
return '\n'.join(str(ET.tostring(e), 'utf-8').strip() for e in self.group) return "\n".join(str(ET.tostring(e), "utf-8").strip() for e in self.group)
def __str__(self) -> str: def __str__(self) -> str:
return f"TagGroup with content:\n{self.to_xml()}" return f"TagGroup with content:\n{self.to_xml()}"
def is_singleton(self) -> TagGroup: def is_singleton(self) -> TagGroup:
if len(self.group) != 1: if len(self.group) != 1:
raise ValueError('More than one instance: {}'.format(self)) raise ValueError("More than one instance: {}".format(self))
return self return self
def has_no_attributes(self) -> TagGroup: def has_no_attributes(self) -> TagGroup:
@ -54,7 +53,9 @@ class TagGroup:
if not isinstance(attrib, Sequence): if not isinstance(attrib, Sequence):
attrib = [attrib] attrib = [attrib]
if not all(e.attrib in attrib for e in self.group): if not all(e.attrib in attrib for e in self.group):
raise ValueError('One or more element has unexpected attributes: {}'.format(self)) raise ValueError(
"One or more element has unexpected attributes: {}".format(self)
)
return self return self
def ignore_attributes(self) -> TagGroup: def ignore_attributes(self) -> TagGroup:
@ -65,10 +66,10 @@ class TagGroup:
self.group = sorted(self.group, key=key, reverse=reverse) self.group = sorted(self.group, key=key, reverse=reverse)
return self return self
def text(self, separator='\n') -> str: def text(self, separator="\n") -> str:
t = '' t = ""
for e in self.group: for e in self.group:
if t != '': if t != "":
t += separator t += separator
if e.text: if e.text:
t += e.text t += e.text
@ -87,7 +88,7 @@ class TagGroup:
new_group.append(e) new_group.append(e)
else: else:
if warn: if warn:
warnings.warn('Filtered {} element ({})'.format(self.tag, warn)) warnings.warn("Filtered {} element ({})".format(self.tag, warn))
return TagGroup(self.tag, new_group) return TagGroup(self.tag, new_group)
def force_singleton(self, warn=True) -> TagGroup: def force_singleton(self, warn=True) -> TagGroup:
@ -95,35 +96,38 @@ class TagGroup:
return self return self
else: else:
if warn: if warn:
warnings.warn('Forced single instance of {}'.format(self.tag)) warnings.warn("Forced single instance of {}".format(self.tag))
return TagGroup(self.tag, self.group[:1]) return TagGroup(self.tag, self.group[:1])
RE_ISO8601_DATE = r'^\d{2}(\d{2}|XX)(-\d{2}-\d{2})?$' # Note: Includes non-specific century dates like '18XX' RE_ISO8601_DATE = r"^\d{2}(\d{2}|XX)(-\d{2}-\d{2})?$" # Note: Includes non-specific century dates like '18XX'
RE_GERMAN_DATE = r'^(?P<dd>\d{2})\.(?P<mm>\d{2})\.(?P<yyyy>\d{4})$' RE_GERMAN_DATE = r"^(?P<dd>\d{2})\.(?P<mm>\d{2})\.(?P<yyyy>\d{4})$"
def fix_date(self) -> TagGroup: def fix_date(self) -> TagGroup:
for e in self.group: for e in self.group:
if e.attrib.get('encoding') == 'w3cdtf': if e.attrib.get("encoding") == "w3cdtf":
# This should be 'iso8601' according to MODS-AP 2.3.1 # This should be 'iso8601' according to MODS-AP 2.3.1
warnings.warn('Changed w3cdtf encoding to iso8601') warnings.warn("Changed w3cdtf encoding to iso8601")
e.attrib['encoding'] = 'iso8601' e.attrib["encoding"] = "iso8601"
new_group = [] new_group = []
for e in self.group: for e in self.group:
if e.text is None: if e.text is None:
warnings.warn('Empty date') warnings.warn("Empty date")
continue continue
if e.attrib.get('encoding') == 'iso8601' and re.match(self.RE_ISO8601_DATE, e.text): if e.attrib.get("encoding") == "iso8601" and re.match(
self.RE_ISO8601_DATE, e.text
):
new_group.append(e) new_group.append(e)
elif re.match(self.RE_ISO8601_DATE, e.text): elif re.match(self.RE_ISO8601_DATE, e.text):
warnings.warn('Added iso8601 encoding to date {}'.format(e.text)) warnings.warn("Added iso8601 encoding to date {}".format(e.text))
e.attrib['encoding'] = 'iso8601' e.attrib["encoding"] = "iso8601"
new_group.append(e) new_group.append(e)
elif m := re.match(self.RE_GERMAN_DATE, e.text): elif m := re.match(self.RE_GERMAN_DATE, e.text):
warnings.warn('Converted date {} to iso8601 encoding'.format(e.text)) warnings.warn("Converted date {} to iso8601 encoding".format(e.text))
e.text = '{}-{}-{}'.format(m.group('yyyy'), m.group('mm'), m.group('dd')) e.text = "{}-{}-{}".format(
e.attrib['encoding'] = 'iso8601' m.group("yyyy"), m.group("mm"), m.group("dd")
)
e.attrib["encoding"] = "iso8601"
new_group.append(e) new_group.append(e)
else: else:
warnings.warn('Not a iso8601 date: "{}"'.format(e.text)) warnings.warn('Not a iso8601 date: "{}"'.format(e.text))
@ -146,26 +150,30 @@ class TagGroup:
# Fix this for special cases. # Fix this for special cases.
for e in self.group: for e in self.group:
if e.attrib.get('eventType') is None: if e.attrib.get("eventType") is None:
try: try:
if e.find('mods:publisher', ns).text.startswith('Staatsbibliothek zu Berlin') and \ if (
e.find('mods:edition', ns).text == '[Electronic ed.]': e.find("mods:publisher", ns).text.startswith(
e.attrib['eventType'] = 'digitization' "Staatsbibliothek zu Berlin"
warnings.warn('Fixed eventType for electronic ed.') )
and e.find("mods:edition", ns).text == "[Electronic ed.]"
):
e.attrib["eventType"] = "digitization"
warnings.warn("Fixed eventType for electronic ed.")
continue continue
except AttributeError: except AttributeError:
pass pass
try: try:
if e.find('mods:dateIssued', ns) is not None: if e.find("mods:dateIssued", ns) is not None:
e.attrib['eventType'] = 'publication' e.attrib["eventType"] = "publication"
warnings.warn('Fixed eventType for an issued origin') warnings.warn("Fixed eventType for an issued origin")
continue continue
except AttributeError: except AttributeError:
pass pass
try: try:
if e.find('mods:dateCreated', ns) is not None: if e.find("mods:dateCreated", ns) is not None:
e.attrib['eventType'] = 'production' e.attrib["eventType"] = "production"
warnings.warn('Fixed eventType for a created origin') warnings.warn("Fixed eventType for a created origin")
continue continue
except AttributeError: except AttributeError:
pass pass
@ -174,13 +182,14 @@ class TagGroup:
def fix_script_term(self) -> TagGroup: def fix_script_term(self) -> TagGroup:
for e in self.group: for e in self.group:
# MODS-AP 2.3.1 is not clear about this, but it looks like that this should be lower case. # MODS-AP 2.3.1 is not clear about this, but it looks like that this should be lower case.
if e.attrib['authority'] == 'ISO15924': if e.attrib["authority"] == "ISO15924":
e.attrib['authority'] = 'iso15924' e.attrib["authority"] = "iso15924"
warnings.warn('Changed scriptTerm authority to lower case') warnings.warn("Changed scriptTerm authority to lower case")
return self return self
def merge_sub_tags_to_set(self) -> dict: def merge_sub_tags_to_set(self) -> dict:
from .mods4pandas import mods_to_dict from .mods4pandas import mods_to_dict
value = {} value = {}
sub_dicts = [mods_to_dict(e) for e in self.group] sub_dicts = [mods_to_dict(e) for e in self.group]
@ -230,6 +239,7 @@ class TagGroup:
Extract values using the given XPath expression, convert them to float and return descriptive Extract values using the given XPath expression, convert them to float and return descriptive
statistics on the values. statistics on the values.
""" """
def xpath_values(): def xpath_values():
values = [] values = []
for e in self.group: for e in self.group:
@ -240,11 +250,11 @@ class TagGroup:
values = xpath_values() values = xpath_values()
statistics = {} statistics = {}
if values.size > 0: if values.size > 0:
statistics[f'{xpath_expr}-mean'] = np.mean(values) statistics[f"{xpath_expr}-mean"] = np.mean(values)
statistics[f'{xpath_expr}-median'] = np.median(values) statistics[f"{xpath_expr}-median"] = np.median(values)
statistics[f'{xpath_expr}-std'] = np.std(values) statistics[f"{xpath_expr}-std"] = np.std(values)
statistics[f'{xpath_expr}-min'] = np.min(values) statistics[f"{xpath_expr}-min"] = np.min(values)
statistics[f'{xpath_expr}-max'] = np.max(values) statistics[f"{xpath_expr}-max"] = np.max(values)
return statistics return statistics
def xpath_count(self, xpath_expr, namespaces) -> dict[str, int]: def xpath_count(self, xpath_expr, namespaces) -> dict[str, int]:
@ -256,11 +266,10 @@ class TagGroup:
r = e.xpath(xpath_expr, namespaces=namespaces) r = e.xpath(xpath_expr, namespaces=namespaces)
values += r values += r
counts = {f'{xpath_expr}-count': len(values)} counts = {f"{xpath_expr}-count": len(values)}
return counts return counts
def sorted_groupby(iterable, key=None): def sorted_groupby(iterable, key=None):
""" """
Sort iterable by key and then group by the same key. Sort iterable by key and then group by the same key.
@ -291,7 +300,7 @@ def _to_dict(root, raise_errors):
raise ValueError(f"Unknown namespace {root_name.namespace}") raise ValueError(f"Unknown namespace {root_name.namespace}")
def flatten(d: MutableMapping, parent='', separator='_') -> dict: def flatten(d: MutableMapping, parent="", separator="_") -> dict:
""" """
Flatten the given nested dict. Flatten the given nested dict.
@ -314,11 +323,12 @@ def flatten(d: MutableMapping, parent='', separator='_') -> dict:
def valid_column_key(k) -> bool: def valid_column_key(k) -> bool:
if re.match(r'^[a-zA-Z0-9 _@/:\[\]-]+$', k): if re.match(r"^[a-zA-Z0-9 _@/:\[\]-]+$", k):
return True return True
else: else:
return False return False
def column_names_csv(columns) -> str: def column_names_csv(columns) -> str:
""" """
Format Column names (identifiers) as a comma-separated list. Format Column names (identifiers) as a comma-separated list.
@ -327,9 +337,11 @@ def column_names_csv(columns) -> str:
""" """
return ",".join('"' + c + '"' for c in columns) return ",".join('"' + c + '"' for c in columns)
current_columns: dict[str, list] = defaultdict(list) current_columns: dict[str, list] = defaultdict(list)
current_columns_types: dict[str, dict] = defaultdict(dict) current_columns_types: dict[str, dict] = defaultdict(dict)
def insert_into_db(con, table, d: Dict): def insert_into_db(con, table, d: Dict):
"""Insert the values from the dict into the table, creating columns if necessary""" """Insert the values from the dict into the table, creating columns if necessary"""
@ -338,7 +350,9 @@ def insert_into_db(con, table, d: Dict):
for k in d.keys(): for k in d.keys():
assert valid_column_key(k), f'"{k}" is not a valid column name' assert valid_column_key(k), f'"{k}" is not a valid column name'
current_columns[table].append(k) current_columns[table].append(k)
con.execute(f"CREATE TABLE {table} ({column_names_csv(current_columns[table])})") con.execute(
f"CREATE TABLE {table} ({column_names_csv(current_columns[table])})"
)
# Add columns if necessary # Add columns if necessary
for k in d.keys(): for k in d.keys():
@ -361,13 +375,15 @@ def insert_into_db(con, table, d: Dict):
f"( {column_names_csv(columns)} )" f"( {column_names_csv(columns)} )"
"VALUES" "VALUES"
f"( {','.join('?' for c in columns)} )", f"( {','.join('?' for c in columns)} )",
[str(d[c]) for c in columns] [str(d[c]) for c in columns],
) )
def insert_into_db_multiple(con, table, ld: List[Dict]): def insert_into_db_multiple(con, table, ld: List[Dict]):
for d in ld: for d in ld:
insert_into_db(con, table, d) insert_into_db(con, table, d)
def convert_db_to_parquet(con, table, index_col, output_file): def convert_db_to_parquet(con, table, index_col, output_file):
df = pd.read_sql_query(f"SELECT * FROM {table}", con, index_col) df = pd.read_sql_query(f"SELECT * FROM {table}", con, index_col)
@ -386,6 +402,8 @@ def convert_db_to_parquet(con, table, index_col, output_file):
elif column_type == "set": elif column_type == "set":
df[c] = df[c].apply(lambda s: list(ast.literal_eval(s)) if s else None) df[c] = df[c].apply(lambda s: list(ast.literal_eval(s)) if s else None)
else: else:
raise NotImplementedError(f"Column {c}: type {column_type} not implemented yet.") raise NotImplementedError(
f"Column {c}: type {column_type} not implemented yet."
)
df.to_parquet(output_file) df.to_parquet(output_file)

View file

@ -17,7 +17,16 @@ from collections.abc import MutableMapping, Sequence
import click import click
from tqdm import tqdm from tqdm import tqdm
from .lib import convert_db_to_parquet, sorted_groupby, TagGroup, ns, flatten, insert_into_db, insert_into_db_multiple, current_columns_types from .lib import (
convert_db_to_parquet,
sorted_groupby,
TagGroup,
ns,
flatten,
insert_into_db,
insert_into_db_multiple,
current_columns_types,
)
with warnings.catch_warnings(): with warnings.catch_warnings():
# Filter warnings on WSL # Filter warnings on WSL
@ -26,7 +35,8 @@ with warnings.catch_warnings():
import pandas as pd import pandas as pd
logger = logging.getLogger('mods4pandas') logger = logging.getLogger("mods4pandas")
def mods_to_dict(mods, raise_errors=True): def mods_to_dict(mods, raise_errors=True):
"""Convert MODS metadata to a nested dictionary""" """Convert MODS metadata to a nested dictionary"""
@ -37,179 +47,290 @@ def mods_to_dict(mods, raise_errors=True):
value = {} value = {}
# Iterate through each group of tags # Iterate through each group of tags
for tag, group in sorted_groupby(mods, key=attrgetter('tag')): for tag, group in sorted_groupby(mods, key=attrgetter("tag")):
group = list(group) group = list(group)
if tag == '{http://www.loc.gov/mods/v3}location': if tag == "{http://www.loc.gov/mods/v3}location":
def only_current_location(location): def only_current_location(location):
return location.get('type') != 'former' return location.get("type") != "former"
value['location'] = TagGroup(tag, group) \
.filter(only_current_location) \ value["location"] = (
.has_attributes([{}, {'type': 'current'}]) \ TagGroup(tag, group)
.is_singleton().descend(raise_errors) .filter(only_current_location)
elif tag == '{http://www.loc.gov/mods/v3}physicalLocation': .has_attributes([{}, {"type": "current"}])
.is_singleton()
.descend(raise_errors)
)
elif tag == "{http://www.loc.gov/mods/v3}physicalLocation":
def no_display_label(physical_location): def no_display_label(physical_location):
return physical_location.get('displayLabel') is None return physical_location.get("displayLabel") is None
value['physicalLocation'] = TagGroup(tag, group).filter(no_display_label).text()
elif tag == '{http://www.loc.gov/mods/v3}shelfLocator': value["physicalLocation"] = (
TagGroup(tag, group).filter(no_display_label).text()
)
elif tag == "{http://www.loc.gov/mods/v3}shelfLocator":
# This element should not be repeated according to MODS-AP 2.3.1, however a few of the files contain # This element should not be repeated according to MODS-AP 2.3.1, however a few of the files contain
# a second element with empty text and a "displayLabel" attribute set. # a second element with empty text and a "displayLabel" attribute set.
def no_display_label(shelf_locator): def no_display_label(shelf_locator):
return shelf_locator.get('displayLabel') is None return shelf_locator.get("displayLabel") is None
value['shelfLocator'] = TagGroup(tag, group) \
.filter(no_display_label) \ value["shelfLocator"] = (
.force_singleton() \ TagGroup(tag, group)
.has_no_attributes() \ .filter(no_display_label)
.force_singleton()
.has_no_attributes()
.text() .text()
elif tag == '{http://www.loc.gov/mods/v3}originInfo': )
elif tag == "{http://www.loc.gov/mods/v3}originInfo":
def has_event_type(origin_info): def has_event_type(origin_info):
# According to MODS-AP 2.3.1, every originInfo should have its eventType set. However, some # According to MODS-AP 2.3.1, every originInfo should have its eventType set. However, some
# are empty and not fixable. # are empty and not fixable.
return origin_info.attrib.get('eventType') is not None return origin_info.attrib.get("eventType") is not None
tag_group = TagGroup(tag, group).fix_event_type().filter(has_event_type, warn="has no eventType")
for event_type, grouped_group in sorted_groupby(tag_group.group, key=lambda g: g.attrib['eventType']): tag_group = (
TagGroup(tag, group)
.fix_event_type()
.filter(has_event_type, warn="has no eventType")
)
for event_type, grouped_group in sorted_groupby(
tag_group.group, key=lambda g: g.attrib["eventType"]
):
for n, e in enumerate(grouped_group): for n, e in enumerate(grouped_group):
value['originInfo-{}{}'.format(event_type, n)] = mods_to_dict(e, raise_errors) value["originInfo-{}{}".format(event_type, n)] = mods_to_dict(
elif tag == '{http://www.loc.gov/mods/v3}place': e, raise_errors
value['place'] = TagGroup(tag, group).force_singleton(warn=False).has_no_attributes().descend(raise_errors) )
elif tag == '{http://www.loc.gov/mods/v3}placeTerm': elif tag == "{http://www.loc.gov/mods/v3}place":
value['placeTerm'] = TagGroup(tag, group).is_singleton().has_attributes({'type': 'text'}).text() value["place"] = (
elif tag == '{http://www.loc.gov/mods/v3}dateIssued': TagGroup(tag, group)
value['dateIssued'] = TagGroup(tag, group) \ .force_singleton(warn=False)
.fix_date() \ .has_no_attributes()
.sort(key=lambda d: d.attrib.get('keyDate') == 'yes', reverse=True) \ .descend(raise_errors)
.ignore_attributes() \ )
.force_singleton() \ elif tag == "{http://www.loc.gov/mods/v3}placeTerm":
value["placeTerm"] = (
TagGroup(tag, group)
.is_singleton()
.has_attributes({"type": "text"})
.text() .text()
elif tag == '{http://www.loc.gov/mods/v3}dateCreated': )
value['dateCreated'] = TagGroup(tag, group) \ elif tag == "{http://www.loc.gov/mods/v3}dateIssued":
.fix_date() \ value["dateIssued"] = (
.sort(key=lambda d: d.attrib.get('keyDate') == 'yes', reverse=True) \ TagGroup(tag, group)
.ignore_attributes() \ .fix_date()
.force_singleton() \ .sort(key=lambda d: d.attrib.get("keyDate") == "yes", reverse=True)
.ignore_attributes()
.force_singleton()
.text() .text()
elif tag == '{http://www.loc.gov/mods/v3}dateCaptured': )
value['dateCaptured'] = TagGroup(tag, group).fix_date().ignore_attributes().is_singleton().text() elif tag == "{http://www.loc.gov/mods/v3}dateCreated":
elif tag == '{http://www.loc.gov/mods/v3}dateOther': value["dateCreated"] = (
value['dateOther'] = TagGroup(tag, group).fix_date().ignore_attributes().is_singleton().text() TagGroup(tag, group)
elif tag == '{http://www.loc.gov/mods/v3}publisher': .fix_date()
value['publisher'] = TagGroup(tag, group).force_singleton(warn=False).has_no_attributes().text() .sort(key=lambda d: d.attrib.get("keyDate") == "yes", reverse=True)
elif tag == '{http://www.loc.gov/mods/v3}edition': .ignore_attributes()
value['edition'] = TagGroup(tag, group).force_singleton().has_no_attributes().text() .force_singleton()
elif tag == '{http://www.loc.gov/mods/v3}classification': .text()
authorities = {e.attrib['authority'] for e in group} )
elif tag == "{http://www.loc.gov/mods/v3}dateCaptured":
value["dateCaptured"] = (
TagGroup(tag, group)
.fix_date()
.ignore_attributes()
.is_singleton()
.text()
)
elif tag == "{http://www.loc.gov/mods/v3}dateOther":
value["dateOther"] = (
TagGroup(tag, group)
.fix_date()
.ignore_attributes()
.is_singleton()
.text()
)
elif tag == "{http://www.loc.gov/mods/v3}publisher":
value["publisher"] = (
TagGroup(tag, group)
.force_singleton(warn=False)
.has_no_attributes()
.text()
)
elif tag == "{http://www.loc.gov/mods/v3}edition":
value["edition"] = (
TagGroup(tag, group).force_singleton().has_no_attributes().text()
)
elif tag == "{http://www.loc.gov/mods/v3}classification":
authorities = {e.attrib["authority"] for e in group}
for authority in authorities: for authority in authorities:
sub_group = [e for e in group if e.attrib.get('authority') == authority] sub_group = [e for e in group if e.attrib.get("authority") == authority]
value['classification-{}'.format(authority)] = TagGroup(tag, sub_group).text_set() value["classification-{}".format(authority)] = TagGroup(
elif tag == '{http://www.loc.gov/mods/v3}recordInfo': tag, sub_group
value['recordInfo'] = TagGroup(tag, group).is_singleton().has_no_attributes().descend(raise_errors) ).text_set()
elif tag == '{http://www.loc.gov/mods/v3}recordIdentifier': elif tag == "{http://www.loc.gov/mods/v3}recordInfo":
value["recordInfo"] = (
TagGroup(tag, group)
.is_singleton()
.has_no_attributes()
.descend(raise_errors)
)
elif tag == "{http://www.loc.gov/mods/v3}recordIdentifier":
# By default we assume source="gbv-ppn" mods:recordIdentifiers (= PPNs), # By default we assume source="gbv-ppn" mods:recordIdentifiers (= PPNs),
# however, in mods:relatedItems, there may be source="dnb-ppns", # however, in mods:relatedItems, there may be source="dnb-ppns",
# which we need to distinguish by using a separate field name. # which we need to distinguish by using a separate field name.
try: try:
value['recordIdentifier'] = TagGroup(tag, group).is_singleton().has_attributes({'source': 'gbv-ppn'}).text() value["recordIdentifier"] = (
TagGroup(tag, group)
.is_singleton()
.has_attributes({"source": "gbv-ppn"})
.text()
)
except ValueError: except ValueError:
value['recordIdentifier-dnb-ppn'] = TagGroup(tag, group).is_singleton().has_attributes({'source': 'dnb-ppn'}).text() value["recordIdentifier-dnb-ppn"] = (
elif tag == '{http://www.loc.gov/mods/v3}identifier': TagGroup(tag, group)
.is_singleton()
.has_attributes({"source": "dnb-ppn"})
.text()
)
elif tag == "{http://www.loc.gov/mods/v3}identifier":
for e in group: for e in group:
if len(e.attrib) != 1: if len(e.attrib) != 1:
raise ValueError('Unknown attributes for identifier {}'.format(e.attrib)) raise ValueError(
value['identifier-{}'.format(e.attrib['type'])] = e.text "Unknown attributes for identifier {}".format(e.attrib)
elif tag == '{http://www.loc.gov/mods/v3}titleInfo': )
value["identifier-{}".format(e.attrib["type"])] = e.text
elif tag == "{http://www.loc.gov/mods/v3}titleInfo":
def only_standard_title(title_info): def only_standard_title(title_info):
return title_info.attrib.get('type') is None return title_info.attrib.get("type") is None
value['titleInfo'] = TagGroup(tag, group) \
.filter(only_standard_title) \ value["titleInfo"] = (
.is_singleton().has_no_attributes().descend(raise_errors) TagGroup(tag, group)
elif tag == '{http://www.loc.gov/mods/v3}title': .filter(only_standard_title)
value['title'] = TagGroup(tag, group).is_singleton().has_no_attributes().text() .is_singleton()
elif tag == '{http://www.loc.gov/mods/v3}partName': .has_no_attributes()
value['partName'] = TagGroup(tag, group).is_singleton().has_no_attributes().text() .descend(raise_errors)
elif tag == '{http://www.loc.gov/mods/v3}subTitle': )
value['subTitle'] = TagGroup(tag, group).force_singleton().has_no_attributes().text() elif tag == "{http://www.loc.gov/mods/v3}title":
elif tag == '{http://www.loc.gov/mods/v3}note': value["title"] = (
TagGroup(tag, group).is_singleton().has_no_attributes().text()
)
elif tag == "{http://www.loc.gov/mods/v3}partName":
value["partName"] = (
TagGroup(tag, group).is_singleton().has_no_attributes().text()
)
elif tag == "{http://www.loc.gov/mods/v3}subTitle":
value["subTitle"] = (
TagGroup(tag, group).force_singleton().has_no_attributes().text()
)
elif tag == "{http://www.loc.gov/mods/v3}note":
# This could be useful if distinguished by type attribute. # This could be useful if distinguished by type attribute.
pass pass
elif tag == '{http://www.loc.gov/mods/v3}part': elif tag == "{http://www.loc.gov/mods/v3}part":
pass pass
elif tag == '{http://www.loc.gov/mods/v3}abstract': elif tag == "{http://www.loc.gov/mods/v3}abstract":
value['abstract'] = TagGroup(tag, group).has_no_attributes().text() value["abstract"] = TagGroup(tag, group).has_no_attributes().text()
elif tag == '{http://www.loc.gov/mods/v3}subject': elif tag == "{http://www.loc.gov/mods/v3}subject":
authorities = {e.attrib.get('authority') for e in group} authorities = {e.attrib.get("authority") for e in group}
for authority in authorities: for authority in authorities:
k = 'subject-{}'.format(authority) if authority is not None else 'subject' k = (
sub_group = [e for e in group if e.attrib.get('authority') == authority] "subject-{}".format(authority)
value[k] = TagGroup(tag, sub_group).force_singleton().descend(raise_errors) if authority is not None
elif tag == '{http://www.loc.gov/mods/v3}topic': else "subject"
)
sub_group = [e for e in group if e.attrib.get("authority") == authority]
value[k] = (
TagGroup(tag, sub_group).force_singleton().descend(raise_errors)
)
elif tag == "{http://www.loc.gov/mods/v3}topic":
TagGroup(tag, group).text_set() TagGroup(tag, group).text_set()
elif tag == '{http://www.loc.gov/mods/v3}cartographics': elif tag == "{http://www.loc.gov/mods/v3}cartographics":
pass pass
elif tag == '{http://www.loc.gov/mods/v3}geographic': elif tag == "{http://www.loc.gov/mods/v3}geographic":
TagGroup(tag, group).text_set() TagGroup(tag, group).text_set()
elif tag == '{http://www.loc.gov/mods/v3}temporal': elif tag == "{http://www.loc.gov/mods/v3}temporal":
TagGroup(tag, group).text_set() TagGroup(tag, group).text_set()
elif tag == '{http://www.loc.gov/mods/v3}genre': elif tag == "{http://www.loc.gov/mods/v3}genre":
authorities = {e.attrib.get('authority') for e in group} authorities = {e.attrib.get("authority") for e in group}
for authority in authorities: for authority in authorities:
k = 'genre-{}'.format(authority) if authority is not None else 'genre' k = "genre-{}".format(authority) if authority is not None else "genre"
value[k] = {e.text for e in group if e.attrib.get('authority') == authority} value[k] = {
elif tag == '{http://www.loc.gov/mods/v3}language': e.text for e in group if e.attrib.get("authority") == authority
value["language"] = TagGroup(tag, group) \ }
.merge_sub_tags_to_set() elif tag == "{http://www.loc.gov/mods/v3}language":
elif tag == '{http://www.loc.gov/mods/v3}languageTerm': value["language"] = TagGroup(tag, group).merge_sub_tags_to_set()
value['languageTerm'] = TagGroup(tag, group) \ elif tag == "{http://www.loc.gov/mods/v3}languageTerm":
.has_attributes({'authority': 'iso639-2b', 'type': 'code'}) \ value["languageTerm"] = (
TagGroup(tag, group)
.has_attributes({"authority": "iso639-2b", "type": "code"})
.text_set() .text_set()
elif tag == '{http://www.loc.gov/mods/v3}scriptTerm': )
value['scriptTerm'] = TagGroup(tag, group) \ elif tag == "{http://www.loc.gov/mods/v3}scriptTerm":
.fix_script_term() \ value["scriptTerm"] = (
.has_attributes({'authority': 'iso15924', 'type': 'code'}) \ TagGroup(tag, group)
.fix_script_term()
.has_attributes({"authority": "iso15924", "type": "code"})
.text_set() .text_set()
elif tag == '{http://www.loc.gov/mods/v3}relatedItem': )
elif tag == "{http://www.loc.gov/mods/v3}relatedItem":
tag_group = TagGroup(tag, group) tag_group = TagGroup(tag, group)
for type_, grouped_group in sorted_groupby(tag_group.group, key=lambda g: g.attrib['type']): for type_, grouped_group in sorted_groupby(
sub_tag = 'relatedItem-{}'.format(type_) tag_group.group, key=lambda g: g.attrib["type"]
):
sub_tag = "relatedItem-{}".format(type_)
grouped_group = list(grouped_group) grouped_group = list(grouped_group)
if type_ in ["original", "host"]: if type_ in ["original", "host"]:
value[sub_tag] = TagGroup(sub_tag, grouped_group).is_singleton().descend(raise_errors) value[sub_tag] = (
TagGroup(sub_tag, grouped_group)
.is_singleton()
.descend(raise_errors)
)
else: else:
# TODO type="series" # TODO type="series"
pass pass
elif tag == '{http://www.loc.gov/mods/v3}name': elif tag == "{http://www.loc.gov/mods/v3}name":
for n, e in enumerate(group): for n, e in enumerate(group):
value['name{}'.format(n)] = mods_to_dict(e, raise_errors) value["name{}".format(n)] = mods_to_dict(e, raise_errors)
elif tag == '{http://www.loc.gov/mods/v3}role': elif tag == "{http://www.loc.gov/mods/v3}role":
value["role"] = TagGroup(tag, group) \ value["role"] = (
.has_no_attributes() \ TagGroup(tag, group).has_no_attributes().merge_sub_tags_to_set()
.merge_sub_tags_to_set() )
elif tag == '{http://www.loc.gov/mods/v3}roleTerm': elif tag == "{http://www.loc.gov/mods/v3}roleTerm":
value['roleTerm'] = TagGroup(tag, group) \ value["roleTerm"] = (
.has_attributes({'authority': 'marcrelator', 'type': 'code'}) \ TagGroup(tag, group)
.has_attributes({"authority": "marcrelator", "type": "code"})
.text_set() .text_set()
elif tag == '{http://www.loc.gov/mods/v3}namePart': )
elif tag == "{http://www.loc.gov/mods/v3}namePart":
for e in group: for e in group:
if not e.attrib.get('type'): if not e.attrib.get("type"):
value['namePart'] = e.text value["namePart"] = e.text
else: else:
value['namePart-{}'.format(e.attrib['type'])] = e.text value["namePart-{}".format(e.attrib["type"])] = e.text
elif tag == '{http://www.loc.gov/mods/v3}nameIdentifier': elif tag == "{http://www.loc.gov/mods/v3}nameIdentifier":
# TODO Use this (e.g. <mods:nameIdentifier type="ppn">106168096</mods:nameIdentifier>) or the # TODO Use this (e.g. <mods:nameIdentifier type="ppn">106168096</mods:nameIdentifier>) or the
# mods:name@valueURI to disambiguate # mods:name@valueURI to disambiguate
pass pass
elif tag == '{http://www.loc.gov/mods/v3}displayForm': elif tag == "{http://www.loc.gov/mods/v3}displayForm":
value['displayForm'] = TagGroup(tag, group).is_singleton().has_no_attributes().text() value["displayForm"] = (
elif tag == '{http://www.loc.gov/mods/v3}physicalDescription': TagGroup(tag, group).is_singleton().has_no_attributes().text()
)
elif tag == "{http://www.loc.gov/mods/v3}physicalDescription":
pass pass
elif tag == '{http://www.loc.gov/mods/v3}extension': elif tag == "{http://www.loc.gov/mods/v3}extension":
pass pass
elif tag == '{http://www.loc.gov/mods/v3}accessCondition': elif tag == "{http://www.loc.gov/mods/v3}accessCondition":
for e in group: for e in group:
if not e.attrib.get('type'): if not e.attrib.get("type"):
raise ValueError('Unknown attributes for accessCondition {}'.format(e.attrib)) raise ValueError(
value['accessCondition-{}'.format(e.attrib['type'])] = e.text "Unknown attributes for accessCondition {}".format(e.attrib)
elif tag == '{http://www.loc.gov/mods/v3}typeOfResource': )
value['typeOfResource'] = TagGroup(tag, group).is_singleton().has_no_attributes().text() value["accessCondition-{}".format(e.attrib["type"])] = e.text
elif tag == '{http://www.loc.gov/mods/v3}mods': elif tag == "{http://www.loc.gov/mods/v3}typeOfResource":
value["typeOfResource"] = (
TagGroup(tag, group).is_singleton().has_no_attributes().text()
)
elif tag == "{http://www.loc.gov/mods/v3}mods":
# XXX Ignore nested mods:mods for now (used in mods:subject) # XXX Ignore nested mods:mods for now (used in mods:subject)
pass pass
else: else:
@ -230,30 +351,29 @@ def mets_to_dict(mets, raise_errors=True):
value = {} value = {}
# Iterate through each group of tags # Iterate through each group of tags
for tag, group in sorted_groupby(mets, key=attrgetter('tag')): for tag, group in sorted_groupby(mets, key=attrgetter("tag")):
group = list(group) group = list(group)
# XXX Namespaces seem to use a trailing / sometimes, sometimes not. # XXX Namespaces seem to use a trailing / sometimes, sometimes not.
# (e.g. {http://www.loc.gov/METS/} vs {http://www.loc.gov/METS}) # (e.g. {http://www.loc.gov/METS/} vs {http://www.loc.gov/METS})
if tag == '{http://www.loc.gov/METS/}amdSec': if tag == "{http://www.loc.gov/METS/}amdSec":
pass # TODO pass # TODO
elif tag == '{http://www.loc.gov/METS/}dmdSec': elif tag == "{http://www.loc.gov/METS/}dmdSec":
pass # TODO pass # TODO
elif tag == '{http://www.loc.gov/METS/}metsHdr': elif tag == "{http://www.loc.gov/METS/}metsHdr":
pass # TODO pass # TODO
elif tag == '{http://www.loc.gov/METS/}structLink': elif tag == "{http://www.loc.gov/METS/}structLink":
pass # TODO pass # TODO
elif tag == '{http://www.loc.gov/METS/}structMap': elif tag == "{http://www.loc.gov/METS/}structMap":
pass # TODO pass # TODO
elif tag == '{http://www.loc.gov/METS/}fileSec': elif tag == "{http://www.loc.gov/METS/}fileSec":
value['fileSec'] = TagGroup(tag, group) \ value["fileSec"] = TagGroup(tag, group).is_singleton().descend(raise_errors)
.is_singleton().descend(raise_errors) elif tag == "{http://www.loc.gov/METS/}fileGrp":
elif tag == '{http://www.loc.gov/METS/}fileGrp':
for e in group: for e in group:
use = e.attrib.get('USE') use = e.attrib.get("USE")
if not use: if not use:
raise ValueError('No USE attribute for fileGrp {}'.format(e)) raise ValueError("No USE attribute for fileGrp {}".format(e))
value[f'fileGrp-{use}-count'] = len(e) value[f"fileGrp-{use}-count"] = len(e)
else: else:
if raise_errors: if raise_errors:
print(value) print(value)
@ -262,6 +382,7 @@ def mets_to_dict(mets, raise_errors=True):
pass pass
return value return value
def pages_to_dict(mets, raise_errors=True) -> List[Dict]: def pages_to_dict(mets, raise_errors=True) -> List[Dict]:
# TODO replace asserts by ValueError # TODO replace asserts by ValueError
@ -269,23 +390,36 @@ def pages_to_dict(mets, raise_errors=True) -> List[Dict]:
# PPN # PPN
def get_mets_recordIdentifier(*, source="gbv-ppn"): def get_mets_recordIdentifier(*, source="gbv-ppn"):
return (mets.xpath(f'//mets:dmdSec[1]//mods:mods/mods:recordInfo/mods:recordIdentifier[@source="{source}"]', return (
namespaces=ns) or [None])[0].text mets.xpath(
f'//mets:dmdSec[1]//mods:mods/mods:recordInfo/mods:recordIdentifier[@source="{source}"]',
namespaces=ns,
)
or [None]
)[0].text
ppn = get_mets_recordIdentifier() ppn = get_mets_recordIdentifier()
# Getting per-page/structure information is a bit different # Getting per-page/structure information is a bit different
structMap_PHYSICAL = mets.find('./mets:structMap[@TYPE="PHYSICAL"]', ns) structMap_PHYSICAL = mets.find('./mets:structMap[@TYPE="PHYSICAL"]', ns)
structMap_LOGICAL = mets.find('./mets:structMap[@TYPE="LOGICAL"]', ns) structMap_LOGICAL = mets.find('./mets:structMap[@TYPE="LOGICAL"]', ns)
fileSec = mets.find('./mets:fileSec', ns) fileSec = mets.find("./mets:fileSec", ns)
if structMap_PHYSICAL is None: if structMap_PHYSICAL is None:
# This is expected in a multivolume work or periodical! # This is expected in a multivolume work or periodical!
if any( if any(
structMap_LOGICAL.find(f'./mets:div[@TYPE="{t}"]', ns) is not None structMap_LOGICAL.find(f'./mets:div[@TYPE="{t}"]', ns) is not None
for t in ["multivolume_work", "MultivolumeWork", "multivolume_manuscript", "periodical"] for t in [
"multivolume_work",
"MultivolumeWork",
"multivolume_manuscript",
"periodical",
]
): ):
return [] return []
else: else:
raise ValueError("No structMap[@TYPE='PHYSICAL'] found (but not a multivolume work)") raise ValueError(
"No structMap[@TYPE='PHYSICAL'] found (but not a multivolume work)"
)
if structMap_LOGICAL is None: if structMap_LOGICAL is None:
raise ValueError("No structMap[@TYPE='LOGICAL'] found") raise ValueError("No structMap[@TYPE='LOGICAL'] found")
if fileSec is None: if fileSec is None:
@ -294,13 +428,14 @@ def pages_to_dict(mets, raise_errors=True) -> List[Dict]:
div_physSequence = structMap_PHYSICAL[0] div_physSequence = structMap_PHYSICAL[0]
assert div_physSequence.attrib.get("TYPE") == "physSequence" assert div_physSequence.attrib.get("TYPE") == "physSequence"
# Build a look-up table to get mets:file by @ID # Build a look-up table to get mets:file by @ID
# This cuts retrieving the mets:file down to half the time. # This cuts retrieving the mets:file down to half the time.
mets_file_by_ID = {} mets_file_by_ID = {}
def _init_mets_file_by_ID(): def _init_mets_file_by_ID():
for f in fileSec.iterfind('./mets:fileGrp/mets:file', ns): for f in fileSec.iterfind("./mets:fileGrp/mets:file", ns):
mets_file_by_ID[f.attrib.get("ID")] = f mets_file_by_ID[f.attrib.get("ID")] = f
_init_mets_file_by_ID() _init_mets_file_by_ID()
def get_mets_file(*, ID): def get_mets_file(*, ID):
@ -312,7 +447,6 @@ def pages_to_dict(mets, raise_errors=True) -> List[Dict]:
return structMap_LOGICAL.findall(f'.//mets:div[@ID="{ID}"]', ns) return structMap_LOGICAL.findall(f'.//mets:div[@ID="{ID}"]', ns)
for page in div_physSequence: for page in div_physSequence:
# TODO sort by ORDER? # TODO sort by ORDER?
assert page.attrib.get("TYPE") == "page" assert page.attrib.get("TYPE") == "page"
page_dict = {} page_dict = {}
@ -326,7 +460,9 @@ def pages_to_dict(mets, raise_errors=True) -> List[Dict]:
file_ = get_mets_file(ID=file_id) file_ = get_mets_file(ID=file_id)
assert file_ is not None assert file_ is not None
fileGrp_USE = file_.getparent().attrib.get("USE") fileGrp_USE = file_.getparent().attrib.get("USE")
file_FLocat_href = (file_.xpath('mets:FLocat/@xlink:href', namespaces=ns) or [None])[0] file_FLocat_href = (
file_.xpath("mets:FLocat/@xlink:href", namespaces=ns) or [None]
)[0]
if file_FLocat_href is not None: if file_FLocat_href is not None:
file_FLocat_href = str(file_FLocat_href) file_FLocat_href = str(file_FLocat_href)
page_dict[f"fileGrp_{fileGrp_USE}_file_FLocat_href"] = file_FLocat_href page_dict[f"fileGrp_{fileGrp_USE}_file_FLocat_href"] = file_FLocat_href
@ -343,7 +479,7 @@ def pages_to_dict(mets, raise_errors=True) -> List[Dict]:
# it suffices to do this the old-fashioned way. # it suffices to do this the old-fashioned way.
sm_links = mets.findall( sm_links = mets.findall(
f'./mets:structLink/mets:smLink[@xlink:to="{to_phys}"]', ns f'./mets:structLink/mets:smLink[@xlink:to="{to_phys}"]', ns
) )
targets = [] targets = []
@ -378,10 +514,19 @@ def pages_to_dict(mets, raise_errors=True) -> List[Dict]:
@click.command() @click.command()
@click.argument('mets_files', type=click.Path(exists=True), required=True, nargs=-1) @click.argument("mets_files", type=click.Path(exists=True), required=True, nargs=-1)
@click.option('--output', '-o', 'output_file', type=click.Path(), help='Output Parquet file', @click.option(
default='mods_info_df.parquet', show_default=True) "--output",
@click.option('--output-page-info', type=click.Path(), help='Output page info Parquet file') "-o",
"output_file",
type=click.Path(),
help="Output Parquet file",
default="mods_info_df.parquet",
show_default=True,
)
@click.option(
"--output-page-info", type=click.Path(), help="Output page info Parquet file"
)
def process_command(mets_files: list[str], output_file: str, output_page_info: str): def process_command(mets_files: list[str], output_file: str, output_page_info: str):
""" """
A tool to convert the MODS metadata in INPUT to a pandas DataFrame. A tool to convert the MODS metadata in INPUT to a pandas DataFrame.
@ -395,18 +540,21 @@ def process_command(mets_files: list[str], output_file: str, output_page_info: s
""" """
process(mets_files, output_file, output_page_info) process(mets_files, output_file, output_page_info)
def process(mets_files: list[str], output_file: str, output_page_info: str): def process(mets_files: list[str], output_file: str, output_page_info: str):
# Extend file list if directories are given # Extend file list if directories are given
mets_files_real: list[str] = [] mets_files_real: list[str] = []
for m in mets_files: for m in mets_files:
if os.path.isdir(m): if os.path.isdir(m):
logger.info('Scanning directory {}'.format(m)) logger.info("Scanning directory {}".format(m))
mets_files_real.extend(f.path for f in tqdm(os.scandir(m), leave=False) mets_files_real.extend(
if f.is_file() and not f.name.startswith('.')) f.path
for f in tqdm(os.scandir(m), leave=False)
if f.is_file() and not f.name.startswith(".")
)
else: else:
mets_files_real.append(m) mets_files_real.append(m)
# Prepare output files # Prepare output files
with contextlib.suppress(FileNotFoundError): with contextlib.suppress(FileNotFoundError):
os.remove(output_file) os.remove(output_file)
@ -414,28 +562,28 @@ def process(mets_files: list[str], output_file: str, output_page_info: str):
with contextlib.suppress(FileNotFoundError): with contextlib.suppress(FileNotFoundError):
os.remove(output_file_sqlite3) os.remove(output_file_sqlite3)
logger.info('Writing SQLite DB to {}'.format(output_file_sqlite3)) logger.info("Writing SQLite DB to {}".format(output_file_sqlite3))
con = sqlite3.connect(output_file_sqlite3) con = sqlite3.connect(output_file_sqlite3)
if output_page_info: if output_page_info:
output_page_info_sqlite3 = output_page_info + ".sqlite3" output_page_info_sqlite3 = output_page_info + ".sqlite3"
logger.info('Writing SQLite DB to {}'.format(output_page_info_sqlite3)) logger.info("Writing SQLite DB to {}".format(output_page_info_sqlite3))
with contextlib.suppress(FileNotFoundError): with contextlib.suppress(FileNotFoundError):
os.remove(output_page_info_sqlite3) os.remove(output_page_info_sqlite3)
con_page_info = sqlite3.connect(output_page_info_sqlite3) con_page_info = sqlite3.connect(output_page_info_sqlite3)
# Process METS files # Process METS files
with open(output_file + '.warnings.csv', 'w') as csvfile: with open(output_file + ".warnings.csv", "w") as csvfile:
csvwriter = csv.writer(csvfile) csvwriter = csv.writer(csvfile)
logger.info('Processing METS files') logger.info("Processing METS files")
for mets_file in tqdm(mets_files_real, leave=True): for mets_file in tqdm(mets_files_real, leave=True):
try: try:
root = ET.parse(mets_file).getroot() root = ET.parse(mets_file).getroot()
mets = root # XXX .find('mets:mets', ns) does not work here mets = root # XXX .find('mets:mets', ns) does not work here
mods = root.find('mets:dmdSec//mods:mods', ns) mods = root.find("mets:dmdSec//mods:mods", ns)
with warnings.catch_warnings(record=True) as caught_warnings: with warnings.catch_warnings(record=True) as caught_warnings:
warnings.simplefilter('always') # do NOT filter double occurrences warnings.simplefilter("always") # do NOT filter double occurrences
# MODS # MODS
d = flatten(mods_to_dict(mods, raise_errors=True)) d = flatten(mods_to_dict(mods, raise_errors=True))
@ -445,7 +593,7 @@ def process(mets_files: list[str], output_file: str, output_page_info: str):
for k, v in d_mets.items(): for k, v in d_mets.items():
d[f"mets_{k}"] = v d[f"mets_{k}"] = v
# "meta" # "meta"
d['mets_file'] = mets_file d["mets_file"] = mets_file
# Save # Save
insert_into_db(con, "mods_info", d) insert_into_db(con, "mods_info", d)
@ -453,8 +601,12 @@ def process(mets_files: list[str], output_file: str, output_page_info: str):
# METS - per-page # METS - per-page
if output_page_info: if output_page_info:
page_info_doc: list[dict] = pages_to_dict(mets, raise_errors=True) page_info_doc: list[dict] = pages_to_dict(
insert_into_db_multiple(con_page_info, "page_info", page_info_doc) mets, raise_errors=True
)
insert_into_db_multiple(
con_page_info, "page_info", page_info_doc
)
con_page_info.commit() con_page_info.commit()
if caught_warnings: if caught_warnings:
@ -463,13 +615,15 @@ def process(mets_files: list[str], output_file: str, output_page_info: str):
for caught_warning in caught_warnings: for caught_warning in caught_warnings:
csvwriter.writerow([mets_file, caught_warning.message]) csvwriter.writerow([mets_file, caught_warning.message])
except Exception as e: except Exception as e:
logger.exception('Exception in {}'.format(mets_file)) logger.exception("Exception in {}".format(mets_file))
logger.info('Writing DataFrame to {}'.format(output_file)) logger.info("Writing DataFrame to {}".format(output_file))
convert_db_to_parquet(con, "mods_info", "recordInfo_recordIdentifier", output_file) convert_db_to_parquet(con, "mods_info", "recordInfo_recordIdentifier", output_file)
if output_page_info: if output_page_info:
logger.info('Writing DataFrame to {}'.format(output_page_info)) logger.info("Writing DataFrame to {}".format(output_page_info))
convert_db_to_parquet(con_page_info, "page_info", ["ppn", "ID"], output_page_info) convert_db_to_parquet(
con_page_info, "page_info", ["ppn", "ID"], output_page_info
)
def main(): def main():
@ -481,5 +635,5 @@ def main():
process_command() process_command()
if __name__ == '__main__': if __name__ == "__main__":
main() main()

View file

@ -9,14 +9,17 @@ from mods4pandas.lib import flatten
TESTS_DATA_DIR = Path(__file__).parent / "data" TESTS_DATA_DIR = Path(__file__).parent / "data"
def dict_fromstring(x): def dict_fromstring(x):
return flatten(alto_to_dict(ET.fromstring(x))) return flatten(alto_to_dict(ET.fromstring(x)))
def test_Page_counts(): def test_Page_counts():
""" """
Elements below Layout/Page should be counted Elements below Layout/Page should be counted
""" """
d = dict_fromstring(""" d = dict_fromstring(
"""
<alto xmlns="http://www.loc.gov/standards/alto/ns-v2#"> <alto xmlns="http://www.loc.gov/standards/alto/ns-v2#">
<Layout> <Layout>
<Page ID="Page1" PHYSICAL_IMG_NR="1"> <Page ID="Page1" PHYSICAL_IMG_NR="1">
@ -37,13 +40,16 @@ def test_Page_counts():
</Page> </Page>
</Layout> </Layout>
</alto> </alto>
""") """
assert d['Layout_Page_TextBlock-count'] == 1 )
assert d['Layout_Page_TextLine-count'] == 3 assert d["Layout_Page_TextBlock-count"] == 1
assert d['Layout_Page_String-count'] == 6 assert d["Layout_Page_TextLine-count"] == 3
assert d["Layout_Page_String-count"] == 6
def test_Tags_counts(): def test_Tags_counts():
d = dict_fromstring(""" d = dict_fromstring(
"""
<alto xmlns="http://www.loc.gov/standards/alto/ns-v2#"> <alto xmlns="http://www.loc.gov/standards/alto/ns-v2#">
<Tags> <Tags>
<NamedEntityTag ID="PER0" LABEL="Pentlings"/> <NamedEntityTag ID="PER0" LABEL="Pentlings"/>
@ -57,11 +63,14 @@ def test_Tags_counts():
<NamedEntityTag ID="PER10" LABEL="Jhesu Christi"/> <NamedEntityTag ID="PER10" LABEL="Jhesu Christi"/>
</Tags> </Tags>
</alto> </alto>
""") """
assert d['Tags_NamedEntityTag-count'] == 9 )
assert d["Tags_NamedEntityTag-count"] == 9
def test_String_TAGREF_counts(): def test_String_TAGREF_counts():
d = dict_fromstring(""" d = dict_fromstring(
"""
<alto xmlns="http://www.loc.gov/standards/alto/ns-v2#"> <alto xmlns="http://www.loc.gov/standards/alto/ns-v2#">
<Layout> <Layout>
<Page> <Page>
@ -80,9 +89,10 @@ def test_String_TAGREF_counts():
</Page> </Page>
</Layout> </Layout>
</alto> </alto>
""") """
assert d['Layout_Page_//alto:String[@TAGREFS]-count'] == 3 )
assert d['Layout_Page_String-count'] == 4 assert d["Layout_Page_//alto:String[@TAGREFS]-count"] == 3
assert d["Layout_Page_String-count"] == 4
def test_dtypes(tmp_path): def test_dtypes(tmp_path):
@ -100,9 +110,9 @@ def test_dtypes(tmp_path):
r"Layout_Page_//alto:String/@WC-.*": ("Float64", None), r"Layout_Page_//alto:String/@WC-.*": ("Float64", None),
r".*-count": ("Int64", None), r".*-count": ("Int64", None),
r"alto_xmlns": ("object", ["str", "NoneType"]), r"alto_xmlns": ("object", ["str", "NoneType"]),
r"Layout_Page_(WIDTH|HEIGHT)": ("Int64", None), r"Layout_Page_(WIDTH|HEIGHT)": ("Int64", None),
} }
def expected_types(c): def expected_types(c):
"""Return the expected types for column c.""" """Return the expected types for column c."""
for r, types in EXPECTED_TYPES.items(): for r, types in EXPECTED_TYPES.items():
@ -126,7 +136,8 @@ def test_dtypes(tmp_path):
if edt == "object": if edt == "object":
inner_types = set(type(v).__name__ for v in df[c]) inner_types = set(type(v).__name__ for v in df[c])
assert all(it in einner_types for it in inner_types), \ assert all(
f"Unexpected inner types {inner_types} for column {c} (expected {einner_types})" it in einner_types for it in inner_types
), f"Unexpected inner types {inner_types} for column {c} (expected {einner_types})"
check_types(alto_info_df) check_types(alto_info_df)

View file

@ -6,15 +6,17 @@ from mods4pandas.lib import flatten
def dict_fromstring(x): def dict_fromstring(x):
"""Helper function to parse a METS/MODS XML string to a flattened dict""" """Helper function to parse a METS/MODS XML string to a flattened dict"""
return flatten(mets_to_dict(ET.fromstring(x))) return flatten(mets_to_dict(ET.fromstring(x)))
# XXX move to test lib # XXX move to test lib
def test_fileGrp(): def test_fileGrp():
""" """
Elements of mets:fileGrp should be counted Elements of mets:fileGrp should be counted
""" """
d = dict_fromstring(""" d = dict_fromstring(
"""
<mets:mets xmlns:mets="http://www.loc.gov/METS/"> <mets:mets xmlns:mets="http://www.loc.gov/METS/">
<mets:fileSec> <mets:fileSec>
@ -31,5 +33,6 @@ def test_fileGrp():
</mets:fileGrp> </mets:fileGrp>
</mets:fileSec> </mets:fileSec>
</mets:mets> </mets:mets>
""") """
assert d['fileSec_fileGrp-PRESENTATION-count'] == 3 )
assert d["fileSec_fileGrp-PRESENTATION-count"] == 3

View file

@ -10,36 +10,45 @@ from mods4pandas.lib import flatten
TESTS_DATA_DIR = Path(__file__).parent / "data" TESTS_DATA_DIR = Path(__file__).parent / "data"
def dict_fromstring(x): def dict_fromstring(x):
"""Helper function to parse a MODS XML string to a flattened dict""" """Helper function to parse a MODS XML string to a flattened dict"""
return flatten(mods_to_dict(ET.fromstring(x))) return flatten(mods_to_dict(ET.fromstring(x)))
def test_single_language_languageTerm(): def test_single_language_languageTerm():
d = dict_fromstring(""" d = dict_fromstring(
"""
<mods:mods xmlns:mods="http://www.loc.gov/mods/v3"> <mods:mods xmlns:mods="http://www.loc.gov/mods/v3">
<mods:language> <mods:language>
<mods:languageTerm authority="iso639-2b" type="code">lat</mods:languageTerm> <mods:languageTerm authority="iso639-2b" type="code">lat</mods:languageTerm>
<mods:languageTerm authority="iso639-2b" type="code">ger</mods:languageTerm> <mods:languageTerm authority="iso639-2b" type="code">ger</mods:languageTerm>
</mods:language> </mods:language>
</mods:mods> </mods:mods>
""") """
assert d['language_languageTerm'] == {'ger', 'lat'} )
assert d["language_languageTerm"] == {"ger", "lat"}
def test_multitple_language_languageTerm(): def test_multitple_language_languageTerm():
""" """
Different languages MAY have multiple mods:language elements. Different languages MAY have multiple mods:language elements.
See MODS-AP 2.3.1 See MODS-AP 2.3.1
""" """
d = dict_fromstring(""" d = dict_fromstring(
"""
<mods:mods xmlns:mods="http://www.loc.gov/mods/v3"> <mods:mods xmlns:mods="http://www.loc.gov/mods/v3">
<mods:language><mods:languageTerm authority="iso639-2b" type="code">lat</mods:languageTerm></mods:language> <mods:language><mods:languageTerm authority="iso639-2b" type="code">lat</mods:languageTerm></mods:language>
<mods:language><mods:languageTerm authority="iso639-2b" type="code">ger</mods:languageTerm></mods:language> <mods:language><mods:languageTerm authority="iso639-2b" type="code">ger</mods:languageTerm></mods:language>
</mods:mods> </mods:mods>
""") """
assert d['language_languageTerm'] == {'ger', 'lat'} )
assert d["language_languageTerm"] == {"ger", "lat"}
def test_role_roleTerm(): def test_role_roleTerm():
d = dict_fromstring(""" d = dict_fromstring(
"""
<mods:mods xmlns:mods="http://www.loc.gov/mods/v3"> <mods:mods xmlns:mods="http://www.loc.gov/mods/v3">
<mods:name type="personal" valueURI="http://d-nb.info/gnd/117357669"> <mods:name type="personal" valueURI="http://d-nb.info/gnd/117357669">
<mods:displayForm>Wurm, Mary</mods:displayForm> <mods:displayForm>Wurm, Mary</mods:displayForm>
@ -51,14 +60,17 @@ def test_role_roleTerm():
</mods:role> </mods:role>
</mods:name> </mods:name>
</mods:mods> </mods:mods>
""") """
assert d['name0_role_roleTerm'] == {'cmp'} )
assert d["name0_role_roleTerm"] == {"cmp"}
def test_multiple_role_roleTerm(): def test_multiple_role_roleTerm():
""" """
Multiple mods:role/mods:roleTerm should be merged into one column. Multiple mods:role/mods:roleTerm should be merged into one column.
""" """
d = dict_fromstring(""" d = dict_fromstring(
"""
<mods:mods xmlns:mods="http://www.loc.gov/mods/v3"> <mods:mods xmlns:mods="http://www.loc.gov/mods/v3">
<mods:name type="personal" valueURI="http://d-nb.info/gnd/117357669"> <mods:name type="personal" valueURI="http://d-nb.info/gnd/117357669">
<mods:displayForm>Wurm, Mary</mods:displayForm> <mods:displayForm>Wurm, Mary</mods:displayForm>
@ -73,8 +85,10 @@ def test_multiple_role_roleTerm():
</mods:role> </mods:role>
</mods:name> </mods:name>
</mods:mods> </mods:mods>
""") """
assert d['name0_role_roleTerm'] == {'cmp', 'aut'} )
assert d["name0_role_roleTerm"] == {"cmp", "aut"}
def test_scriptTerm(): def test_scriptTerm():
""" """
@ -82,7 +96,8 @@ def test_scriptTerm():
See MODS-AP 2.3.1. See MODS-AP 2.3.1.
""" """
d = dict_fromstring(""" d = dict_fromstring(
"""
<mods:mods xmlns:mods="http://www.loc.gov/mods/v3"> <mods:mods xmlns:mods="http://www.loc.gov/mods/v3">
<mods:language> <mods:language>
<mods:languageTerm authority="iso639-2b" type="code">ger</mods:languageTerm> <mods:languageTerm authority="iso639-2b" type="code">ger</mods:languageTerm>
@ -94,44 +109,59 @@ def test_scriptTerm():
<mods:scriptTerm authority="iso15924" type="code">216</mods:scriptTerm> <mods:scriptTerm authority="iso15924" type="code">216</mods:scriptTerm>
</mods:language> </mods:language>
</mods:mods> </mods:mods>
""") """
assert d['language_scriptTerm'] == {'215', '216', '217'} )
assert d["language_scriptTerm"] == {"215", "216", "217"}
def test_recordInfo(): def test_recordInfo():
d = dict_fromstring(""" d = dict_fromstring(
"""
<mods:mods xmlns:mods="http://www.loc.gov/mods/v3"> <mods:mods xmlns:mods="http://www.loc.gov/mods/v3">
<mods:recordInfo> <mods:recordInfo>
<mods:recordIdentifier source="gbv-ppn">PPN610714341</mods:recordIdentifier> <mods:recordIdentifier source="gbv-ppn">PPN610714341</mods:recordIdentifier>
</mods:recordInfo> </mods:recordInfo>
</mods:mods> </mods:mods>
""") """
assert d['recordInfo_recordIdentifier'] == 'PPN610714341' )
assert d["recordInfo_recordIdentifier"] == "PPN610714341"
def test_accessCondition(): def test_accessCondition():
d = dict_fromstring(""" d = dict_fromstring(
"""
<mods:mods xmlns:mods="http://www.loc.gov/mods/v3"> <mods:mods xmlns:mods="http://www.loc.gov/mods/v3">
<mods:accessCondition type="use and reproduction">UNKNOWN</mods:accessCondition> <mods:accessCondition type="use and reproduction">UNKNOWN</mods:accessCondition>
</mods:mods> </mods:mods>
""") """
assert d['accessCondition-use and reproduction'] == 'UNKNOWN' )
assert d["accessCondition-use and reproduction"] == "UNKNOWN"
def test_originInfo_no_event_type(): def test_originInfo_no_event_type():
with pytest.warns(UserWarning) as ws: with pytest.warns(UserWarning) as ws:
d = dict_fromstring(""" d = dict_fromstring(
"""
<mods:mods xmlns:mods="http://www.loc.gov/mods/v3"> <mods:mods xmlns:mods="http://www.loc.gov/mods/v3">
<mods:originInfo> <mods:originInfo>
<mods:place><mods:placeTerm type="text">Berlin</mods:placeTerm></mods:place> <mods:place><mods:placeTerm type="text">Berlin</mods:placeTerm></mods:place>
</mods:originInfo> </mods:originInfo>
</mods:mods> </mods:mods>
""") """
)
assert d == {} # empty assert d == {} # empty
assert len(ws) == 1 assert len(ws) == 1
assert ws[0].message.args[0] == 'Filtered {http://www.loc.gov/mods/v3}originInfo element (has no eventType)' assert (
ws[0].message.args[0]
== "Filtered {http://www.loc.gov/mods/v3}originInfo element (has no eventType)"
)
def test_relatedItem(): def test_relatedItem():
d = dict_fromstring(""" d = dict_fromstring(
"""
<mods:mods xmlns:mods="http://www.loc.gov/mods/v3"> <mods:mods xmlns:mods="http://www.loc.gov/mods/v3">
<mods:relatedItem type="original"> <mods:relatedItem type="original">
<mods:recordInfo> <mods:recordInfo>
@ -139,12 +169,14 @@ def test_relatedItem():
</mods:recordInfo> </mods:recordInfo>
</mods:relatedItem> </mods:relatedItem>
</mods:mods> </mods:mods>
""") """
)
assert d['relatedItem-original_recordInfo_recordIdentifier'] == 'PPN167755803' assert d["relatedItem-original_recordInfo_recordIdentifier"] == "PPN167755803"
# mods:relatedItem may also have source="dnb-ppn" recordIdentifiers: # mods:relatedItem may also have source="dnb-ppn" recordIdentifiers:
d = dict_fromstring(""" d = dict_fromstring(
"""
<mods:mods xmlns:mods="http://www.loc.gov/mods/v3"> <mods:mods xmlns:mods="http://www.loc.gov/mods/v3">
<mods:relatedItem type="original"> <mods:relatedItem type="original">
<mods:recordInfo> <mods:recordInfo>
@ -152,12 +184,16 @@ def test_relatedItem():
</mods:recordInfo> </mods:recordInfo>
</mods:relatedItem> </mods:relatedItem>
</mods:mods> </mods:mods>
""") """
)
assert d["relatedItem-original_recordInfo_recordIdentifier-dnb-ppn"] == "1236513355"
assert d['relatedItem-original_recordInfo_recordIdentifier-dnb-ppn'] == '1236513355'
def test_dtypes(tmp_path): def test_dtypes(tmp_path):
mets_files = [p.absolute().as_posix() for p in (TESTS_DATA_DIR / "mets-mods").glob("*.xml")] mets_files = [
p.absolute().as_posix() for p in (TESTS_DATA_DIR / "mets-mods").glob("*.xml")
]
mods_info_df_parquet = (tmp_path / "test_dtypes_mods_info.parquet").as_posix() mods_info_df_parquet = (tmp_path / "test_dtypes_mods_info.parquet").as_posix()
page_info_df_parquet = (tmp_path / "test_dtypes_page_info.parquet").as_posix() page_info_df_parquet = (tmp_path / "test_dtypes_page_info.parquet").as_posix()
process(mets_files, mods_info_df_parquet, page_info_df_parquet) process(mets_files, mods_info_df_parquet, page_info_df_parquet)
@ -166,7 +202,6 @@ def test_dtypes(tmp_path):
EXPECTED_TYPES = { EXPECTED_TYPES = {
# mods_info # mods_info
r"mets_file": ("object", ["str"]), r"mets_file": ("object", ["str"]),
r"titleInfo_title": ("object", ["str"]), r"titleInfo_title": ("object", ["str"]),
r"titleInfo_subTitle": ("object", ["str", "NoneType"]), r"titleInfo_subTitle": ("object", ["str", "NoneType"]),
@ -179,19 +214,16 @@ def test_dtypes(tmp_path):
r"typeOfResource": ("object", ["str", "NoneType"]), r"typeOfResource": ("object", ["str", "NoneType"]),
r"accessCondition-.*": ("object", ["str", "NoneType"]), r"accessCondition-.*": ("object", ["str", "NoneType"]),
r"originInfo-.*": ("object", ["str", "NoneType"]), r"originInfo-.*": ("object", ["str", "NoneType"]),
r".*-count": ("Int64", None), r".*-count": ("Int64", None),
r"genre-.*": ("object", ["ndarray", "NoneType"]), r"genre-.*": ("object", ["ndarray", "NoneType"]),
r"subject-.*": ("object", ["ndarray", "NoneType"]), r"subject-.*": ("object", ["ndarray", "NoneType"]),
r"language_.*Term": ("object", ["ndarray", "NoneType"]), r"language_.*Term": ("object", ["ndarray", "NoneType"]),
r"classification-.*": ("object", ["ndarray", "NoneType"]), r"classification-.*": ("object", ["ndarray", "NoneType"]),
# page_info # page_info
r"fileGrp_.*_file_FLocat_href": ("object", ["str", "NoneType"]), r"fileGrp_.*_file_FLocat_href": ("object", ["str", "NoneType"]),
r"structMap-LOGICAL_TYPE_.*": ("boolean", None), r"structMap-LOGICAL_TYPE_.*": ("boolean", None),
} }
def expected_types(c): def expected_types(c):
"""Return the expected types for column c.""" """Return the expected types for column c."""
for r, types in EXPECTED_TYPES.items(): for r, types in EXPECTED_TYPES.items():
@ -215,8 +247,9 @@ def test_dtypes(tmp_path):
if edt == "object": if edt == "object":
inner_types = set(type(v).__name__ for v in df[c]) inner_types = set(type(v).__name__ for v in df[c])
assert all(it in einner_types for it in inner_types), \ assert all(
f"Unexpected inner types {inner_types} for column {c} (expected {einner_types})" it in einner_types for it in inner_types
), f"Unexpected inner types {inner_types} for column {c} (expected {einner_types})"
check_types(mods_info_df) check_types(mods_info_df)
check_types(page_info_df) check_types(page_info_df)

View file

@ -10,8 +10,8 @@ TESTS_DATA_DIR = Path(__file__).parent / "data"
def removeprefix(s, prefix): def removeprefix(s, prefix):
if sys.version_info < (3,9): if sys.version_info < (3, 9):
return s[len(prefix):] if s.startswith(prefix) else s return s[len(prefix) :] if s.startswith(prefix) else s
else: else:
return s.removeprefix(prefix) return s.removeprefix(prefix)
@ -26,20 +26,32 @@ def test_page_info():
assert all(p["ppn"] == "PPN821507109" for p in page_info) assert all(p["ppn"] == "PPN821507109" for p in page_info)
# Look closer at an interesting page # Look closer at an interesting page
from pprint import pprint; pprint(page_info[0]) from pprint import pprint
pprint(page_info[0])
page_info_page = next(p for p in page_info if p["ID"] == "PHYS_0005") page_info_page = next(p for p in page_info if p["ID"] == "PHYS_0005")
assert page_info_page["fileGrp_PRESENTATION_file_FLocat_href"] == "file:///goobi/tiff001/sbb/PPN821507109/00000005.tif" assert (
page_info_page["fileGrp_PRESENTATION_file_FLocat_href"]
== "file:///goobi/tiff001/sbb/PPN821507109/00000005.tif"
)
# This is a title page with an illustration, check that we correctly got this info from the # This is a title page with an illustration, check that we correctly got this info from the
# structMap. # structMap.
struct_types = sorted(removeprefix(k, "structMap-LOGICAL_TYPE_") for k, v in page_info_page.items() if k.startswith("structMap-LOGICAL_TYPE_") and v == 1) struct_types = sorted(
removeprefix(k, "structMap-LOGICAL_TYPE_")
for k, v in page_info_page.items()
if k.startswith("structMap-LOGICAL_TYPE_") and v == 1
)
assert struct_types == ["illustration", "monograph", "title_page"] assert struct_types == ["illustration", "monograph", "title_page"]
def test_page_info_multivolume_work(): def test_page_info_multivolume_work():
"""Test creation of page_info for multivolume_work""" """Test creation of page_info for multivolume_work"""
mets = ET.parse(TESTS_DATA_DIR / "mets-mods" / "PPN717884805-multivolume_work-no-structMap-PHYSICAL.xml") mets = ET.parse(
TESTS_DATA_DIR
/ "mets-mods"
/ "PPN717884805-multivolume_work-no-structMap-PHYSICAL.xml"
)
page_info = pages_to_dict(mets) page_info = pages_to_dict(mets)
assert page_info == [] assert page_info == []