1
0
Fork 0
mirror of https://github.com/qurator-spk/modstool.git synced 2025-08-14 03:59:53 +02:00

Be more flexible about recordIdentifiers

This commit is contained in:
Gerber, Mike 2025-08-08 12:06:35 +02:00
parent 0855ccb66b
commit 2af30598bd

View file

@ -169,21 +169,25 @@ def mods_to_dict(mods, raise_errors=True):
# By default we assume source="gbv-ppn" mods:recordIdentifiers (= PPNs), # By default we assume source="gbv-ppn" mods:recordIdentifiers (= PPNs),
# however, in mods:relatedItems, there may be source="dnb-ppns", # however, in mods:relatedItems, there may be source="dnb-ppns",
# which we need to distinguish by using a separate field name. # which we need to distinguish by using a separate field name.
try:
value["recordIdentifier"] = ( for field_name, source in \
TagGroup(tag, group) ("recordIdentifier", "gbv-ppn"), \
.filter(no_uuid) ("recordIdentifier-dnb-ppn", "dnb-ppn"), \
.is_singleton() ("recordIdentifier-zdb", "zdb"):
.has_attributes({"source": "gbv-ppn"}) try:
.text() value[field_name] = (
) TagGroup(tag, group)
except ValueError: .filter(no_uuid)
value["recordIdentifier-dnb-ppn"] = ( .fix_recordIdentifier_source_zdb()
TagGroup(tag, group) .is_singleton()
.is_singleton() .has_attributes({"source": source})
.has_attributes({"source": "dnb-ppn"}) .text()
.text() )
) break
except ValueError as e:
pass
if field_name not in value:
raise ValueError("Unknown recordIdentifier found")
elif tag == "{http://www.loc.gov/mods/v3}identifier": elif tag == "{http://www.loc.gov/mods/v3}identifier":
for e in group: for e in group:
if len(e.attrib) != 1: if len(e.attrib) != 1:
@ -634,11 +638,18 @@ def process(mets_files: list[str], output_file: str, output_page_info: str, mets
logger.exception("Exception in {}".format(mets_file)) logger.exception("Exception in {}".format(mets_file))
logger.info("Writing DataFrame to {}".format(output_file)) logger.info("Writing DataFrame to {}".format(output_file))
try: considered_indexes = ("recordInfo_recordIdentifier", "recordIdentifier-zdb")
convert_db_to_parquet(con, "mods_info", "recordInfo_recordIdentifier", output_file) success = False
except: for considered_index in considered_indexes:
# FIXME: Fix missing mods:recordInfo instead, https://github.com/qurator-spk/mods4pandas/issues/60 try:
convert_db_to_parquet(con, "mods_info", "recordIdentifier", output_file) convert_db_to_parquet(con, "mods_info", considered_index, output_file)
success = True
break
except:
pass
if not success:
raise ValueError(f"None of {considered_indexes} found")
if output_page_info: if output_page_info:
logger.info("Writing DataFrame to {}".format(output_page_info)) logger.info("Writing DataFrame to {}".format(output_page_info))
convert_db_to_parquet( convert_db_to_parquet(