✨ Be more flexible about recordIdentifiers

2026-02-11 20:11:56 +01:00 · 2025-08-08 12:06:35 +02:00 · 2025-08-08 12:06:35 +02:00 · 2af30598bd
commit 2af30598bd
parent 0855ccb66b
1 changed files with 31 additions and 20 deletions
--- a/src/mods4pandas/mods4pandas.py
+++ b/src/mods4pandas/mods4pandas.py
@ -169,21 +169,25 @@ def mods_to_dict(mods, raise_errors=True):
            # By default we assume source="gbv-ppn" mods:recordIdentifiers (= PPNs),
            # however, in mods:relatedItems, there may be source="dnb-ppns",
            # which we need to distinguish by using a separate field name.
-            try:
+
-                value["recordIdentifier"] = (
+            for field_name, source in \
-                    TagGroup(tag, group)
+                ("recordIdentifier",         "gbv-ppn"), \
-                    .filter(no_uuid)
+                ("recordIdentifier-dnb-ppn", "dnb-ppn"), \
-                    .is_singleton()
+                ("recordIdentifier-zdb",     "zdb"):
-                    .has_attributes({"source": "gbv-ppn"})
+                try:
-                    .text()
+                    value[field_name] = (
-                )
+                        TagGroup(tag, group)
-            except ValueError:
+                        .filter(no_uuid)
-                value["recordIdentifier-dnb-ppn"] = (
+                        .fix_recordIdentifier_source_zdb()
-                    TagGroup(tag, group)
+                        .is_singleton()
-                    .is_singleton()
+                        .has_attributes({"source": source})
-                    .has_attributes({"source": "dnb-ppn"})
+                        .text()
-                    .text()
+                    )
-                )
+                    break
                except ValueError as e:
                    pass
            if field_name not in value:
                raise ValueError("Unknown recordIdentifier found")
        elif tag == "{http://www.loc.gov/mods/v3}identifier":
            for e in group:
                if len(e.attrib) != 1:
@ -634,11 +638,18 @@ def process(mets_files: list[str], output_file: str, output_page_info: str, mets
                logger.exception("Exception in {}".format(mets_file))
    logger.info("Writing DataFrame to {}".format(output_file))
-    try:
+    considered_indexes = ("recordInfo_recordIdentifier", "recordIdentifier-zdb")
-        convert_db_to_parquet(con, "mods_info", "recordInfo_recordIdentifier", output_file)
+    success = False
-    except:
+    for considered_index in considered_indexes:
-        # FIXME: Fix missing mods:recordInfo instead, https://github.com/qurator-spk/mods4pandas/issues/60
+        try:
-        convert_db_to_parquet(con, "mods_info", "recordIdentifier", output_file)
+            convert_db_to_parquet(con, "mods_info", considered_index, output_file)
            success = True
            break
        except:
            pass
    if not success:
        raise ValueError(f"None of {considered_indexes} found")
    if output_page_info:
        logger.info("Writing DataFrame to {}".format(output_page_info))
        convert_db_to_parquet(