mirror of
https://github.com/qurator-spk/modstool.git
synced 2025-08-14 03:59:53 +02:00
✨ Be more flexible about recordIdentifiers
This commit is contained in:
parent
0855ccb66b
commit
2af30598bd
1 changed files with 31 additions and 20 deletions
|
@ -169,21 +169,25 @@ def mods_to_dict(mods, raise_errors=True):
|
||||||
# By default we assume source="gbv-ppn" mods:recordIdentifiers (= PPNs),
|
# By default we assume source="gbv-ppn" mods:recordIdentifiers (= PPNs),
|
||||||
# however, in mods:relatedItems, there may be source="dnb-ppns",
|
# however, in mods:relatedItems, there may be source="dnb-ppns",
|
||||||
# which we need to distinguish by using a separate field name.
|
# which we need to distinguish by using a separate field name.
|
||||||
try:
|
|
||||||
value["recordIdentifier"] = (
|
for field_name, source in \
|
||||||
TagGroup(tag, group)
|
("recordIdentifier", "gbv-ppn"), \
|
||||||
.filter(no_uuid)
|
("recordIdentifier-dnb-ppn", "dnb-ppn"), \
|
||||||
.is_singleton()
|
("recordIdentifier-zdb", "zdb"):
|
||||||
.has_attributes({"source": "gbv-ppn"})
|
try:
|
||||||
.text()
|
value[field_name] = (
|
||||||
)
|
TagGroup(tag, group)
|
||||||
except ValueError:
|
.filter(no_uuid)
|
||||||
value["recordIdentifier-dnb-ppn"] = (
|
.fix_recordIdentifier_source_zdb()
|
||||||
TagGroup(tag, group)
|
.is_singleton()
|
||||||
.is_singleton()
|
.has_attributes({"source": source})
|
||||||
.has_attributes({"source": "dnb-ppn"})
|
.text()
|
||||||
.text()
|
)
|
||||||
)
|
break
|
||||||
|
except ValueError as e:
|
||||||
|
pass
|
||||||
|
if field_name not in value:
|
||||||
|
raise ValueError("Unknown recordIdentifier found")
|
||||||
elif tag == "{http://www.loc.gov/mods/v3}identifier":
|
elif tag == "{http://www.loc.gov/mods/v3}identifier":
|
||||||
for e in group:
|
for e in group:
|
||||||
if len(e.attrib) != 1:
|
if len(e.attrib) != 1:
|
||||||
|
@ -634,11 +638,18 @@ def process(mets_files: list[str], output_file: str, output_page_info: str, mets
|
||||||
logger.exception("Exception in {}".format(mets_file))
|
logger.exception("Exception in {}".format(mets_file))
|
||||||
|
|
||||||
logger.info("Writing DataFrame to {}".format(output_file))
|
logger.info("Writing DataFrame to {}".format(output_file))
|
||||||
try:
|
considered_indexes = ("recordInfo_recordIdentifier", "recordIdentifier-zdb")
|
||||||
convert_db_to_parquet(con, "mods_info", "recordInfo_recordIdentifier", output_file)
|
success = False
|
||||||
except:
|
for considered_index in considered_indexes:
|
||||||
# FIXME: Fix missing mods:recordInfo instead, https://github.com/qurator-spk/mods4pandas/issues/60
|
try:
|
||||||
convert_db_to_parquet(con, "mods_info", "recordIdentifier", output_file)
|
convert_db_to_parquet(con, "mods_info", considered_index, output_file)
|
||||||
|
success = True
|
||||||
|
break
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
if not success:
|
||||||
|
raise ValueError(f"None of {considered_indexes} found")
|
||||||
|
|
||||||
if output_page_info:
|
if output_page_info:
|
||||||
logger.info("Writing DataFrame to {}".format(output_page_info))
|
logger.info("Writing DataFrame to {}".format(output_page_info))
|
||||||
convert_db_to_parquet(
|
convert_db_to_parquet(
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue