Make get_mets_file aa lot faster by using find() instead of xpath()

master
Gerber, Mike 1 year ago
parent 16a3a3bcc8
commit 90c60ebb80

@ -273,6 +273,12 @@ def pages_to_dict(mets, raise_errors=True) -> List[Dict]:
div_physSequence = structMap_PHYSICAL[0] div_physSequence = structMap_PHYSICAL[0]
assert div_physSequence.attrib.get("TYPE") == "physSequence" assert div_physSequence.attrib.get("TYPE") == "physSequence"
def get_mets_file(*, ID):
if ID:
file_ = mets.find(f'.//{{{ns["mets"]}}}file[@ID="{ID}"]')
return file_
for page in div_physSequence: for page in div_physSequence:
# TODO sort by ORDER? # TODO sort by ORDER?
@ -285,12 +291,8 @@ def pages_to_dict(mets, raise_errors=True) -> List[Dict]:
file_id = fptr.attrib.get("FILEID") file_id = fptr.attrib.get("FILEID")
assert file_id assert file_id
def get_mets_file(*, ID):
if ID:
file_ = (mets.xpath(f'//mets:file[@ID="{ID}"]', namespaces=ns) or [None])[0]
return file_
file_ = get_mets_file(ID=file_id) file_ = get_mets_file(ID=file_id)
assert file_ is not None
fileGrp_USE = file_.getparent().attrib.get("USE") fileGrp_USE = file_.getparent().attrib.get("USE")
file_FLocat_href = (file_.xpath('mets:FLocat/@xlink:href', namespaces=ns) or [None])[0] file_FLocat_href = (file_.xpath('mets:FLocat/@xlink:href', namespaces=ns) or [None])[0]
page_dict[f"fileGrp_{fileGrp_USE}_file_FLocat_href"] = file_FLocat_href page_dict[f"fileGrp_{fileGrp_USE}_file_FLocat_href"] = file_FLocat_href

Loading…
Cancel
Save