From 90c60ebb80f2a72e5ea1ea9edfc83def9172622e Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Sat, 9 Dec 2023 10:24:38 +0100 Subject: [PATCH] =?UTF-8?q?=E2=9A=A1=20Make=20get=5Fmets=5Ffile=20aa=20lot?= =?UTF-8?q?=20faster=20by=20using=20find()=20instead=20of=20xpath()?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- qurator/mods4pandas/mods4pandas.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/qurator/mods4pandas/mods4pandas.py b/qurator/mods4pandas/mods4pandas.py index 9993fa9..16934e7 100755 --- a/qurator/mods4pandas/mods4pandas.py +++ b/qurator/mods4pandas/mods4pandas.py @@ -273,6 +273,12 @@ def pages_to_dict(mets, raise_errors=True) -> List[Dict]: div_physSequence = structMap_PHYSICAL[0] assert div_physSequence.attrib.get("TYPE") == "physSequence" + def get_mets_file(*, ID): + if ID: + file_ = mets.find(f'.//{{{ns["mets"]}}}file[@ID="{ID}"]') + return file_ + + for page in div_physSequence: # TODO sort by ORDER? @@ -285,12 +291,8 @@ def pages_to_dict(mets, raise_errors=True) -> List[Dict]: file_id = fptr.attrib.get("FILEID") assert file_id - def get_mets_file(*, ID): - if ID: - file_ = (mets.xpath(f'//mets:file[@ID="{ID}"]', namespaces=ns) or [None])[0] - return file_ - file_ = get_mets_file(ID=file_id) + assert file_ is not None fileGrp_USE = file_.getparent().attrib.get("USE") file_FLocat_href = (file_.xpath('mets:FLocat/@xlink:href', namespaces=ns) or [None])[0] page_dict[f"fileGrp_{fileGrp_USE}_file_FLocat_href"] = file_FLocat_href