From 912e5d2b4a8e7c90b2c73389eb0f2d449975c402 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Sat, 9 Dec 2023 11:40:45 +0100 Subject: [PATCH] =?UTF-8?q?=E2=9A=A1=20Make=20get=5Fstruct=5Flog=20faster?= =?UTF-8?q?=20by=20using=20precise=20predicates?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- qurator/mods4pandas/mods4pandas.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/qurator/mods4pandas/mods4pandas.py b/qurator/mods4pandas/mods4pandas.py index 09dd9c4..75cf03c 100755 --- a/qurator/mods4pandas/mods4pandas.py +++ b/qurator/mods4pandas/mods4pandas.py @@ -268,17 +268,20 @@ def pages_to_dict(mets, raise_errors=True) -> List[Dict]: # Getting per-page/structure information is a bit different structMap_PHYSICAL = mets.find('./mets:structMap[@TYPE="PHYSICAL"]', ns) structMap_LOGICAL = mets.find('./mets:structMap[@TYPE="LOGICAL"]', ns) + fileSec = mets.find('./mets:fileSec', ns) if structMap_PHYSICAL is None: raise ValueError("No structMap[@TYPE='PHYSICAL'] found") if structMap_LOGICAL is None: raise ValueError("No structMap[@TYPE='LOGICAL'] found") + if fileSec is None: + raise ValueError("No fileSec found") div_physSequence = structMap_PHYSICAL[0] assert div_physSequence.attrib.get("TYPE") == "physSequence" def get_mets_file(*, ID): if ID: - file_ = mets.find(f'.//{{{ns["mets"]}}}file[@ID="{ID}"]') + file_ = fileSec.find(f'./mets:fileGrp/mets:file[@ID="{ID}"]', ns) return file_ def get_mets_div(*, ID):