From 448639b05bb3142c3fa86773399e1065f41fdff1 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Sat, 9 Dec 2023 11:35:24 +0100 Subject: [PATCH] =?UTF-8?q?=E2=9A=A1=20Make=20gettstruct=5Flog=20faster=20?= =?UTF-8?q?by=20using=20precise=20predicates?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- qurator/mods4pandas/mods4pandas.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/qurator/mods4pandas/mods4pandas.py b/qurator/mods4pandas/mods4pandas.py index 32728a3..09dd9c4 100755 --- a/qurator/mods4pandas/mods4pandas.py +++ b/qurator/mods4pandas/mods4pandas.py @@ -266,9 +266,12 @@ def pages_to_dict(mets, raise_errors=True) -> List[Dict]: ppn = get_mets_recordIdentifier() # Getting per-page/structure information is a bit different - structMap_PHYSICAL = (mets.xpath('//mets:structMap[@TYPE="PHYSICAL"]', namespaces=ns) or [None])[0] - if not structMap_PHYSICAL: + structMap_PHYSICAL = mets.find('./mets:structMap[@TYPE="PHYSICAL"]', ns) + structMap_LOGICAL = mets.find('./mets:structMap[@TYPE="LOGICAL"]', ns) + if structMap_PHYSICAL is None: raise ValueError("No structMap[@TYPE='PHYSICAL'] found") + if structMap_LOGICAL is None: + raise ValueError("No structMap[@TYPE='LOGICAL'] found") div_physSequence = structMap_PHYSICAL[0] assert div_physSequence.attrib.get("TYPE") == "physSequence" @@ -278,6 +281,9 @@ def pages_to_dict(mets, raise_errors=True) -> List[Dict]: file_ = mets.find(f'.//{{{ns["mets"]}}}file[@ID="{ID}"]') return file_ + def get_mets_div(*, ID): + if ID: + return structMap_LOGICAL.findall(f'.//mets:div[@ID="{ID}"]', ns) for page in div_physSequence: @@ -315,7 +321,7 @@ def pages_to_dict(mets, raise_errors=True) -> List[Dict]: targets = [] for sm_link in sm_links: xlink_from = sm_link.attrib.get(f"{{{ns['xlink']}}}from") - targets.extend(mets.findall(f'//mets:div[@ID="{xlink_from}"]', ns)) + targets.extend(get_mets_div(ID=xlink_from)) return targets struct_divs = set(get_struct_log(to_phys=page_dict["ID"]))