From 8fc4eeeb3bf6178f766f2993fcd12716bff8ab04 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Sat, 9 Dec 2023 12:05:20 +0100 Subject: [PATCH] =?UTF-8?q?=E2=9A=A1=20Make=20get=5Fsets=5Ffile=20faster?= =?UTF-8?q?=20by=20using=20a=20lookup=20table?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- qurator/mods4pandas/mods4pandas.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/qurator/mods4pandas/mods4pandas.py b/qurator/mods4pandas/mods4pandas.py index 75cf03c..4b45148 100755 --- a/qurator/mods4pandas/mods4pandas.py +++ b/qurator/mods4pandas/mods4pandas.py @@ -279,10 +279,18 @@ def pages_to_dict(mets, raise_errors=True) -> List[Dict]: div_physSequence = structMap_PHYSICAL[0] assert div_physSequence.attrib.get("TYPE") == "physSequence" + + # Build a look-up table to get mets:file by @ID + # This cuts retrieving the mets:file down to half the time. + mets_file_by_ID = {} + def _init_mets_file_by_ID(): + for f in fileSec.iterfind('./mets:fileGrp/mets:file', ns): + mets_file_by_ID[f.attrib.get("ID")] = f + _init_mets_file_by_ID() + def get_mets_file(*, ID): if ID: - file_ = fileSec.find(f'./mets:fileGrp/mets:file[@ID="{ID}"]', ns) - return file_ + return mets_file_by_ID[ID] def get_mets_div(*, ID): if ID: