diff --git a/qurator/mods4pandas/mods4pandas.py b/qurator/mods4pandas/mods4pandas.py index 9993fa9..16934e7 100755 --- a/qurator/mods4pandas/mods4pandas.py +++ b/qurator/mods4pandas/mods4pandas.py @@ -273,6 +273,12 @@ def pages_to_dict(mets, raise_errors=True) -> List[Dict]: div_physSequence = structMap_PHYSICAL[0] assert div_physSequence.attrib.get("TYPE") == "physSequence" + def get_mets_file(*, ID): + if ID: + file_ = mets.find(f'.//{{{ns["mets"]}}}file[@ID="{ID}"]') + return file_ + + for page in div_physSequence: # TODO sort by ORDER? @@ -285,12 +291,8 @@ def pages_to_dict(mets, raise_errors=True) -> List[Dict]: file_id = fptr.attrib.get("FILEID") assert file_id - def get_mets_file(*, ID): - if ID: - file_ = (mets.xpath(f'//mets:file[@ID="{ID}"]', namespaces=ns) or [None])[0] - return file_ - file_ = get_mets_file(ID=file_id) + assert file_ is not None fileGrp_USE = file_.getparent().attrib.get("USE") file_FLocat_href = (file_.xpath('mets:FLocat/@xlink:href', namespaces=ns) or [None])[0] page_dict[f"fileGrp_{fileGrp_USE}_file_FLocat_href"] = file_FLocat_href