From 8c269b35a488244635eff26cc6728d13c45a1c66 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Fri, 8 Dec 2023 15:58:59 +0100 Subject: [PATCH 01/12] =?UTF-8?q?=E2=9C=94=20Test=20creation=20of=20page?= =?UTF-8?q?=5Finfo?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ...507109.xml => PPN821507109-1361-pages.xml} | 0 qurator/mods4pandas/tests/test_page_info.py | 29 +++++++++++++++++++ 2 files changed, 29 insertions(+) rename qurator/mods4pandas/tests/data/mets-mods/{sbb-mets-PPN821507109.xml => PPN821507109-1361-pages.xml} (100%) mode change 100755 => 100644 create mode 100644 qurator/mods4pandas/tests/test_page_info.py diff --git a/qurator/mods4pandas/tests/data/mets-mods/sbb-mets-PPN821507109.xml b/qurator/mods4pandas/tests/data/mets-mods/PPN821507109-1361-pages.xml old mode 100755 new mode 100644 similarity index 100% rename from qurator/mods4pandas/tests/data/mets-mods/sbb-mets-PPN821507109.xml rename to qurator/mods4pandas/tests/data/mets-mods/PPN821507109-1361-pages.xml diff --git a/qurator/mods4pandas/tests/test_page_info.py b/qurator/mods4pandas/tests/test_page_info.py new file mode 100644 index 0000000..441230b --- /dev/null +++ b/qurator/mods4pandas/tests/test_page_info.py @@ -0,0 +1,29 @@ +from pathlib import Path + +from lxml import etree as ET + +from qurator.mods4pandas.mods4pandas import pages_to_dict + + +TESTS_DATA_DIR = Path(__file__).parent / "data" + + +def test_page_info(): + """Test creation of page_info""" + mets = ET.parse(TESTS_DATA_DIR / "mets-mods" / "PPN821507109-1361-pages.xml") + page_info = pages_to_dict(mets) + + # We have 1361 pages for this one work. + assert len(page_info) == 1361 + assert all(p["ppn"] == "PPN821507109" for p in page_info) + + # Look closer at an interesting page + from pprint import pprint; pprint(page_info[0]) + page_info_page = next(p for p in page_info if p["ID"] == "PHYS_0005") + + assert page_info_page["fileGrp_PRESENTATION_file_FLocat_href"] == "file:///goobi/tiff001/sbb/PPN821507109/00000005.tif" + + # This is a title page with an illustration, check that we correctly got this info from the + # structMap. + struct_types = sorted(k.removeprefix("structMap-LOGICAL_TYPE_") for k, v in page_info_page.items() if k.startswith("structMap-LOGICAL_TYPE_") and v == 1) + assert struct_types == ["illustration", "monograph", "title_page"] From 8d0dc72ca222b984cf0e8547cd5ea0f0549b0488 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Fri, 8 Dec 2023 16:28:45 +0100 Subject: [PATCH 02/12] =?UTF-8?q?=E2=9C=94=20Enable/document=20profiling?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README-DEV.md | 16 ++++++++++++++++ requirements-test.txt | 1 + 2 files changed, 17 insertions(+) create mode 100644 README-DEV.md diff --git a/README-DEV.md b/README-DEV.md new file mode 100644 index 0000000..6d1f17d --- /dev/null +++ b/README-DEV.md @@ -0,0 +1,16 @@ +``` +pip install -r requirements-test.txt +``` + +To run tests: +``` +pytest +``` + +To run a test with profiling: + +1. Make sure graphviz is installed +2. Run pytest with with profiling enabled: + ``` + pytest --profile-svg -k test_page_info + ``` diff --git a/requirements-test.txt b/requirements-test.txt index e079f8a..6f0f369 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -1 +1,2 @@ pytest +pytest-profiling From 16a3a3bcc8645627ef691e61e15a08d22854e22e Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Sat, 9 Dec 2023 09:21:10 +0100 Subject: [PATCH 03/12] =?UTF-8?q?=E2=9C=94=20Fix=20tests=20on=20Python=203?= =?UTF-8?q?.8=20by=20backporting=20removeprefix?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- qurator/mods4pandas/tests/test_page_info.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/qurator/mods4pandas/tests/test_page_info.py b/qurator/mods4pandas/tests/test_page_info.py index 441230b..a740ffd 100644 --- a/qurator/mods4pandas/tests/test_page_info.py +++ b/qurator/mods4pandas/tests/test_page_info.py @@ -1,3 +1,4 @@ +import sys from pathlib import Path from lxml import etree as ET @@ -8,6 +9,13 @@ from qurator.mods4pandas.mods4pandas import pages_to_dict TESTS_DATA_DIR = Path(__file__).parent / "data" +def removeprefix(s, prefix): + if sys.version_info < (3,9): + return s[len(prefix):] if s.startswith(prefix) else s + else: + return s.removeprefix(prefix) + + def test_page_info(): """Test creation of page_info""" mets = ET.parse(TESTS_DATA_DIR / "mets-mods" / "PPN821507109-1361-pages.xml") @@ -25,5 +33,5 @@ def test_page_info(): # This is a title page with an illustration, check that we correctly got this info from the # structMap. - struct_types = sorted(k.removeprefix("structMap-LOGICAL_TYPE_") for k, v in page_info_page.items() if k.startswith("structMap-LOGICAL_TYPE_") and v == 1) + struct_types = sorted(removeprefix(k, "structMap-LOGICAL_TYPE_") for k, v in page_info_page.items() if k.startswith("structMap-LOGICAL_TYPE_") and v == 1) assert struct_types == ["illustration", "monograph", "title_page"] From 90c60ebb80f2a72e5ea1ea9edfc83def9172622e Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Sat, 9 Dec 2023 10:24:38 +0100 Subject: [PATCH 04/12] =?UTF-8?q?=E2=9A=A1=20Make=20get=5Fmets=5Ffile=20aa?= =?UTF-8?q?=20lot=20faster=20by=20using=20find()=20instead=20of=20xpath()?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- qurator/mods4pandas/mods4pandas.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/qurator/mods4pandas/mods4pandas.py b/qurator/mods4pandas/mods4pandas.py index 9993fa9..16934e7 100755 --- a/qurator/mods4pandas/mods4pandas.py +++ b/qurator/mods4pandas/mods4pandas.py @@ -273,6 +273,12 @@ def pages_to_dict(mets, raise_errors=True) -> List[Dict]: div_physSequence = structMap_PHYSICAL[0] assert div_physSequence.attrib.get("TYPE") == "physSequence" + def get_mets_file(*, ID): + if ID: + file_ = mets.find(f'.//{{{ns["mets"]}}}file[@ID="{ID}"]') + return file_ + + for page in div_physSequence: # TODO sort by ORDER? @@ -285,12 +291,8 @@ def pages_to_dict(mets, raise_errors=True) -> List[Dict]: file_id = fptr.attrib.get("FILEID") assert file_id - def get_mets_file(*, ID): - if ID: - file_ = (mets.xpath(f'//mets:file[@ID="{ID}"]', namespaces=ns) or [None])[0] - return file_ - file_ = get_mets_file(ID=file_id) + assert file_ is not None fileGrp_USE = file_.getparent().attrib.get("USE") file_FLocat_href = (file_.xpath('mets:FLocat/@xlink:href', namespaces=ns) or [None])[0] page_dict[f"fileGrp_{fileGrp_USE}_file_FLocat_href"] = file_FLocat_href From 1dac77a2f5c8054140dca4be81d32d1ad9735426 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Sat, 9 Dec 2023 10:36:55 +0100 Subject: [PATCH 05/12] =?UTF-8?q?=E2=9A=A1=20Make=20gett=5Fstruct=5Flog=20?= =?UTF-8?q?faster=20by=20using=20find(all)=20instead=20of=20xpath()?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- qurator/mods4pandas/mods4pandas.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/qurator/mods4pandas/mods4pandas.py b/qurator/mods4pandas/mods4pandas.py index 16934e7..32728a3 100755 --- a/qurator/mods4pandas/mods4pandas.py +++ b/qurator/mods4pandas/mods4pandas.py @@ -308,12 +308,14 @@ def pages_to_dict(mets, raise_errors=True) -> List[Dict]: # This is all XLink, there might be a more generic way to traverse the links. However, currently, # it suffices to do this the old-fashioned way. - sm_links = mets.xpath(f'//mets:structLink/mets:smLink[@xlink:to="{to_phys}"]', namespaces=ns) + sm_links = mets.findall( + f'//mets:structLink/mets:smLink[@xlink:to="{to_phys}"]', ns + ) targets = [] for sm_link in sm_links: xlink_from = sm_link.attrib.get(f"{{{ns['xlink']}}}from") - targets.extend(mets.xpath(f'//mets:div[@ID="{xlink_from}"]', namespaces=ns)) + targets.extend(mets.findall(f'//mets:div[@ID="{xlink_from}"]', ns)) return targets struct_divs = set(get_struct_log(to_phys=page_dict["ID"])) From 448639b05bb3142c3fa86773399e1065f41fdff1 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Sat, 9 Dec 2023 11:35:24 +0100 Subject: [PATCH 06/12] =?UTF-8?q?=E2=9A=A1=20Make=20gettstruct=5Flog=20fas?= =?UTF-8?q?ter=20by=20using=20precise=20predicates?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- qurator/mods4pandas/mods4pandas.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/qurator/mods4pandas/mods4pandas.py b/qurator/mods4pandas/mods4pandas.py index 32728a3..09dd9c4 100755 --- a/qurator/mods4pandas/mods4pandas.py +++ b/qurator/mods4pandas/mods4pandas.py @@ -266,9 +266,12 @@ def pages_to_dict(mets, raise_errors=True) -> List[Dict]: ppn = get_mets_recordIdentifier() # Getting per-page/structure information is a bit different - structMap_PHYSICAL = (mets.xpath('//mets:structMap[@TYPE="PHYSICAL"]', namespaces=ns) or [None])[0] - if not structMap_PHYSICAL: + structMap_PHYSICAL = mets.find('./mets:structMap[@TYPE="PHYSICAL"]', ns) + structMap_LOGICAL = mets.find('./mets:structMap[@TYPE="LOGICAL"]', ns) + if structMap_PHYSICAL is None: raise ValueError("No structMap[@TYPE='PHYSICAL'] found") + if structMap_LOGICAL is None: + raise ValueError("No structMap[@TYPE='LOGICAL'] found") div_physSequence = structMap_PHYSICAL[0] assert div_physSequence.attrib.get("TYPE") == "physSequence" @@ -278,6 +281,9 @@ def pages_to_dict(mets, raise_errors=True) -> List[Dict]: file_ = mets.find(f'.//{{{ns["mets"]}}}file[@ID="{ID}"]') return file_ + def get_mets_div(*, ID): + if ID: + return structMap_LOGICAL.findall(f'.//mets:div[@ID="{ID}"]', ns) for page in div_physSequence: @@ -315,7 +321,7 @@ def pages_to_dict(mets, raise_errors=True) -> List[Dict]: targets = [] for sm_link in sm_links: xlink_from = sm_link.attrib.get(f"{{{ns['xlink']}}}from") - targets.extend(mets.findall(f'//mets:div[@ID="{xlink_from}"]', ns)) + targets.extend(get_mets_div(ID=xlink_from)) return targets struct_divs = set(get_struct_log(to_phys=page_dict["ID"])) From 912e5d2b4a8e7c90b2c73389eb0f2d449975c402 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Sat, 9 Dec 2023 11:40:45 +0100 Subject: [PATCH 07/12] =?UTF-8?q?=E2=9A=A1=20Make=20get=5Fstruct=5Flog=20f?= =?UTF-8?q?aster=20by=20using=20precise=20predicates?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- qurator/mods4pandas/mods4pandas.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/qurator/mods4pandas/mods4pandas.py b/qurator/mods4pandas/mods4pandas.py index 09dd9c4..75cf03c 100755 --- a/qurator/mods4pandas/mods4pandas.py +++ b/qurator/mods4pandas/mods4pandas.py @@ -268,17 +268,20 @@ def pages_to_dict(mets, raise_errors=True) -> List[Dict]: # Getting per-page/structure information is a bit different structMap_PHYSICAL = mets.find('./mets:structMap[@TYPE="PHYSICAL"]', ns) structMap_LOGICAL = mets.find('./mets:structMap[@TYPE="LOGICAL"]', ns) + fileSec = mets.find('./mets:fileSec', ns) if structMap_PHYSICAL is None: raise ValueError("No structMap[@TYPE='PHYSICAL'] found") if structMap_LOGICAL is None: raise ValueError("No structMap[@TYPE='LOGICAL'] found") + if fileSec is None: + raise ValueError("No fileSec found") div_physSequence = structMap_PHYSICAL[0] assert div_physSequence.attrib.get("TYPE") == "physSequence" def get_mets_file(*, ID): if ID: - file_ = mets.find(f'.//{{{ns["mets"]}}}file[@ID="{ID}"]') + file_ = fileSec.find(f'./mets:fileGrp/mets:file[@ID="{ID}"]', ns) return file_ def get_mets_div(*, ID): From 8fc4eeeb3bf6178f766f2993fcd12716bff8ab04 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Sat, 9 Dec 2023 12:05:20 +0100 Subject: [PATCH 08/12] =?UTF-8?q?=E2=9A=A1=20Make=20get=5Fsets=5Ffile=20fa?= =?UTF-8?q?ster=20by=20using=20a=20lookup=20table?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- qurator/mods4pandas/mods4pandas.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/qurator/mods4pandas/mods4pandas.py b/qurator/mods4pandas/mods4pandas.py index 75cf03c..4b45148 100755 --- a/qurator/mods4pandas/mods4pandas.py +++ b/qurator/mods4pandas/mods4pandas.py @@ -279,10 +279,18 @@ def pages_to_dict(mets, raise_errors=True) -> List[Dict]: div_physSequence = structMap_PHYSICAL[0] assert div_physSequence.attrib.get("TYPE") == "physSequence" + + # Build a look-up table to get mets:file by @ID + # This cuts retrieving the mets:file down to half the time. + mets_file_by_ID = {} + def _init_mets_file_by_ID(): + for f in fileSec.iterfind('./mets:fileGrp/mets:file', ns): + mets_file_by_ID[f.attrib.get("ID")] = f + _init_mets_file_by_ID() + def get_mets_file(*, ID): if ID: - file_ = fileSec.find(f'./mets:fileGrp/mets:file[@ID="{ID}"]', ns) - return file_ + return mets_file_by_ID[ID] def get_mets_div(*, ID): if ID: From 0acaa831631937263231c051ca7cd689ddad3053 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Sat, 9 Dec 2023 12:48:07 +0100 Subject: [PATCH 09/12] =?UTF-8?q?=E2=9A=A1=20MUse=20relative=20predicate?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- qurator/mods4pandas/mods4pandas.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/qurator/mods4pandas/mods4pandas.py b/qurator/mods4pandas/mods4pandas.py index 4b45148..7d187e1 100755 --- a/qurator/mods4pandas/mods4pandas.py +++ b/qurator/mods4pandas/mods4pandas.py @@ -326,7 +326,7 @@ def pages_to_dict(mets, raise_errors=True) -> List[Dict]: # it suffices to do this the old-fashioned way. sm_links = mets.findall( - f'//mets:structLink/mets:smLink[@xlink:to="{to_phys}"]', ns + f'./mets:structLink/mets:smLink[@xlink:to="{to_phys}"]', ns ) targets = [] From 6226618f400973818a8d6042d8a220604fbec2af Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Tue, 12 Dec 2023 12:34:24 +0100 Subject: [PATCH 10/12] =?UTF-8?q?=F0=9F=90=9B=20mods4pandas:=20Handle=20mu?= =?UTF-8?q?ltivolume=5Fwork=20without=20structMap=20TYPE=3D'PHYSICAL'?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- qurator/mods4pandas/mods4pandas.py | 6 +- ...multivolume_work-no-structMap-PHYSICAL.xml | 114 ++++++++++++++++++ qurator/mods4pandas/tests/test_page_info.py | 8 ++ 3 files changed, 127 insertions(+), 1 deletion(-) create mode 100644 qurator/mods4pandas/tests/data/mets-mods/PPN717884805-multivolume_work-no-structMap-PHYSICAL.xml diff --git a/qurator/mods4pandas/mods4pandas.py b/qurator/mods4pandas/mods4pandas.py index 7d187e1..028b9ac 100755 --- a/qurator/mods4pandas/mods4pandas.py +++ b/qurator/mods4pandas/mods4pandas.py @@ -270,7 +270,11 @@ def pages_to_dict(mets, raise_errors=True) -> List[Dict]: structMap_LOGICAL = mets.find('./mets:structMap[@TYPE="LOGICAL"]', ns) fileSec = mets.find('./mets:fileSec', ns) if structMap_PHYSICAL is None: - raise ValueError("No structMap[@TYPE='PHYSICAL'] found") + # This is expected in a multivolume work! + if structMap_LOGICAL.find('./mets:div[@TYPE="multivolume_work"]', ns) is not None: + return [] + else: + raise ValueError("No structMap[@TYPE='PHYSICAL'] found (but not a multivolume work)") if structMap_LOGICAL is None: raise ValueError("No structMap[@TYPE='LOGICAL'] found") if fileSec is None: diff --git a/qurator/mods4pandas/tests/data/mets-mods/PPN717884805-multivolume_work-no-structMap-PHYSICAL.xml b/qurator/mods4pandas/tests/data/mets-mods/PPN717884805-multivolume_work-no-structMap-PHYSICAL.xml new file mode 100644 index 0000000..3ed745c --- /dev/null +++ b/qurator/mods4pandas/tests/data/mets-mods/PPN717884805-multivolume_work-no-structMap-PHYSICAL.xml @@ -0,0 +1,114 @@ + + + + + Goobi - UGH-1.11.1-v1.11.0-11-gbafb11b - 16−November−2015 + Goobi + + + + + + + + + Herborn + + Buchhandlung des Nassauischen Colportagevereins + 1916 + + + + Berlin + + Staatsbibliothek zu Berlin - Preußischer Kulturbesitz, Germany + [Electronic ed.] + + Krieg 1914-1918 + Historische Drucke + + PPN717884805 + + http://resolver.staatsbibliothek-berlin.de/SBB00008D1E00000000 + + + PPN242046452 + + + + Die Predigt des Evangeliums in der Zeitenwende + Erläuterungen und Dispositionen zu den altkirchlichen und den Eisenacher Perikopen und zu freien Texten unter besonderer Berücksichtigung der Kriegszeit + + P_Drucke_Europeana1914-1918 + + book + + Weltkr. 625 + + ger + + + + Europeana Collections 1914-1918 + + + + + aut + + Dunkmann + Karl + Dunkmann, Karl + + + reformatted digital + + + 217 + + + sh2010119545 + sh2008113843 + + UNKNOWN + text + + + + + + + + + + Staatsbibliothek zu Berlin - Preußischer Kulturbesitz + http://resolver.staatsbibliothek-berlin.de/SBB0000000100000000 + http://www.staatsbibliothek-berlin.de + mailto:info@sbb.spk-berlin.de + + + + + + + + + http://www.stabikat.de/DB=1/PPN?PPN=717884805 + http://digital.staatsbibliothek-berlin.de/dms/werkansicht/?PPN=PPN717884805 + https://content.staatsbibliothek-berlin.de/dc/PPN717884805/manifest + + + + + + + + + + + + + + + + diff --git a/qurator/mods4pandas/tests/test_page_info.py b/qurator/mods4pandas/tests/test_page_info.py index a740ffd..87eeac7 100644 --- a/qurator/mods4pandas/tests/test_page_info.py +++ b/qurator/mods4pandas/tests/test_page_info.py @@ -35,3 +35,11 @@ def test_page_info(): # structMap. struct_types = sorted(removeprefix(k, "structMap-LOGICAL_TYPE_") for k, v in page_info_page.items() if k.startswith("structMap-LOGICAL_TYPE_") and v == 1) assert struct_types == ["illustration", "monograph", "title_page"] + + +def test_page_info_multivolume_work(): + """Test creation of page_info for multivolume_work""" + mets = ET.parse(TESTS_DATA_DIR / "mets-mods" / "PPN717884805-multivolume_work-no-structMap-PHYSICAL.xml") + page_info = pages_to_dict(mets) + assert page_info == [] + From b8a287258240fb149b9976667ce2b0170bbb2519 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Tue, 12 Dec 2023 13:13:23 +0100 Subject: [PATCH 11/12] =?UTF-8?q?=F0=9F=90=9B=20mods4pandas:=20Handle=20pe?= =?UTF-8?q?riodical=20without=20structMap=20TYPE=3D'PHYSICAL'?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- qurator/mods4pandas/mods4pandas.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/qurator/mods4pandas/mods4pandas.py b/qurator/mods4pandas/mods4pandas.py index 028b9ac..75f2caf 100755 --- a/qurator/mods4pandas/mods4pandas.py +++ b/qurator/mods4pandas/mods4pandas.py @@ -270,8 +270,11 @@ def pages_to_dict(mets, raise_errors=True) -> List[Dict]: structMap_LOGICAL = mets.find('./mets:structMap[@TYPE="LOGICAL"]', ns) fileSec = mets.find('./mets:fileSec', ns) if structMap_PHYSICAL is None: - # This is expected in a multivolume work! - if structMap_LOGICAL.find('./mets:div[@TYPE="multivolume_work"]', ns) is not None: + # This is expected in a multivolume work or periodical! + if any( + structMap_LOGICAL.find(f'./mets:div[@TYPE="{t}"]', ns) is not None + for t in ["multivolume_work", "periodical"] + ): return [] else: raise ValueError("No structMap[@TYPE='PHYSICAL'] found (but not a multivolume work)") From e9fca0f563868e0573b279f01ab3aea7866739b3 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Tue, 12 Dec 2023 13:33:05 +0100 Subject: [PATCH 12/12] =?UTF-8?q?=F0=9F=90=9B=20mods4pandas:=20Handle=20Mu?= =?UTF-8?q?ltiVolumeWork=20(differently=20spelled=20type=20cp.=20to=20befo?= =?UTF-8?q?re)=20without=20structMap=20TYPE=3D'PHYSICAL'?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- qurator/mods4pandas/mods4pandas.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/qurator/mods4pandas/mods4pandas.py b/qurator/mods4pandas/mods4pandas.py index 75f2caf..6a8c1c6 100755 --- a/qurator/mods4pandas/mods4pandas.py +++ b/qurator/mods4pandas/mods4pandas.py @@ -273,7 +273,7 @@ def pages_to_dict(mets, raise_errors=True) -> List[Dict]: # This is expected in a multivolume work or periodical! if any( structMap_LOGICAL.find(f'./mets:div[@TYPE="{t}"]', ns) is not None - for t in ["multivolume_work", "periodical"] + for t in ["multivolume_work", "MultivolumeWork", "periodical"] ): return [] else: