diff --git a/README-DEV.md b/README-DEV.md new file mode 100644 index 0000000..6d1f17d --- /dev/null +++ b/README-DEV.md @@ -0,0 +1,16 @@ +``` +pip install -r requirements-test.txt +``` + +To run tests: +``` +pytest +``` + +To run a test with profiling: + +1. Make sure graphviz is installed +2. Run pytest with with profiling enabled: + ``` + pytest --profile-svg -k test_page_info + ``` diff --git a/qurator/mods4pandas/mods4pandas.py b/qurator/mods4pandas/mods4pandas.py index 9993fa9..6a8c1c6 100755 --- a/qurator/mods4pandas/mods4pandas.py +++ b/qurator/mods4pandas/mods4pandas.py @@ -266,13 +266,43 @@ def pages_to_dict(mets, raise_errors=True) -> List[Dict]: ppn = get_mets_recordIdentifier() # Getting per-page/structure information is a bit different - structMap_PHYSICAL = (mets.xpath('//mets:structMap[@TYPE="PHYSICAL"]', namespaces=ns) or [None])[0] - if not structMap_PHYSICAL: - raise ValueError("No structMap[@TYPE='PHYSICAL'] found") + structMap_PHYSICAL = mets.find('./mets:structMap[@TYPE="PHYSICAL"]', ns) + structMap_LOGICAL = mets.find('./mets:structMap[@TYPE="LOGICAL"]', ns) + fileSec = mets.find('./mets:fileSec', ns) + if structMap_PHYSICAL is None: + # This is expected in a multivolume work or periodical! + if any( + structMap_LOGICAL.find(f'./mets:div[@TYPE="{t}"]', ns) is not None + for t in ["multivolume_work", "MultivolumeWork", "periodical"] + ): + return [] + else: + raise ValueError("No structMap[@TYPE='PHYSICAL'] found (but not a multivolume work)") + if structMap_LOGICAL is None: + raise ValueError("No structMap[@TYPE='LOGICAL'] found") + if fileSec is None: + raise ValueError("No fileSec found") div_physSequence = structMap_PHYSICAL[0] assert div_physSequence.attrib.get("TYPE") == "physSequence" + + # Build a look-up table to get mets:file by @ID + # This cuts retrieving the mets:file down to half the time. + mets_file_by_ID = {} + def _init_mets_file_by_ID(): + for f in fileSec.iterfind('./mets:fileGrp/mets:file', ns): + mets_file_by_ID[f.attrib.get("ID")] = f + _init_mets_file_by_ID() + + def get_mets_file(*, ID): + if ID: + return mets_file_by_ID[ID] + + def get_mets_div(*, ID): + if ID: + return structMap_LOGICAL.findall(f'.//mets:div[@ID="{ID}"]', ns) + for page in div_physSequence: # TODO sort by ORDER? @@ -285,12 +315,8 @@ def pages_to_dict(mets, raise_errors=True) -> List[Dict]: file_id = fptr.attrib.get("FILEID") assert file_id - def get_mets_file(*, ID): - if ID: - file_ = (mets.xpath(f'//mets:file[@ID="{ID}"]', namespaces=ns) or [None])[0] - return file_ - file_ = get_mets_file(ID=file_id) + assert file_ is not None fileGrp_USE = file_.getparent().attrib.get("USE") file_FLocat_href = (file_.xpath('mets:FLocat/@xlink:href', namespaces=ns) or [None])[0] page_dict[f"fileGrp_{fileGrp_USE}_file_FLocat_href"] = file_FLocat_href @@ -306,12 +332,14 @@ def pages_to_dict(mets, raise_errors=True) -> List[Dict]: # This is all XLink, there might be a more generic way to traverse the links. However, currently, # it suffices to do this the old-fashioned way. - sm_links = mets.xpath(f'//mets:structLink/mets:smLink[@xlink:to="{to_phys}"]', namespaces=ns) + sm_links = mets.findall( + f'./mets:structLink/mets:smLink[@xlink:to="{to_phys}"]', ns + ) targets = [] for sm_link in sm_links: xlink_from = sm_link.attrib.get(f"{{{ns['xlink']}}}from") - targets.extend(mets.xpath(f'//mets:div[@ID="{xlink_from}"]', namespaces=ns)) + targets.extend(get_mets_div(ID=xlink_from)) return targets struct_divs = set(get_struct_log(to_phys=page_dict["ID"])) diff --git a/qurator/mods4pandas/tests/data/mets-mods/PPN717884805-multivolume_work-no-structMap-PHYSICAL.xml b/qurator/mods4pandas/tests/data/mets-mods/PPN717884805-multivolume_work-no-structMap-PHYSICAL.xml new file mode 100644 index 0000000..3ed745c --- /dev/null +++ b/qurator/mods4pandas/tests/data/mets-mods/PPN717884805-multivolume_work-no-structMap-PHYSICAL.xml @@ -0,0 +1,114 @@ + + + + + Goobi - UGH-1.11.1-v1.11.0-11-gbafb11b - 16−November−2015 + Goobi + + + + + + + + + Herborn + + Buchhandlung des Nassauischen Colportagevereins + 1916 + + + + Berlin + + Staatsbibliothek zu Berlin - Preußischer Kulturbesitz, Germany + [Electronic ed.] + + Krieg 1914-1918 + Historische Drucke + + PPN717884805 + + http://resolver.staatsbibliothek-berlin.de/SBB00008D1E00000000 + + + PPN242046452 + + + + Die Predigt des Evangeliums in der Zeitenwende + Erläuterungen und Dispositionen zu den altkirchlichen und den Eisenacher Perikopen und zu freien Texten unter besonderer Berücksichtigung der Kriegszeit + + P_Drucke_Europeana1914-1918 + + book + + Weltkr. 625 + + ger + + + + Europeana Collections 1914-1918 + + + + + aut + + Dunkmann + Karl + Dunkmann, Karl + + + reformatted digital + + + 217 + + + sh2010119545 + sh2008113843 + + UNKNOWN + text + + + + + + + + + + Staatsbibliothek zu Berlin - Preußischer Kulturbesitz + http://resolver.staatsbibliothek-berlin.de/SBB0000000100000000 + http://www.staatsbibliothek-berlin.de + mailto:info@sbb.spk-berlin.de + + + + + + + + + http://www.stabikat.de/DB=1/PPN?PPN=717884805 + http://digital.staatsbibliothek-berlin.de/dms/werkansicht/?PPN=PPN717884805 + https://content.staatsbibliothek-berlin.de/dc/PPN717884805/manifest + + + + + + + + + + + + + + + + diff --git a/qurator/mods4pandas/tests/data/mets-mods/sbb-mets-PPN821507109.xml b/qurator/mods4pandas/tests/data/mets-mods/PPN821507109-1361-pages.xml old mode 100755 new mode 100644 similarity index 100% rename from qurator/mods4pandas/tests/data/mets-mods/sbb-mets-PPN821507109.xml rename to qurator/mods4pandas/tests/data/mets-mods/PPN821507109-1361-pages.xml diff --git a/qurator/mods4pandas/tests/test_page_info.py b/qurator/mods4pandas/tests/test_page_info.py new file mode 100644 index 0000000..87eeac7 --- /dev/null +++ b/qurator/mods4pandas/tests/test_page_info.py @@ -0,0 +1,45 @@ +import sys +from pathlib import Path + +from lxml import etree as ET + +from qurator.mods4pandas.mods4pandas import pages_to_dict + + +TESTS_DATA_DIR = Path(__file__).parent / "data" + + +def removeprefix(s, prefix): + if sys.version_info < (3,9): + return s[len(prefix):] if s.startswith(prefix) else s + else: + return s.removeprefix(prefix) + + +def test_page_info(): + """Test creation of page_info""" + mets = ET.parse(TESTS_DATA_DIR / "mets-mods" / "PPN821507109-1361-pages.xml") + page_info = pages_to_dict(mets) + + # We have 1361 pages for this one work. + assert len(page_info) == 1361 + assert all(p["ppn"] == "PPN821507109" for p in page_info) + + # Look closer at an interesting page + from pprint import pprint; pprint(page_info[0]) + page_info_page = next(p for p in page_info if p["ID"] == "PHYS_0005") + + assert page_info_page["fileGrp_PRESENTATION_file_FLocat_href"] == "file:///goobi/tiff001/sbb/PPN821507109/00000005.tif" + + # This is a title page with an illustration, check that we correctly got this info from the + # structMap. + struct_types = sorted(removeprefix(k, "structMap-LOGICAL_TYPE_") for k, v in page_info_page.items() if k.startswith("structMap-LOGICAL_TYPE_") and v == 1) + assert struct_types == ["illustration", "monograph", "title_page"] + + +def test_page_info_multivolume_work(): + """Test creation of page_info for multivolume_work""" + mets = ET.parse(TESTS_DATA_DIR / "mets-mods" / "PPN717884805-multivolume_work-no-structMap-PHYSICAL.xml") + page_info = pages_to_dict(mets) + assert page_info == [] + diff --git a/requirements-test.txt b/requirements-test.txt index e079f8a..6f0f369 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -1 +1,2 @@ pytest +pytest-profiling