1
0
Fork 0
mirror of https://github.com/qurator-spk/modstool.git synced 2025-06-26 20:19:56 +02:00
modstool/src/mods4pandas/tests/test_page_info.py

58 lines
1.6 KiB
Python
Raw Normal View History

import sys
2023-12-08 15:58:59 +01:00
from pathlib import Path
from lxml import etree as ET
2024-07-25 13:20:18 +02:00
from mods4pandas.mods4pandas import pages_to_dict
2023-12-08 15:58:59 +01:00
TESTS_DATA_DIR = Path(__file__).parent / "data"
def removeprefix(s, prefix):
2025-06-12 09:51:02 +02:00
if sys.version_info < (3, 9):
return s[len(prefix) :] if s.startswith(prefix) else s
else:
return s.removeprefix(prefix)
2023-12-08 15:58:59 +01:00
def test_page_info():
"""Test creation of page_info"""
mets = ET.parse(TESTS_DATA_DIR / "mets-mods" / "PPN821507109-1361-pages.xml")
page_info = pages_to_dict(mets)
# We have 1361 pages for this one work.
assert len(page_info) == 1361
assert all(p["ppn"] == "PPN821507109" for p in page_info)
# Look closer at an interesting page
2025-06-12 09:51:02 +02:00
from pprint import pprint
pprint(page_info[0])
2023-12-08 15:58:59 +01:00
page_info_page = next(p for p in page_info if p["ID"] == "PHYS_0005")
2025-06-12 09:51:02 +02:00
assert (
page_info_page["fileGrp_PRESENTATION_file_FLocat_href"]
== "file:///goobi/tiff001/sbb/PPN821507109/00000005.tif"
)
2023-12-08 15:58:59 +01:00
# This is a title page with an illustration, check that we correctly got this info from the
# structMap.
2025-06-12 09:51:02 +02:00
struct_types = sorted(
removeprefix(k, "structMap-LOGICAL_TYPE_")
for k, v in page_info_page.items()
if k.startswith("structMap-LOGICAL_TYPE_") and v == 1
)
2023-12-08 15:58:59 +01:00
assert struct_types == ["illustration", "monograph", "title_page"]
def test_page_info_multivolume_work():
"""Test creation of page_info for multivolume_work"""
2025-06-12 09:51:02 +02:00
mets = ET.parse(
TESTS_DATA_DIR
/ "mets-mods"
/ "PPN717884805-multivolume_work-no-structMap-PHYSICAL.xml"
)
page_info = pages_to_dict(mets)
assert page_info == []