Merge branch 'feat/page_info' of https://github.com/qurator-spk/mods4pandas into feat/page_info

master
Mike Gerber 5 months ago
commit acd9c5cd4b

@ -0,0 +1,16 @@
```
pip install -r requirements-test.txt
```
To run tests:
```
pytest
```
To run a test with profiling:
1. Make sure graphviz is installed
2. Run pytest with with profiling enabled:
```
pytest --profile-svg -k test_page_info
```

@ -266,13 +266,43 @@ def pages_to_dict(mets, raise_errors=True) -> List[Dict]:
ppn = get_mets_recordIdentifier() ppn = get_mets_recordIdentifier()
# Getting per-page/structure information is a bit different # Getting per-page/structure information is a bit different
structMap_PHYSICAL = (mets.xpath('//mets:structMap[@TYPE="PHYSICAL"]', namespaces=ns) or [None])[0] structMap_PHYSICAL = mets.find('./mets:structMap[@TYPE="PHYSICAL"]', ns)
if not structMap_PHYSICAL: structMap_LOGICAL = mets.find('./mets:structMap[@TYPE="LOGICAL"]', ns)
raise ValueError("No structMap[@TYPE='PHYSICAL'] found") fileSec = mets.find('./mets:fileSec', ns)
if structMap_PHYSICAL is None:
# This is expected in a multivolume work or periodical!
if any(
structMap_LOGICAL.find(f'./mets:div[@TYPE="{t}"]', ns) is not None
for t in ["multivolume_work", "MultivolumeWork", "periodical"]
):
return []
else:
raise ValueError("No structMap[@TYPE='PHYSICAL'] found (but not a multivolume work)")
if structMap_LOGICAL is None:
raise ValueError("No structMap[@TYPE='LOGICAL'] found")
if fileSec is None:
raise ValueError("No fileSec found")
div_physSequence = structMap_PHYSICAL[0] div_physSequence = structMap_PHYSICAL[0]
assert div_physSequence.attrib.get("TYPE") == "physSequence" assert div_physSequence.attrib.get("TYPE") == "physSequence"
# Build a look-up table to get mets:file by @ID
# This cuts retrieving the mets:file down to half the time.
mets_file_by_ID = {}
def _init_mets_file_by_ID():
for f in fileSec.iterfind('./mets:fileGrp/mets:file', ns):
mets_file_by_ID[f.attrib.get("ID")] = f
_init_mets_file_by_ID()
def get_mets_file(*, ID):
if ID:
return mets_file_by_ID[ID]
def get_mets_div(*, ID):
if ID:
return structMap_LOGICAL.findall(f'.//mets:div[@ID="{ID}"]', ns)
for page in div_physSequence: for page in div_physSequence:
# TODO sort by ORDER? # TODO sort by ORDER?
@ -285,12 +315,8 @@ def pages_to_dict(mets, raise_errors=True) -> List[Dict]:
file_id = fptr.attrib.get("FILEID") file_id = fptr.attrib.get("FILEID")
assert file_id assert file_id
def get_mets_file(*, ID):
if ID:
file_ = (mets.xpath(f'//mets:file[@ID="{ID}"]', namespaces=ns) or [None])[0]
return file_
file_ = get_mets_file(ID=file_id) file_ = get_mets_file(ID=file_id)
assert file_ is not None
fileGrp_USE = file_.getparent().attrib.get("USE") fileGrp_USE = file_.getparent().attrib.get("USE")
file_FLocat_href = (file_.xpath('mets:FLocat/@xlink:href', namespaces=ns) or [None])[0] file_FLocat_href = (file_.xpath('mets:FLocat/@xlink:href', namespaces=ns) or [None])[0]
page_dict[f"fileGrp_{fileGrp_USE}_file_FLocat_href"] = file_FLocat_href page_dict[f"fileGrp_{fileGrp_USE}_file_FLocat_href"] = file_FLocat_href
@ -306,12 +332,14 @@ def pages_to_dict(mets, raise_errors=True) -> List[Dict]:
# This is all XLink, there might be a more generic way to traverse the links. However, currently, # This is all XLink, there might be a more generic way to traverse the links. However, currently,
# it suffices to do this the old-fashioned way. # it suffices to do this the old-fashioned way.
sm_links = mets.xpath(f'//mets:structLink/mets:smLink[@xlink:to="{to_phys}"]', namespaces=ns) sm_links = mets.findall(
f'./mets:structLink/mets:smLink[@xlink:to="{to_phys}"]', ns
)
targets = [] targets = []
for sm_link in sm_links: for sm_link in sm_links:
xlink_from = sm_link.attrib.get(f"{{{ns['xlink']}}}from") xlink_from = sm_link.attrib.get(f"{{{ns['xlink']}}}from")
targets.extend(mets.xpath(f'//mets:div[@ID="{xlink_from}"]', namespaces=ns)) targets.extend(get_mets_div(ID=xlink_from))
return targets return targets
struct_divs = set(get_struct_log(to_phys=page_dict["ID"])) struct_divs = set(get_struct_log(to_phys=page_dict["ID"]))

@ -0,0 +1,114 @@
<?xml version="1.0" encoding="UTF-8"?>
<mets:mets xmlns:mets="http://www.loc.gov/METS/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="info:lc/xmlns/premis-v2 http://www.loc.gov/standards/premis/v2/premis-v2-0.xsd http://www.loc.gov/mods/v3 http://www.loc.gov/standards/mods/v3/mods-3-6.xsd http://www.loc.gov/METS/ http://www.loc.gov/standards/mets/version17/mets.v1-7.xsd http://www.loc.gov/mix/v10 http://www.loc.gov/standards/mix/mix10/mix10.xsd">
<mets:metsHdr CREATEDATE="2019-02-01T13:50:33">
<mets:agent OTHERTYPE="SOFTWARE" ROLE="CREATOR" TYPE="OTHER">
<mets:name>Goobi - UGH-1.11.1-v1.11.0-11-gbafb11b - 16November2015</mets:name>
<mets:note>Goobi</mets:note>
</mets:agent>
</mets:metsHdr>
<mets:dmdSec ID="DMDLOG_0000">
<mets:mdWrap MDTYPE="MODS">
<mets:xmlData>
<mods:mods xmlns:mods="http://www.loc.gov/mods/v3">
<mods:originInfo eventType="publication">
<mods:place>
<mods:placeTerm type="text">Herborn</mods:placeTerm>
</mods:place>
<mods:publisher>Buchhandlung des Nassauischen Colportagevereins</mods:publisher>
<mods:dateIssued encoding="iso8601" keyDate="yes" point="start">1916</mods:dateIssued>
</mods:originInfo>
<mods:originInfo eventType="digitization">
<mods:place>
<mods:placeTerm type="text">Berlin</mods:placeTerm>
</mods:place>
<mods:publisher>Staatsbibliothek zu Berlin - Preußischer Kulturbesitz, Germany</mods:publisher>
<mods:edition>[Electronic ed.]</mods:edition>
</mods:originInfo>
<mods:classification authority="ZVDD">Krieg 1914-1918</mods:classification>
<mods:classification authority="ZVDD">Historische Drucke</mods:classification>
<mods:recordInfo>
<mods:recordIdentifier source="gbv-ppn">PPN717884805</mods:recordIdentifier>
</mods:recordInfo>
<mods:identifier type="purl">http://resolver.staatsbibliothek-berlin.de/SBB00008D1E00000000</mods:identifier>
<mods:relatedItem type="original">
<mods:recordInfo>
<mods:recordIdentifier source="gbv-ppn">PPN242046452</mods:recordIdentifier>
</mods:recordInfo>
</mods:relatedItem>
<mods:titleInfo>
<mods:title>Die Predigt des Evangeliums in der Zeitenwende</mods:title>
<mods:subTitle>Erläuterungen und Dispositionen zu den altkirchlichen und den Eisenacher Perikopen und zu freien Texten unter besonderer Berücksichtigung der Kriegszeit</mods:subTitle>
</mods:titleInfo>
<mods:note type="source characteristics">P_Drucke_Europeana1914-1918</mods:note>
<mods:subject authority="EC1418">
<mods:genre>book</mods:genre>
</mods:subject>
<mods:classification authority="sbb">Weltkr. 625</mods:classification>
<mods:language>
<mods:languageTerm authority="iso639-2b" type="code">ger</mods:languageTerm>
</mods:language>
<mods:relatedItem type="series">
<mods:titleInfo>
<mods:title>Europeana Collections 1914-1918</mods:title>
</mods:titleInfo>
</mods:relatedItem>
<mods:name type="personal">
<mods:role>
<mods:roleTerm authority="marcrelator" type="code">aut</mods:roleTerm>
</mods:role>
<mods:namePart type="family">Dunkmann</mods:namePart>
<mods:namePart type="given">Karl</mods:namePart>
<mods:displayForm>Dunkmann, Karl</mods:displayForm>
</mods:name>
<mods:physicalDescription>
<mods:digitalOrigin>reformatted digital</mods:digitalOrigin>
</mods:physicalDescription>
<mods:language>
<mods:scriptTerm authority="iso15924" type="code">217</mods:scriptTerm>
</mods:language>
<mods:subject authority="lcsh">
<mods:topic>sh2010119545</mods:topic>
<mods:topic>sh2008113843</mods:topic>
</mods:subject>
<mods:accessCondition type="use and reproduction">UNKNOWN</mods:accessCondition>
<mods:typeOfResource>text</mods:typeOfResource>
</mods:mods>
</mets:xmlData>
</mets:mdWrap>
</mets:dmdSec>
<mets:amdSec ID="AMD">
<mets:rightsMD ID="RIGHTS">
<mets:mdWrap MDTYPE="OTHER" MIMETYPE="text/xml" OTHERMDTYPE="DVRIGHTS">
<mets:xmlData>
<dv:rights xmlns:dv="http://dfg-viewer.de/">
<dv:owner>Staatsbibliothek zu Berlin - Preußischer Kulturbesitz</dv:owner>
<dv:ownerLogo>http://resolver.staatsbibliothek-berlin.de/SBB0000000100000000</dv:ownerLogo>
<dv:ownerSiteURL>http://www.staatsbibliothek-berlin.de</dv:ownerSiteURL>
<dv:ownerContact>mailto:info@sbb.spk-berlin.de</dv:ownerContact>
</dv:rights>
</mets:xmlData>
</mets:mdWrap>
</mets:rightsMD>
<mets:digiprovMD ID="DIGIPROV">
<mets:mdWrap MDTYPE="OTHER" MIMETYPE="text/xml" OTHERMDTYPE="DVLINKS">
<mets:xmlData>
<dv:links xmlns:dv="http://dfg-viewer.de/">
<dv:reference>http://www.stabikat.de/DB=1/PPN?PPN=717884805 </dv:reference>
<dv:presentation>http://digital.staatsbibliothek-berlin.de/dms/werkansicht/?PPN=PPN717884805</dv:presentation>
<dv:iiif>https://content.staatsbibliothek-berlin.de/dc/PPN717884805/manifest</dv:iiif>
</dv:links>
</mets:xmlData>
</mets:mdWrap>
</mets:digiprovMD>
</mets:amdSec>
<mets:structMap TYPE="LOGICAL">
<mets:div ADMID="AMD" CONTENTIDS="http://resolver.staatsbibliothek-berlin.de/SBB00008D1E00000000" DMDID="DMDLOG_0000" ID="LOG_0000" LABEL="Die Predigt des Evangeliums in der Zeitenwende" ORDERLABEL="Predigt des Evangeliums in der Zeitenwende" TYPE="multivolume_work">
<mets:div ID="LOG_0001" LABEL="Altkirchliche Perikopen" ORDERLABEL="Altkirchliche Perikopen" TYPE="volume">
<mets:mptr xmlns:xlink="http://www.w3.org/1999/xlink" LOCTYPE="URL" xlink:href="http://digital.staatsbibliothek-berlin.de/dms/metsresolver/?PPN=PPN717885003"/>
</mets:div>
<mets:div ID="LOG_0002" TYPE="Volume" LABEL="Eisenacher Perikopen Bd. 2" ORDER="20">
<mets:mptr xmlns:xlink="http://www.w3.org/1999/xlink" LOCTYPE="URL" xlink:href="http://digital.staatsbibliothek-berlin.de/dms/metsresolver/?PPN=PPN717885429"/>
</mets:div>
</mets:div>
</mets:structMap>
</mets:mets>

@ -0,0 +1,45 @@
import sys
from pathlib import Path
from lxml import etree as ET
from qurator.mods4pandas.mods4pandas import pages_to_dict
TESTS_DATA_DIR = Path(__file__).parent / "data"
def removeprefix(s, prefix):
if sys.version_info < (3,9):
return s[len(prefix):] if s.startswith(prefix) else s
else:
return s.removeprefix(prefix)
def test_page_info():
"""Test creation of page_info"""
mets = ET.parse(TESTS_DATA_DIR / "mets-mods" / "PPN821507109-1361-pages.xml")
page_info = pages_to_dict(mets)
# We have 1361 pages for this one work.
assert len(page_info) == 1361
assert all(p["ppn"] == "PPN821507109" for p in page_info)
# Look closer at an interesting page
from pprint import pprint; pprint(page_info[0])
page_info_page = next(p for p in page_info if p["ID"] == "PHYS_0005")
assert page_info_page["fileGrp_PRESENTATION_file_FLocat_href"] == "file:///goobi/tiff001/sbb/PPN821507109/00000005.tif"
# This is a title page with an illustration, check that we correctly got this info from the
# structMap.
struct_types = sorted(removeprefix(k, "structMap-LOGICAL_TYPE_") for k, v in page_info_page.items() if k.startswith("structMap-LOGICAL_TYPE_") and v == 1)
assert struct_types == ["illustration", "monograph", "title_page"]
def test_page_info_multivolume_work():
"""Test creation of page_info for multivolume_work"""
mets = ET.parse(TESTS_DATA_DIR / "mets-mods" / "PPN717884805-multivolume_work-no-structMap-PHYSICAL.xml")
page_info = pages_to_dict(mets)
assert page_info == []

@ -1 +1,2 @@
pytest pytest
pytest-profiling

Loading…
Cancel
Save