mirror of
https://github.com/qurator-spk/modstool.git
synced 2025-06-09 19:59:57 +02:00
Merge branch 'feat/page_info' of https://github.com/qurator-spk/mods4pandas into feat/page_info
This commit is contained in:
commit
acd9c5cd4b
6 changed files with 214 additions and 10 deletions
16
README-DEV.md
Normal file
16
README-DEV.md
Normal file
|
@ -0,0 +1,16 @@
|
||||||
|
```
|
||||||
|
pip install -r requirements-test.txt
|
||||||
|
```
|
||||||
|
|
||||||
|
To run tests:
|
||||||
|
```
|
||||||
|
pytest
|
||||||
|
```
|
||||||
|
|
||||||
|
To run a test with profiling:
|
||||||
|
|
||||||
|
1. Make sure graphviz is installed
|
||||||
|
2. Run pytest with with profiling enabled:
|
||||||
|
```
|
||||||
|
pytest --profile-svg -k test_page_info
|
||||||
|
```
|
|
@ -266,13 +266,43 @@ def pages_to_dict(mets, raise_errors=True) -> List[Dict]:
|
||||||
ppn = get_mets_recordIdentifier()
|
ppn = get_mets_recordIdentifier()
|
||||||
|
|
||||||
# Getting per-page/structure information is a bit different
|
# Getting per-page/structure information is a bit different
|
||||||
structMap_PHYSICAL = (mets.xpath('//mets:structMap[@TYPE="PHYSICAL"]', namespaces=ns) or [None])[0]
|
structMap_PHYSICAL = mets.find('./mets:structMap[@TYPE="PHYSICAL"]', ns)
|
||||||
if not structMap_PHYSICAL:
|
structMap_LOGICAL = mets.find('./mets:structMap[@TYPE="LOGICAL"]', ns)
|
||||||
raise ValueError("No structMap[@TYPE='PHYSICAL'] found")
|
fileSec = mets.find('./mets:fileSec', ns)
|
||||||
|
if structMap_PHYSICAL is None:
|
||||||
|
# This is expected in a multivolume work or periodical!
|
||||||
|
if any(
|
||||||
|
structMap_LOGICAL.find(f'./mets:div[@TYPE="{t}"]', ns) is not None
|
||||||
|
for t in ["multivolume_work", "MultivolumeWork", "periodical"]
|
||||||
|
):
|
||||||
|
return []
|
||||||
|
else:
|
||||||
|
raise ValueError("No structMap[@TYPE='PHYSICAL'] found (but not a multivolume work)")
|
||||||
|
if structMap_LOGICAL is None:
|
||||||
|
raise ValueError("No structMap[@TYPE='LOGICAL'] found")
|
||||||
|
if fileSec is None:
|
||||||
|
raise ValueError("No fileSec found")
|
||||||
|
|
||||||
div_physSequence = structMap_PHYSICAL[0]
|
div_physSequence = structMap_PHYSICAL[0]
|
||||||
assert div_physSequence.attrib.get("TYPE") == "physSequence"
|
assert div_physSequence.attrib.get("TYPE") == "physSequence"
|
||||||
|
|
||||||
|
|
||||||
|
# Build a look-up table to get mets:file by @ID
|
||||||
|
# This cuts retrieving the mets:file down to half the time.
|
||||||
|
mets_file_by_ID = {}
|
||||||
|
def _init_mets_file_by_ID():
|
||||||
|
for f in fileSec.iterfind('./mets:fileGrp/mets:file', ns):
|
||||||
|
mets_file_by_ID[f.attrib.get("ID")] = f
|
||||||
|
_init_mets_file_by_ID()
|
||||||
|
|
||||||
|
def get_mets_file(*, ID):
|
||||||
|
if ID:
|
||||||
|
return mets_file_by_ID[ID]
|
||||||
|
|
||||||
|
def get_mets_div(*, ID):
|
||||||
|
if ID:
|
||||||
|
return structMap_LOGICAL.findall(f'.//mets:div[@ID="{ID}"]', ns)
|
||||||
|
|
||||||
for page in div_physSequence:
|
for page in div_physSequence:
|
||||||
|
|
||||||
# TODO sort by ORDER?
|
# TODO sort by ORDER?
|
||||||
|
@ -285,12 +315,8 @@ def pages_to_dict(mets, raise_errors=True) -> List[Dict]:
|
||||||
file_id = fptr.attrib.get("FILEID")
|
file_id = fptr.attrib.get("FILEID")
|
||||||
assert file_id
|
assert file_id
|
||||||
|
|
||||||
def get_mets_file(*, ID):
|
|
||||||
if ID:
|
|
||||||
file_ = (mets.xpath(f'//mets:file[@ID="{ID}"]', namespaces=ns) or [None])[0]
|
|
||||||
return file_
|
|
||||||
|
|
||||||
file_ = get_mets_file(ID=file_id)
|
file_ = get_mets_file(ID=file_id)
|
||||||
|
assert file_ is not None
|
||||||
fileGrp_USE = file_.getparent().attrib.get("USE")
|
fileGrp_USE = file_.getparent().attrib.get("USE")
|
||||||
file_FLocat_href = (file_.xpath('mets:FLocat/@xlink:href', namespaces=ns) or [None])[0]
|
file_FLocat_href = (file_.xpath('mets:FLocat/@xlink:href', namespaces=ns) or [None])[0]
|
||||||
page_dict[f"fileGrp_{fileGrp_USE}_file_FLocat_href"] = file_FLocat_href
|
page_dict[f"fileGrp_{fileGrp_USE}_file_FLocat_href"] = file_FLocat_href
|
||||||
|
@ -306,12 +332,14 @@ def pages_to_dict(mets, raise_errors=True) -> List[Dict]:
|
||||||
# This is all XLink, there might be a more generic way to traverse the links. However, currently,
|
# This is all XLink, there might be a more generic way to traverse the links. However, currently,
|
||||||
# it suffices to do this the old-fashioned way.
|
# it suffices to do this the old-fashioned way.
|
||||||
|
|
||||||
sm_links = mets.xpath(f'//mets:structLink/mets:smLink[@xlink:to="{to_phys}"]', namespaces=ns)
|
sm_links = mets.findall(
|
||||||
|
f'./mets:structLink/mets:smLink[@xlink:to="{to_phys}"]', ns
|
||||||
|
)
|
||||||
|
|
||||||
targets = []
|
targets = []
|
||||||
for sm_link in sm_links:
|
for sm_link in sm_links:
|
||||||
xlink_from = sm_link.attrib.get(f"{{{ns['xlink']}}}from")
|
xlink_from = sm_link.attrib.get(f"{{{ns['xlink']}}}from")
|
||||||
targets.extend(mets.xpath(f'//mets:div[@ID="{xlink_from}"]', namespaces=ns))
|
targets.extend(get_mets_div(ID=xlink_from))
|
||||||
return targets
|
return targets
|
||||||
|
|
||||||
struct_divs = set(get_struct_log(to_phys=page_dict["ID"]))
|
struct_divs = set(get_struct_log(to_phys=page_dict["ID"]))
|
||||||
|
|
|
@ -0,0 +1,114 @@
|
||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<mets:mets xmlns:mets="http://www.loc.gov/METS/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="info:lc/xmlns/premis-v2 http://www.loc.gov/standards/premis/v2/premis-v2-0.xsd http://www.loc.gov/mods/v3 http://www.loc.gov/standards/mods/v3/mods-3-6.xsd http://www.loc.gov/METS/ http://www.loc.gov/standards/mets/version17/mets.v1-7.xsd http://www.loc.gov/mix/v10 http://www.loc.gov/standards/mix/mix10/mix10.xsd">
|
||||||
|
<mets:metsHdr CREATEDATE="2019-02-01T13:50:33">
|
||||||
|
<mets:agent OTHERTYPE="SOFTWARE" ROLE="CREATOR" TYPE="OTHER">
|
||||||
|
<mets:name>Goobi - UGH-1.11.1-v1.11.0-11-gbafb11b - 16−November−2015</mets:name>
|
||||||
|
<mets:note>Goobi</mets:note>
|
||||||
|
</mets:agent>
|
||||||
|
</mets:metsHdr>
|
||||||
|
<mets:dmdSec ID="DMDLOG_0000">
|
||||||
|
<mets:mdWrap MDTYPE="MODS">
|
||||||
|
<mets:xmlData>
|
||||||
|
<mods:mods xmlns:mods="http://www.loc.gov/mods/v3">
|
||||||
|
<mods:originInfo eventType="publication">
|
||||||
|
<mods:place>
|
||||||
|
<mods:placeTerm type="text">Herborn</mods:placeTerm>
|
||||||
|
</mods:place>
|
||||||
|
<mods:publisher>Buchhandlung des Nassauischen Colportagevereins</mods:publisher>
|
||||||
|
<mods:dateIssued encoding="iso8601" keyDate="yes" point="start">1916</mods:dateIssued>
|
||||||
|
</mods:originInfo>
|
||||||
|
<mods:originInfo eventType="digitization">
|
||||||
|
<mods:place>
|
||||||
|
<mods:placeTerm type="text">Berlin</mods:placeTerm>
|
||||||
|
</mods:place>
|
||||||
|
<mods:publisher>Staatsbibliothek zu Berlin - Preußischer Kulturbesitz, Germany</mods:publisher>
|
||||||
|
<mods:edition>[Electronic ed.]</mods:edition>
|
||||||
|
</mods:originInfo>
|
||||||
|
<mods:classification authority="ZVDD">Krieg 1914-1918</mods:classification>
|
||||||
|
<mods:classification authority="ZVDD">Historische Drucke</mods:classification>
|
||||||
|
<mods:recordInfo>
|
||||||
|
<mods:recordIdentifier source="gbv-ppn">PPN717884805</mods:recordIdentifier>
|
||||||
|
</mods:recordInfo>
|
||||||
|
<mods:identifier type="purl">http://resolver.staatsbibliothek-berlin.de/SBB00008D1E00000000</mods:identifier>
|
||||||
|
<mods:relatedItem type="original">
|
||||||
|
<mods:recordInfo>
|
||||||
|
<mods:recordIdentifier source="gbv-ppn">PPN242046452</mods:recordIdentifier>
|
||||||
|
</mods:recordInfo>
|
||||||
|
</mods:relatedItem>
|
||||||
|
<mods:titleInfo>
|
||||||
|
<mods:title>Die Predigt des Evangeliums in der Zeitenwende</mods:title>
|
||||||
|
<mods:subTitle>Erläuterungen und Dispositionen zu den altkirchlichen und den Eisenacher Perikopen und zu freien Texten unter besonderer Berücksichtigung der Kriegszeit</mods:subTitle>
|
||||||
|
</mods:titleInfo>
|
||||||
|
<mods:note type="source characteristics">P_Drucke_Europeana1914-1918</mods:note>
|
||||||
|
<mods:subject authority="EC1418">
|
||||||
|
<mods:genre>book</mods:genre>
|
||||||
|
</mods:subject>
|
||||||
|
<mods:classification authority="sbb">Weltkr. 625</mods:classification>
|
||||||
|
<mods:language>
|
||||||
|
<mods:languageTerm authority="iso639-2b" type="code">ger</mods:languageTerm>
|
||||||
|
</mods:language>
|
||||||
|
<mods:relatedItem type="series">
|
||||||
|
<mods:titleInfo>
|
||||||
|
<mods:title>Europeana Collections 1914-1918</mods:title>
|
||||||
|
</mods:titleInfo>
|
||||||
|
</mods:relatedItem>
|
||||||
|
<mods:name type="personal">
|
||||||
|
<mods:role>
|
||||||
|
<mods:roleTerm authority="marcrelator" type="code">aut</mods:roleTerm>
|
||||||
|
</mods:role>
|
||||||
|
<mods:namePart type="family">Dunkmann</mods:namePart>
|
||||||
|
<mods:namePart type="given">Karl</mods:namePart>
|
||||||
|
<mods:displayForm>Dunkmann, Karl</mods:displayForm>
|
||||||
|
</mods:name>
|
||||||
|
<mods:physicalDescription>
|
||||||
|
<mods:digitalOrigin>reformatted digital</mods:digitalOrigin>
|
||||||
|
</mods:physicalDescription>
|
||||||
|
<mods:language>
|
||||||
|
<mods:scriptTerm authority="iso15924" type="code">217</mods:scriptTerm>
|
||||||
|
</mods:language>
|
||||||
|
<mods:subject authority="lcsh">
|
||||||
|
<mods:topic>sh2010119545</mods:topic>
|
||||||
|
<mods:topic>sh2008113843</mods:topic>
|
||||||
|
</mods:subject>
|
||||||
|
<mods:accessCondition type="use and reproduction">UNKNOWN</mods:accessCondition>
|
||||||
|
<mods:typeOfResource>text</mods:typeOfResource>
|
||||||
|
</mods:mods>
|
||||||
|
</mets:xmlData>
|
||||||
|
</mets:mdWrap>
|
||||||
|
</mets:dmdSec>
|
||||||
|
<mets:amdSec ID="AMD">
|
||||||
|
<mets:rightsMD ID="RIGHTS">
|
||||||
|
<mets:mdWrap MDTYPE="OTHER" MIMETYPE="text/xml" OTHERMDTYPE="DVRIGHTS">
|
||||||
|
<mets:xmlData>
|
||||||
|
<dv:rights xmlns:dv="http://dfg-viewer.de/">
|
||||||
|
<dv:owner>Staatsbibliothek zu Berlin - Preußischer Kulturbesitz</dv:owner>
|
||||||
|
<dv:ownerLogo>http://resolver.staatsbibliothek-berlin.de/SBB0000000100000000</dv:ownerLogo>
|
||||||
|
<dv:ownerSiteURL>http://www.staatsbibliothek-berlin.de</dv:ownerSiteURL>
|
||||||
|
<dv:ownerContact>mailto:info@sbb.spk-berlin.de</dv:ownerContact>
|
||||||
|
</dv:rights>
|
||||||
|
</mets:xmlData>
|
||||||
|
</mets:mdWrap>
|
||||||
|
</mets:rightsMD>
|
||||||
|
<mets:digiprovMD ID="DIGIPROV">
|
||||||
|
<mets:mdWrap MDTYPE="OTHER" MIMETYPE="text/xml" OTHERMDTYPE="DVLINKS">
|
||||||
|
<mets:xmlData>
|
||||||
|
<dv:links xmlns:dv="http://dfg-viewer.de/">
|
||||||
|
<dv:reference>http://www.stabikat.de/DB=1/PPN?PPN=717884805 </dv:reference>
|
||||||
|
<dv:presentation>http://digital.staatsbibliothek-berlin.de/dms/werkansicht/?PPN=PPN717884805</dv:presentation>
|
||||||
|
<dv:iiif>https://content.staatsbibliothek-berlin.de/dc/PPN717884805/manifest</dv:iiif>
|
||||||
|
</dv:links>
|
||||||
|
</mets:xmlData>
|
||||||
|
</mets:mdWrap>
|
||||||
|
</mets:digiprovMD>
|
||||||
|
</mets:amdSec>
|
||||||
|
<mets:structMap TYPE="LOGICAL">
|
||||||
|
<mets:div ADMID="AMD" CONTENTIDS="http://resolver.staatsbibliothek-berlin.de/SBB00008D1E00000000" DMDID="DMDLOG_0000" ID="LOG_0000" LABEL="Die Predigt des Evangeliums in der Zeitenwende" ORDERLABEL="Predigt des Evangeliums in der Zeitenwende" TYPE="multivolume_work">
|
||||||
|
<mets:div ID="LOG_0001" LABEL="Altkirchliche Perikopen" ORDERLABEL="Altkirchliche Perikopen" TYPE="volume">
|
||||||
|
<mets:mptr xmlns:xlink="http://www.w3.org/1999/xlink" LOCTYPE="URL" xlink:href="http://digital.staatsbibliothek-berlin.de/dms/metsresolver/?PPN=PPN717885003"/>
|
||||||
|
</mets:div>
|
||||||
|
<mets:div ID="LOG_0002" TYPE="Volume" LABEL="Eisenacher Perikopen Bd. 2" ORDER="20">
|
||||||
|
<mets:mptr xmlns:xlink="http://www.w3.org/1999/xlink" LOCTYPE="URL" xlink:href="http://digital.staatsbibliothek-berlin.de/dms/metsresolver/?PPN=PPN717885429"/>
|
||||||
|
</mets:div>
|
||||||
|
</mets:div>
|
||||||
|
</mets:structMap>
|
||||||
|
</mets:mets>
|
45
qurator/mods4pandas/tests/test_page_info.py
Normal file
45
qurator/mods4pandas/tests/test_page_info.py
Normal file
|
@ -0,0 +1,45 @@
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from lxml import etree as ET
|
||||||
|
|
||||||
|
from qurator.mods4pandas.mods4pandas import pages_to_dict
|
||||||
|
|
||||||
|
|
||||||
|
TESTS_DATA_DIR = Path(__file__).parent / "data"
|
||||||
|
|
||||||
|
|
||||||
|
def removeprefix(s, prefix):
|
||||||
|
if sys.version_info < (3,9):
|
||||||
|
return s[len(prefix):] if s.startswith(prefix) else s
|
||||||
|
else:
|
||||||
|
return s.removeprefix(prefix)
|
||||||
|
|
||||||
|
|
||||||
|
def test_page_info():
|
||||||
|
"""Test creation of page_info"""
|
||||||
|
mets = ET.parse(TESTS_DATA_DIR / "mets-mods" / "PPN821507109-1361-pages.xml")
|
||||||
|
page_info = pages_to_dict(mets)
|
||||||
|
|
||||||
|
# We have 1361 pages for this one work.
|
||||||
|
assert len(page_info) == 1361
|
||||||
|
assert all(p["ppn"] == "PPN821507109" for p in page_info)
|
||||||
|
|
||||||
|
# Look closer at an interesting page
|
||||||
|
from pprint import pprint; pprint(page_info[0])
|
||||||
|
page_info_page = next(p for p in page_info if p["ID"] == "PHYS_0005")
|
||||||
|
|
||||||
|
assert page_info_page["fileGrp_PRESENTATION_file_FLocat_href"] == "file:///goobi/tiff001/sbb/PPN821507109/00000005.tif"
|
||||||
|
|
||||||
|
# This is a title page with an illustration, check that we correctly got this info from the
|
||||||
|
# structMap.
|
||||||
|
struct_types = sorted(removeprefix(k, "structMap-LOGICAL_TYPE_") for k, v in page_info_page.items() if k.startswith("structMap-LOGICAL_TYPE_") and v == 1)
|
||||||
|
assert struct_types == ["illustration", "monograph", "title_page"]
|
||||||
|
|
||||||
|
|
||||||
|
def test_page_info_multivolume_work():
|
||||||
|
"""Test creation of page_info for multivolume_work"""
|
||||||
|
mets = ET.parse(TESTS_DATA_DIR / "mets-mods" / "PPN717884805-multivolume_work-no-structMap-PHYSICAL.xml")
|
||||||
|
page_info = pages_to_dict(mets)
|
||||||
|
assert page_info == []
|
||||||
|
|
|
@ -1 +1,2 @@
|
||||||
pytest
|
pytest
|
||||||
|
pytest-profiling
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue