mirror of
https://github.com/qurator-spk/modstool.git
synced 2025-06-08 19:29:57 +02:00
✨ page_info: Retrieve filenames + structMap types
This commit is contained in:
parent
dd3943eaf6
commit
889d36f0d4
2 changed files with 94 additions and 3 deletions
|
@ -13,7 +13,8 @@ __all__ = ["ns"]
|
||||||
ns = {
|
ns = {
|
||||||
'mets': 'http://www.loc.gov/METS/',
|
'mets': 'http://www.loc.gov/METS/',
|
||||||
'mods': 'http://www.loc.gov/mods/v3',
|
'mods': 'http://www.loc.gov/mods/v3',
|
||||||
"alto": "http://www.loc.gov/standards/alto/ns-v2"
|
"alto": "http://www.loc.gov/standards/alto/ns-v2",
|
||||||
|
"xlink": "http://www.w3.org/1999/xlink",
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -25,9 +26,12 @@ class TagGroup:
|
||||||
self.tag = tag
|
self.tag = tag
|
||||||
self.group = group
|
self.group = group
|
||||||
|
|
||||||
def __str__(self):
|
def to_xml(self):
|
||||||
return '\n'.join(str(ET.tostring(e), 'utf-8').strip() for e in self.group)
|
return '\n'.join(str(ET.tostring(e), 'utf-8').strip() for e in self.group)
|
||||||
|
|
||||||
|
def __str__(self):
|
||||||
|
return f"TagGroup with content:\n{self.to_xml()}"
|
||||||
|
|
||||||
def is_singleton(self):
|
def is_singleton(self):
|
||||||
if len(self.group) != 1:
|
if len(self.group) != 1:
|
||||||
raise ValueError('More than one instance: {}'.format(self))
|
raise ValueError('More than one instance: {}'.format(self))
|
||||||
|
|
|
@ -252,9 +252,90 @@ def mets_to_dict(mets, raise_errors=True):
|
||||||
raise ValueError('Unknown tag "{}"'.format(tag))
|
raise ValueError('Unknown tag "{}"'.format(tag))
|
||||||
else:
|
else:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
return value
|
return value
|
||||||
|
|
||||||
|
def pages_to_dict(mets, raise_errors=True) -> list[dict]:
|
||||||
|
# TODO replace asserts by ValueError
|
||||||
|
|
||||||
|
result = []
|
||||||
|
|
||||||
|
# PPN
|
||||||
|
def get_mets_recordIdentifier(*, source="gbv-ppn"):
|
||||||
|
return (mets.xpath(f'//mets:dmdSec[1]//mods:mods/mods:recordInfo/mods:recordIdentifier[@source="{source}"]',
|
||||||
|
namespaces=ns) or [None])[0].text
|
||||||
|
ppn = get_mets_recordIdentifier()
|
||||||
|
|
||||||
|
# Getting per-page/structure information is a bit different
|
||||||
|
structMap_PHYSICAL = (mets.xpath('//mets:structMap[@TYPE="PHYSICAL"]', namespaces=ns) or [None])[0]
|
||||||
|
if not structMap_PHYSICAL:
|
||||||
|
raise ValueError("No structMap[@TYPE='PHYSICAL'] found")
|
||||||
|
|
||||||
|
div_physSequence = structMap_PHYSICAL[0]
|
||||||
|
assert div_physSequence.attrib.get("TYPE") == "physSequence"
|
||||||
|
|
||||||
|
for page in div_physSequence:
|
||||||
|
|
||||||
|
# TODO sort by ORDER?
|
||||||
|
assert page.attrib.get("TYPE") == "page"
|
||||||
|
page_dict = {}
|
||||||
|
page_dict["ppn"] = ppn
|
||||||
|
page_dict["ID"] = page.attrib.get("ID")
|
||||||
|
for fptr in page:
|
||||||
|
assert fptr.tag == "{http://www.loc.gov/METS/}fptr"
|
||||||
|
file_id = fptr.attrib.get("FILEID")
|
||||||
|
assert file_id
|
||||||
|
|
||||||
|
def get_mets_file(*, ID):
|
||||||
|
if ID:
|
||||||
|
file_ = (mets.xpath(f'//mets:file[@ID="{ID}"]', namespaces=ns) or [None])[0]
|
||||||
|
return file_
|
||||||
|
|
||||||
|
file_ = get_mets_file(ID=file_id)
|
||||||
|
fileGrp_USE = file_.getparent().attrib.get("USE")
|
||||||
|
file_FLocat_href = (file_.xpath('mets:FLocat/@xlink:href', namespaces=ns) or [None])[0]
|
||||||
|
page_dict[f"fileGrp_{fileGrp_USE}_file_FLocat_href"] = file_FLocat_href
|
||||||
|
|
||||||
|
def get_struct_log(*, to_phys):
|
||||||
|
"""
|
||||||
|
Get the logical structMap elements that link to the given physical page.
|
||||||
|
|
||||||
|
Keyword arguments:
|
||||||
|
to_phys -- ID of the page, as per structMap[@TYPE="PHYSICAL"]
|
||||||
|
"""
|
||||||
|
|
||||||
|
# This is all XLink, there might be a more generic way to traverse the links. However, currently,
|
||||||
|
# it suffices to do this the old-fashioned way.
|
||||||
|
|
||||||
|
sm_links = mets.xpath(f'//mets:structLink/mets:smLink[@xlink:to="{to_phys}"]', namespaces=ns)
|
||||||
|
|
||||||
|
targets = []
|
||||||
|
for sm_link in sm_links:
|
||||||
|
xlink_from = sm_link.attrib.get(f"{{{ns['xlink']}}}from")
|
||||||
|
targets.extend(mets.xpath(f'//mets:div[@ID="{xlink_from}"]', namespaces=ns))
|
||||||
|
return targets
|
||||||
|
|
||||||
|
struct_divs = set(get_struct_log(to_phys=page_dict["ID"]))
|
||||||
|
|
||||||
|
# In our documents, there are already links to parent elements, but we want to make
|
||||||
|
# sure and add them.
|
||||||
|
def get_struct_log_parents(div):
|
||||||
|
cursor = div
|
||||||
|
while (cursor := cursor.getparent()).tag == f"{{{ns['mets']}}}div":
|
||||||
|
yield cursor
|
||||||
|
|
||||||
|
for struct_div in struct_divs:
|
||||||
|
struct_divs.update(get_struct_log_parents(struct_div))
|
||||||
|
|
||||||
|
for struct_div in struct_divs:
|
||||||
|
type_ = struct_div.attrib.get("TYPE")
|
||||||
|
assert type_
|
||||||
|
page_dict[f"structmap_LOGICAL_TYPE_{type_}"] = 1
|
||||||
|
|
||||||
|
from pprint import pprint; pprint(page_dict); print()
|
||||||
|
result.append(page_dict)
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
@click.command()
|
@click.command()
|
||||||
@click.argument('mets_files', type=click.Path(exists=True), required=True, nargs=-1)
|
@click.argument('mets_files', type=click.Path(exists=True), required=True, nargs=-1)
|
||||||
|
@ -286,6 +367,7 @@ def process(mets_files: List[str], output_file: str, output_csv: str, output_xls
|
||||||
with open(output_file + '.warnings.csv', 'w') as csvfile:
|
with open(output_file + '.warnings.csv', 'w') as csvfile:
|
||||||
csvwriter = csv.writer(csvfile)
|
csvwriter = csv.writer(csvfile)
|
||||||
mods_info = []
|
mods_info = []
|
||||||
|
page_info = []
|
||||||
logger.info('Processing METS files')
|
logger.info('Processing METS files')
|
||||||
for mets_file in tqdm(mets_files_real, leave=False):
|
for mets_file in tqdm(mets_files_real, leave=False):
|
||||||
try:
|
try:
|
||||||
|
@ -298,6 +380,7 @@ def process(mets_files: List[str], output_file: str, output_csv: str, output_xls
|
||||||
|
|
||||||
# MODS
|
# MODS
|
||||||
d = flatten(mods_to_dict(mods, raise_errors=True))
|
d = flatten(mods_to_dict(mods, raise_errors=True))
|
||||||
|
|
||||||
# METS
|
# METS
|
||||||
d_mets = flatten(mets_to_dict(mets, raise_errors=True))
|
d_mets = flatten(mets_to_dict(mets, raise_errors=True))
|
||||||
for k, v in d_mets.items():
|
for k, v in d_mets.items():
|
||||||
|
@ -305,7 +388,11 @@ def process(mets_files: List[str], output_file: str, output_csv: str, output_xls
|
||||||
# "meta"
|
# "meta"
|
||||||
d['mets_file'] = mets_file
|
d['mets_file'] = mets_file
|
||||||
|
|
||||||
|
# METS - per-page
|
||||||
|
page_info_doc: list[dict] = pages_to_dict(mets, raise_errors=True)
|
||||||
|
|
||||||
mods_info.append(d)
|
mods_info.append(d)
|
||||||
|
page_info.extend(page_info_doc)
|
||||||
|
|
||||||
if caught_warnings:
|
if caught_warnings:
|
||||||
# PyCharm thinks caught_warnings is not Iterable:
|
# PyCharm thinks caught_warnings is not Iterable:
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue