mirror of
https://github.com/qurator-spk/ocrd-galley.git
synced 2025-07-01 01:19:52 +02:00
Some checks failed
continuous-integration/drone/push Build is failing
We were using the file group name BEST for what Kitodo seems to call MAX by convention. So we use MAX now. Currently, we work under the assumption that, if MAX exists in the METS retrieved by OAI-PMH, it's not what we want and we replace it with our own IIIF URLS with full size. Fixes GH-43.
171 lines
4.8 KiB
Python
Executable file
171 lines
4.8 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
"""Get OCR results as a OCR-D workspace for a given PPN"""
|
|
import os
|
|
import requests
|
|
import sys
|
|
import lxml.etree as ET
|
|
import re
|
|
import subprocess
|
|
import click
|
|
from copy import deepcopy
|
|
|
|
|
|
XMLNS = {
|
|
'mets': 'http://www.loc.gov/METS/',
|
|
'xlink': 'http://www.w3.org/1999/xlink'
|
|
}
|
|
API_URL = 'https://digital.staatsbibliothek-berlin.de/oai'
|
|
IDENTIFIER_TEMPLATE = 'oai:digital.staatsbibliothek-berlin.de:%s'
|
|
|
|
|
|
for prefix, uri in XMLNS.items():
|
|
ET.register_namespace(prefix, uri)
|
|
|
|
|
|
def oai_mets(ppn):
|
|
"""Retrieve METS metadata for a given PPN."""
|
|
|
|
params = {
|
|
'verb': 'GetRecord',
|
|
'metadataPrefix': 'mets',
|
|
'identifier': IDENTIFIER_TEMPLATE % ppn
|
|
}
|
|
|
|
s = requests.Session()
|
|
r = s.get(API_URL, params=params)
|
|
mets = ET.XML(r.content).find(f".//{{{XMLNS['mets']}}}mets")
|
|
mets = ET.ElementTree(mets)
|
|
|
|
return mets
|
|
|
|
|
|
def iiif_url_for_dms_url(dms_url, ppn, size, format):
|
|
"""
|
|
Construct an IIIF URL from a dms URL.
|
|
|
|
This function exists to encapsulate the hack of rewriting the URL to get IIIF.
|
|
"""
|
|
if ppn not in dms_url:
|
|
raise ValueError(f"Unexpected URL {dms_url}")
|
|
m = re.search(r'/dms/.*/([0-9]+)\.jpg$', dms_url)
|
|
if m:
|
|
page_num = m.group(1)
|
|
else:
|
|
raise ValueError(f"Unexpected URL {dms_url}")
|
|
iiif_identifier = f'{ppn}-{page_num}'
|
|
iiif_quality = 'default'
|
|
iiif_url = f'https://content.staatsbibliothek-berlin.de/dc/{iiif_identifier}/full/{size}/0/{iiif_quality}.{format}'
|
|
|
|
return iiif_url
|
|
|
|
|
|
def remove_file_grp(mets, use):
|
|
for bad_fileid in mets.xpath(f'//mets:fileGrp[@USE="{use}"]/mets:file/@ID', namespaces=XMLNS):
|
|
for bad in mets.xpath(f'//mets:fptr[@FILEID="{bad_fileid}"]', namespaces=XMLNS):
|
|
bad.getparent().remove(bad)
|
|
for bad in mets.xpath(f'//mets:fileGrp[@USE="{use}"]', namespaces=XMLNS):
|
|
bad.getparent().remove(bad)
|
|
|
|
|
|
def mime_type_for_format(format_):
|
|
if format_ == 'tif':
|
|
mime_type = 'image/tiff'
|
|
elif format_ == 'jpg':
|
|
mime_type = 'image/jpg'
|
|
else:
|
|
raise ValueError()
|
|
|
|
return mime_type
|
|
|
|
|
|
def make_workspace(ppn, workspace):
|
|
# Make workspace directory
|
|
os.mkdir(workspace)
|
|
os.chdir(workspace)
|
|
|
|
mets = oai_mets(ppn)
|
|
|
|
# XXX
|
|
# Delete PRESENTATION + LOCAL file groups
|
|
# (local file:/// or file:/ links, not handled well by "ocrd workspace")
|
|
remove_file_grp(mets, 'PRESENTATION')
|
|
remove_file_grp(mets, 'LOCAL')
|
|
|
|
|
|
# Delete MAX file group - we assume that, if it exists, it is not as
|
|
# we expect it, e.g. IIIF full URLs
|
|
remove_file_grp(mets, 'MAX')
|
|
|
|
# Duplicate DEFAULT file group into a new file group MAX
|
|
format_ = 'tif'
|
|
file_grp_default = mets.find('//mets:fileGrp[@USE="DEFAULT"]', namespaces=XMLNS)
|
|
file_grp_best = deepcopy(file_grp_default)
|
|
|
|
file_grp_best.attrib['USE'] = 'MAX'
|
|
for f in file_grp_best.findall('./mets:file', namespaces=XMLNS):
|
|
old_id = f.attrib['ID']
|
|
new_id = re.sub('DEFAULT', 'MAX', old_id)
|
|
f.attrib['ID'] = new_id
|
|
f.attrib['MIMETYPE'] = mime_type_for_format(format_)
|
|
|
|
for fptr in mets.findall(f'//mets:fptr[@FILEID="{old_id}"]', namespaces=XMLNS):
|
|
new_fptr = deepcopy(fptr)
|
|
new_fptr.attrib['FILEID'] = new_id
|
|
fptr.getparent().append(new_fptr)
|
|
|
|
# XXX Need to fumble around with the URL for now
|
|
flocat = f.find(f".//{{{XMLNS['mets']}}}FLocat")
|
|
old_url = flocat.attrib[f"{{{XMLNS['xlink']}}}href"]
|
|
url_iiif_full = iiif_url_for_dms_url(old_url, ppn, 'full', format_)
|
|
flocat.attrib[f"{{{XMLNS['xlink']}}}href"] = url_iiif_full
|
|
|
|
mets.find('//mets:fileSec', namespaces=XMLNS).append(file_grp_best)
|
|
|
|
# Write mets.xml
|
|
mets.write('mets.xml', pretty_print=True)
|
|
|
|
# TODO
|
|
# Validate workspace
|
|
#ocrd workspace validate mets.xml | grep -v "<notice>Won't download remote image"
|
|
|
|
# XXX
|
|
# Fix 'file:/' URLs to 'file:///'
|
|
#sed -i 's#file:/\([^/]\)#file:///\1#' mets.xml
|
|
|
|
# Patch mets.xml to use our NFS mount
|
|
#sed -i 's#file:///goobi/tiff001/sbb/#file:///srv/digisam_images/sbb/#g' mets.xml
|
|
|
|
|
|
def validate_ppn(ctx, param, value):
|
|
"""Validate a PPN argument"""
|
|
if not value.startswith('PPN'):
|
|
raise click.BadParameter('PPN must be in format PPNxxxxxxxx')
|
|
else:
|
|
return value
|
|
|
|
|
|
@click.command()
|
|
@click.argument('ppn', callback=validate_ppn)
|
|
def ppn2ocr(ppn):
|
|
"""
|
|
Get METS with best images for a document PPN
|
|
|
|
For example, to get the document "PROPOSITIONES PHILOSOPHICAE: [...]" use this:
|
|
|
|
\b
|
|
ppn2ocr PPN699887615
|
|
ls PPN699887615
|
|
"""
|
|
self_dir = os.path.realpath(os.path.dirname(sys.argv[0]))
|
|
make_workspace(ppn, ppn)
|
|
|
|
# XXX
|
|
# subprocess.run([
|
|
# os.path.join(self_dir, 'run-docker-hub'),
|
|
# '-I', 'MAX',
|
|
# '--skip-validation'
|
|
# ])
|
|
|
|
|
|
if __name__ == '__main__':
|
|
ppn2ocr()
|