ocrd-galley/ppn2ocr
Gerber, Mike 691be243f6
Some checks failed
continuous-integration/drone/push Build is failing
Use MAX file group name instead of BEST
We were using the file group name BEST for what Kitodo seems to call
MAX by convention. So we use MAX now.

Currently, we work under the assumption that, if MAX exists in the METS
retrieved by OAI-PMH, it's not what we want and we replace it with our
own IIIF URLS with full size.

Fixes GH-43.
2021-02-18 16:34:25 +01:00

171 lines
4.8 KiB
Python
Executable file

#!/usr/bin/env python3
"""Get OCR results as a OCR-D workspace for a given PPN"""
import os
import requests
import sys
import lxml.etree as ET
import re
import subprocess
import click
from copy import deepcopy
XMLNS = {
'mets': 'http://www.loc.gov/METS/',
'xlink': 'http://www.w3.org/1999/xlink'
}
API_URL = 'https://digital.staatsbibliothek-berlin.de/oai'
IDENTIFIER_TEMPLATE = 'oai:digital.staatsbibliothek-berlin.de:%s'
for prefix, uri in XMLNS.items():
ET.register_namespace(prefix, uri)
def oai_mets(ppn):
"""Retrieve METS metadata for a given PPN."""
params = {
'verb': 'GetRecord',
'metadataPrefix': 'mets',
'identifier': IDENTIFIER_TEMPLATE % ppn
}
s = requests.Session()
r = s.get(API_URL, params=params)
mets = ET.XML(r.content).find(f".//{{{XMLNS['mets']}}}mets")
mets = ET.ElementTree(mets)
return mets
def iiif_url_for_dms_url(dms_url, ppn, size, format):
"""
Construct an IIIF URL from a dms URL.
This function exists to encapsulate the hack of rewriting the URL to get IIIF.
"""
if ppn not in dms_url:
raise ValueError(f"Unexpected URL {dms_url}")
m = re.search(r'/dms/.*/([0-9]+)\.jpg$', dms_url)
if m:
page_num = m.group(1)
else:
raise ValueError(f"Unexpected URL {dms_url}")
iiif_identifier = f'{ppn}-{page_num}'
iiif_quality = 'default'
iiif_url = f'https://content.staatsbibliothek-berlin.de/dc/{iiif_identifier}/full/{size}/0/{iiif_quality}.{format}'
return iiif_url
def remove_file_grp(mets, use):
for bad_fileid in mets.xpath(f'//mets:fileGrp[@USE="{use}"]/mets:file/@ID', namespaces=XMLNS):
for bad in mets.xpath(f'//mets:fptr[@FILEID="{bad_fileid}"]', namespaces=XMLNS):
bad.getparent().remove(bad)
for bad in mets.xpath(f'//mets:fileGrp[@USE="{use}"]', namespaces=XMLNS):
bad.getparent().remove(bad)
def mime_type_for_format(format_):
if format_ == 'tif':
mime_type = 'image/tiff'
elif format_ == 'jpg':
mime_type = 'image/jpg'
else:
raise ValueError()
return mime_type
def make_workspace(ppn, workspace):
# Make workspace directory
os.mkdir(workspace)
os.chdir(workspace)
mets = oai_mets(ppn)
# XXX
# Delete PRESENTATION + LOCAL file groups
# (local file:/// or file:/ links, not handled well by "ocrd workspace")
remove_file_grp(mets, 'PRESENTATION')
remove_file_grp(mets, 'LOCAL')
# Delete MAX file group - we assume that, if it exists, it is not as
# we expect it, e.g. IIIF full URLs
remove_file_grp(mets, 'MAX')
# Duplicate DEFAULT file group into a new file group MAX
format_ = 'tif'
file_grp_default = mets.find('//mets:fileGrp[@USE="DEFAULT"]', namespaces=XMLNS)
file_grp_best = deepcopy(file_grp_default)
file_grp_best.attrib['USE'] = 'MAX'
for f in file_grp_best.findall('./mets:file', namespaces=XMLNS):
old_id = f.attrib['ID']
new_id = re.sub('DEFAULT', 'MAX', old_id)
f.attrib['ID'] = new_id
f.attrib['MIMETYPE'] = mime_type_for_format(format_)
for fptr in mets.findall(f'//mets:fptr[@FILEID="{old_id}"]', namespaces=XMLNS):
new_fptr = deepcopy(fptr)
new_fptr.attrib['FILEID'] = new_id
fptr.getparent().append(new_fptr)
# XXX Need to fumble around with the URL for now
flocat = f.find(f".//{{{XMLNS['mets']}}}FLocat")
old_url = flocat.attrib[f"{{{XMLNS['xlink']}}}href"]
url_iiif_full = iiif_url_for_dms_url(old_url, ppn, 'full', format_)
flocat.attrib[f"{{{XMLNS['xlink']}}}href"] = url_iiif_full
mets.find('//mets:fileSec', namespaces=XMLNS).append(file_grp_best)
# Write mets.xml
mets.write('mets.xml', pretty_print=True)
# TODO
# Validate workspace
#ocrd workspace validate mets.xml | grep -v "<notice>Won't download remote image"
# XXX
# Fix 'file:/' URLs to 'file:///'
#sed -i 's#file:/\([^/]\)#file:///\1#' mets.xml
# Patch mets.xml to use our NFS mount
#sed -i 's#file:///goobi/tiff001/sbb/#file:///srv/digisam_images/sbb/#g' mets.xml
def validate_ppn(ctx, param, value):
"""Validate a PPN argument"""
if not value.startswith('PPN'):
raise click.BadParameter('PPN must be in format PPNxxxxxxxx')
else:
return value
@click.command()
@click.argument('ppn', callback=validate_ppn)
def ppn2ocr(ppn):
"""
Get METS with best images for a document PPN
For example, to get the document "PROPOSITIONES PHILOSOPHICAE: [...]" use this:
\b
ppn2ocr PPN699887615
ls PPN699887615
"""
self_dir = os.path.realpath(os.path.dirname(sys.argv[0]))
make_workspace(ppn, ppn)
# XXX
# subprocess.run([
# os.path.join(self_dir, 'run-docker-hub'),
# '-I', 'MAX',
# '--skip-validation'
# ])
if __name__ == '__main__':
ppn2ocr()