mirror of
https://github.com/qurator-spk/ocrd-galley.git
synced 2025-07-01 01:19:52 +02:00
ppn2ocr expects the PPN to be in the PPNxxxxxxx format, i.e. including the leading 'PPN' string. Validate the argument accordingly.
166 lines
4.6 KiB
Python
Executable file
166 lines
4.6 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
"""Get OCR results as a OCR-D workspace for a given PPN"""
|
|
import os
|
|
import requests
|
|
import sys
|
|
import lxml.etree as ET
|
|
import re
|
|
import subprocess
|
|
import click
|
|
from copy import deepcopy
|
|
|
|
|
|
XMLNS = {
|
|
'mets': 'http://www.loc.gov/METS/',
|
|
'xlink': 'http://www.w3.org/1999/xlink'
|
|
}
|
|
API_URL = 'https://digital.staatsbibliothek-berlin.de/oai'
|
|
IDENTIFIER_TEMPLATE = 'oai:digital.staatsbibliothek-berlin.de:%s'
|
|
|
|
|
|
for prefix, uri in XMLNS.items():
|
|
ET.register_namespace(prefix, uri)
|
|
|
|
|
|
def oai_mets(ppn):
|
|
"""Retrieve METS metadata for a given PPN."""
|
|
|
|
params = {
|
|
'verb': 'GetRecord',
|
|
'metadataPrefix': 'mets',
|
|
'identifier': IDENTIFIER_TEMPLATE % ppn
|
|
}
|
|
|
|
s = requests.Session()
|
|
r = s.get(API_URL, params=params)
|
|
mets = ET.XML(r.content).find(f".//{{{XMLNS['mets']}}}mets")
|
|
mets = ET.ElementTree(mets)
|
|
|
|
return mets
|
|
|
|
|
|
def iiif_url_for_dms_url(dms_url, ppn, size, format):
|
|
"""
|
|
Construct an IIIF URL from a dms URL.
|
|
|
|
This function exists to encapsulate the hack of rewriting the URL to get IIIF.
|
|
"""
|
|
if ppn not in dms_url:
|
|
raise ValueError(f"Unexpected URL {dms_url}")
|
|
m = re.search(r'/dms/.*/([0-9]+)\.jpg$', dms_url)
|
|
if m:
|
|
page_num = m.group(1)
|
|
else:
|
|
raise ValueError(f"Unexpected URL {dms_url}")
|
|
iiif_identifier = f'{ppn}-{page_num}'
|
|
iiif_quality = 'default'
|
|
iiif_url = f'https://content.staatsbibliothek-berlin.de/dc/{iiif_identifier}/full/{size}/0/{iiif_quality}.{format}'
|
|
|
|
return iiif_url
|
|
|
|
|
|
def remove_file_grp(mets, use):
|
|
for bad_fileid in mets.xpath(f'//mets:fileGrp[@USE="{use}"]/mets:file/@ID', namespaces=XMLNS):
|
|
for bad in mets.xpath(f'//mets:fptr[@FILEID="{bad_fileid}"]', namespaces=XMLNS):
|
|
bad.getparent().remove(bad)
|
|
for bad in mets.xpath(f'//mets:fileGrp[@USE="{use}"]', namespaces=XMLNS):
|
|
bad.getparent().remove(bad)
|
|
|
|
|
|
def mime_type_for_format(format_):
|
|
if format_ == 'tif':
|
|
mime_type = 'image/tiff'
|
|
elif format_ == 'jpg':
|
|
mime_type = 'image/jpg'
|
|
else:
|
|
raise ValueError()
|
|
|
|
return mime_type
|
|
|
|
|
|
def make_workspace(ppn, workspace):
|
|
# Make workspace directory
|
|
os.mkdir(workspace)
|
|
os.chdir(workspace)
|
|
|
|
mets = oai_mets(ppn)
|
|
|
|
# XXX
|
|
# Delete PRESENTATION + LOCAL file groups
|
|
# (local file:/// or file:/ links, not handled well by "ocrd workspace")
|
|
remove_file_grp(mets, 'PRESENTATION')
|
|
remove_file_grp(mets, 'LOCAL')
|
|
|
|
|
|
# Duplicate DEFAULT file group into a new file group BEST
|
|
format_ = 'tif'
|
|
file_grp_default = mets.find('//mets:fileGrp[@USE="DEFAULT"]', namespaces=XMLNS)
|
|
file_grp_best = deepcopy(file_grp_default)
|
|
|
|
file_grp_best.attrib['USE'] = 'BEST'
|
|
for f in file_grp_best.findall('./mets:file', namespaces=XMLNS):
|
|
old_id = f.attrib['ID']
|
|
new_id = re.sub('DEFAULT', 'BEST', old_id)
|
|
f.attrib['ID'] = new_id
|
|
f.attrib['MIMETYPE'] = mime_type_for_format(format_)
|
|
|
|
for fptr in mets.findall(f'//mets:fptr[@FILEID="{old_id}"]', namespaces=XMLNS):
|
|
new_fptr = deepcopy(fptr)
|
|
new_fptr.attrib['FILEID'] = new_id
|
|
fptr.getparent().append(new_fptr)
|
|
|
|
# XXX Need to fumble around with the URL for now
|
|
flocat = f.find(f".//{{{XMLNS['mets']}}}FLocat")
|
|
old_url = flocat.attrib[f"{{{XMLNS['xlink']}}}href"]
|
|
url_iiif_full = iiif_url_for_dms_url(old_url, ppn, 'full', format_)
|
|
flocat.attrib[f"{{{XMLNS['xlink']}}}href"] = url_iiif_full
|
|
|
|
mets.find('//mets:fileSec', namespaces=XMLNS).append(file_grp_best)
|
|
|
|
|
|
# Write mets.xml
|
|
mets.write('mets.xml', pretty_print=True)
|
|
|
|
# TODO
|
|
# Validate workspace
|
|
#ocrd workspace validate mets.xml | grep -v "<notice>Won't download remote image"
|
|
|
|
# XXX
|
|
# Fix 'file:/' URLs to 'file:///'
|
|
#sed -i 's#file:/\([^/]\)#file:///\1#' mets.xml
|
|
|
|
# Patch mets.xml to use our NFS mount
|
|
#sed -i 's#file:///goobi/tiff001/sbb/#file:///srv/digisam_images/sbb/#g' mets.xml
|
|
|
|
|
|
def validate_ppn(ctx, param, value):
|
|
if not value.startswith('PPN'):
|
|
raise click.BadParameter('PPN must be in format PPNxxxxxxxx')
|
|
else:
|
|
return value
|
|
|
|
@click.command()
|
|
@click.argument('ppn', callback=validate_ppn)
|
|
def ppn2ocr(ppn):
|
|
"""
|
|
Get METS with best images for a document PPN
|
|
|
|
For example, to get the document "PROPOSITIONES PHILOSOPHICAE: [...]" use this:
|
|
|
|
\b
|
|
ppn2ocr PPN699887615
|
|
ls PPN699887615
|
|
"""
|
|
self_dir = os.path.realpath(os.path.dirname(sys.argv[0]))
|
|
make_workspace(ppn, ppn)
|
|
|
|
# XXX
|
|
# subprocess.run([
|
|
# os.path.join(self_dir, 'run-docker-hub'),
|
|
# '-I', 'BEST',
|
|
# '--skip-validation'
|
|
# ])
|
|
|
|
|
|
if __name__ == '__main__':
|
|
ppn2ocr()
|