| 
									
										
										
										
											2020-06-02 19:06:31 +02:00
										 |  |  | #!/usr/bin/env python3 | 
					
						
							|  |  |  | """Get OCR results as a OCR-D workspace for a given PPN""" | 
					
						
							|  |  |  | import os | 
					
						
							|  |  |  | import requests | 
					
						
							|  |  |  | import sys | 
					
						
							|  |  |  | import lxml.etree as ET | 
					
						
							|  |  |  | import re | 
					
						
							| 
									
										
										
										
											2020-06-02 19:25:31 +02:00
										 |  |  | import subprocess | 
					
						
							| 
									
										
										
										
											2020-06-03 15:53:45 +02:00
										 |  |  | import click | 
					
						
							| 
									
										
										
										
											2020-06-02 19:06:31 +02:00
										 |  |  | from copy import deepcopy | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | XMLNS = { | 
					
						
							| 
									
										
										
										
											2021-09-15 17:37:31 +02:00
										 |  |  |     'mets': 'http://www.loc.gov/METS/', | 
					
						
							|  |  |  |     'xlink': 'http://www.w3.org/1999/xlink' | 
					
						
							| 
									
										
										
										
											2020-06-02 19:06:31 +02:00
										 |  |  | } | 
					
						
							| 
									
										
										
										
											2021-09-15 17:11:12 +02:00
										 |  |  | API_URL = 'https://oai.sbb.berlin' | 
					
						
							| 
									
										
										
										
											2020-06-02 19:06:31 +02:00
										 |  |  | IDENTIFIER_TEMPLATE = 'oai:digital.staatsbibliothek-berlin.de:%s' | 
					
						
							| 
									
										
										
										
											2020-05-22 16:45:19 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-03-09 18:27:29 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-06-02 19:06:31 +02:00
										 |  |  | for prefix, uri in XMLNS.items(): | 
					
						
							|  |  |  |     ET.register_namespace(prefix, uri) | 
					
						
							| 
									
										
										
										
											2020-03-09 18:27:29 +01:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-06-02 19:06:31 +02:00
										 |  |  | def oai_mets(ppn): | 
					
						
							|  |  |  |     """Retrieve METS metadata for a given PPN.""" | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     params = { | 
					
						
							|  |  |  |         'verb': 'GetRecord', | 
					
						
							|  |  |  |         'metadataPrefix': 'mets', | 
					
						
							|  |  |  |         'identifier': IDENTIFIER_TEMPLATE % ppn | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     s = requests.Session() | 
					
						
							| 
									
										
										
										
											2020-06-23 15:15:21 +02:00
										 |  |  |     r = s.get(API_URL, params=params) | 
					
						
							| 
									
										
										
										
											2020-06-02 19:06:31 +02:00
										 |  |  |     mets = ET.XML(r.content).find(f".//{{{XMLNS['mets']}}}mets") | 
					
						
							|  |  |  |     mets = ET.ElementTree(mets) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     return mets | 
					
						
							| 
									
										
										
										
											2020-03-09 18:27:29 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-06-02 19:06:31 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-04-07 18:12:49 +02:00
										 |  |  | def iiif_url_for_sbb_url(sbb_url, ppn, size, format): | 
					
						
							|  |  |  |     """ | 
					
						
							|  |  |  |     Construct an IIIF URL from a dms or an IIIF URL. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     This function exists as long as dms URL exist (or as long as we may need to | 
					
						
							|  |  |  |     rewrite IIIF URLs for a different size) | 
					
						
							|  |  |  |     """ | 
					
						
							|  |  |  |     if "/dms/" in sbb_url: | 
					
						
							|  |  |  |         return iiif_url_for_dms_url(sbb_url, ppn, size, format) | 
					
						
							|  |  |  |     else: | 
					
						
							|  |  |  |         return iiif_url_for_iiif_url(sbb_url, ppn, size, format) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-06-23 19:03:58 +02:00
										 |  |  | def iiif_url_for_dms_url(dms_url, ppn, size, format): | 
					
						
							| 
									
										
										
										
											2020-06-02 19:18:06 +02:00
										 |  |  |     """ | 
					
						
							|  |  |  |     Construct an IIIF URL from a dms URL. | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-06-03 10:11:23 +02:00
										 |  |  |     This function exists to encapsulate the hack of rewriting the URL to get IIIF. | 
					
						
							| 
									
										
										
										
											2020-06-02 19:18:06 +02:00
										 |  |  |     """ | 
					
						
							|  |  |  |     if ppn not in dms_url: | 
					
						
							|  |  |  |         raise ValueError(f"Unexpected URL {dms_url}") | 
					
						
							|  |  |  |     m = re.search(r'/dms/.*/([0-9]+)\.jpg$', dms_url) | 
					
						
							|  |  |  |     if m: | 
					
						
							|  |  |  |         page_num = m.group(1) | 
					
						
							|  |  |  |     else: | 
					
						
							|  |  |  |         raise ValueError(f"Unexpected URL {dms_url}") | 
					
						
							|  |  |  |     iiif_identifier = f'{ppn}-{page_num}' | 
					
						
							| 
									
										
										
										
											2020-06-23 19:03:58 +02:00
										 |  |  |     iiif_quality = 'default' | 
					
						
							|  |  |  |     iiif_url = f'https://content.staatsbibliothek-berlin.de/dc/{iiif_identifier}/full/{size}/0/{iiif_quality}.{format}' | 
					
						
							| 
									
										
										
										
											2020-06-02 19:18:06 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  |     return iiif_url | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-04-07 18:12:49 +02:00
										 |  |  | def iiif_url_for_iiif_url(iiif_url, ppn, size, format): | 
					
						
							|  |  |  |     """ | 
					
						
							|  |  |  |     Construct an IIIF URL from an already existing IIIF URL. | 
					
						
							|  |  |  |     """ | 
					
						
							|  |  |  |     if ppn not in iiif_url: | 
					
						
							|  |  |  |         raise ValueError(f"Unexpected URL {iiif_url}") | 
					
						
							|  |  |  |     m = re.search(rf'/dc/{ppn}-([0-9]+)/', iiif_url) | 
					
						
							|  |  |  |     if m: | 
					
						
							|  |  |  |         page_num = m.group(1) | 
					
						
							|  |  |  |     else: | 
					
						
							|  |  |  |         raise ValueError(f"Unexpected URL {iiif_url}") | 
					
						
							|  |  |  |     iiif_identifier = f'{ppn}-{page_num}' | 
					
						
							|  |  |  |     iiif_quality = 'default' | 
					
						
							|  |  |  |     iiif_url = f'https://content.staatsbibliothek-berlin.de/dc/{iiif_identifier}/full/{size}/0/{iiif_quality}.{format}' | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     return iiif_url | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-06-03 10:10:54 +02:00
										 |  |  | def remove_file_grp(mets, use): | 
					
						
							|  |  |  |     for bad_fileid in mets.xpath(f'//mets:fileGrp[@USE="{use}"]/mets:file/@ID', namespaces=XMLNS): | 
					
						
							|  |  |  |         for bad in mets.xpath(f'//mets:fptr[@FILEID="{bad_fileid}"]', namespaces=XMLNS): | 
					
						
							|  |  |  |             bad.getparent().remove(bad) | 
					
						
							|  |  |  |     for bad in mets.xpath(f'//mets:fileGrp[@USE="{use}"]', namespaces=XMLNS): | 
					
						
							|  |  |  |         bad.getparent().remove(bad) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-06-23 19:03:58 +02:00
										 |  |  | def mime_type_for_format(format_): | 
					
						
							|  |  |  |     if format_ == 'tif': | 
					
						
							|  |  |  |         mime_type = 'image/tiff' | 
					
						
							|  |  |  |     elif format_ == 'jpg': | 
					
						
							|  |  |  |         mime_type = 'image/jpg' | 
					
						
							|  |  |  |     else: | 
					
						
							|  |  |  |         raise ValueError() | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     return mime_type | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-09-15 17:12:11 +02:00
										 |  |  | def prune_file_grps(mets): | 
					
						
							| 
									
										
										
										
											2021-09-15 17:26:14 +02:00
										 |  |  |     """ | 
					
						
							|  |  |  |     Prune unwanted file groups | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     We only want to keep the MAX file group (we created it ourselves) and | 
					
						
							|  |  |  |     possibly ABBYY full texts in FULLTEXT. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     For the PRESENTATION + LOCAL file groups we definitely want to delete | 
					
						
							|  |  |  |     because they contain local file:/// or file:/ links, which are not handled | 
					
						
							|  |  |  |     well by "ocrd workspace". They are not explicitly mentioned, as we | 
					
						
							|  |  |  |     only keep a whitelist. | 
					
						
							|  |  |  |     """ | 
					
						
							|  |  |  |     wanted_file_grps = ["MAX", "FULLTEXT"] | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     for u in mets.xpath('//mets:fileGrp/@USE', namespaces=XMLNS): | 
					
						
							| 
									
										
										
										
											2021-09-15 17:37:31 +02:00
										 |  |  |         if u not in wanted_file_grps: | 
					
						
							|  |  |  |             remove_file_grp(mets, u) | 
					
						
							| 
									
										
										
										
											2021-09-15 17:12:11 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-06-02 19:06:31 +02:00
										 |  |  | def make_workspace(ppn, workspace): | 
					
						
							| 
									
										
										
										
											2020-05-22 16:53:20 +02:00
										 |  |  |     # Make workspace directory | 
					
						
							| 
									
										
										
										
											2020-06-02 19:06:31 +02:00
										 |  |  |     os.mkdir(workspace) | 
					
						
							|  |  |  |     os.chdir(workspace) | 
					
						
							| 
									
										
										
										
											2020-03-09 18:27:29 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-06-02 19:06:31 +02:00
										 |  |  |     mets = oai_mets(ppn) | 
					
						
							| 
									
										
										
										
											2020-03-09 18:27:29 +01:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-02-18 16:34:25 +01:00
										 |  |  | 
 | 
					
						
							|  |  |  |     # Delete MAX file group - we assume that, if it exists, it is not as | 
					
						
							|  |  |  |     # we expect it, e.g. IIIF full URLs | 
					
						
							|  |  |  |     remove_file_grp(mets, 'MAX') | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # Duplicate DEFAULT file group into a new file group MAX | 
					
						
							| 
									
										
										
										
											2020-06-23 19:03:58 +02:00
										 |  |  |     format_ = 'tif' | 
					
						
							| 
									
										
										
										
											2020-06-02 19:06:31 +02:00
										 |  |  |     file_grp_default = mets.find('//mets:fileGrp[@USE="DEFAULT"]', namespaces=XMLNS) | 
					
						
							| 
									
										
										
										
											2021-03-03 16:17:14 +01:00
										 |  |  | 
 | 
					
						
							|  |  |  |     if file_grp_default is None: | 
					
						
							| 
									
										
										
										
											2021-09-15 17:37:31 +02:00
										 |  |  |         raise ValueError("This document has no DEFAULT file group, could be a multi-volume work") | 
					
						
							| 
									
										
										
										
											2021-03-03 16:17:14 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-06-02 19:06:31 +02:00
										 |  |  |     file_grp_best = deepcopy(file_grp_default) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-02-18 16:34:25 +01:00
										 |  |  |     file_grp_best.attrib['USE'] = 'MAX' | 
					
						
							| 
									
										
										
										
											2020-06-02 19:06:31 +02:00
										 |  |  |     for f in file_grp_best.findall('./mets:file', namespaces=XMLNS): | 
					
						
							|  |  |  |         old_id = f.attrib['ID'] | 
					
						
							| 
									
										
										
										
											2021-02-18 16:34:25 +01:00
										 |  |  |         new_id = re.sub('DEFAULT', 'MAX', old_id) | 
					
						
							| 
									
										
										
										
											2020-06-02 19:06:31 +02:00
										 |  |  |         f.attrib['ID'] = new_id | 
					
						
							| 
									
										
										
										
											2020-06-23 19:03:58 +02:00
										 |  |  |         f.attrib['MIMETYPE'] = mime_type_for_format(format_) | 
					
						
							| 
									
										
										
										
											2020-06-02 19:06:31 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  |         for fptr in mets.findall(f'//mets:fptr[@FILEID="{old_id}"]', namespaces=XMLNS): | 
					
						
							|  |  |  |             new_fptr = deepcopy(fptr) | 
					
						
							|  |  |  |             new_fptr.attrib['FILEID'] = new_id | 
					
						
							|  |  |  |             fptr.getparent().append(new_fptr) | 
					
						
							| 
									
										
										
										
											2020-03-09 18:27:29 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-06-02 19:06:31 +02:00
										 |  |  |         # XXX Need to fumble around with the URL for now | 
					
						
							|  |  |  |         flocat = f.find(f".//{{{XMLNS['mets']}}}FLocat") | 
					
						
							|  |  |  |         old_url = flocat.attrib[f"{{{XMLNS['xlink']}}}href"] | 
					
						
							| 
									
										
										
										
											2022-04-07 18:12:49 +02:00
										 |  |  |         url_iiif_full = iiif_url_for_sbb_url(old_url, ppn, 'full', format_) | 
					
						
							| 
									
										
										
										
											2020-06-02 19:06:31 +02:00
										 |  |  |         flocat.attrib[f"{{{XMLNS['xlink']}}}href"] = url_iiif_full | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     mets.find('//mets:fileSec', namespaces=XMLNS).append(file_grp_best) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-09-15 17:12:11 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  |     prune_file_grps(mets) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-06-02 19:06:31 +02:00
										 |  |  |     # Write mets.xml | 
					
						
							|  |  |  |     mets.write('mets.xml', pretty_print=True) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # TODO | 
					
						
							| 
									
										
										
										
											2020-05-22 16:53:20 +02:00
										 |  |  |     # Validate workspace | 
					
						
							| 
									
										
										
										
											2020-06-02 19:06:31 +02:00
										 |  |  |     #ocrd workspace validate mets.xml | grep -v "<notice>Won't download remote image" | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-05-22 16:53:20 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-09-03 16:59:50 +02:00
										 |  |  | def validate_ppn(ctx, param, value): | 
					
						
							| 
									
										
										
										
											2020-09-03 17:18:42 +02:00
										 |  |  |     """Validate a PPN argument""" | 
					
						
							| 
									
										
										
										
											2020-09-03 16:59:50 +02:00
										 |  |  |     if not value.startswith('PPN'): | 
					
						
							|  |  |  |         raise click.BadParameter('PPN must be in format PPNxxxxxxxx') | 
					
						
							|  |  |  |     else: | 
					
						
							|  |  |  |         return value | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-09-03 17:18:42 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-06-03 15:53:45 +02:00
										 |  |  | @click.command() | 
					
						
							| 
									
										
										
										
											2020-09-03 16:59:50 +02:00
										 |  |  | @click.argument('ppn', callback=validate_ppn) | 
					
						
							| 
									
										
										
										
											2020-06-03 15:53:45 +02:00
										 |  |  | def ppn2ocr(ppn): | 
					
						
							| 
									
										
										
										
											2020-09-03 17:18:42 +02:00
										 |  |  |     """ | 
					
						
							|  |  |  |     Get METS with best images for a document PPN | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     For example, to get the document "PROPOSITIONES PHILOSOPHICAE: [...]" use this: | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     \b | 
					
						
							|  |  |  |     ppn2ocr PPN699887615 | 
					
						
							|  |  |  |     ls PPN699887615 | 
					
						
							|  |  |  |     """ | 
					
						
							|  |  |  |     self_dir = os.path.realpath(os.path.dirname(sys.argv[0])) | 
					
						
							|  |  |  |     make_workspace(ppn, ppn) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # XXX | 
					
						
							|  |  |  |     #  subprocess.run([ | 
					
						
							|  |  |  |     #     os.path.join(self_dir, 'run-docker-hub'), | 
					
						
							| 
									
										
										
										
											2021-02-18 16:34:25 +01:00
										 |  |  |     #     '-I', 'MAX', | 
					
						
							| 
									
										
										
										
											2020-09-03 17:18:42 +02:00
										 |  |  |     #     '--skip-validation' | 
					
						
							|  |  |  |     #  ]) | 
					
						
							| 
									
										
										
										
											2020-06-03 15:53:45 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | if __name__ == '__main__': | 
					
						
							|  |  |  |     ppn2ocr() |