ocrd-galley/extra/ppn2ocr

#!/usr/bin/env python3
"""Get OCR results as a OCR-D workspace for a given PPN"""
import os
import requests
import sys
import lxml.etree as ET
import re
import subprocess
import click
from copy import deepcopy


XMLNS = {
    'mets': 'http://www.loc.gov/METS/',
    'xlink': 'http://www.w3.org/1999/xlink'
}
API_URL = 'https://oai.sbb.berlin'
IDENTIFIER_TEMPLATE = 'oai:digital.staatsbibliothek-berlin.de:%s'


for prefix, uri in XMLNS.items():
    ET.register_namespace(prefix, uri)


def oai_mets(ppn):
    """Retrieve METS metadata for a given PPN."""

    params = {
        'verb': 'GetRecord',
        'metadataPrefix': 'mets',
        'identifier': IDENTIFIER_TEMPLATE % ppn
    }

    s = requests.Session()
    r = s.get(API_URL, params=params)
    mets = ET.XML(r.content).find(f".//{{{XMLNS['mets']}}}mets")
    mets = ET.ElementTree(mets)

    return mets


def iiif_url_for_sbb_url(sbb_url, ppn, size, format):
    """
    Construct an IIIF URL from a dms or an IIIF URL.

    This function exists as long as dms URL exist (or as long as we may need to
    rewrite IIIF URLs for a different size)
    """
    if "/dms/" in sbb_url:
        return iiif_url_for_dms_url(sbb_url, ppn, size, format)
    else:
        return iiif_url_for_iiif_url(sbb_url, ppn, size, format)


def iiif_url_for_dms_url(dms_url, ppn, size, format):
    """
    Construct an IIIF URL from a dms URL.

    This function exists to encapsulate the hack of rewriting the URL to get IIIF.
    """
    if ppn not in dms_url:
        raise ValueError(f"Unexpected URL {dms_url}")
    m = re.search(r'/dms/.*/([0-9]+)\.jpg$', dms_url)
    if m:
        page_num = m.group(1)
    else:
        raise ValueError(f"Unexpected URL {dms_url}")
    iiif_identifier = f'{ppn}-{page_num}'
    iiif_quality = 'default'
    iiif_url = f'https://content.staatsbibliothek-berlin.de/dc/{iiif_identifier}/full/{size}/0/{iiif_quality}.{format}'

    return iiif_url


def iiif_url_for_iiif_url(iiif_url, ppn, size, format):
    """
    Construct an IIIF URL from an already existing IIIF URL.
    """
    if ppn not in iiif_url:
        raise ValueError(f"Unexpected URL {iiif_url}")
    m = re.search(rf'/dc/{ppn}-([0-9]+)/', iiif_url)
    if m:
        page_num = m.group(1)
    else:
        raise ValueError(f"Unexpected URL {iiif_url}")
    iiif_identifier = f'{ppn}-{page_num}'
    iiif_quality = 'default'
    iiif_url = f'https://content.staatsbibliothek-berlin.de/dc/{iiif_identifier}/full/{size}/0/{iiif_quality}.{format}'

    return iiif_url


def remove_file_grp(mets, use):
    for bad_fileid in mets.xpath(f'//mets:fileGrp[@USE="{use}"]/mets:file/@ID', namespaces=XMLNS):
        for bad in mets.xpath(f'//mets:fptr[@FILEID="{bad_fileid}"]', namespaces=XMLNS):
            bad.getparent().remove(bad)
    for bad in mets.xpath(f'//mets:fileGrp[@USE="{use}"]', namespaces=XMLNS):
        bad.getparent().remove(bad)


def mime_type_for_format(format_):
    if format_ == 'tif':
        mime_type = 'image/tiff'
    elif format_ == 'jpg':
        mime_type = 'image/jpg'
    else:
        raise ValueError()

    return mime_type


def prune_file_grps(mets):
    """
    Prune unwanted file groups

    We only want to keep the MAX file group (we created it ourselves) and
    possibly ABBYY full texts in FULLTEXT.

    For the PRESENTATION + LOCAL file groups we definitely want to delete
    because they contain local file:/// or file:/ links, which are not handled
    well by "ocrd workspace". They are not explicitly mentioned, as we
    only keep a whitelist.
    """
    wanted_file_grps = ["MAX", "FULLTEXT"]

    for u in mets.xpath('//mets:fileGrp/@USE', namespaces=XMLNS):
        if u not in wanted_file_grps:
            remove_file_grp(mets, u)


def make_workspace(ppn, workspace):
    # Make workspace directory
    os.mkdir(workspace)
    os.chdir(workspace)

    mets = oai_mets(ppn)


    # Delete MAX file group - we assume that, if it exists, it is not as
    # we expect it, e.g. IIIF full URLs
    remove_file_grp(mets, 'MAX')

    # Duplicate DEFAULT file group into a new file group MAX
    format_ = 'tif'
    file_grp_default = mets.find('//mets:fileGrp[@USE="DEFAULT"]', namespaces=XMLNS)

    if file_grp_default is None:
        raise ValueError("This document has no DEFAULT file group, could be a multi-volume work")

    file_grp_best = deepcopy(file_grp_default)

    file_grp_best.attrib['USE'] = 'MAX'
    for f in file_grp_best.findall('./mets:file', namespaces=XMLNS):
        old_id = f.attrib['ID']
        new_id = re.sub('DEFAULT', 'MAX', old_id)
        f.attrib['ID'] = new_id
        f.attrib['MIMETYPE'] = mime_type_for_format(format_)

        for fptr in mets.findall(f'//mets:fptr[@FILEID="{old_id}"]', namespaces=XMLNS):
            new_fptr = deepcopy(fptr)
            new_fptr.attrib['FILEID'] = new_id
            fptr.getparent().append(new_fptr)

        # XXX Need to fumble around with the URL for now
        flocat = f.find(f".//{{{XMLNS['mets']}}}FLocat")
        old_url = flocat.attrib[f"{{{XMLNS['xlink']}}}href"]
        url_iiif_full = iiif_url_for_sbb_url(old_url, ppn, 'full', format_)
        flocat.attrib[f"{{{XMLNS['xlink']}}}href"] = url_iiif_full

    mets.find('//mets:fileSec', namespaces=XMLNS).append(file_grp_best)


    prune_file_grps(mets)


    # Write mets.xml
    mets.write('mets.xml', pretty_print=True)

    # TODO
    # Validate workspace
    #ocrd workspace validate mets.xml | grep -v "<notice>Won't download remote image"


def validate_ppn(ctx, param, value):
    """Validate a PPN argument"""
    if not value.startswith('PPN'):
        raise click.BadParameter('PPN must be in format PPNxxxxxxxx')
    else:
        return value


@click.command()
@click.argument('ppn', callback=validate_ppn)
def ppn2ocr(ppn):
    """
    Get METS with best images for a document PPN

    For example, to get the document "PROPOSITIONES PHILOSOPHICAE: [...]" use this:

    \b
    ppn2ocr PPN699887615
    ls PPN699887615
    """
    self_dir = os.path.realpath(os.path.dirname(sys.argv[0]))
    make_workspace(ppn, ppn)

    # XXX
    #  subprocess.run([
    #     os.path.join(self_dir, 'run-docker-hub'),
    #     '-I', 'MAX',
    #     '--skip-validation'
    #  ])


if __name__ == '__main__':
    ppn2ocr()
🚧 ppn2ocr: Convert to Python + fumble in IIIF URLs 2020-06-02 19:06:31 +02:00			`#!/usr/bin/env python3`
			`"""Get OCR results as a OCR-D workspace for a given PPN"""`
			`import os`
			`import requests`
			`import sys`
			`import lxml.etree as ET`
			`import re`
🚧 ppn2ocr: Actually run the workflow 2020-06-02 19:25:31 +02:00			`import subprocess`
💄 ppn2ocr: Add a proper CLI interface 2020-06-03 15:53:45 +02:00			`import click`
🚧 ppn2ocr: Convert to Python + fumble in IIIF URLs 2020-06-02 19:06:31 +02:00			`from copy import deepcopy`


			`XMLNS = {`
🎨 ppn2ocr: Fix bad indentation 2021-09-15 17:37:31 +02:00			`'mets': 'http://www.loc.gov/METS/',`
			`'xlink': 'http://www.w3.org/1999/xlink'`
🚧 ppn2ocr: Convert to Python + fumble in IIIF URLs 2020-06-02 19:06:31 +02:00			`}`
⚙️ ppn2ocr: Use new API_URL (https://oai.sbb.berlin) 2021-09-15 17:11:12 +02:00			`API_URL = 'https://oai.sbb.berlin'`
🚧 ppn2ocr: Convert to Python + fumble in IIIF URLs 2020-06-02 19:06:31 +02:00			`IDENTIFIER_TEMPLATE = 'oai:digital.staatsbibliothek-berlin.de:%s'`
✨ ppn2ocr: Use a better example document 2020-05-22 16:45:19 +02:00
🚧 WIP: Add script ppn2ocr to run a document by giving PPN 2020-03-09 18:27:29 +01:00
🚧 ppn2ocr: Convert to Python + fumble in IIIF URLs 2020-06-02 19:06:31 +02:00			`for prefix, uri in XMLNS.items():`
			`ET.register_namespace(prefix, uri)`
🚧 WIP: Add script ppn2ocr to run a document by giving PPN 2020-03-09 18:27:29 +01:00

🚧 ppn2ocr: Convert to Python + fumble in IIIF URLs 2020-06-02 19:06:31 +02:00			`def oai_mets(ppn):`
			`"""Retrieve METS metadata for a given PPN."""`

			`params = {`
			`'verb': 'GetRecord',`
			`'metadataPrefix': 'mets',`
			`'identifier': IDENTIFIER_TEMPLATE % ppn`
			`}`

			`s = requests.Session()`
🐛 ppn2ocr: Verify oai.sbb.berlin's certificate again Now that oai.sbb.berlin's certificate chain is fixed, remove the workaround again. Fixes GH#15. 2020-06-23 15:15:21 +02:00			`r = s.get(API_URL, params=params)`
🚧 ppn2ocr: Convert to Python + fumble in IIIF URLs 2020-06-02 19:06:31 +02:00			`mets = ET.XML(r.content).find(f".//{{{XMLNS['mets']}}}mets")`
			`mets = ET.ElementTree(mets)`

			`return mets`
🚧 WIP: Add script ppn2ocr to run a document by giving PPN 2020-03-09 18:27:29 +01:00
🚧 ppn2ocr: Convert to Python + fumble in IIIF URLs 2020-06-02 19:06:31 +02:00
🐛 ppn2ocr: Don't break now that we have IIIF URLs 2022-04-07 18:12:49 +02:00			`def iiif_url_for_sbb_url(sbb_url, ppn, size, format):`
			`"""`
			`Construct an IIIF URL from a dms or an IIIF URL.`

			`This function exists as long as dms URL exist (or as long as we may need to`
			`rewrite IIIF URLs for a different size)`
			`"""`
			`if "/dms/" in sbb_url:`
			`return iiif_url_for_dms_url(sbb_url, ppn, size, format)`
			`else:`
			`return iiif_url_for_iiif_url(sbb_url, ppn, size, format)`


✨ ppn2ocr: Support TIFF in the BEST group 2020-06-23 19:03:58 +02:00			`def iiif_url_for_dms_url(dms_url, ppn, size, format):`
🚧 ppn2ocr: Extract a function to contain the IIIF hack 2020-06-02 19:18:06 +02:00			`"""`
			`Construct an IIIF URL from a dms URL.`

🚧 ppn2ocr: s/contain/encapsulate 2020-06-03 10:11:23 +02:00			`This function exists to encapsulate the hack of rewriting the URL to get IIIF.`
🚧 ppn2ocr: Extract a function to contain the IIIF hack 2020-06-02 19:18:06 +02:00			`"""`
			`if ppn not in dms_url:`
			`raise ValueError(f"Unexpected URL {dms_url}")`
			`m = re.search(r'/dms/.*/([0-9]+)\.jpg$', dms_url)`
			`if m:`
			`page_num = m.group(1)`
			`else:`
			`raise ValueError(f"Unexpected URL {dms_url}")`
			`iiif_identifier = f'{ppn}-{page_num}'`
✨ ppn2ocr: Support TIFF in the BEST group 2020-06-23 19:03:58 +02:00			`iiif_quality = 'default'`
			`iiif_url = f'https://content.staatsbibliothek-berlin.de/dc/{iiif_identifier}/full/{size}/0/{iiif_quality}.{format}'`
🚧 ppn2ocr: Extract a function to contain the IIIF hack 2020-06-02 19:18:06 +02:00
			`return iiif_url`


🐛 ppn2ocr: Don't break now that we have IIIF URLs 2022-04-07 18:12:49 +02:00			`def iiif_url_for_iiif_url(iiif_url, ppn, size, format):`
			`"""`
			`Construct an IIIF URL from an already existing IIIF URL.`
			`"""`
			`if ppn not in iiif_url:`
			`raise ValueError(f"Unexpected URL {iiif_url}")`
			`m = re.search(rf'/dc/{ppn}-([0-9]+)/', iiif_url)`
			`if m:`
			`page_num = m.group(1)`
			`else:`
			`raise ValueError(f"Unexpected URL {iiif_url}")`
			`iiif_identifier = f'{ppn}-{page_num}'`
			`iiif_quality = 'default'`
			`iiif_url = f'https://content.staatsbibliothek-berlin.de/dc/{iiif_identifier}/full/{size}/0/{iiif_quality}.{format}'`

			`return iiif_url`


🚧 ppn2ocr: Properly remove the PRESENTATION file group 2020-06-03 10:10:54 +02:00			`def remove_file_grp(mets, use):`
			`for bad_fileid in mets.xpath(f'//mets:fileGrp[@USE="{use}"]/mets:file/@ID', namespaces=XMLNS):`
			`for bad in mets.xpath(f'//mets:fptr[@FILEID="{bad_fileid}"]', namespaces=XMLNS):`
			`bad.getparent().remove(bad)`
			`for bad in mets.xpath(f'//mets:fileGrp[@USE="{use}"]', namespaces=XMLNS):`
			`bad.getparent().remove(bad)`


✨ ppn2ocr: Support TIFF in the BEST group 2020-06-23 19:03:58 +02:00			`def mime_type_for_format(format_):`
			`if format_ == 'tif':`
			`mime_type = 'image/tiff'`
			`elif format_ == 'jpg':`
			`mime_type = 'image/jpg'`
			`else:`
			`raise ValueError()`

			`return mime_type`


⚙️ ppn2ocr: Move pruning file groups into a function 2021-09-15 17:12:11 +02:00			`def prune_file_grps(mets):`
✨ ppn2ocr: Keep only wanted file groups 2021-09-15 17:26:14 +02:00			`"""`
			`Prune unwanted file groups`

			`We only want to keep the MAX file group (we created it ourselves) and`
			`possibly ABBYY full texts in FULLTEXT.`

			`For the PRESENTATION + LOCAL file groups we definitely want to delete`
			`because they contain local file:/// or file:/ links, which are not handled`
			`well by "ocrd workspace". They are not explicitly mentioned, as we`
			`only keep a whitelist.`
			`"""`
			`wanted_file_grps = ["MAX", "FULLTEXT"]`

			`for u in mets.xpath('//mets:fileGrp/@USE', namespaces=XMLNS):`
🎨 ppn2ocr: Fix bad indentation 2021-09-15 17:37:31 +02:00			`if u not in wanted_file_grps:`
			`remove_file_grp(mets, u)`
⚙️ ppn2ocr: Move pruning file groups into a function 2021-09-15 17:12:11 +02:00

🚧 ppn2ocr: Convert to Python + fumble in IIIF URLs 2020-06-02 19:06:31 +02:00			`def make_workspace(ppn, workspace):`
🎨 ppn2ocr: Extract a function to make a workspace 2020-05-22 16:53:20 +02:00			`# Make workspace directory`
🚧 ppn2ocr: Convert to Python + fumble in IIIF URLs 2020-06-02 19:06:31 +02:00			`os.mkdir(workspace)`
			`os.chdir(workspace)`
🚧 WIP: Add script ppn2ocr to run a document by giving PPN 2020-03-09 18:27:29 +01:00
🚧 ppn2ocr: Convert to Python + fumble in IIIF URLs 2020-06-02 19:06:31 +02:00			`mets = oai_mets(ppn)`
🚧 WIP: Add script ppn2ocr to run a document by giving PPN 2020-03-09 18:27:29 +01:00

✨ Use MAX file group name instead of BEST We were using the file group name BEST for what Kitodo seems to call MAX by convention. So we use MAX now. Currently, we work under the assumption that, if MAX exists in the METS retrieved by OAI-PMH, it's not what we want and we replace it with our own IIIF URLS with full size. Fixes GH-43. 2021-02-18 16:34:25 +01:00
			`# Delete MAX file group - we assume that, if it exists, it is not as`
			`# we expect it, e.g. IIIF full URLs`
			`remove_file_grp(mets, 'MAX')`

			`# Duplicate DEFAULT file group into a new file group MAX`
✨ ppn2ocr: Support TIFF in the BEST group 2020-06-23 19:03:58 +02:00			`format_ = 'tif'`
🚧 ppn2ocr: Convert to Python + fumble in IIIF URLs 2020-06-02 19:06:31 +02:00			`file_grp_default = mets.find('//mets:fileGrp[@USE="DEFAULT"]', namespaces=XMLNS)`
🐛 ppn2ocr: Gracefully handle documents without DEFAULT, e.g. multi-volume works 2021-03-03 16:17:14 +01:00
			`if file_grp_default is None:`
🎨 ppn2ocr: Fix bad indentation 2021-09-15 17:37:31 +02:00			`raise ValueError("This document has no DEFAULT file group, could be a multi-volume work")`
🐛 ppn2ocr: Gracefully handle documents without DEFAULT, e.g. multi-volume works 2021-03-03 16:17:14 +01:00
🚧 ppn2ocr: Convert to Python + fumble in IIIF URLs 2020-06-02 19:06:31 +02:00			`file_grp_best = deepcopy(file_grp_default)`

✨ Use MAX file group name instead of BEST We were using the file group name BEST for what Kitodo seems to call MAX by convention. So we use MAX now. Currently, we work under the assumption that, if MAX exists in the METS retrieved by OAI-PMH, it's not what we want and we replace it with our own IIIF URLS with full size. Fixes GH-43. 2021-02-18 16:34:25 +01:00			`file_grp_best.attrib['USE'] = 'MAX'`
🚧 ppn2ocr: Convert to Python + fumble in IIIF URLs 2020-06-02 19:06:31 +02:00			`for f in file_grp_best.findall('./mets:file', namespaces=XMLNS):`
			`old_id = f.attrib['ID']`
✨ Use MAX file group name instead of BEST We were using the file group name BEST for what Kitodo seems to call MAX by convention. So we use MAX now. Currently, we work under the assumption that, if MAX exists in the METS retrieved by OAI-PMH, it's not what we want and we replace it with our own IIIF URLS with full size. Fixes GH-43. 2021-02-18 16:34:25 +01:00			`new_id = re.sub('DEFAULT', 'MAX', old_id)`
🚧 ppn2ocr: Convert to Python + fumble in IIIF URLs 2020-06-02 19:06:31 +02:00			`f.attrib['ID'] = new_id`
✨ ppn2ocr: Support TIFF in the BEST group 2020-06-23 19:03:58 +02:00			`f.attrib['MIMETYPE'] = mime_type_for_format(format_)`
🚧 ppn2ocr: Convert to Python + fumble in IIIF URLs 2020-06-02 19:06:31 +02:00
			`for fptr in mets.findall(f'//mets:fptr[@FILEID="{old_id}"]', namespaces=XMLNS):`
			`new_fptr = deepcopy(fptr)`
			`new_fptr.attrib['FILEID'] = new_id`
			`fptr.getparent().append(new_fptr)`
🚧 WIP: Add script ppn2ocr to run a document by giving PPN 2020-03-09 18:27:29 +01:00
🚧 ppn2ocr: Convert to Python + fumble in IIIF URLs 2020-06-02 19:06:31 +02:00			`# XXX Need to fumble around with the URL for now`
			`flocat = f.find(f".//{{{XMLNS['mets']}}}FLocat")`
			`old_url = flocat.attrib[f"{{{XMLNS['xlink']}}}href"]`
🐛 ppn2ocr: Don't break now that we have IIIF URLs 2022-04-07 18:12:49 +02:00			`url_iiif_full = iiif_url_for_sbb_url(old_url, ppn, 'full', format_)`
🚧 ppn2ocr: Convert to Python + fumble in IIIF URLs 2020-06-02 19:06:31 +02:00			`flocat.attrib[f"{{{XMLNS['xlink']}}}href"] = url_iiif_full`

			`mets.find('//mets:fileSec', namespaces=XMLNS).append(file_grp_best)`

⚙️ ppn2ocr: Move pruning file groups into a function 2021-09-15 17:12:11 +02:00
			`prune_file_grps(mets)`


🚧 ppn2ocr: Convert to Python + fumble in IIIF URLs 2020-06-02 19:06:31 +02:00			`# Write mets.xml`
			`mets.write('mets.xml', pretty_print=True)`

			`# TODO`
🎨 ppn2ocr: Extract a function to make a workspace 2020-05-22 16:53:20 +02:00			`# Validate workspace`
🚧 ppn2ocr: Convert to Python + fumble in IIIF URLs 2020-06-02 19:06:31 +02:00			`#ocrd workspace validate mets.xml \| grep -v "<notice>Won't download remote image"`

🎨 ppn2ocr: Extract a function to make a workspace 2020-05-22 16:53:20 +02:00
✨ Validate PPN argument ppn2ocr expects the PPN to be in the PPNxxxxxxx format, i.e. including the leading 'PPN' string. Validate the argument accordingly. 2020-09-03 16:59:50 +02:00			`def validate_ppn(ctx, param, value):`
🎨 ppn2ocr: Fix some whitespace code style issues 2020-09-03 17:18:42 +02:00			`"""Validate a PPN argument"""`
✨ Validate PPN argument ppn2ocr expects the PPN to be in the PPNxxxxxxx format, i.e. including the leading 'PPN' string. Validate the argument accordingly. 2020-09-03 16:59:50 +02:00			`if not value.startswith('PPN'):`
			`raise click.BadParameter('PPN must be in format PPNxxxxxxxx')`
			`else:`
			`return value`

🎨 ppn2ocr: Fix some whitespace code style issues 2020-09-03 17:18:42 +02:00
💄 ppn2ocr: Add a proper CLI interface 2020-06-03 15:53:45 +02:00			`@click.command()`
✨ Validate PPN argument ppn2ocr expects the PPN to be in the PPNxxxxxxx format, i.e. including the leading 'PPN' string. Validate the argument accordingly. 2020-09-03 16:59:50 +02:00			`@click.argument('ppn', callback=validate_ppn)`
💄 ppn2ocr: Add a proper CLI interface 2020-06-03 15:53:45 +02:00			`def ppn2ocr(ppn):`
🎨 ppn2ocr: Fix some whitespace code style issues 2020-09-03 17:18:42 +02:00			`"""`
			`Get METS with best images for a document PPN`

			`For example, to get the document "PROPOSITIONES PHILOSOPHICAE: [...]" use this:`

			`\b`
			`ppn2ocr PPN699887615`
			`ls PPN699887615`
			`"""`
			`self_dir = os.path.realpath(os.path.dirname(sys.argv[0]))`
			`make_workspace(ppn, ppn)`

			`# XXX`
			`# subprocess.run([`
			`# os.path.join(self_dir, 'run-docker-hub'),`
✨ Use MAX file group name instead of BEST We were using the file group name BEST for what Kitodo seems to call MAX by convention. So we use MAX now. Currently, we work under the assumption that, if MAX exists in the METS retrieved by OAI-PMH, it's not what we want and we replace it with our own IIIF URLS with full size. Fixes GH-43. 2021-02-18 16:34:25 +01:00			`# '-I', 'MAX',`
🎨 ppn2ocr: Fix some whitespace code style issues 2020-09-03 17:18:42 +02:00			`# '--skip-validation'`
			`# ])`
💄 ppn2ocr: Add a proper CLI interface 2020-06-03 15:53:45 +02:00

			`if __name__ == '__main__':`
			`ppn2ocr()`