mirror of
https://github.com/qurator-spk/ocrd-galley.git
synced 2025-06-09 22:59:53 +02:00
Merge branch 'master' of github.com:qurator-spk/ocrd-galley
All checks were successful
continuous-integration/drone/push Build is passing
All checks were successful
continuous-integration/drone/push Build is passing
This commit is contained in:
commit
caa10531e0
2 changed files with 28 additions and 17 deletions
|
@ -2,7 +2,7 @@ ARG DRONE_COMMIT="latest"
|
||||||
FROM quratorspk/ocrd-galley-core-cuda10.0:$DRONE_COMMIT
|
FROM quratorspk/ocrd-galley-core-cuda10.0:$DRONE_COMMIT
|
||||||
|
|
||||||
ARG PIP_INSTALL="pip install --no-cache-dir"
|
ARG PIP_INSTALL="pip install --no-cache-dir"
|
||||||
ARG EYNOLLAH_VERSION="0.0.6"
|
ARG EYNOLLAH_VERSION="0.0.8"
|
||||||
|
|
||||||
|
|
||||||
# Build pip installable stuff
|
# Build pip installable stuff
|
||||||
|
|
43
ppn2ocr
43
ppn2ocr
|
@ -11,10 +11,10 @@ from copy import deepcopy
|
||||||
|
|
||||||
|
|
||||||
XMLNS = {
|
XMLNS = {
|
||||||
'mets': 'http://www.loc.gov/METS/',
|
'mets': 'http://www.loc.gov/METS/',
|
||||||
'xlink': 'http://www.w3.org/1999/xlink'
|
'xlink': 'http://www.w3.org/1999/xlink'
|
||||||
}
|
}
|
||||||
API_URL = 'https://digital.staatsbibliothek-berlin.de/oai'
|
API_URL = 'https://oai.sbb.berlin'
|
||||||
IDENTIFIER_TEMPLATE = 'oai:digital.staatsbibliothek-berlin.de:%s'
|
IDENTIFIER_TEMPLATE = 'oai:digital.staatsbibliothek-berlin.de:%s'
|
||||||
|
|
||||||
|
|
||||||
|
@ -78,6 +78,25 @@ def mime_type_for_format(format_):
|
||||||
return mime_type
|
return mime_type
|
||||||
|
|
||||||
|
|
||||||
|
def prune_file_grps(mets):
|
||||||
|
"""
|
||||||
|
Prune unwanted file groups
|
||||||
|
|
||||||
|
We only want to keep the MAX file group (we created it ourselves) and
|
||||||
|
possibly ABBYY full texts in FULLTEXT.
|
||||||
|
|
||||||
|
For the PRESENTATION + LOCAL file groups we definitely want to delete
|
||||||
|
because they contain local file:/// or file:/ links, which are not handled
|
||||||
|
well by "ocrd workspace". They are not explicitly mentioned, as we
|
||||||
|
only keep a whitelist.
|
||||||
|
"""
|
||||||
|
wanted_file_grps = ["MAX", "FULLTEXT"]
|
||||||
|
|
||||||
|
for u in mets.xpath('//mets:fileGrp/@USE', namespaces=XMLNS):
|
||||||
|
if u not in wanted_file_grps:
|
||||||
|
remove_file_grp(mets, u)
|
||||||
|
|
||||||
|
|
||||||
def make_workspace(ppn, workspace):
|
def make_workspace(ppn, workspace):
|
||||||
# Make workspace directory
|
# Make workspace directory
|
||||||
os.mkdir(workspace)
|
os.mkdir(workspace)
|
||||||
|
@ -85,11 +104,6 @@ def make_workspace(ppn, workspace):
|
||||||
|
|
||||||
mets = oai_mets(ppn)
|
mets = oai_mets(ppn)
|
||||||
|
|
||||||
# XXX
|
|
||||||
# Delete PRESENTATION + LOCAL file groups
|
|
||||||
# (local file:/// or file:/ links, not handled well by "ocrd workspace")
|
|
||||||
remove_file_grp(mets, 'PRESENTATION')
|
|
||||||
remove_file_grp(mets, 'LOCAL')
|
|
||||||
|
|
||||||
|
|
||||||
# Delete MAX file group - we assume that, if it exists, it is not as
|
# Delete MAX file group - we assume that, if it exists, it is not as
|
||||||
|
@ -101,7 +115,7 @@ def make_workspace(ppn, workspace):
|
||||||
file_grp_default = mets.find('//mets:fileGrp[@USE="DEFAULT"]', namespaces=XMLNS)
|
file_grp_default = mets.find('//mets:fileGrp[@USE="DEFAULT"]', namespaces=XMLNS)
|
||||||
|
|
||||||
if file_grp_default is None:
|
if file_grp_default is None:
|
||||||
raise ValueError("This document has no DEFAULT file group, could be a multi-volume work")
|
raise ValueError("This document has no DEFAULT file group, could be a multi-volume work")
|
||||||
|
|
||||||
file_grp_best = deepcopy(file_grp_default)
|
file_grp_best = deepcopy(file_grp_default)
|
||||||
|
|
||||||
|
@ -125,6 +139,10 @@ def make_workspace(ppn, workspace):
|
||||||
|
|
||||||
mets.find('//mets:fileSec', namespaces=XMLNS).append(file_grp_best)
|
mets.find('//mets:fileSec', namespaces=XMLNS).append(file_grp_best)
|
||||||
|
|
||||||
|
|
||||||
|
prune_file_grps(mets)
|
||||||
|
|
||||||
|
|
||||||
# Write mets.xml
|
# Write mets.xml
|
||||||
mets.write('mets.xml', pretty_print=True)
|
mets.write('mets.xml', pretty_print=True)
|
||||||
|
|
||||||
|
@ -132,13 +150,6 @@ def make_workspace(ppn, workspace):
|
||||||
# Validate workspace
|
# Validate workspace
|
||||||
#ocrd workspace validate mets.xml | grep -v "<notice>Won't download remote image"
|
#ocrd workspace validate mets.xml | grep -v "<notice>Won't download remote image"
|
||||||
|
|
||||||
# XXX
|
|
||||||
# Fix 'file:/' URLs to 'file:///'
|
|
||||||
#sed -i 's#file:/\([^/]\)#file:///\1#' mets.xml
|
|
||||||
|
|
||||||
# Patch mets.xml to use our NFS mount
|
|
||||||
#sed -i 's#file:///goobi/tiff001/sbb/#file:///srv/digisam_images/sbb/#g' mets.xml
|
|
||||||
|
|
||||||
|
|
||||||
def validate_ppn(ctx, param, value):
|
def validate_ppn(ctx, param, value):
|
||||||
"""Validate a PPN argument"""
|
"""Validate a PPN argument"""
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue