mirror of
https://github.com/qurator-spk/ocrd-galley.git
synced 2025-07-01 01:19:52 +02:00
Merge branch 'master' of https://github.com/mikegerber/my_ocrd_workflow
This commit is contained in:
commit
032f58e4b8
3 changed files with 21 additions and 8 deletions
|
@ -2,6 +2,7 @@ FROM ubuntu:18.04
|
|||
|
||||
ENV LC_ALL=C.UTF-8 LANG=C.UTF-8
|
||||
|
||||
ENV OCRD_OLENA_VERSION 1.1.10
|
||||
ENV TESSDATA_BEST_VERSION 4.0.0
|
||||
ENV TESSDATA_PREFIX /usr/local/share/tessdata
|
||||
|
||||
|
@ -47,7 +48,7 @@ RUN curl -sSL -O https://qurator-data.de/~mike.gerber/olena_2.1-0+ocrd-git/olena
|
|||
apt-get -f install -y && \
|
||||
apt-get clean && rm -rf /var/lib/apt/lists/*
|
||||
RUN pip3 install --no-cache-dir --upgrade pip && \
|
||||
curl -sSL -o ocrd_olena.tar.gz https://github.com/OCR-D/ocrd_olena/archive/v1.1.4.tar.gz && \
|
||||
curl -sSL -o ocrd_olena.tar.gz https://github.com/OCR-D/ocrd_olena/archive/v${OCRD_OLENA_VERSION}.tar.gz && \
|
||||
mkdir ocrd_olena && \
|
||||
tar xvz -C ocrd_olena --strip-components=1 -f ocrd_olena.tar.gz && \
|
||||
cd ocrd_olena && \
|
||||
|
|
24
ppn2ocr
24
ppn2ocr
|
@ -32,16 +32,14 @@ def oai_mets(ppn):
|
|||
}
|
||||
|
||||
s = requests.Session()
|
||||
# FIXME oai.sbb.berlin fails certificate check
|
||||
#r = s.get(API_URL, params=params)
|
||||
r = s.get(API_URL, params=params, verify=False)
|
||||
r = s.get(API_URL, params=params)
|
||||
mets = ET.XML(r.content).find(f".//{{{XMLNS['mets']}}}mets")
|
||||
mets = ET.ElementTree(mets)
|
||||
|
||||
return mets
|
||||
|
||||
|
||||
def iiif_url_for_dms_url(dms_url, ppn, size):
|
||||
def iiif_url_for_dms_url(dms_url, ppn, size, format):
|
||||
"""
|
||||
Construct an IIIF URL from a dms URL.
|
||||
|
||||
|
@ -55,7 +53,8 @@ def iiif_url_for_dms_url(dms_url, ppn, size):
|
|||
else:
|
||||
raise ValueError(f"Unexpected URL {dms_url}")
|
||||
iiif_identifier = f'{ppn}-{page_num}'
|
||||
iiif_url = f'https://content.staatsbibliothek-berlin.de/dc/{iiif_identifier}/full/{size}/0/default.jpg'
|
||||
iiif_quality = 'default'
|
||||
iiif_url = f'https://content.staatsbibliothek-berlin.de/dc/{iiif_identifier}/full/{size}/0/{iiif_quality}.{format}'
|
||||
|
||||
return iiif_url
|
||||
|
||||
|
@ -68,6 +67,17 @@ def remove_file_grp(mets, use):
|
|||
bad.getparent().remove(bad)
|
||||
|
||||
|
||||
def mime_type_for_format(format_):
|
||||
if format_ == 'tif':
|
||||
mime_type = 'image/tiff'
|
||||
elif format_ == 'jpg':
|
||||
mime_type = 'image/jpg'
|
||||
else:
|
||||
raise ValueError()
|
||||
|
||||
return mime_type
|
||||
|
||||
|
||||
def make_workspace(ppn, workspace):
|
||||
# Make workspace directory
|
||||
os.mkdir(workspace)
|
||||
|
@ -83,6 +93,7 @@ def make_workspace(ppn, workspace):
|
|||
|
||||
|
||||
# Duplicate DEFAULT file group into a new file group BEST
|
||||
format_ = 'tif'
|
||||
file_grp_default = mets.find('//mets:fileGrp[@USE="DEFAULT"]', namespaces=XMLNS)
|
||||
file_grp_best = deepcopy(file_grp_default)
|
||||
|
||||
|
@ -91,6 +102,7 @@ def make_workspace(ppn, workspace):
|
|||
old_id = f.attrib['ID']
|
||||
new_id = re.sub('DEFAULT', 'BEST', old_id)
|
||||
f.attrib['ID'] = new_id
|
||||
f.attrib['MIMETYPE'] = mime_type_for_format(format_)
|
||||
|
||||
for fptr in mets.findall(f'//mets:fptr[@FILEID="{old_id}"]', namespaces=XMLNS):
|
||||
new_fptr = deepcopy(fptr)
|
||||
|
@ -100,7 +112,7 @@ def make_workspace(ppn, workspace):
|
|||
# XXX Need to fumble around with the URL for now
|
||||
flocat = f.find(f".//{{{XMLNS['mets']}}}FLocat")
|
||||
old_url = flocat.attrib[f"{{{XMLNS['xlink']}}}href"]
|
||||
url_iiif_full = iiif_url_for_dms_url(old_url, ppn, 'full')
|
||||
url_iiif_full = iiif_url_for_dms_url(old_url, ppn, 'full', format_)
|
||||
flocat.attrib[f"{{{XMLNS['xlink']}}}href"] = url_iiif_full
|
||||
|
||||
mets.find('//mets:fileSec', namespaces=XMLNS).append(file_grp_best)
|
||||
|
|
|
@ -6,6 +6,6 @@ ocrd_tesserocr >= 0.8.1
|
|||
|
||||
ocrd_calamari >= 0.0.6
|
||||
|
||||
https://github.com/qurator-spk/sbb_textline_detector/archive/8618be2.tar.gz
|
||||
https://github.com/qurator-spk/sbb_textline_detector/archive/4036e2a5.tar.gz
|
||||
|
||||
https://github.com/qurator-spk/dinglehopper/archive/745095e.tar.gz
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue