Merge branch 'master' of github.com:mikegerber/my_ocrd_workflow

pull/27/head
Gerber, Mike 4 years ago
commit 4e37a52899

@ -70,10 +70,13 @@ COPY data/textline_detection /var/lib/textline_detection
# Install requirements
# Using pipdeptree here to get more info than from pip3 check
COPY requirements.txt /tmp/
RUN pip3 install --no-cache-dir --upgrade pip && \
pip3 install --no-cache-dir -r /tmp/requirements.txt && \
pip3 check
pip3 install --no-cache-dir pipdeptree && \
pipdeptree -w fail
COPY my_ocrd_workflow /usr/bin/
COPY xsd/* /usr/share/xml/

@ -72,9 +72,6 @@ This produces a workspace directory `PPN77164308X` with the OCR results in it;
the results are viewable as explained above.
ppn2ocr requires a working Docker setup and properly set up environment
variables for the proxy configuration. At SBB, this means:
~~~
export HTTP_PROXY=http://http-proxy.sbb.spk-berlin.de:3128/
export HTTPS_PROXY=$HTTP_PROXY; export http_proxy=$HTTP_PROXY; export https_proxy=$HTTP_PROXY
export no_proxy=localhost,digital.staatsbibliothek-berlin.de,content.staatsbibliothek-berlin.de
~~~
variables for the proxy configuration. At SBB, this following
`howto/docker-proxy.md` and `howto/proxy-settings-for-shell+python.md`
(in qurator's mono-repo).

@ -1,47 +0,0 @@
"""Check FULLTEXT ALTO page dimensions against BEST image dimensions"""
import PIL.Image
import sys
from ocrd.workspace import Workspace
from ocrd.resolver import Resolver
from lxml import etree as ET
def alto_namespace(tree):
"""
Return the ALTO namespace used in the given ElementTree.
This relies on the assumption that, in any given ALTO file, the root
element has the local name "alto". We do not check if the files uses any
valid ALTO namespace.
"""
root_name = ET.QName(tree.getroot().tag)
if root_name.localname == 'alto':
return root_name.namespace
else:
raise ValueError('Not an ALTO tree')
exit_code = 0
workspace = Workspace(Resolver(), '.')
for n, page_id in enumerate(workspace.mets.physical_pages):
gt_file = workspace.mets.find_files(fileGrp='FULLTEXT', pageId=page_id)[0]
img_file = workspace.mets.find_files(fileGrp='BEST', pageId=page_id)[0]
gt_file = workspace.download_file(gt_file)
img_file = workspace.download_file(img_file)
tree = ET.parse(gt_file.local_filename)
nsmap = {'alto': alto_namespace(tree)}
alto_page = tree.find('//alto:Page', namespaces=nsmap) # one page assumed
gt_size = int(alto_page.attrib['WIDTH']), int(alto_page.attrib['HEIGHT'])
img_size = PIL.Image.open(img_file.local_filename).size
if gt_size == img_size:
print('OK', page_id)
else:
print('ERR', page_id, gt_size, '!=', img_size)
exit_code = 1
sys.exit(exit_code)

@ -134,6 +134,8 @@ page_upgrade_to_2019() {
main() {
do_validate
do_binarization
do_validate

@ -74,9 +74,10 @@ def make_workspace(ppn, workspace):
mets = oai_mets(ppn)
# XXX
# Delete PRESENTATION file group
# (local file:/// links, not handled well by "ocrd workspace")
# Delete PRESENTATION + LOCAL file groups
# (local file:/// or file:/ links, not handled well by "ocrd workspace")
remove_file_grp(mets, 'PRESENTATION')
remove_file_grp(mets, 'LOCAL')
# Duplicate DEFAULT file group into a new file group BEST

Loading…
Cancel
Save