Merge branch 'master' of github.com:mikegerber/my_ocrd_workflow

pull/27/head
Gerber, Mike 5 years ago
commit 4e37a52899

@ -70,10 +70,13 @@ COPY data/textline_detection /var/lib/textline_detection
# Install requirements # Install requirements
# Using pipdeptree here to get more info than from pip3 check
COPY requirements.txt /tmp/ COPY requirements.txt /tmp/
RUN pip3 install --no-cache-dir --upgrade pip && \ RUN pip3 install --no-cache-dir --upgrade pip && \
pip3 install --no-cache-dir -r /tmp/requirements.txt && \ pip3 install --no-cache-dir -r /tmp/requirements.txt && \
pip3 check pip3 install --no-cache-dir pipdeptree && \
pipdeptree -w fail
COPY my_ocrd_workflow /usr/bin/ COPY my_ocrd_workflow /usr/bin/
COPY xsd/* /usr/share/xml/ COPY xsd/* /usr/share/xml/

@ -72,9 +72,6 @@ This produces a workspace directory `PPN77164308X` with the OCR results in it;
the results are viewable as explained above. the results are viewable as explained above.
ppn2ocr requires a working Docker setup and properly set up environment ppn2ocr requires a working Docker setup and properly set up environment
variables for the proxy configuration. At SBB, this means: variables for the proxy configuration. At SBB, this following
~~~ `howto/docker-proxy.md` and `howto/proxy-settings-for-shell+python.md`
export HTTP_PROXY=http://http-proxy.sbb.spk-berlin.de:3128/ (in qurator's mono-repo).
export HTTPS_PROXY=$HTTP_PROXY; export http_proxy=$HTTP_PROXY; export https_proxy=$HTTP_PROXY
export no_proxy=localhost,digital.staatsbibliothek-berlin.de,content.staatsbibliothek-berlin.de
~~~

@ -1,47 +0,0 @@
"""Check FULLTEXT ALTO page dimensions against BEST image dimensions"""
import PIL.Image
import sys
from ocrd.workspace import Workspace
from ocrd.resolver import Resolver
from lxml import etree as ET
def alto_namespace(tree):
"""
Return the ALTO namespace used in the given ElementTree.
This relies on the assumption that, in any given ALTO file, the root
element has the local name "alto". We do not check if the files uses any
valid ALTO namespace.
"""
root_name = ET.QName(tree.getroot().tag)
if root_name.localname == 'alto':
return root_name.namespace
else:
raise ValueError('Not an ALTO tree')
exit_code = 0
workspace = Workspace(Resolver(), '.')
for n, page_id in enumerate(workspace.mets.physical_pages):
gt_file = workspace.mets.find_files(fileGrp='FULLTEXT', pageId=page_id)[0]
img_file = workspace.mets.find_files(fileGrp='BEST', pageId=page_id)[0]
gt_file = workspace.download_file(gt_file)
img_file = workspace.download_file(img_file)
tree = ET.parse(gt_file.local_filename)
nsmap = {'alto': alto_namespace(tree)}
alto_page = tree.find('//alto:Page', namespaces=nsmap) # one page assumed
gt_size = int(alto_page.attrib['WIDTH']), int(alto_page.attrib['HEIGHT'])
img_size = PIL.Image.open(img_file.local_filename).size
if gt_size == img_size:
print('OK', page_id)
else:
print('ERR', page_id, gt_size, '!=', img_size)
exit_code = 1
sys.exit(exit_code)

@ -134,6 +134,8 @@ page_upgrade_to_2019() {
main() { main() {
do_validate
do_binarization do_binarization
do_validate do_validate

@ -74,9 +74,10 @@ def make_workspace(ppn, workspace):
mets = oai_mets(ppn) mets = oai_mets(ppn)
# XXX # XXX
# Delete PRESENTATION file group # Delete PRESENTATION + LOCAL file groups
# (local file:/// links, not handled well by "ocrd workspace") # (local file:/// or file:/ links, not handled well by "ocrd workspace")
remove_file_grp(mets, 'PRESENTATION') remove_file_grp(mets, 'PRESENTATION')
remove_file_grp(mets, 'LOCAL')
# Duplicate DEFAULT file group into a new file group BEST # Duplicate DEFAULT file group into a new file group BEST

Loading…
Cancel
Save