Merge branch 'master' of github.com:mikegerber/my_ocrd_workflow

This commit is contained in:
Gerber, Mike 2020-06-23 15:17:03 +02:00
commit 4e37a52899
5 changed files with 12 additions and 56 deletions

View file

@ -70,10 +70,13 @@ COPY data/textline_detection /var/lib/textline_detection
# Install requirements
# Using pipdeptree here to get more info than from pip3 check
COPY requirements.txt /tmp/
RUN pip3 install --no-cache-dir --upgrade pip && \
pip3 install --no-cache-dir -r /tmp/requirements.txt && \
pip3 check
pip3 install --no-cache-dir pipdeptree && \
pipdeptree -w fail
COPY my_ocrd_workflow /usr/bin/
COPY xsd/* /usr/share/xml/

View file

@ -72,9 +72,6 @@ This produces a workspace directory `PPN77164308X` with the OCR results in it;
the results are viewable as explained above.
ppn2ocr requires a working Docker setup and properly set up environment
variables for the proxy configuration. At SBB, this means:
~~~
export HTTP_PROXY=http://http-proxy.sbb.spk-berlin.de:3128/
export HTTPS_PROXY=$HTTP_PROXY; export http_proxy=$HTTP_PROXY; export https_proxy=$HTTP_PROXY
export no_proxy=localhost,digital.staatsbibliothek-berlin.de,content.staatsbibliothek-berlin.de
~~~
variables for the proxy configuration. At SBB, this following
`howto/docker-proxy.md` and `howto/proxy-settings-for-shell+python.md`
(in qurator's mono-repo).

View file

@ -1,47 +0,0 @@
"""Check FULLTEXT ALTO page dimensions against BEST image dimensions"""
import PIL.Image
import sys
from ocrd.workspace import Workspace
from ocrd.resolver import Resolver
from lxml import etree as ET
def alto_namespace(tree):
"""
Return the ALTO namespace used in the given ElementTree.
This relies on the assumption that, in any given ALTO file, the root
element has the local name "alto". We do not check if the files uses any
valid ALTO namespace.
"""
root_name = ET.QName(tree.getroot().tag)
if root_name.localname == 'alto':
return root_name.namespace
else:
raise ValueError('Not an ALTO tree')
exit_code = 0
workspace = Workspace(Resolver(), '.')
for n, page_id in enumerate(workspace.mets.physical_pages):
gt_file = workspace.mets.find_files(fileGrp='FULLTEXT', pageId=page_id)[0]
img_file = workspace.mets.find_files(fileGrp='BEST', pageId=page_id)[0]
gt_file = workspace.download_file(gt_file)
img_file = workspace.download_file(img_file)
tree = ET.parse(gt_file.local_filename)
nsmap = {'alto': alto_namespace(tree)}
alto_page = tree.find('//alto:Page', namespaces=nsmap) # one page assumed
gt_size = int(alto_page.attrib['WIDTH']), int(alto_page.attrib['HEIGHT'])
img_size = PIL.Image.open(img_file.local_filename).size
if gt_size == img_size:
print('OK', page_id)
else:
print('ERR', page_id, gt_size, '!=', img_size)
exit_code = 1
sys.exit(exit_code)

View file

@ -134,6 +134,8 @@ page_upgrade_to_2019() {
main() {
do_validate
do_binarization
do_validate

View file

@ -74,9 +74,10 @@ def make_workspace(ppn, workspace):
mets = oai_mets(ppn)
# XXX
# Delete PRESENTATION file group
# (local file:/// links, not handled well by "ocrd workspace")
# Delete PRESENTATION + LOCAL file groups
# (local file:/// or file:/ links, not handled well by "ocrd workspace")
remove_file_grp(mets, 'PRESENTATION')
remove_file_grp(mets, 'LOCAL')
# Duplicate DEFAULT file group into a new file group BEST