mirror of
https://github.com/qurator-spk/ocrd-galley.git
synced 2025-07-13 06:59:53 +02:00
Merge branch 'master' of github.com:mikegerber/my_ocrd_workflow
This commit is contained in:
commit
4e37a52899
5 changed files with 12 additions and 56 deletions
|
@ -70,10 +70,13 @@ COPY data/textline_detection /var/lib/textline_detection
|
||||||
|
|
||||||
|
|
||||||
# Install requirements
|
# Install requirements
|
||||||
|
# Using pipdeptree here to get more info than from pip3 check
|
||||||
COPY requirements.txt /tmp/
|
COPY requirements.txt /tmp/
|
||||||
RUN pip3 install --no-cache-dir --upgrade pip && \
|
RUN pip3 install --no-cache-dir --upgrade pip && \
|
||||||
pip3 install --no-cache-dir -r /tmp/requirements.txt && \
|
pip3 install --no-cache-dir -r /tmp/requirements.txt && \
|
||||||
pip3 check
|
pip3 install --no-cache-dir pipdeptree && \
|
||||||
|
pipdeptree -w fail
|
||||||
|
|
||||||
|
|
||||||
COPY my_ocrd_workflow /usr/bin/
|
COPY my_ocrd_workflow /usr/bin/
|
||||||
COPY xsd/* /usr/share/xml/
|
COPY xsd/* /usr/share/xml/
|
||||||
|
|
|
@ -72,9 +72,6 @@ This produces a workspace directory `PPN77164308X` with the OCR results in it;
|
||||||
the results are viewable as explained above.
|
the results are viewable as explained above.
|
||||||
|
|
||||||
ppn2ocr requires a working Docker setup and properly set up environment
|
ppn2ocr requires a working Docker setup and properly set up environment
|
||||||
variables for the proxy configuration. At SBB, this means:
|
variables for the proxy configuration. At SBB, this following
|
||||||
~~~
|
`howto/docker-proxy.md` and `howto/proxy-settings-for-shell+python.md`
|
||||||
export HTTP_PROXY=http://http-proxy.sbb.spk-berlin.de:3128/
|
(in qurator's mono-repo).
|
||||||
export HTTPS_PROXY=$HTTP_PROXY; export http_proxy=$HTTP_PROXY; export https_proxy=$HTTP_PROXY
|
|
||||||
export no_proxy=localhost,digital.staatsbibliothek-berlin.de,content.staatsbibliothek-berlin.de
|
|
||||||
~~~
|
|
||||||
|
|
|
@ -1,47 +0,0 @@
|
||||||
"""Check FULLTEXT ALTO page dimensions against BEST image dimensions"""
|
|
||||||
|
|
||||||
import PIL.Image
|
|
||||||
import sys
|
|
||||||
from ocrd.workspace import Workspace
|
|
||||||
from ocrd.resolver import Resolver
|
|
||||||
from lxml import etree as ET
|
|
||||||
|
|
||||||
|
|
||||||
def alto_namespace(tree):
|
|
||||||
"""
|
|
||||||
Return the ALTO namespace used in the given ElementTree.
|
|
||||||
|
|
||||||
This relies on the assumption that, in any given ALTO file, the root
|
|
||||||
element has the local name "alto". We do not check if the files uses any
|
|
||||||
valid ALTO namespace.
|
|
||||||
"""
|
|
||||||
root_name = ET.QName(tree.getroot().tag)
|
|
||||||
if root_name.localname == 'alto':
|
|
||||||
return root_name.namespace
|
|
||||||
else:
|
|
||||||
raise ValueError('Not an ALTO tree')
|
|
||||||
|
|
||||||
|
|
||||||
exit_code = 0
|
|
||||||
workspace = Workspace(Resolver(), '.')
|
|
||||||
|
|
||||||
for n, page_id in enumerate(workspace.mets.physical_pages):
|
|
||||||
gt_file = workspace.mets.find_files(fileGrp='FULLTEXT', pageId=page_id)[0]
|
|
||||||
img_file = workspace.mets.find_files(fileGrp='BEST', pageId=page_id)[0]
|
|
||||||
gt_file = workspace.download_file(gt_file)
|
|
||||||
img_file = workspace.download_file(img_file)
|
|
||||||
|
|
||||||
tree = ET.parse(gt_file.local_filename)
|
|
||||||
nsmap = {'alto': alto_namespace(tree)}
|
|
||||||
alto_page = tree.find('//alto:Page', namespaces=nsmap) # one page assumed
|
|
||||||
gt_size = int(alto_page.attrib['WIDTH']), int(alto_page.attrib['HEIGHT'])
|
|
||||||
|
|
||||||
img_size = PIL.Image.open(img_file.local_filename).size
|
|
||||||
|
|
||||||
if gt_size == img_size:
|
|
||||||
print('OK', page_id)
|
|
||||||
else:
|
|
||||||
print('ERR', page_id, gt_size, '!=', img_size)
|
|
||||||
exit_code = 1
|
|
||||||
|
|
||||||
sys.exit(exit_code)
|
|
|
@ -134,6 +134,8 @@ page_upgrade_to_2019() {
|
||||||
|
|
||||||
|
|
||||||
main() {
|
main() {
|
||||||
|
do_validate
|
||||||
|
|
||||||
do_binarization
|
do_binarization
|
||||||
do_validate
|
do_validate
|
||||||
|
|
||||||
|
|
5
ppn2ocr
5
ppn2ocr
|
@ -74,9 +74,10 @@ def make_workspace(ppn, workspace):
|
||||||
mets = oai_mets(ppn)
|
mets = oai_mets(ppn)
|
||||||
|
|
||||||
# XXX
|
# XXX
|
||||||
# Delete PRESENTATION file group
|
# Delete PRESENTATION + LOCAL file groups
|
||||||
# (local file:/// links, not handled well by "ocrd workspace")
|
# (local file:/// or file:/ links, not handled well by "ocrd workspace")
|
||||||
remove_file_grp(mets, 'PRESENTATION')
|
remove_file_grp(mets, 'PRESENTATION')
|
||||||
|
remove_file_grp(mets, 'LOCAL')
|
||||||
|
|
||||||
|
|
||||||
# Duplicate DEFAULT file group into a new file group BEST
|
# Duplicate DEFAULT file group into a new file group BEST
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue