mirror of
https://github.com/qurator-spk/ocrd-galley.git
synced 2025-06-09 14:49:53 +02:00
Merge branch 'master' of https://github.com/mikegerber/my_ocrd_workflow
This commit is contained in:
commit
af4557fb33
5 changed files with 64 additions and 15 deletions
3
.gitignore
vendored
Normal file
3
.gitignore
vendored
Normal file
|
@ -0,0 +1,3 @@
|
|||
# Byte-compiled / optimized / DLL files
|
||||
__pycache__/
|
||||
*.py[cod]
|
47
check-FULLTEXT-Page-dimensions-vs-BEST-dimensions.py
Normal file
47
check-FULLTEXT-Page-dimensions-vs-BEST-dimensions.py
Normal file
|
@ -0,0 +1,47 @@
|
|||
"""Check FULLTEXT ALTO page dimensions against BEST image dimensions"""
|
||||
|
||||
import PIL.Image
|
||||
import sys
|
||||
from ocrd.workspace import Workspace
|
||||
from ocrd.resolver import Resolver
|
||||
from lxml import etree as ET
|
||||
|
||||
|
||||
def alto_namespace(tree):
|
||||
"""
|
||||
Return the ALTO namespace used in the given ElementTree.
|
||||
|
||||
This relies on the assumption that, in any given ALTO file, the root
|
||||
element has the local name "alto". We do not check if the files uses any
|
||||
valid ALTO namespace.
|
||||
"""
|
||||
root_name = ET.QName(tree.getroot().tag)
|
||||
if root_name.localname == 'alto':
|
||||
return root_name.namespace
|
||||
else:
|
||||
raise ValueError('Not an ALTO tree')
|
||||
|
||||
|
||||
exit_code = 0
|
||||
workspace = Workspace(Resolver(), '.')
|
||||
|
||||
for n, page_id in enumerate(workspace.mets.physical_pages):
|
||||
gt_file = workspace.mets.find_files(fileGrp='FULLTEXT', pageId=page_id)[0]
|
||||
img_file = workspace.mets.find_files(fileGrp='BEST', pageId=page_id)[0]
|
||||
gt_file = workspace.download_file(gt_file)
|
||||
img_file = workspace.download_file(img_file)
|
||||
|
||||
tree = ET.parse(gt_file.local_filename)
|
||||
nsmap = {'alto': alto_namespace(tree)}
|
||||
alto_page = tree.find('//alto:Page', namespaces=nsmap) # one page assumed
|
||||
gt_size = int(alto_page.attrib['WIDTH']), int(alto_page.attrib['HEIGHT'])
|
||||
|
||||
img_size = PIL.Image.open(img_file.local_filename).size
|
||||
|
||||
if gt_size == img_size:
|
||||
print('OK', page_id)
|
||||
else:
|
||||
print('ERR', page_id, gt_size, '!=', img_size)
|
||||
exit_code = 1
|
||||
|
||||
sys.exit(exit_code)
|
17
ppn2ocr
17
ppn2ocr
|
@ -22,19 +22,6 @@ for prefix, uri in XMLNS.items():
|
|||
ET.register_namespace(prefix, uri)
|
||||
|
||||
|
||||
# XXX
|
||||
# show_help() {
|
||||
# cat <<-EOH
|
||||
# Usage: $0 PPN77164308X
|
||||
#
|
||||
# Get OCR results as a OCR-D workspace for a given PPN
|
||||
#
|
||||
# Options:
|
||||
# --help Show this message and exit.
|
||||
# EOH
|
||||
# }
|
||||
|
||||
|
||||
def oai_mets(ppn):
|
||||
"""Retrieve METS metadata for a given PPN."""
|
||||
|
||||
|
@ -45,7 +32,9 @@ def oai_mets(ppn):
|
|||
}
|
||||
|
||||
s = requests.Session()
|
||||
r = s.get(API_URL, params=params)
|
||||
# FIXME oai.sbb.berlin fails certificate check
|
||||
#r = s.get(API_URL, params=params)
|
||||
r = s.get(API_URL, params=params, verify=False)
|
||||
mets = ET.XML(r.content).find(f".//{{{XMLNS['mets']}}}mets")
|
||||
mets = ET.ElementTree(mets)
|
||||
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
requests
|
||||
lxml
|
||||
click
|
||||
ocrd
|
||||
|
|
11
run
11
run
|
@ -9,13 +9,22 @@ if echo "$DOCKER_IMAGE" | grep -q "/"; then
|
|||
docker pull "$DOCKER_IMAGE"
|
||||
fi
|
||||
|
||||
|
||||
# XXX Work around podman vs docker uid behaviour
|
||||
if docker -v 2>&1 | grep -q podman; then
|
||||
user="0:0"
|
||||
else
|
||||
user="`id -u`:`id -g`"
|
||||
fi
|
||||
|
||||
|
||||
# The container currently needs to run privileged to allow it to read from e.g.
|
||||
# /home on SELinux secured systems such as Fedora. We might want to use udica
|
||||
# instead in the future.
|
||||
|
||||
docker run --privileged=true --rm -t \
|
||||
\
|
||||
--user `id -u`:`id -g` \
|
||||
--user $user \
|
||||
--mount type=bind,src="$(pwd)",target=/data \
|
||||
\
|
||||
-e LOG_LEVEL=$LOG_LEVEL \
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue