Gerber, Mike 4 years ago
commit af4557fb33

3
.gitignore vendored

@ -0,0 +1,3 @@
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]

@ -0,0 +1,47 @@
"""Check FULLTEXT ALTO page dimensions against BEST image dimensions"""
import PIL.Image
import sys
from ocrd.workspace import Workspace
from ocrd.resolver import Resolver
from lxml import etree as ET
def alto_namespace(tree):
"""
Return the ALTO namespace used in the given ElementTree.
This relies on the assumption that, in any given ALTO file, the root
element has the local name "alto". We do not check if the files uses any
valid ALTO namespace.
"""
root_name = ET.QName(tree.getroot().tag)
if root_name.localname == 'alto':
return root_name.namespace
else:
raise ValueError('Not an ALTO tree')
exit_code = 0
workspace = Workspace(Resolver(), '.')
for n, page_id in enumerate(workspace.mets.physical_pages):
gt_file = workspace.mets.find_files(fileGrp='FULLTEXT', pageId=page_id)[0]
img_file = workspace.mets.find_files(fileGrp='BEST', pageId=page_id)[0]
gt_file = workspace.download_file(gt_file)
img_file = workspace.download_file(img_file)
tree = ET.parse(gt_file.local_filename)
nsmap = {'alto': alto_namespace(tree)}
alto_page = tree.find('//alto:Page', namespaces=nsmap) # one page assumed
gt_size = int(alto_page.attrib['WIDTH']), int(alto_page.attrib['HEIGHT'])
img_size = PIL.Image.open(img_file.local_filename).size
if gt_size == img_size:
print('OK', page_id)
else:
print('ERR', page_id, gt_size, '!=', img_size)
exit_code = 1
sys.exit(exit_code)

@ -22,19 +22,6 @@ for prefix, uri in XMLNS.items():
ET.register_namespace(prefix, uri)
# XXX
# show_help() {
# cat <<-EOH
# Usage: $0 PPN77164308X
#
# Get OCR results as a OCR-D workspace for a given PPN
#
# Options:
# --help Show this message and exit.
# EOH
# }
def oai_mets(ppn):
"""Retrieve METS metadata for a given PPN."""
@ -45,7 +32,9 @@ def oai_mets(ppn):
}
s = requests.Session()
r = s.get(API_URL, params=params)
# FIXME oai.sbb.berlin fails certificate check
#r = s.get(API_URL, params=params)
r = s.get(API_URL, params=params, verify=False)
mets = ET.XML(r.content).find(f".//{{{XMLNS['mets']}}}mets")
mets = ET.ElementTree(mets)

@ -1,3 +1,4 @@
requests
lxml
click
ocrd

11
run

@ -9,13 +9,22 @@ if echo "$DOCKER_IMAGE" | grep -q "/"; then
docker pull "$DOCKER_IMAGE"
fi
# XXX Work around podman vs docker uid behaviour
if docker -v 2>&1 | grep -q podman; then
user="0:0"
else
user="`id -u`:`id -g`"
fi
# The container currently needs to run privileged to allow it to read from e.g.
# /home on SELinux secured systems such as Fedora. We might want to use udica
# instead in the future.
docker run --privileged=true --rm -t \
\
--user `id -u`:`id -g` \
--user $user \
--mount type=bind,src="$(pwd)",target=/data \
\
-e LOG_LEVEL=$LOG_LEVEL \

Loading…
Cancel
Save