mirror of
https://github.com/qurator-spk/ocrd-galley.git
synced 2025-06-09 22:59:53 +02:00
Merge branch 'master' of https://github.com/mikegerber/my_ocrd_workflow
This commit is contained in:
commit
af4557fb33
5 changed files with 64 additions and 15 deletions
3
.gitignore
vendored
Normal file
3
.gitignore
vendored
Normal file
|
@ -0,0 +1,3 @@
|
||||||
|
# Byte-compiled / optimized / DLL files
|
||||||
|
__pycache__/
|
||||||
|
*.py[cod]
|
47
check-FULLTEXT-Page-dimensions-vs-BEST-dimensions.py
Normal file
47
check-FULLTEXT-Page-dimensions-vs-BEST-dimensions.py
Normal file
|
@ -0,0 +1,47 @@
|
||||||
|
"""Check FULLTEXT ALTO page dimensions against BEST image dimensions"""
|
||||||
|
|
||||||
|
import PIL.Image
|
||||||
|
import sys
|
||||||
|
from ocrd.workspace import Workspace
|
||||||
|
from ocrd.resolver import Resolver
|
||||||
|
from lxml import etree as ET
|
||||||
|
|
||||||
|
|
||||||
|
def alto_namespace(tree):
|
||||||
|
"""
|
||||||
|
Return the ALTO namespace used in the given ElementTree.
|
||||||
|
|
||||||
|
This relies on the assumption that, in any given ALTO file, the root
|
||||||
|
element has the local name "alto". We do not check if the files uses any
|
||||||
|
valid ALTO namespace.
|
||||||
|
"""
|
||||||
|
root_name = ET.QName(tree.getroot().tag)
|
||||||
|
if root_name.localname == 'alto':
|
||||||
|
return root_name.namespace
|
||||||
|
else:
|
||||||
|
raise ValueError('Not an ALTO tree')
|
||||||
|
|
||||||
|
|
||||||
|
exit_code = 0
|
||||||
|
workspace = Workspace(Resolver(), '.')
|
||||||
|
|
||||||
|
for n, page_id in enumerate(workspace.mets.physical_pages):
|
||||||
|
gt_file = workspace.mets.find_files(fileGrp='FULLTEXT', pageId=page_id)[0]
|
||||||
|
img_file = workspace.mets.find_files(fileGrp='BEST', pageId=page_id)[0]
|
||||||
|
gt_file = workspace.download_file(gt_file)
|
||||||
|
img_file = workspace.download_file(img_file)
|
||||||
|
|
||||||
|
tree = ET.parse(gt_file.local_filename)
|
||||||
|
nsmap = {'alto': alto_namespace(tree)}
|
||||||
|
alto_page = tree.find('//alto:Page', namespaces=nsmap) # one page assumed
|
||||||
|
gt_size = int(alto_page.attrib['WIDTH']), int(alto_page.attrib['HEIGHT'])
|
||||||
|
|
||||||
|
img_size = PIL.Image.open(img_file.local_filename).size
|
||||||
|
|
||||||
|
if gt_size == img_size:
|
||||||
|
print('OK', page_id)
|
||||||
|
else:
|
||||||
|
print('ERR', page_id, gt_size, '!=', img_size)
|
||||||
|
exit_code = 1
|
||||||
|
|
||||||
|
sys.exit(exit_code)
|
17
ppn2ocr
17
ppn2ocr
|
@ -22,19 +22,6 @@ for prefix, uri in XMLNS.items():
|
||||||
ET.register_namespace(prefix, uri)
|
ET.register_namespace(prefix, uri)
|
||||||
|
|
||||||
|
|
||||||
# XXX
|
|
||||||
# show_help() {
|
|
||||||
# cat <<-EOH
|
|
||||||
# Usage: $0 PPN77164308X
|
|
||||||
#
|
|
||||||
# Get OCR results as a OCR-D workspace for a given PPN
|
|
||||||
#
|
|
||||||
# Options:
|
|
||||||
# --help Show this message and exit.
|
|
||||||
# EOH
|
|
||||||
# }
|
|
||||||
|
|
||||||
|
|
||||||
def oai_mets(ppn):
|
def oai_mets(ppn):
|
||||||
"""Retrieve METS metadata for a given PPN."""
|
"""Retrieve METS metadata for a given PPN."""
|
||||||
|
|
||||||
|
@ -45,7 +32,9 @@ def oai_mets(ppn):
|
||||||
}
|
}
|
||||||
|
|
||||||
s = requests.Session()
|
s = requests.Session()
|
||||||
r = s.get(API_URL, params=params)
|
# FIXME oai.sbb.berlin fails certificate check
|
||||||
|
#r = s.get(API_URL, params=params)
|
||||||
|
r = s.get(API_URL, params=params, verify=False)
|
||||||
mets = ET.XML(r.content).find(f".//{{{XMLNS['mets']}}}mets")
|
mets = ET.XML(r.content).find(f".//{{{XMLNS['mets']}}}mets")
|
||||||
mets = ET.ElementTree(mets)
|
mets = ET.ElementTree(mets)
|
||||||
|
|
||||||
|
|
|
@ -1,3 +1,4 @@
|
||||||
requests
|
requests
|
||||||
lxml
|
lxml
|
||||||
click
|
click
|
||||||
|
ocrd
|
||||||
|
|
11
run
11
run
|
@ -9,13 +9,22 @@ if echo "$DOCKER_IMAGE" | grep -q "/"; then
|
||||||
docker pull "$DOCKER_IMAGE"
|
docker pull "$DOCKER_IMAGE"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
|
||||||
|
# XXX Work around podman vs docker uid behaviour
|
||||||
|
if docker -v 2>&1 | grep -q podman; then
|
||||||
|
user="0:0"
|
||||||
|
else
|
||||||
|
user="`id -u`:`id -g`"
|
||||||
|
fi
|
||||||
|
|
||||||
|
|
||||||
# The container currently needs to run privileged to allow it to read from e.g.
|
# The container currently needs to run privileged to allow it to read from e.g.
|
||||||
# /home on SELinux secured systems such as Fedora. We might want to use udica
|
# /home on SELinux secured systems such as Fedora. We might want to use udica
|
||||||
# instead in the future.
|
# instead in the future.
|
||||||
|
|
||||||
docker run --privileged=true --rm -t \
|
docker run --privileged=true --rm -t \
|
||||||
\
|
\
|
||||||
--user `id -u`:`id -g` \
|
--user $user \
|
||||||
--mount type=bind,src="$(pwd)",target=/data \
|
--mount type=bind,src="$(pwd)",target=/data \
|
||||||
\
|
\
|
||||||
-e LOG_LEVEL=$LOG_LEVEL \
|
-e LOG_LEVEL=$LOG_LEVEL \
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue