diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..c678a5e --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] diff --git a/check-FULLTEXT-Page-dimensions-vs-BEST-dimensions.py b/check-FULLTEXT-Page-dimensions-vs-BEST-dimensions.py new file mode 100644 index 0000000..d332214 --- /dev/null +++ b/check-FULLTEXT-Page-dimensions-vs-BEST-dimensions.py @@ -0,0 +1,47 @@ +"""Check FULLTEXT ALTO page dimensions against BEST image dimensions""" + +import PIL.Image +import sys +from ocrd.workspace import Workspace +from ocrd.resolver import Resolver +from lxml import etree as ET + + +def alto_namespace(tree): + """ + Return the ALTO namespace used in the given ElementTree. + + This relies on the assumption that, in any given ALTO file, the root + element has the local name "alto". We do not check if the files uses any + valid ALTO namespace. + """ + root_name = ET.QName(tree.getroot().tag) + if root_name.localname == 'alto': + return root_name.namespace + else: + raise ValueError('Not an ALTO tree') + + +exit_code = 0 +workspace = Workspace(Resolver(), '.') + +for n, page_id in enumerate(workspace.mets.physical_pages): + gt_file = workspace.mets.find_files(fileGrp='FULLTEXT', pageId=page_id)[0] + img_file = workspace.mets.find_files(fileGrp='BEST', pageId=page_id)[0] + gt_file = workspace.download_file(gt_file) + img_file = workspace.download_file(img_file) + + tree = ET.parse(gt_file.local_filename) + nsmap = {'alto': alto_namespace(tree)} + alto_page = tree.find('//alto:Page', namespaces=nsmap) # one page assumed + gt_size = int(alto_page.attrib['WIDTH']), int(alto_page.attrib['HEIGHT']) + + img_size = PIL.Image.open(img_file.local_filename).size + + if gt_size == img_size: + print('OK', page_id) + else: + print('ERR', page_id, gt_size, '!=', img_size) + exit_code = 1 + +sys.exit(exit_code) diff --git a/ppn2ocr b/ppn2ocr index db1ee06..5f20dcb 100755 --- a/ppn2ocr +++ b/ppn2ocr @@ -22,19 +22,6 @@ for prefix, uri in XMLNS.items(): ET.register_namespace(prefix, uri) -# XXX -# show_help() { -# cat <<-EOH -# Usage: $0 PPN77164308X -# -# Get OCR results as a OCR-D workspace for a given PPN -# -# Options: -# --help Show this message and exit. -# EOH -# } - - def oai_mets(ppn): """Retrieve METS metadata for a given PPN.""" @@ -45,7 +32,9 @@ def oai_mets(ppn): } s = requests.Session() - r = s.get(API_URL, params=params) + # FIXME oai.sbb.berlin fails certificate check + #r = s.get(API_URL, params=params) + r = s.get(API_URL, params=params, verify=False) mets = ET.XML(r.content).find(f".//{{{XMLNS['mets']}}}mets") mets = ET.ElementTree(mets) diff --git a/requirements-ppn2ocr.txt b/requirements-ppn2ocr.txt index 88ebd80..c15546a 100644 --- a/requirements-ppn2ocr.txt +++ b/requirements-ppn2ocr.txt @@ -1,3 +1,4 @@ requests lxml click +ocrd diff --git a/run b/run index 14277a2..7fbf16d 100755 --- a/run +++ b/run @@ -9,13 +9,22 @@ if echo "$DOCKER_IMAGE" | grep -q "/"; then docker pull "$DOCKER_IMAGE" fi + +# XXX Work around podman vs docker uid behaviour +if docker -v 2>&1 | grep -q podman; then + user="0:0" +else + user="`id -u`:`id -g`" +fi + + # The container currently needs to run privileged to allow it to read from e.g. # /home on SELinux secured systems such as Fedora. We might want to use udica # instead in the future. docker run --privileged=true --rm -t \ \ - --user `id -u`:`id -g` \ + --user $user \ --mount type=bind,src="$(pwd)",target=/data \ \ -e LOG_LEVEL=$LOG_LEVEL \