🧹 Move check-FULLTEXT-Page-dimensions-vs-BEST-dimensions.py code to mono-repo/experiments

pull/27/head
Gerber, Mike 5 years ago
parent af4557fb33
commit c334b1e7ac

@ -1,47 +0,0 @@
"""Check FULLTEXT ALTO page dimensions against BEST image dimensions"""
import PIL.Image
import sys
from ocrd.workspace import Workspace
from ocrd.resolver import Resolver
from lxml import etree as ET
def alto_namespace(tree):
"""
Return the ALTO namespace used in the given ElementTree.
This relies on the assumption that, in any given ALTO file, the root
element has the local name "alto". We do not check if the files uses any
valid ALTO namespace.
"""
root_name = ET.QName(tree.getroot().tag)
if root_name.localname == 'alto':
return root_name.namespace
else:
raise ValueError('Not an ALTO tree')
exit_code = 0
workspace = Workspace(Resolver(), '.')
for n, page_id in enumerate(workspace.mets.physical_pages):
gt_file = workspace.mets.find_files(fileGrp='FULLTEXT', pageId=page_id)[0]
img_file = workspace.mets.find_files(fileGrp='BEST', pageId=page_id)[0]
gt_file = workspace.download_file(gt_file)
img_file = workspace.download_file(img_file)
tree = ET.parse(gt_file.local_filename)
nsmap = {'alto': alto_namespace(tree)}
alto_page = tree.find('//alto:Page', namespaces=nsmap) # one page assumed
gt_size = int(alto_page.attrib['WIDTH']), int(alto_page.attrib['HEIGHT'])
img_size = PIL.Image.open(img_file.local_filename).size
if gt_size == img_size:
print('OK', page_id)
else:
print('ERR', page_id, gt_size, '!=', img_size)
exit_code = 1
sys.exit(exit_code)
Loading…
Cancel
Save