dinglehopper: Add OCR-D interface

pull/3/head
Gerber, Mike 5 years ago
parent e6e2db79da
commit 02a0e093bf

@ -13,4 +13,13 @@ Goals
* As a library * As a library
* Unicode support * Unicode support
Usage
-----
As a OCR-D processor:
~~~
ocrd-dinglehopper -m mets.xml -I OCR-D-GT-PAGE,OCR-D-OCR-TESS -O OCR-D-OCR-TESS-EVAL
~~~
This generates HTML and JSON reports in the `OCR-D-OCR-TESS-EVAL` filegroup.
![dinglehopper displaying metrics and character differences](.screenshots/dinglehopper.png?raw=true) ![dinglehopper displaying metrics and character differences](.screenshots/dinglehopper.png?raw=true)

@ -41,11 +41,12 @@ def gen_diff_report(gt_things, ocr_things, css_prefix, joiner, none):
'''.format(gtx, ocrx) '''.format(gtx, ocrx)
@click.command() def process(gt, ocr, report_prefix):
@click.argument('gt', type=click.Path(exists=True)) """Check OCR result against GT.
@click.argument('ocr', type=click.Path(exists=True))
def process(gt, ocr): The @click decorators change the signature of the decorated functions, so we keep this undecorated version and use
"""Check OCR result against GT""" Click on a wrapper.
"""
gt_text = text(gt) gt_text = text(gt)
ocr_text = text(ocr) ocr_text = text(ocr)
@ -64,8 +65,10 @@ def process(gt, ocr):
word_diff_report = gen_diff_report(gt_words, ocr_words, css_prefix='w', joiner=' ', none='') word_diff_report = gen_diff_report(gt_words, ocr_words, css_prefix='w', joiner=' ', none='')
env = Environment(loader=FileSystemLoader(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'templates'))) env = Environment(loader=FileSystemLoader(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'templates')))
for out_fn in ('report.html', 'report.json'): for report_suffix in ('.html', '.json'):
template_fn = out_fn + '.j2' template_fn = 'report' + report_suffix + '.j2'
out_fn = report_prefix + report_suffix
template = env.get_template(template_fn) template = env.get_template(template_fn)
template.stream( template.stream(
gt=gt, ocr=ocr, gt=gt, ocr=ocr,
@ -75,8 +78,12 @@ def process(gt, ocr):
).dump(out_fn) ).dump(out_fn)
def main(): @click.command()
process() @click.argument('gt', type=click.Path(exists=True))
@click.argument('ocr', type=click.Path(exists=True))
@click.argument('report_prefix', type=click.Path(), default='report')
def main(gt, ocr, report_prefix):
process(gt, ocr, report_prefix)
if __name__ == '__main__': if __name__ == '__main__':

@ -0,0 +1,19 @@
{
"git_url": "https://github.com/qurator-spk/dinglehopper",
"tools": {
"ocrd-dinglehopper": {
"executable": "ocrd-dinglehopper",
"description": "Evaluate OCR text against ground truth with dinglehopper",
"input_file_grp": [
"OCR-D-GT-PAGE",
"OCR-D-OCR"
],
"output_file_grp": [
"OCR-D-OCR-EVAL"
],
"steps": [
"recognition/text-recognition"
]
}
}
}

@ -0,0 +1,67 @@
import json
import os
import click
from ocrd import Processor
from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor
from ocrd_utils import concat_padded, getLogger
from pkg_resources import resource_string
from qurator.dinglehopper.cli import process as cli_process
log = getLogger('processor.OcrdDinglehopperEvaluate')
OCRD_TOOL = json.loads(resource_string(__name__, 'ocrd-tool.json').decode('utf8'))
@click.command()
@ocrd_cli_options
def ocrd_dinglehopper(*args, **kwargs):
return ocrd_cli_wrap_processor(OcrdDinglehopperEvaluate, *args, **kwargs)
class OcrdDinglehopperEvaluate(Processor):
def __init__(self, *args, **kwargs):
kwargs['ocrd_tool'] = OCRD_TOOL['tools']['ocrd-dinglehopper']
super(OcrdDinglehopperEvaluate, self).__init__(*args, **kwargs)
def _make_file_id(self, input_file, input_file_grp, n):
file_id = input_file.ID.replace(input_file_grp, self.output_file_grp)
if file_id == input_file.ID:
file_id = concat_padded(self.output_file_grp, n)
return file_id
def process(self):
gt_grp, ocr_grp = self.input_file_grp.split(',')
for n, page_id in enumerate(self.workspace.mets.physical_pages):
gt_file = self.workspace.mets.find_files(fileGrp=gt_grp, pageId=page_id)[0]
ocr_file = self.workspace.mets.find_files(fileGrp=ocr_grp, pageId=page_id)[0]
log.info("INPUT FILES %i / %s%s", n, gt_file, ocr_file)
file_id = self._make_file_id(ocr_file, ocr_grp, n)
report_prefix = os.path.join(self.output_file_grp, file_id)
# Process the files
try:
os.mkdir(self.output_file_grp)
except FileExistsError:
pass
cli_process(gt_file.local_filename, ocr_file.local_filename, report_prefix)
# Add reports to the workspace
for report_suffix, mimetype in \
[
['.html', 'text/html'],
['.json', 'application/json']
]:
self.workspace.add_file(
ID=file_id + report_suffix,
file_grp=self.output_file_grp,
pageId=page_id,
mimetype=mimetype,
local_filename=report_prefix + report_suffix)
if __name__ == '__main__':
ocrd_dinglehopper()

@ -4,3 +4,4 @@ lxml
uniseg uniseg
numpy numpy
colorama colorama
ocrd >= 1.0.0b15

@ -18,6 +18,7 @@ setup(
entry_points={ entry_points={
'console_scripts': [ 'console_scripts': [
'dinglehopper=qurator.dinglehopper.cli:main', 'dinglehopper=qurator.dinglehopper.cli:main',
'ocrd-dinglehopper=qurator.dinglehopper.ocrd_cli:ocrd_dinglehopper',
] ]
} }
) )

Loading…
Cancel
Save