diff --git a/README.md b/README.md index 4950ca2..e8136c8 100644 --- a/README.md +++ b/README.md @@ -13,4 +13,13 @@ Goals * As a library * Unicode support +Usage +----- +As a OCR-D processor: +~~~ +ocrd-dinglehopper -m mets.xml -I OCR-D-GT-PAGE,OCR-D-OCR-TESS -O OCR-D-OCR-TESS-EVAL +~~~ +This generates HTML and JSON reports in the `OCR-D-OCR-TESS-EVAL` filegroup. + + ![dinglehopper displaying metrics and character differences](.screenshots/dinglehopper.png?raw=true) diff --git a/qurator/dinglehopper/cli.py b/qurator/dinglehopper/cli.py index 8c3186a..3cdcf9a 100644 --- a/qurator/dinglehopper/cli.py +++ b/qurator/dinglehopper/cli.py @@ -41,11 +41,12 @@ def gen_diff_report(gt_things, ocr_things, css_prefix, joiner, none): '''.format(gtx, ocrx) -@click.command() -@click.argument('gt', type=click.Path(exists=True)) -@click.argument('ocr', type=click.Path(exists=True)) -def process(gt, ocr): - """Check OCR result against GT""" +def process(gt, ocr, report_prefix): + """Check OCR result against GT. + + The @click decorators change the signature of the decorated functions, so we keep this undecorated version and use + Click on a wrapper. + """ gt_text = text(gt) ocr_text = text(ocr) @@ -64,8 +65,10 @@ def process(gt, ocr): word_diff_report = gen_diff_report(gt_words, ocr_words, css_prefix='w', joiner=' ', none='⋯') env = Environment(loader=FileSystemLoader(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'templates'))) - for out_fn in ('report.html', 'report.json'): - template_fn = out_fn + '.j2' + for report_suffix in ('.html', '.json'): + template_fn = 'report' + report_suffix + '.j2' + out_fn = report_prefix + report_suffix + template = env.get_template(template_fn) template.stream( gt=gt, ocr=ocr, @@ -75,8 +78,12 @@ def process(gt, ocr): ).dump(out_fn) -def main(): - process() +@click.command() +@click.argument('gt', type=click.Path(exists=True)) +@click.argument('ocr', type=click.Path(exists=True)) +@click.argument('report_prefix', type=click.Path(), default='report') +def main(gt, ocr, report_prefix): + process(gt, ocr, report_prefix) if __name__ == '__main__': diff --git a/qurator/dinglehopper/ocrd-tool.json b/qurator/dinglehopper/ocrd-tool.json new file mode 100644 index 0000000..ebb141a --- /dev/null +++ b/qurator/dinglehopper/ocrd-tool.json @@ -0,0 +1,19 @@ +{ + "git_url": "https://github.com/qurator-spk/dinglehopper", + "tools": { + "ocrd-dinglehopper": { + "executable": "ocrd-dinglehopper", + "description": "Evaluate OCR text against ground truth with dinglehopper", + "input_file_grp": [ + "OCR-D-GT-PAGE", + "OCR-D-OCR" + ], + "output_file_grp": [ + "OCR-D-OCR-EVAL" + ], + "steps": [ + "recognition/text-recognition" + ] + } + } +} diff --git a/qurator/dinglehopper/ocrd_cli.py b/qurator/dinglehopper/ocrd_cli.py new file mode 100644 index 0000000..2a5aed5 --- /dev/null +++ b/qurator/dinglehopper/ocrd_cli.py @@ -0,0 +1,67 @@ +import json +import os + +import click +from ocrd import Processor +from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor +from ocrd_utils import concat_padded, getLogger +from pkg_resources import resource_string + +from qurator.dinglehopper.cli import process as cli_process + +log = getLogger('processor.OcrdDinglehopperEvaluate') + +OCRD_TOOL = json.loads(resource_string(__name__, 'ocrd-tool.json').decode('utf8')) + + +@click.command() +@ocrd_cli_options +def ocrd_dinglehopper(*args, **kwargs): + return ocrd_cli_wrap_processor(OcrdDinglehopperEvaluate, *args, **kwargs) + + +class OcrdDinglehopperEvaluate(Processor): + + def __init__(self, *args, **kwargs): + kwargs['ocrd_tool'] = OCRD_TOOL['tools']['ocrd-dinglehopper'] + super(OcrdDinglehopperEvaluate, self).__init__(*args, **kwargs) + + def _make_file_id(self, input_file, input_file_grp, n): + file_id = input_file.ID.replace(input_file_grp, self.output_file_grp) + if file_id == input_file.ID: + file_id = concat_padded(self.output_file_grp, n) + return file_id + + def process(self): + gt_grp, ocr_grp = self.input_file_grp.split(',') + for n, page_id in enumerate(self.workspace.mets.physical_pages): + gt_file = self.workspace.mets.find_files(fileGrp=gt_grp, pageId=page_id)[0] + ocr_file = self.workspace.mets.find_files(fileGrp=ocr_grp, pageId=page_id)[0] + log.info("INPUT FILES %i / %s↔ %s", n, gt_file, ocr_file) + + file_id = self._make_file_id(ocr_file, ocr_grp, n) + report_prefix = os.path.join(self.output_file_grp, file_id) + + # Process the files + try: + os.mkdir(self.output_file_grp) + except FileExistsError: + pass + cli_process(gt_file.local_filename, ocr_file.local_filename, report_prefix) + + # Add reports to the workspace + for report_suffix, mimetype in \ + [ + ['.html', 'text/html'], + ['.json', 'application/json'] + ]: + self.workspace.add_file( + ID=file_id + report_suffix, + file_grp=self.output_file_grp, + pageId=page_id, + mimetype=mimetype, + local_filename=report_prefix + report_suffix) + + +if __name__ == '__main__': + ocrd_dinglehopper() diff --git a/requirements.txt b/requirements.txt index 074aede..063bac4 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,3 +4,4 @@ lxml uniseg numpy colorama +ocrd >= 1.0.0b15 diff --git a/setup.py b/setup.py index b613b4f..9d158a3 100644 --- a/setup.py +++ b/setup.py @@ -18,6 +18,7 @@ setup( entry_points={ 'console_scripts': [ 'dinglehopper=qurator.dinglehopper.cli:main', + 'ocrd-dinglehopper=qurator.dinglehopper.ocrd_cli:ocrd_dinglehopper', ] } )