import os import click from jinja2 import Environment, FileSystemLoader from markupsafe import escape from qurator.dinglehopper import * from qurator.dinglehopper import ExtractedText def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none): gtx = '' ocrx = '' def format_thing(t, css_classes=None, id_=None): if t is None: html_t = none css_classes += ' ellipsis' elif t == '\n': html_t = '
' else: html_t = escape(t) html_custom_attrs = "" # Set Bootstrap tooltip to the segment id if id_: html_custom_attrs += 'data-toggle="tooltip" title="{}"'.format(id_) if css_classes: return '{html_t}'.format(css_classes=css_classes, html_t=html_t, html_custom_attrs=html_custom_attrs) else: return '{html_t}'.format(html_t=html_t) if isinstance(gt_in, ExtractedText): if not isinstance(ocr_in, ExtractedText): raise TypeError() # XXX splitting should be done in ExtractedText gt_things = list(grapheme_clusters(gt_in.text)) ocr_things = list(grapheme_clusters(ocr_in.text)) else: gt_things = gt_in ocr_things = ocr_in g_pos = 0 o_pos = 0 for k, (g, o) in enumerate(seq_align(gt_things, ocr_things)): css_classes = None gt_id = None ocr_id = None if g != o: css_classes = '{css_prefix}diff{k} diff'.format(css_prefix=css_prefix, k=k) if isinstance(gt_in, ExtractedText): gt_id = gt_in.segment_id_for_pos(g_pos) if g is not None else None ocr_id = ocr_in.segment_id_for_pos(o_pos) if o is not None else None # Deletions and inserts only produce one id + None, UI must # support this, i.e. display for the one id produced gtx += joiner + format_thing(g, css_classes, gt_id) ocrx += joiner + format_thing(o, css_classes, ocr_id) if g is not None: g_pos += len(g) if o is not None: o_pos += len(o) return \ '''
{}
{}
'''.format(gtx, ocrx) def process(gt, ocr, report_prefix, *, metrics=True): """Check OCR result against GT. The @click decorators change the signature of the decorated functions, so we keep this undecorated version and use Click on a wrapper. """ gt_text = extract(gt) ocr_text = extract(ocr) cer, n_characters = character_error_rate_n(gt_text, ocr_text) wer, n_words = word_error_rate_n(gt_text, ocr_text) char_diff_report = gen_diff_report(gt_text, ocr_text, css_prefix='c', joiner='', none='·') gt_words = words_normalized(gt_text) ocr_words = words_normalized(ocr_text) word_diff_report = gen_diff_report(gt_words, ocr_words, css_prefix='w', joiner=' ', none='⋯') def json_float(value): """Convert a float value to an JSON float. This is here so that float('inf') yields "Infinity", not "inf". """ if value == float('inf'): return 'Infinity' elif value == float('-inf'): return '-Infinity' else: return str(value) env = Environment(loader=FileSystemLoader(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'templates'))) env.filters['json_float'] = json_float for report_suffix in ('.html', '.json'): template_fn = 'report' + report_suffix + '.j2' out_fn = report_prefix + report_suffix template = env.get_template(template_fn) template.stream( gt=gt, ocr=ocr, cer=cer, n_characters=n_characters, wer=wer, n_words=n_words, char_diff_report=char_diff_report, word_diff_report=word_diff_report, metrics=metrics, ).dump(out_fn) @click.command() @click.argument('gt', type=click.Path(exists=True)) @click.argument('ocr', type=click.Path(exists=True)) @click.argument('report_prefix', type=click.Path(), default='report') @click.option('--metrics/--no-metrics', default=True, help='Enable/disable metrics and green/red') def main(gt, ocr, report_prefix, metrics): """ Compare the PAGE/ALTO/text document GT against the document OCR. The files GT and OCR are usually a ground truth document and the result of an OCR software, but you may use dinglehopper to compare two OCR results. In that case, use --no-metrics to disable the then meaningless metrics and also change the color scheme from green/red to blue. """ process(gt, ocr, report_prefix, metrics=metrics) if __name__ == '__main__': main()