|
|
|
@ -8,11 +8,11 @@ from markupsafe import escape
|
|
|
|
|
from qurator.dinglehopper import *
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def gen_diff_report(gt_things, ocr_things, css_prefix, joiner, none, align):
|
|
|
|
|
def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none):
|
|
|
|
|
gtx = ''
|
|
|
|
|
ocrx = ''
|
|
|
|
|
|
|
|
|
|
def format_thing(t, css_classes=None):
|
|
|
|
|
def format_thing(t, css_classes=None, id_=None):
|
|
|
|
|
if t is None:
|
|
|
|
|
html_t = none
|
|
|
|
|
css_classes += ' ellipsis'
|
|
|
|
@ -21,19 +21,52 @@ def gen_diff_report(gt_things, ocr_things, css_prefix, joiner, none, align):
|
|
|
|
|
else:
|
|
|
|
|
html_t = escape(t)
|
|
|
|
|
|
|
|
|
|
html_custom_attrs = ""
|
|
|
|
|
# XXX must sanitize id_ or do we trust the XML?
|
|
|
|
|
if id_:
|
|
|
|
|
html_custom_attrs = 'data-segment-id="{}"'.format(id_)
|
|
|
|
|
|
|
|
|
|
if css_classes:
|
|
|
|
|
return '<span class="{css_classes}">{html_t}</span>'.format(css_classes=css_classes, html_t=html_t)
|
|
|
|
|
return '<span class="{css_classes}" {html_custom_attrs}>{html_t}</span>'.format(css_classes=css_classes, html_t=html_t, html_custom_attrs=html_custom_attrs)
|
|
|
|
|
else:
|
|
|
|
|
return '{html_t}'.format(html_t=html_t)
|
|
|
|
|
|
|
|
|
|
for k, (g, o) in enumerate(align(gt_things, ocr_things)):
|
|
|
|
|
if g == o:
|
|
|
|
|
css_classes = None
|
|
|
|
|
else:
|
|
|
|
|
if isinstance(gt_in, ExtractedText):
|
|
|
|
|
print(gt_in.text)
|
|
|
|
|
if not isinstance(ocr_in, ExtractedText):
|
|
|
|
|
raise TypeError()
|
|
|
|
|
# XXX splitting should be done in ExtractedText
|
|
|
|
|
gt_things = list(grapheme_clusters(gt_in.text))
|
|
|
|
|
ocr_things = list(grapheme_clusters(ocr_in.text))
|
|
|
|
|
else:
|
|
|
|
|
gt_things = gt_in
|
|
|
|
|
ocr_things = ocr_in
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
g_pos = 0
|
|
|
|
|
o_pos = 0
|
|
|
|
|
for k, (g, o) in enumerate(seq_align(gt_things, ocr_things)):
|
|
|
|
|
css_classes = None
|
|
|
|
|
gt_id = None
|
|
|
|
|
ocr_id = None
|
|
|
|
|
if g != o:
|
|
|
|
|
css_classes = '{css_prefix}diff{k} diff'.format(css_prefix=css_prefix, k=k)
|
|
|
|
|
if isinstance(gt_in, ExtractedText):
|
|
|
|
|
gt_id = gt_in.segment_id_for_pos(g_pos) if g is not None else None
|
|
|
|
|
ocr_id = ocr_in.segment_id_for_pos(o_pos) if o is not None else None
|
|
|
|
|
# XXX note that deletions and inserts only produce one id + None, UI must
|
|
|
|
|
# support this, i.e. display for the one id produced
|
|
|
|
|
# XXX otherwise, it should always display for BOTH ids
|
|
|
|
|
|
|
|
|
|
gtx += joiner + format_thing(g, css_classes, gt_id)
|
|
|
|
|
ocrx += joiner + format_thing(o, css_classes, ocr_id)
|
|
|
|
|
|
|
|
|
|
if g is not None:
|
|
|
|
|
g_pos += len(g)
|
|
|
|
|
if o is not None:
|
|
|
|
|
o_pos += len(o)
|
|
|
|
|
|
|
|
|
|
gtx += joiner + format_thing(g, css_classes)
|
|
|
|
|
ocrx += joiner + format_thing(o, css_classes)
|
|
|
|
|
|
|
|
|
|
return \
|
|
|
|
|
'''
|
|
|
|
@ -51,20 +84,21 @@ def process(gt, ocr, report_prefix, *, metrics=True):
|
|
|
|
|
Click on a wrapper.
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
gt_text = text(gt)
|
|
|
|
|
ocr_text = text(ocr)
|
|
|
|
|
gt_text = extract(gt)
|
|
|
|
|
ocr_text = extract(ocr)
|
|
|
|
|
|
|
|
|
|
gt_text = substitute_equivalences(gt_text)
|
|
|
|
|
ocr_text = substitute_equivalences(ocr_text)
|
|
|
|
|
# FIXME
|
|
|
|
|
#gt_text = substitute_equivalences(gt_text)
|
|
|
|
|
#ocr_text = substitute_equivalences(ocr_text)
|
|
|
|
|
|
|
|
|
|
cer, n_characters = character_error_rate_n(gt_text, ocr_text)
|
|
|
|
|
wer, n_words = word_error_rate_n(gt_text, ocr_text)
|
|
|
|
|
|
|
|
|
|
char_diff_report = gen_diff_report(gt_text, ocr_text, css_prefix='c', joiner='', none='·', align=align)
|
|
|
|
|
char_diff_report = gen_diff_report(gt_text, ocr_text, css_prefix='c', joiner='', none='·')
|
|
|
|
|
|
|
|
|
|
gt_words = words_normalized(gt_text)
|
|
|
|
|
ocr_words = words_normalized(ocr_text)
|
|
|
|
|
word_diff_report = gen_diff_report(gt_words, ocr_words, css_prefix='w', joiner=' ', none='⋯', align=seq_align)
|
|
|
|
|
word_diff_report = gen_diff_report(gt_words, ocr_words, css_prefix='w', joiner=' ', none='⋯')
|
|
|
|
|
|
|
|
|
|
def json_float(value):
|
|
|
|
|
"""Convert a float value to an JSON float.
|
|
|
|
|