diff --git a/qurator/dinglehopper/character_error_rate.py b/qurator/dinglehopper/character_error_rate.py index 05cc931..e99f391 100644 --- a/qurator/dinglehopper/character_error_rate.py +++ b/qurator/dinglehopper/character_error_rate.py @@ -15,6 +15,10 @@ def character_error_rate_n(reference, compared) -> Tuple[float, int]: :return: character error rate and length of the reference """ d = distance(reference, compared) + # XXX + from .cli import ExtractedText + if isinstance(reference, ExtractedText): + reference = reference.text n = len(list(grapheme_clusters(unicodedata.normalize('NFC', reference)))) if d == 0: diff --git a/qurator/dinglehopper/cli.py b/qurator/dinglehopper/cli.py index 63bfd92..8e18b26 100644 --- a/qurator/dinglehopper/cli.py +++ b/qurator/dinglehopper/cli.py @@ -8,11 +8,11 @@ from markupsafe import escape from qurator.dinglehopper import * -def gen_diff_report(gt_things, ocr_things, css_prefix, joiner, none, align): +def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none): gtx = '' ocrx = '' - def format_thing(t, css_classes=None): + def format_thing(t, css_classes=None, id_=None): if t is None: html_t = none css_classes += ' ellipsis' @@ -21,19 +21,52 @@ def gen_diff_report(gt_things, ocr_things, css_prefix, joiner, none, align): else: html_t = escape(t) + html_custom_attrs = "" + # XXX must sanitize id_ or do we trust the XML? + if id_: + html_custom_attrs = 'data-segment-id="{}"'.format(id_) + if css_classes: - return '{html_t}'.format(css_classes=css_classes, html_t=html_t) + return '{html_t}'.format(css_classes=css_classes, html_t=html_t, html_custom_attrs=html_custom_attrs) else: return '{html_t}'.format(html_t=html_t) - for k, (g, o) in enumerate(align(gt_things, ocr_things)): - if g == o: - css_classes = None - else: + if isinstance(gt_in, ExtractedText): + print(gt_in.text) + if not isinstance(ocr_in, ExtractedText): + raise TypeError() + # XXX splitting should be done in ExtractedText + gt_things = list(grapheme_clusters(gt_in.text)) + ocr_things = list(grapheme_clusters(ocr_in.text)) + else: + gt_things = gt_in + ocr_things = ocr_in + + + + g_pos = 0 + o_pos = 0 + for k, (g, o) in enumerate(seq_align(gt_things, ocr_things)): + css_classes = None + gt_id = None + ocr_id = None + if g != o: css_classes = '{css_prefix}diff{k} diff'.format(css_prefix=css_prefix, k=k) + if isinstance(gt_in, ExtractedText): + gt_id = gt_in.segment_id_for_pos(g_pos) if g is not None else None + ocr_id = ocr_in.segment_id_for_pos(o_pos) if o is not None else None + # XXX note that deletions and inserts only produce one id + None, UI must + # support this, i.e. display for the one id produced + # XXX otherwise, it should always display for BOTH ids + + gtx += joiner + format_thing(g, css_classes, gt_id) + ocrx += joiner + format_thing(o, css_classes, ocr_id) + + if g is not None: + g_pos += len(g) + if o is not None: + o_pos += len(o) - gtx += joiner + format_thing(g, css_classes) - ocrx += joiner + format_thing(o, css_classes) return \ ''' @@ -51,20 +84,21 @@ def process(gt, ocr, report_prefix, *, metrics=True): Click on a wrapper. """ - gt_text = text(gt) - ocr_text = text(ocr) + gt_text = extract(gt) + ocr_text = extract(ocr) - gt_text = substitute_equivalences(gt_text) - ocr_text = substitute_equivalences(ocr_text) + # FIXME + #gt_text = substitute_equivalences(gt_text) + #ocr_text = substitute_equivalences(ocr_text) cer, n_characters = character_error_rate_n(gt_text, ocr_text) wer, n_words = word_error_rate_n(gt_text, ocr_text) - char_diff_report = gen_diff_report(gt_text, ocr_text, css_prefix='c', joiner='', none='·', align=align) + char_diff_report = gen_diff_report(gt_text, ocr_text, css_prefix='c', joiner='', none='·') gt_words = words_normalized(gt_text) ocr_words = words_normalized(ocr_text) - word_diff_report = gen_diff_report(gt_words, ocr_words, css_prefix='w', joiner=' ', none='⋯', align=seq_align) + word_diff_report = gen_diff_report(gt_words, ocr_words, css_prefix='w', joiner=' ', none='⋯') def json_float(value): """Convert a float value to an JSON float. diff --git a/qurator/dinglehopper/edit_distance.py b/qurator/dinglehopper/edit_distance.py index 8ca24d3..dc1cb24 100644 --- a/qurator/dinglehopper/edit_distance.py +++ b/qurator/dinglehopper/edit_distance.py @@ -8,6 +8,7 @@ import numpy as np from uniseg.graphemecluster import grapheme_clusters + def levenshtein_matrix(seq1: Sequence, seq2: Sequence): """Compute the matrix commonly computed to produce the Levenshtein distance. This is also known as the Wagner-Fischer algorithm. The matrix element at the bottom right contains the desired @@ -75,6 +76,12 @@ def distance(s1, s2): Note that this is different from levenshtein() as this function knows about Unicode normalization and grapheme clusters. This should be the correct way to compare two Unicode strings. """ + # XXX + from .cli import ExtractedText + if isinstance(s1, ExtractedText): + s1 = s1.text + if isinstance(s2, ExtractedText): + s2 = s2.text s1 = list(grapheme_clusters(unicodedata.normalize('NFC', s1))) s2 = list(grapheme_clusters(unicodedata.normalize('NFC', s2))) return levenshtein(s1, s2) diff --git a/qurator/dinglehopper/ocr_files.py b/qurator/dinglehopper/ocr_files.py index fd89b03..17868a7 100644 --- a/qurator/dinglehopper/ocr_files.py +++ b/qurator/dinglehopper/ocr_files.py @@ -138,7 +138,7 @@ def page_extract(tree): # XXX Does a file have to have regions etc.? region vs lines etc. # Filter empty region texts - regions = (r for r in regions if r.text is not None) + regions = [r for r in regions if r.text is not None] return ExtractedText(regions, '\n') # FIXME needs to handle normalization diff --git a/qurator/dinglehopper/templates/report.html.j2 b/qurator/dinglehopper/templates/report.html.j2 index 0c2f464..f7b2efb 100644 --- a/qurator/dinglehopper/templates/report.html.j2 +++ b/qurator/dinglehopper/templates/report.html.j2 @@ -26,12 +26,23 @@ border: 2px solid; border-radius: 5px; } + #status-box { + position: fixed; + background: grey; + color: white; + width: 100%; + height: 2em; + } +
foo
+ + +
{{ gt }}
diff --git a/qurator/dinglehopper/templates/report.html.js b/qurator/dinglehopper/templates/report.html.js index ac43676..01f5323 100644 --- a/qurator/dinglehopper/templates/report.html.js +++ b/qurator/dinglehopper/templates/report.html.js @@ -4,11 +4,16 @@ function find_diff_class(classes) { $(document).ready(function() { $('.diff').mouseover(function() { - let c = find_diff_class($(this).attr('class')) - $('.' + c).addClass('diff-highlight') + let c = find_diff_class($(this).attr('class')); + $('.' + c).addClass('diff-highlight'); + + segment_id = $(this).attr('data-segment-id'); + $('#status-box').text(segment_id); }); $('.diff').mouseout(function() { - let c = find_diff_class($(this).attr('class')) - $('.' + c).removeClass('diff-highlight') + let c = find_diff_class($(this).attr('class')); + $('.' + c).removeClass('diff-highlight'); + + $('#status-box').text(''); }); }); diff --git a/qurator/dinglehopper/word_error_rate.py b/qurator/dinglehopper/word_error_rate.py index 7ed56e4..64eba0a 100644 --- a/qurator/dinglehopper/word_error_rate.py +++ b/qurator/dinglehopper/word_error_rate.py @@ -32,6 +32,11 @@ def words(s): cat = subcat[0] return cat in unwanted_categories or subcat in unwanted_subcategories + # XXX + from .cli import ExtractedText + if isinstance(s, ExtractedText): + s = s.text + # We follow Unicode Standard Annex #29 on Unicode Text Segmentation here: Split on word boundaries using # uniseg.wordbreak.words() and ignore all "words" that contain only whitespace, punctation "or similar characters." for word in uniseg.wordbreak.words(s): @@ -42,10 +47,20 @@ def words(s): def words_normalized(s): + # XXX + from .cli import ExtractedText + if isinstance(s, ExtractedText): + s = s.text return words(unicodedata.normalize('NFC', s)) def word_error_rate_n(reference, compared) -> Tuple[float, int]: + # XXX + from .cli import ExtractedText + if isinstance(reference, ExtractedText): + reference = reference.text + if isinstance(compared, ExtractedText): + compared = compared.text if isinstance(reference, str): reference_seq = list(words_normalized(reference)) compared_seq = list(words_normalized(compared))