🚧 dinglehopper: Display segment id when hovering over a character difference

pull/38/head
Gerber, Mike 5 years ago
parent 1f6538b44c
commit 48ad340428

@ -15,6 +15,10 @@ def character_error_rate_n(reference, compared) -> Tuple[float, int]:
:return: character error rate and length of the reference :return: character error rate and length of the reference
""" """
d = distance(reference, compared) d = distance(reference, compared)
# XXX
from .cli import ExtractedText
if isinstance(reference, ExtractedText):
reference = reference.text
n = len(list(grapheme_clusters(unicodedata.normalize('NFC', reference)))) n = len(list(grapheme_clusters(unicodedata.normalize('NFC', reference))))
if d == 0: if d == 0:

@ -8,11 +8,11 @@ from markupsafe import escape
from qurator.dinglehopper import * from qurator.dinglehopper import *
def gen_diff_report(gt_things, ocr_things, css_prefix, joiner, none, align): def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none):
gtx = '' gtx = ''
ocrx = '' ocrx = ''
def format_thing(t, css_classes=None): def format_thing(t, css_classes=None, id_=None):
if t is None: if t is None:
html_t = none html_t = none
css_classes += ' ellipsis' css_classes += ' ellipsis'
@ -21,19 +21,52 @@ def gen_diff_report(gt_things, ocr_things, css_prefix, joiner, none, align):
else: else:
html_t = escape(t) html_t = escape(t)
html_custom_attrs = ""
# XXX must sanitize id_ or do we trust the XML?
if id_:
html_custom_attrs = 'data-segment-id="{}"'.format(id_)
if css_classes: if css_classes:
return '<span class="{css_classes}">{html_t}</span>'.format(css_classes=css_classes, html_t=html_t) return '<span class="{css_classes}" {html_custom_attrs}>{html_t}</span>'.format(css_classes=css_classes, html_t=html_t, html_custom_attrs=html_custom_attrs)
else: else:
return '{html_t}'.format(html_t=html_t) return '{html_t}'.format(html_t=html_t)
for k, (g, o) in enumerate(align(gt_things, ocr_things)): if isinstance(gt_in, ExtractedText):
if g == o: print(gt_in.text)
css_classes = None if not isinstance(ocr_in, ExtractedText):
else: raise TypeError()
# XXX splitting should be done in ExtractedText
gt_things = list(grapheme_clusters(gt_in.text))
ocr_things = list(grapheme_clusters(ocr_in.text))
else:
gt_things = gt_in
ocr_things = ocr_in
g_pos = 0
o_pos = 0
for k, (g, o) in enumerate(seq_align(gt_things, ocr_things)):
css_classes = None
gt_id = None
ocr_id = None
if g != o:
css_classes = '{css_prefix}diff{k} diff'.format(css_prefix=css_prefix, k=k) css_classes = '{css_prefix}diff{k} diff'.format(css_prefix=css_prefix, k=k)
if isinstance(gt_in, ExtractedText):
gt_id = gt_in.segment_id_for_pos(g_pos) if g is not None else None
ocr_id = ocr_in.segment_id_for_pos(o_pos) if o is not None else None
# XXX note that deletions and inserts only produce one id + None, UI must
# support this, i.e. display for the one id produced
# XXX otherwise, it should always display for BOTH ids
gtx += joiner + format_thing(g, css_classes, gt_id)
ocrx += joiner + format_thing(o, css_classes, ocr_id)
if g is not None:
g_pos += len(g)
if o is not None:
o_pos += len(o)
gtx += joiner + format_thing(g, css_classes)
ocrx += joiner + format_thing(o, css_classes)
return \ return \
''' '''
@ -51,20 +84,21 @@ def process(gt, ocr, report_prefix, *, metrics=True):
Click on a wrapper. Click on a wrapper.
""" """
gt_text = text(gt) gt_text = extract(gt)
ocr_text = text(ocr) ocr_text = extract(ocr)
gt_text = substitute_equivalences(gt_text) # FIXME
ocr_text = substitute_equivalences(ocr_text) #gt_text = substitute_equivalences(gt_text)
#ocr_text = substitute_equivalences(ocr_text)
cer, n_characters = character_error_rate_n(gt_text, ocr_text) cer, n_characters = character_error_rate_n(gt_text, ocr_text)
wer, n_words = word_error_rate_n(gt_text, ocr_text) wer, n_words = word_error_rate_n(gt_text, ocr_text)
char_diff_report = gen_diff_report(gt_text, ocr_text, css_prefix='c', joiner='', none='·', align=align) char_diff_report = gen_diff_report(gt_text, ocr_text, css_prefix='c', joiner='', none='·')
gt_words = words_normalized(gt_text) gt_words = words_normalized(gt_text)
ocr_words = words_normalized(ocr_text) ocr_words = words_normalized(ocr_text)
word_diff_report = gen_diff_report(gt_words, ocr_words, css_prefix='w', joiner=' ', none='', align=seq_align) word_diff_report = gen_diff_report(gt_words, ocr_words, css_prefix='w', joiner=' ', none='')
def json_float(value): def json_float(value):
"""Convert a float value to an JSON float. """Convert a float value to an JSON float.

@ -8,6 +8,7 @@ import numpy as np
from uniseg.graphemecluster import grapheme_clusters from uniseg.graphemecluster import grapheme_clusters
def levenshtein_matrix(seq1: Sequence, seq2: Sequence): def levenshtein_matrix(seq1: Sequence, seq2: Sequence):
"""Compute the matrix commonly computed to produce the Levenshtein distance. """Compute the matrix commonly computed to produce the Levenshtein distance.
This is also known as the Wagner-Fischer algorithm. The matrix element at the bottom right contains the desired This is also known as the Wagner-Fischer algorithm. The matrix element at the bottom right contains the desired
@ -75,6 +76,12 @@ def distance(s1, s2):
Note that this is different from levenshtein() as this function knows about Unicode normalization and grapheme Note that this is different from levenshtein() as this function knows about Unicode normalization and grapheme
clusters. This should be the correct way to compare two Unicode strings. clusters. This should be the correct way to compare two Unicode strings.
""" """
# XXX
from .cli import ExtractedText
if isinstance(s1, ExtractedText):
s1 = s1.text
if isinstance(s2, ExtractedText):
s2 = s2.text
s1 = list(grapheme_clusters(unicodedata.normalize('NFC', s1))) s1 = list(grapheme_clusters(unicodedata.normalize('NFC', s1)))
s2 = list(grapheme_clusters(unicodedata.normalize('NFC', s2))) s2 = list(grapheme_clusters(unicodedata.normalize('NFC', s2)))
return levenshtein(s1, s2) return levenshtein(s1, s2)

@ -138,7 +138,7 @@ def page_extract(tree):
# XXX Does a file have to have regions etc.? region vs lines etc. # XXX Does a file have to have regions etc.? region vs lines etc.
# Filter empty region texts # Filter empty region texts
regions = (r for r in regions if r.text is not None) regions = [r for r in regions if r.text is not None]
return ExtractedText(regions, '\n') return ExtractedText(regions, '\n')
# FIXME needs to handle normalization # FIXME needs to handle normalization

@ -26,12 +26,23 @@
border: 2px solid; border: 2px solid;
border-radius: 5px; border-radius: 5px;
} }
#status-box {
position: fixed;
background: grey;
color: white;
width: 100%;
height: 2em;
}
</style> </style>
</head> </head>
<body> <body>
<div id="status-box"> foo</div>
<div class="container"> <div class="container">
{{ gt }}<br> {{ gt }}<br>

@ -4,11 +4,16 @@ function find_diff_class(classes) {
$(document).ready(function() { $(document).ready(function() {
$('.diff').mouseover(function() { $('.diff').mouseover(function() {
let c = find_diff_class($(this).attr('class')) let c = find_diff_class($(this).attr('class'));
$('.' + c).addClass('diff-highlight') $('.' + c).addClass('diff-highlight');
segment_id = $(this).attr('data-segment-id');
$('#status-box').text(segment_id);
}); });
$('.diff').mouseout(function() { $('.diff').mouseout(function() {
let c = find_diff_class($(this).attr('class')) let c = find_diff_class($(this).attr('class'));
$('.' + c).removeClass('diff-highlight') $('.' + c).removeClass('diff-highlight');
$('#status-box').text('');
}); });
}); });

@ -32,6 +32,11 @@ def words(s):
cat = subcat[0] cat = subcat[0]
return cat in unwanted_categories or subcat in unwanted_subcategories return cat in unwanted_categories or subcat in unwanted_subcategories
# XXX
from .cli import ExtractedText
if isinstance(s, ExtractedText):
s = s.text
# We follow Unicode Standard Annex #29 on Unicode Text Segmentation here: Split on word boundaries using # We follow Unicode Standard Annex #29 on Unicode Text Segmentation here: Split on word boundaries using
# uniseg.wordbreak.words() and ignore all "words" that contain only whitespace, punctation "or similar characters." # uniseg.wordbreak.words() and ignore all "words" that contain only whitespace, punctation "or similar characters."
for word in uniseg.wordbreak.words(s): for word in uniseg.wordbreak.words(s):
@ -42,10 +47,20 @@ def words(s):
def words_normalized(s): def words_normalized(s):
# XXX
from .cli import ExtractedText
if isinstance(s, ExtractedText):
s = s.text
return words(unicodedata.normalize('NFC', s)) return words(unicodedata.normalize('NFC', s))
def word_error_rate_n(reference, compared) -> Tuple[float, int]: def word_error_rate_n(reference, compared) -> Tuple[float, int]:
# XXX
from .cli import ExtractedText
if isinstance(reference, ExtractedText):
reference = reference.text
if isinstance(compared, ExtractedText):
compared = compared.text
if isinstance(reference, str): if isinstance(reference, str):
reference_seq = list(words_normalized(reference)) reference_seq = list(words_normalized(reference))
compared_seq = list(words_normalized(compared)) compared_seq = list(words_normalized(compared))

Loading…
Cancel
Save