mirror of
https://github.com/qurator-spk/dinglehopper.git
synced 2025-06-09 20:00:01 +02:00
🚧 dinglehopper: Display segment id when hovering over a character difference
This commit is contained in:
parent
1f6538b44c
commit
48ad340428
7 changed files with 97 additions and 21 deletions
|
@ -15,6 +15,10 @@ def character_error_rate_n(reference, compared) -> Tuple[float, int]:
|
||||||
:return: character error rate and length of the reference
|
:return: character error rate and length of the reference
|
||||||
"""
|
"""
|
||||||
d = distance(reference, compared)
|
d = distance(reference, compared)
|
||||||
|
# XXX
|
||||||
|
from .cli import ExtractedText
|
||||||
|
if isinstance(reference, ExtractedText):
|
||||||
|
reference = reference.text
|
||||||
n = len(list(grapheme_clusters(unicodedata.normalize('NFC', reference))))
|
n = len(list(grapheme_clusters(unicodedata.normalize('NFC', reference))))
|
||||||
|
|
||||||
if d == 0:
|
if d == 0:
|
||||||
|
|
|
@ -8,11 +8,11 @@ from markupsafe import escape
|
||||||
from qurator.dinglehopper import *
|
from qurator.dinglehopper import *
|
||||||
|
|
||||||
|
|
||||||
def gen_diff_report(gt_things, ocr_things, css_prefix, joiner, none, align):
|
def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none):
|
||||||
gtx = ''
|
gtx = ''
|
||||||
ocrx = ''
|
ocrx = ''
|
||||||
|
|
||||||
def format_thing(t, css_classes=None):
|
def format_thing(t, css_classes=None, id_=None):
|
||||||
if t is None:
|
if t is None:
|
||||||
html_t = none
|
html_t = none
|
||||||
css_classes += ' ellipsis'
|
css_classes += ' ellipsis'
|
||||||
|
@ -21,19 +21,52 @@ def gen_diff_report(gt_things, ocr_things, css_prefix, joiner, none, align):
|
||||||
else:
|
else:
|
||||||
html_t = escape(t)
|
html_t = escape(t)
|
||||||
|
|
||||||
|
html_custom_attrs = ""
|
||||||
|
# XXX must sanitize id_ or do we trust the XML?
|
||||||
|
if id_:
|
||||||
|
html_custom_attrs = 'data-segment-id="{}"'.format(id_)
|
||||||
|
|
||||||
if css_classes:
|
if css_classes:
|
||||||
return '<span class="{css_classes}">{html_t}</span>'.format(css_classes=css_classes, html_t=html_t)
|
return '<span class="{css_classes}" {html_custom_attrs}>{html_t}</span>'.format(css_classes=css_classes, html_t=html_t, html_custom_attrs=html_custom_attrs)
|
||||||
else:
|
else:
|
||||||
return '{html_t}'.format(html_t=html_t)
|
return '{html_t}'.format(html_t=html_t)
|
||||||
|
|
||||||
for k, (g, o) in enumerate(align(gt_things, ocr_things)):
|
if isinstance(gt_in, ExtractedText):
|
||||||
if g == o:
|
print(gt_in.text)
|
||||||
css_classes = None
|
if not isinstance(ocr_in, ExtractedText):
|
||||||
|
raise TypeError()
|
||||||
|
# XXX splitting should be done in ExtractedText
|
||||||
|
gt_things = list(grapheme_clusters(gt_in.text))
|
||||||
|
ocr_things = list(grapheme_clusters(ocr_in.text))
|
||||||
else:
|
else:
|
||||||
css_classes = '{css_prefix}diff{k} diff'.format(css_prefix=css_prefix, k=k)
|
gt_things = gt_in
|
||||||
|
ocr_things = ocr_in
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
g_pos = 0
|
||||||
|
o_pos = 0
|
||||||
|
for k, (g, o) in enumerate(seq_align(gt_things, ocr_things)):
|
||||||
|
css_classes = None
|
||||||
|
gt_id = None
|
||||||
|
ocr_id = None
|
||||||
|
if g != o:
|
||||||
|
css_classes = '{css_prefix}diff{k} diff'.format(css_prefix=css_prefix, k=k)
|
||||||
|
if isinstance(gt_in, ExtractedText):
|
||||||
|
gt_id = gt_in.segment_id_for_pos(g_pos) if g is not None else None
|
||||||
|
ocr_id = ocr_in.segment_id_for_pos(o_pos) if o is not None else None
|
||||||
|
# XXX note that deletions and inserts only produce one id + None, UI must
|
||||||
|
# support this, i.e. display for the one id produced
|
||||||
|
# XXX otherwise, it should always display for BOTH ids
|
||||||
|
|
||||||
|
gtx += joiner + format_thing(g, css_classes, gt_id)
|
||||||
|
ocrx += joiner + format_thing(o, css_classes, ocr_id)
|
||||||
|
|
||||||
|
if g is not None:
|
||||||
|
g_pos += len(g)
|
||||||
|
if o is not None:
|
||||||
|
o_pos += len(o)
|
||||||
|
|
||||||
gtx += joiner + format_thing(g, css_classes)
|
|
||||||
ocrx += joiner + format_thing(o, css_classes)
|
|
||||||
|
|
||||||
return \
|
return \
|
||||||
'''
|
'''
|
||||||
|
@ -51,20 +84,21 @@ def process(gt, ocr, report_prefix, *, metrics=True):
|
||||||
Click on a wrapper.
|
Click on a wrapper.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
gt_text = text(gt)
|
gt_text = extract(gt)
|
||||||
ocr_text = text(ocr)
|
ocr_text = extract(ocr)
|
||||||
|
|
||||||
gt_text = substitute_equivalences(gt_text)
|
# FIXME
|
||||||
ocr_text = substitute_equivalences(ocr_text)
|
#gt_text = substitute_equivalences(gt_text)
|
||||||
|
#ocr_text = substitute_equivalences(ocr_text)
|
||||||
|
|
||||||
cer, n_characters = character_error_rate_n(gt_text, ocr_text)
|
cer, n_characters = character_error_rate_n(gt_text, ocr_text)
|
||||||
wer, n_words = word_error_rate_n(gt_text, ocr_text)
|
wer, n_words = word_error_rate_n(gt_text, ocr_text)
|
||||||
|
|
||||||
char_diff_report = gen_diff_report(gt_text, ocr_text, css_prefix='c', joiner='', none='·', align=align)
|
char_diff_report = gen_diff_report(gt_text, ocr_text, css_prefix='c', joiner='', none='·')
|
||||||
|
|
||||||
gt_words = words_normalized(gt_text)
|
gt_words = words_normalized(gt_text)
|
||||||
ocr_words = words_normalized(ocr_text)
|
ocr_words = words_normalized(ocr_text)
|
||||||
word_diff_report = gen_diff_report(gt_words, ocr_words, css_prefix='w', joiner=' ', none='⋯', align=seq_align)
|
word_diff_report = gen_diff_report(gt_words, ocr_words, css_prefix='w', joiner=' ', none='⋯')
|
||||||
|
|
||||||
def json_float(value):
|
def json_float(value):
|
||||||
"""Convert a float value to an JSON float.
|
"""Convert a float value to an JSON float.
|
||||||
|
|
|
@ -8,6 +8,7 @@ import numpy as np
|
||||||
from uniseg.graphemecluster import grapheme_clusters
|
from uniseg.graphemecluster import grapheme_clusters
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def levenshtein_matrix(seq1: Sequence, seq2: Sequence):
|
def levenshtein_matrix(seq1: Sequence, seq2: Sequence):
|
||||||
"""Compute the matrix commonly computed to produce the Levenshtein distance.
|
"""Compute the matrix commonly computed to produce the Levenshtein distance.
|
||||||
This is also known as the Wagner-Fischer algorithm. The matrix element at the bottom right contains the desired
|
This is also known as the Wagner-Fischer algorithm. The matrix element at the bottom right contains the desired
|
||||||
|
@ -75,6 +76,12 @@ def distance(s1, s2):
|
||||||
Note that this is different from levenshtein() as this function knows about Unicode normalization and grapheme
|
Note that this is different from levenshtein() as this function knows about Unicode normalization and grapheme
|
||||||
clusters. This should be the correct way to compare two Unicode strings.
|
clusters. This should be the correct way to compare two Unicode strings.
|
||||||
"""
|
"""
|
||||||
|
# XXX
|
||||||
|
from .cli import ExtractedText
|
||||||
|
if isinstance(s1, ExtractedText):
|
||||||
|
s1 = s1.text
|
||||||
|
if isinstance(s2, ExtractedText):
|
||||||
|
s2 = s2.text
|
||||||
s1 = list(grapheme_clusters(unicodedata.normalize('NFC', s1)))
|
s1 = list(grapheme_clusters(unicodedata.normalize('NFC', s1)))
|
||||||
s2 = list(grapheme_clusters(unicodedata.normalize('NFC', s2)))
|
s2 = list(grapheme_clusters(unicodedata.normalize('NFC', s2)))
|
||||||
return levenshtein(s1, s2)
|
return levenshtein(s1, s2)
|
||||||
|
|
|
@ -138,7 +138,7 @@ def page_extract(tree):
|
||||||
|
|
||||||
# XXX Does a file have to have regions etc.? region vs lines etc.
|
# XXX Does a file have to have regions etc.? region vs lines etc.
|
||||||
# Filter empty region texts
|
# Filter empty region texts
|
||||||
regions = (r for r in regions if r.text is not None)
|
regions = [r for r in regions if r.text is not None]
|
||||||
|
|
||||||
return ExtractedText(regions, '\n')
|
return ExtractedText(regions, '\n')
|
||||||
# FIXME needs to handle normalization
|
# FIXME needs to handle normalization
|
||||||
|
|
|
@ -26,12 +26,23 @@
|
||||||
border: 2px solid;
|
border: 2px solid;
|
||||||
border-radius: 5px;
|
border-radius: 5px;
|
||||||
}
|
}
|
||||||
|
#status-box {
|
||||||
|
position: fixed;
|
||||||
|
background: grey;
|
||||||
|
color: white;
|
||||||
|
width: 100%;
|
||||||
|
height: 2em;
|
||||||
|
}
|
||||||
</style>
|
</style>
|
||||||
</head>
|
</head>
|
||||||
<body>
|
<body>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<div id="status-box"> foo</div>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
<div class="container">
|
<div class="container">
|
||||||
|
|
||||||
{{ gt }}<br>
|
{{ gt }}<br>
|
||||||
|
|
|
@ -4,11 +4,16 @@ function find_diff_class(classes) {
|
||||||
|
|
||||||
$(document).ready(function() {
|
$(document).ready(function() {
|
||||||
$('.diff').mouseover(function() {
|
$('.diff').mouseover(function() {
|
||||||
let c = find_diff_class($(this).attr('class'))
|
let c = find_diff_class($(this).attr('class'));
|
||||||
$('.' + c).addClass('diff-highlight')
|
$('.' + c).addClass('diff-highlight');
|
||||||
|
|
||||||
|
segment_id = $(this).attr('data-segment-id');
|
||||||
|
$('#status-box').text(segment_id);
|
||||||
});
|
});
|
||||||
$('.diff').mouseout(function() {
|
$('.diff').mouseout(function() {
|
||||||
let c = find_diff_class($(this).attr('class'))
|
let c = find_diff_class($(this).attr('class'));
|
||||||
$('.' + c).removeClass('diff-highlight')
|
$('.' + c).removeClass('diff-highlight');
|
||||||
|
|
||||||
|
$('#status-box').text('');
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|
|
@ -32,6 +32,11 @@ def words(s):
|
||||||
cat = subcat[0]
|
cat = subcat[0]
|
||||||
return cat in unwanted_categories or subcat in unwanted_subcategories
|
return cat in unwanted_categories or subcat in unwanted_subcategories
|
||||||
|
|
||||||
|
# XXX
|
||||||
|
from .cli import ExtractedText
|
||||||
|
if isinstance(s, ExtractedText):
|
||||||
|
s = s.text
|
||||||
|
|
||||||
# We follow Unicode Standard Annex #29 on Unicode Text Segmentation here: Split on word boundaries using
|
# We follow Unicode Standard Annex #29 on Unicode Text Segmentation here: Split on word boundaries using
|
||||||
# uniseg.wordbreak.words() and ignore all "words" that contain only whitespace, punctation "or similar characters."
|
# uniseg.wordbreak.words() and ignore all "words" that contain only whitespace, punctation "or similar characters."
|
||||||
for word in uniseg.wordbreak.words(s):
|
for word in uniseg.wordbreak.words(s):
|
||||||
|
@ -42,10 +47,20 @@ def words(s):
|
||||||
|
|
||||||
|
|
||||||
def words_normalized(s):
|
def words_normalized(s):
|
||||||
|
# XXX
|
||||||
|
from .cli import ExtractedText
|
||||||
|
if isinstance(s, ExtractedText):
|
||||||
|
s = s.text
|
||||||
return words(unicodedata.normalize('NFC', s))
|
return words(unicodedata.normalize('NFC', s))
|
||||||
|
|
||||||
|
|
||||||
def word_error_rate_n(reference, compared) -> Tuple[float, int]:
|
def word_error_rate_n(reference, compared) -> Tuple[float, int]:
|
||||||
|
# XXX
|
||||||
|
from .cli import ExtractedText
|
||||||
|
if isinstance(reference, ExtractedText):
|
||||||
|
reference = reference.text
|
||||||
|
if isinstance(compared, ExtractedText):
|
||||||
|
compared = compared.text
|
||||||
if isinstance(reference, str):
|
if isinstance(reference, str):
|
||||||
reference_seq = list(words_normalized(reference))
|
reference_seq = list(words_normalized(reference))
|
||||||
compared_seq = list(words_normalized(compared))
|
compared_seq = list(words_normalized(compared))
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue