Correct report for fca

As the fca implementation already knows the editing operations for each
segment we use a different sequence alignment method.
pull/47/head
Benjamin Rosemann 5 years ago
parent 750ad00d1b
commit 1bc7ef6c8b

@ -8,6 +8,14 @@ def align(t1, t2):
return seq_align(s1, s2)
def seq_align_linewise(s1, s2, ops):
"""Align two lists of lines linewise."""
assert len(s1) == len(s2)
assert len(s2) == len(ops)
for l1, l2, line_ops in zip(s1, s2, ops):
yield from seq_align(l1, l2, ops=line_ops)
def seq_align(s1, s2, ops=None):
"""Align general sequences."""
s1 = list(s1)

@ -8,7 +8,7 @@ from uniseg.graphemecluster import grapheme_clusters
from .character_error_rate import character_error_rate_n
from .flexible_character_accuracy import flexible_character_accuracy, split_matches
from .word_error_rate import word_error_rate_n, words_normalized
from .align import seq_align
from .align import seq_align, seq_align_linewise
from .extracted_text import ExtractedText
from .ocr_files import extract
from .config import Config
@ -43,7 +43,9 @@ def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none, matches=None):
return "{html_t}".format(html_t=html_t)
ops, ocr_ids = None, None
seq_align_fun = seq_align
if matches:
seq_align_fun = seq_align_linewise
gt_things, ocr_things, ops = split_matches(matches)
# we have to reconstruct the order of the ocr because we mixed it for fca
ocr_lines = [match.ocr for match in matches]
@ -74,7 +76,7 @@ def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none, matches=None):
g_pos = 0
o_pos = 0
for k, (g, o) in enumerate(seq_align(gt_things, ocr_things, ops=ops)):
for k, (g, o) in enumerate(seq_align_fun(gt_things, ocr_things, ops=ops)):
css_classes = None
gt_id = None
ocr_id = None
@ -85,7 +87,7 @@ def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none, matches=None):
# support this, i.e. display for the one id produced
gt_id = gt_in.segment_id_for_pos(g_pos) if g else None
if ocr_ids:
ocr_id = ocr_ids[o_pos]
ocr_id = ocr_ids.get(o_pos, None)
else:
ocr_id = ocr_in.segment_id_for_pos(o_pos) if o else None

Loading…
Cancel
Save