mirror of
https://github.com/qurator-spk/dinglehopper.git
synced 2025-06-08 19:30:01 +02:00
Correct report for fca
As the fca implementation already knows the editing operations for each segment we use a different sequence alignment method.
This commit is contained in:
parent
750ad00d1b
commit
1bc7ef6c8b
2 changed files with 13 additions and 3 deletions
|
@ -8,6 +8,14 @@ def align(t1, t2):
|
||||||
return seq_align(s1, s2)
|
return seq_align(s1, s2)
|
||||||
|
|
||||||
|
|
||||||
|
def seq_align_linewise(s1, s2, ops):
|
||||||
|
"""Align two lists of lines linewise."""
|
||||||
|
assert len(s1) == len(s2)
|
||||||
|
assert len(s2) == len(ops)
|
||||||
|
for l1, l2, line_ops in zip(s1, s2, ops):
|
||||||
|
yield from seq_align(l1, l2, ops=line_ops)
|
||||||
|
|
||||||
|
|
||||||
def seq_align(s1, s2, ops=None):
|
def seq_align(s1, s2, ops=None):
|
||||||
"""Align general sequences."""
|
"""Align general sequences."""
|
||||||
s1 = list(s1)
|
s1 = list(s1)
|
||||||
|
|
|
@ -8,7 +8,7 @@ from uniseg.graphemecluster import grapheme_clusters
|
||||||
from .character_error_rate import character_error_rate_n
|
from .character_error_rate import character_error_rate_n
|
||||||
from .flexible_character_accuracy import flexible_character_accuracy, split_matches
|
from .flexible_character_accuracy import flexible_character_accuracy, split_matches
|
||||||
from .word_error_rate import word_error_rate_n, words_normalized
|
from .word_error_rate import word_error_rate_n, words_normalized
|
||||||
from .align import seq_align
|
from .align import seq_align, seq_align_linewise
|
||||||
from .extracted_text import ExtractedText
|
from .extracted_text import ExtractedText
|
||||||
from .ocr_files import extract
|
from .ocr_files import extract
|
||||||
from .config import Config
|
from .config import Config
|
||||||
|
@ -43,7 +43,9 @@ def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none, matches=None):
|
||||||
return "{html_t}".format(html_t=html_t)
|
return "{html_t}".format(html_t=html_t)
|
||||||
|
|
||||||
ops, ocr_ids = None, None
|
ops, ocr_ids = None, None
|
||||||
|
seq_align_fun = seq_align
|
||||||
if matches:
|
if matches:
|
||||||
|
seq_align_fun = seq_align_linewise
|
||||||
gt_things, ocr_things, ops = split_matches(matches)
|
gt_things, ocr_things, ops = split_matches(matches)
|
||||||
# we have to reconstruct the order of the ocr because we mixed it for fca
|
# we have to reconstruct the order of the ocr because we mixed it for fca
|
||||||
ocr_lines = [match.ocr for match in matches]
|
ocr_lines = [match.ocr for match in matches]
|
||||||
|
@ -74,7 +76,7 @@ def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none, matches=None):
|
||||||
|
|
||||||
g_pos = 0
|
g_pos = 0
|
||||||
o_pos = 0
|
o_pos = 0
|
||||||
for k, (g, o) in enumerate(seq_align(gt_things, ocr_things, ops=ops)):
|
for k, (g, o) in enumerate(seq_align_fun(gt_things, ocr_things, ops=ops)):
|
||||||
css_classes = None
|
css_classes = None
|
||||||
gt_id = None
|
gt_id = None
|
||||||
ocr_id = None
|
ocr_id = None
|
||||||
|
@ -85,7 +87,7 @@ def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none, matches=None):
|
||||||
# support this, i.e. display for the one id produced
|
# support this, i.e. display for the one id produced
|
||||||
gt_id = gt_in.segment_id_for_pos(g_pos) if g else None
|
gt_id = gt_in.segment_id_for_pos(g_pos) if g else None
|
||||||
if ocr_ids:
|
if ocr_ids:
|
||||||
ocr_id = ocr_ids[o_pos]
|
ocr_id = ocr_ids.get(o_pos, None)
|
||||||
else:
|
else:
|
||||||
ocr_id = ocr_in.segment_id_for_pos(o_pos) if o else None
|
ocr_id = ocr_in.segment_id_for_pos(o_pos) if o else None
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue