Correct report for fca

As the fca implementation already knows the editing operations for each segment we use a different sequence alignment method.
2026-07-29 15:02:33 +02:00 · 2020-11-12 16:23:04 +01:00 · 2020-11-12 16:23:04 +01:00 · 1bc7ef6c8b
commit 1bc7ef6c8b
parent 750ad00d1b
2 changed files with 13 additions and 3 deletions
--- a/qurator/dinglehopper/align.py
+++ b/qurator/dinglehopper/align.py
@ -8,6 +8,14 @@ def align(t1, t2):
    return seq_align(s1, s2)


+def seq_align_linewise(s1, s2, ops):
+    """Align two lists of lines linewise."""
+    assert len(s1) == len(s2)
+    assert len(s2) == len(ops)
+    for l1, l2, line_ops in zip(s1, s2, ops):
+        yield from seq_align(l1, l2, ops=line_ops)
+
+
 def seq_align(s1, s2, ops=None):
    """Align general sequences."""
    s1 = list(s1)
--- a/qurator/dinglehopper/cli.py
+++ b/qurator/dinglehopper/cli.py
@ -8,7 +8,7 @@ from uniseg.graphemecluster import grapheme_clusters
 from .character_error_rate import character_error_rate_n
 from .flexible_character_accuracy import flexible_character_accuracy, split_matches
 from .word_error_rate import word_error_rate_n, words_normalized
-from .align import seq_align
+from .align import seq_align, seq_align_linewise
 from .extracted_text import ExtractedText
 from .ocr_files import extract
 from .config import Config
@ -43,7 +43,9 @@ def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none, matches=None):
            return "{html_t}".format(html_t=html_t)

    ops, ocr_ids = None, None
+    seq_align_fun = seq_align
    if matches:
+        seq_align_fun = seq_align_linewise
        gt_things, ocr_things, ops = split_matches(matches)
        # we have to reconstruct the order of the ocr because we mixed it for fca
        ocr_lines = [match.ocr for match in matches]
@ -74,7 +76,7 @@ def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none, matches=None):

    g_pos = 0
    o_pos = 0
-    for k, (g, o) in enumerate(seq_align(gt_things, ocr_things, ops=ops)):
+    for k, (g, o) in enumerate(seq_align_fun(gt_things, ocr_things, ops=ops)):
        css_classes = None
        gt_id = None
        ocr_id = None
@ -85,7 +87,7 @@ def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none, matches=None):
                # support this, i.e. display for the one id produced
                gt_id = gt_in.segment_id_for_pos(g_pos) if g else None
                if ocr_ids:
-                    ocr_id = ocr_ids[o_pos]
+                    ocr_id = ocr_ids.get(o_pos, None)
                else:
                    ocr_id = ocr_in.segment_id_for_pos(o_pos) if o else None