From 1bc7ef6c8b06daa169f7e4e61043b5fc060f7d7e Mon Sep 17 00:00:00 2001
From: Benjamin Rosemann <benjamin.rosemann@la-bw.de>
Date: Thu, 12 Nov 2020 16:23:04 +0100
Subject: [PATCH] Correct report for fca

As the fca implementation already knows the editing operations for each
segment we use a different sequence alignment method.
---
 qurator/dinglehopper/align.py | 8 ++++++++
 qurator/dinglehopper/cli.py   | 8 +++++---
 2 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/qurator/dinglehopper/align.py b/qurator/dinglehopper/align.py
index ede75f4..08bb3f5 100644
--- a/qurator/dinglehopper/align.py
+++ b/qurator/dinglehopper/align.py
@@ -8,6 +8,14 @@ def align(t1, t2):
     return seq_align(s1, s2)
 
 
+def seq_align_linewise(s1, s2, ops):
+    """Align two lists of lines linewise."""
+    assert len(s1) == len(s2)
+    assert len(s2) == len(ops)
+    for l1, l2, line_ops in zip(s1, s2, ops):
+        yield from seq_align(l1, l2, ops=line_ops)
+
+
 def seq_align(s1, s2, ops=None):
     """Align general sequences."""
     s1 = list(s1)
diff --git a/qurator/dinglehopper/cli.py b/qurator/dinglehopper/cli.py
index 46fc0b0..9a2a837 100644
--- a/qurator/dinglehopper/cli.py
+++ b/qurator/dinglehopper/cli.py
@@ -8,7 +8,7 @@ from uniseg.graphemecluster import grapheme_clusters
 from .character_error_rate import character_error_rate_n
 from .flexible_character_accuracy import flexible_character_accuracy, split_matches
 from .word_error_rate import word_error_rate_n, words_normalized
-from .align import seq_align
+from .align import seq_align, seq_align_linewise
 from .extracted_text import ExtractedText
 from .ocr_files import extract
 from .config import Config
@@ -43,7 +43,9 @@ def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none, matches=None):
             return "{html_t}".format(html_t=html_t)
 
     ops, ocr_ids = None, None
+    seq_align_fun = seq_align
     if matches:
+        seq_align_fun = seq_align_linewise
         gt_things, ocr_things, ops = split_matches(matches)
         # we have to reconstruct the order of the ocr because we mixed it for fca
         ocr_lines = [match.ocr for match in matches]
@@ -74,7 +76,7 @@ def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none, matches=None):
 
     g_pos = 0
     o_pos = 0
-    for k, (g, o) in enumerate(seq_align(gt_things, ocr_things, ops=ops)):
+    for k, (g, o) in enumerate(seq_align_fun(gt_things, ocr_things, ops=ops)):
         css_classes = None
         gt_id = None
         ocr_id = None
@@ -85,7 +87,7 @@ def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none, matches=None):
                 # support this, i.e. display for the one id produced
                 gt_id = gt_in.segment_id_for_pos(g_pos) if g else None
                 if ocr_ids:
-                    ocr_id = ocr_ids[o_pos]
+                    ocr_id = ocr_ids.get(o_pos, None)
                 else:
                     ocr_id = ocr_in.segment_id_for_pos(o_pos) if o else None