From 0ef7810dd0967ba86a94b559e125fbe6fda664db Mon Sep 17 00:00:00 2001
From: Benjamin Rosemann <benjamin.rosemann@la-bw.de>
Date: Fri, 13 Nov 2020 11:45:55 +0100
Subject: [PATCH] Reduce number of splits for short (one char) elements

---
 qurator/dinglehopper/flexible_character_accuracy.py    | 10 ++++++----
 .../tests/test_flexible_character_accuracy.py          |  4 +++-
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/qurator/dinglehopper/flexible_character_accuracy.py b/qurator/dinglehopper/flexible_character_accuracy.py
index ed72764..884bf1b 100644
--- a/qurator/dinglehopper/flexible_character_accuracy.py
+++ b/qurator/dinglehopper/flexible_character_accuracy.py
@@ -145,9 +145,11 @@ def match_longest_gt_lines(
         score = 0 if not match else character_accuracy(match.dist)
         if score > best_score:
             best_score, best_match, best_gt, best_ocr = score, match, gt_line, ocr_line
+        # early breaking: we only need one perfect fit
+        if best_score >= 1:
+            break
 
     # Step 4 of the flexible character accuracy algorithm.
-    # Remove on full match or split.
     if best_match:
         remove_or_split(best_gt, best_match.gt, gt_lines)
         remove_or_split(best_ocr, best_match.ocr, ocr_lines)
@@ -168,7 +170,7 @@ def match_gt_line(
     """
     min_penalty = float("inf")
     best_match, best_ocr = None, None
-    for ocr_line in [*ocr_lines]:
+    for ocr_line in ocr_lines:
         match = match_lines(gt_line, ocr_line)
         if match:
             penalty = calculate_penalty(gt_line, ocr_line, match, coef)
@@ -233,7 +235,7 @@ def match_lines(gt_line: "Part", ocr_line: "Part") -> Optional[Match]:
         for j, ocr_part in ocr_parts:
             match = distance(gt_part, ocr_part)
             edit_dist = score_edit_distance(match)
-            if edit_dist < min_edit_dist:
+            if edit_dist < min_edit_dist and match.dist.replace < min_length:
                 min_edit_dist = edit_dist
                 best_match = match
                 best_i, best_j = i, j
@@ -247,7 +249,7 @@ def match_lines(gt_line: "Part", ocr_line: "Part") -> Optional[Match]:
                 ocr_line.substring(rel_start=best_j, rel_end=best_j + k),
             )
             edit_dist = score_edit_distance(match)
-            if edit_dist < min_edit_dist:
+            if edit_dist < min_edit_dist and match.dist.replace < min_length:
                 min_edit_dist = edit_dist
                 best_match = match
     # is delete a better option?
diff --git a/qurator/dinglehopper/tests/test_flexible_character_accuracy.py b/qurator/dinglehopper/tests/test_flexible_character_accuracy.py
index 9529c87..6f30b71 100644
--- a/qurator/dinglehopper/tests/test_flexible_character_accuracy.py
+++ b/qurator/dinglehopper/tests/test_flexible_character_accuracy.py
@@ -26,6 +26,7 @@ SIMPLE_CASES = [
     ("bbb", "aaa\nbbb\nccc", 1, 1 - 6 / 3),
     ("a", "a\nbb\nccc", 1, 1 - 5 / 1),
     ("bb", "a\nbb\nccc", 1, 1 - 4 / 2),
+    ("abcd", "ab\ne", 1, 1 - 3 / 4),
 ]
 
 COMPLEX_CASES = [
@@ -135,7 +136,6 @@ def test_flexible_character_accuracy_xml(gt, ocr, first_line_score, all_line_sco
     assert score == pytest.approx(all_line_score)
 
 
-@pytest.mark.xfail(reason="Need to adapt performance details.")
 @pytest.mark.parametrize(
     "config,ocr",
     [
@@ -273,6 +273,8 @@ def test_remove_or_split(original, match, expected_lines):
     [
         *SIMPLE_EDITS,
         (Part(text="a"), Part(text="b"), Distance(delete=1)),
+        (Part(text="ab"), Part(text="c"), Distance(delete=2)),
+        (Part(text="abc"), Part(text="d"), Distance(delete=3)),
         (Part(text="aaa"), Part(text="bbb"), Distance(delete=3)),
         (Part(text="aaabbbaaa"), Part(text="bbb"), Distance(match=3)),
         (Part(text="bbb"), Part(text="aaabbbaaa"), Distance(match=3)),