From 5277593bdbd50f2dee361087b26c0a0580e6ac28 Mon Sep 17 00:00:00 2001 From: Benjamin Rosemann Date: Tue, 10 Nov 2020 12:33:49 +0100 Subject: [PATCH] Fix some special cases --- .../flexible_character_accuracy.py | 37 ++++++-- .../tests/test_flexible_character_accuracy.py | 86 ++++++++++++------- 2 files changed, 83 insertions(+), 40 deletions(-) diff --git a/qurator/dinglehopper/flexible_character_accuracy.py b/qurator/dinglehopper/flexible_character_accuracy.py index 3e4bb93..2b9a56f 100644 --- a/qurator/dinglehopper/flexible_character_accuracy.py +++ b/qurator/dinglehopper/flexible_character_accuracy.py @@ -117,6 +117,7 @@ def match_longest_gt_lines(gt_lines: List["Part"], if best_match and best_gt: splitted = remove_or_split(best_gt, best_match.gt, gt_lines) if splitted: + # according to the paper the match is not put back, we deviate... gt_lines.append(best_match.gt) best_match = None if best_match and best_ocr: @@ -134,13 +135,12 @@ def match_gt_line(gt_line: "Part", Reference: contains steps 3 of the flexible character accuracy algorithm. TODO: Make penalty function configurable? - TODO: Add empty ocr line to avoid having nonesense one character alignments? :return: Match object and the matched ocr line. """ min_penalty = float('inf') best_match, best_ocr = None, None - for ocr_line in ocr_lines: + for ocr_line in [*ocr_lines]: match = match_lines(gt_line, ocr_line) penalty = calculate_penalty(gt_line, ocr_line, match, coef) if penalty < min_penalty: @@ -177,20 +177,42 @@ def match_lines(gt_line: "Part", ocr_line: "Part") -> Optional["Match"]: Reference: see figure 2 in the paper. TODO: make distance function configurable? + TODO: rethink @lru_cache :return: Match object if one is found. """ min_length = min(gt_line.length, ocr_line.length) best_match = None + best_i, best_j = 0, 0 if min_length == 0: return best_match length_diff = gt_line.length - ocr_line.length min_edit_dist = float('inf') - # TODO: handle deletes and replacements by extending the length. - for i in range(0, max(1, length_diff + 1)): - for j in range(0, max(1, -1 * length_diff + 1)): - match = distance(gt_line.substring(rel_start=i, rel_end=i + min_length), - ocr_line.substring(rel_start=j, rel_end=j + min_length)) + + gt_parts = [(i, gt_line.substring(rel_start=i, rel_end=i + min_length)) + for i in range(0, max(1, length_diff + 1))] + ocr_parts = [(j, ocr_line.substring(rel_start=j, rel_end=j + min_length)) + for j in range(0, max(1, -1 * length_diff + 1))] + + # add full line and empty line match + gt_parts = [*gt_parts, (0, gt_line), (0, gt_line)] + ocr_parts = [*ocr_parts, (0, ocr_line), + (0, Part(text="", line=gt_line.line, start=gt_line.start))] + + for i, gt_part in gt_parts: + for j, ocr_part in ocr_parts: + match = distance(gt_part, ocr_part) + edit_dist = score_edit_distance(match) + if edit_dist < min_edit_dist: + min_edit_dist = edit_dist + best_match = match + best_i, best_j = i, j + if best_match and (best_match.dist.delete or best_match.dist.replace): + part_length = best_match.gt.length + additional_length = best_match.dist.delete + best_match.dist.replace + for k in range(part_length + 1, part_length + additional_length + 1): + match = distance(gt_line.substring(rel_start=best_i, rel_end=best_i + k), + ocr_line.substring(rel_start=best_j, rel_end=best_j + k)) edit_dist = score_edit_distance(match) if edit_dist < min_edit_dist: min_edit_dist = edit_dist @@ -205,6 +227,7 @@ def distance(gt: "Part", ocr: "Part") -> "Match": Using the already available `editops()` function with the Levenshtein distance. TODO: replace with @cache annotation in Python 3.9 + TODO: rethink @lru_cache :return: Match object containing the lines and the editing operations. """ diff --git a/qurator/dinglehopper/tests/test_flexible_character_accuracy.py b/qurator/dinglehopper/tests/test_flexible_character_accuracy.py index 6393e2f..0126696 100644 --- a/qurator/dinglehopper/tests/test_flexible_character_accuracy.py +++ b/qurator/dinglehopper/tests/test_flexible_character_accuracy.py @@ -33,6 +33,7 @@ COMPLEX_CASES = [ ] EXTENDED_CASES = [ + # See figure 4 in 10.1016/j.patrec.2020.02.003 # A: No errors ((0, 1, 2, 3, 4, 11, 5, 6, 7, 8, 9), (0, 1, 2, 3, 4, 11, 5, 6, 7, 8, 9), @@ -70,14 +71,18 @@ EXTENDED_CASES = [ EDIT_ARGS = "gt,ocr,expected_dist" SIMPLE_EDITS = [ - (Part(text="a").substring(), Part(text="a"), Distance(match=1)), - (Part(text="a").substring(), Part(text="b"), Distance(replace=1)), - (Part(text="abcd").substring(), Part(text="beed"), + (Part(text="a"), Part(text="a"), Distance(match=1)), + (Part(text="aaa"), Part(text="aaa"), Distance(match=3)), + (Part(text="abcd"), Part(text="beed"), Distance(match=2, replace=1, insert=1, delete=1)), ] def extended_case_to_text(gt, ocr): + """Generate sentence from reading order encoding. + + See figure 4 in 10.1016/j.patrec.2020.02.003 + """ sentence = ("Eight", "happy", "frogs", "scuba", "dived", "Jenny", "chick", "flaps", "white", "wings", "", "\n") @@ -93,38 +98,45 @@ def test_flexible_character_accuracy_simple(gt, ocr, first_line_score, all_line_ assert score == pytest.approx(all_line_score) -@pytest.mark.xfail -@pytest.mark.parametrize("ocr", [ - "1 hav\nnospecial\ntalents.\nI am one\npassionate\ncuriousity.\"\nAlberto\nEmstein", - "1 hav\nnospecial\ntalents. Alberto\nI am one Emstein\npassionate\ncuriousity.\"", - "Alberto\nEmstein\n1 hav\nnospecial\ntalents.\nI am one\npassionate\ncuriousity.\"" +@pytest.mark.parametrize("config,ocr", [ + ("Config I", + "1 hav\nnospecial\ntalents.\nI am one\npassionate\ncuriousity.\"\nAlberto\nEmstein" + ), + ("Config II", + "1 hav\nnospecial\ntalents. Alberto\nI am one Emstein\npassionate\ncuriousity.\"" + ), + ("Config III", + "Alberto\nEmstein\n1 hav\nnospecial\ntalents.\nI am one\npassionate\ncuriousity.\"" + ), ]) -def test_flexible_character_accuracy(ocr): - """Tests from figure 3 in the paper. - - TODO: We have a 2 percent deviation from the original because of redistributed - one character alignments (e.g. the y-insert replaces the y-delete). - """ - gt = """"I have -no special -talent. -I am only -passionately -curious." -Albert -Einstein -""" - replacements = 3 - inserts = 5 - deletes = 7 +def test_flexible_character_accuracy(config, ocr): + """Tests from figure 3 in the paper.""" + gt = "\"I have\nno special\ntalent." \ + "\nI am only\npassionately\ncurious.\"" \ + "\nAlbert\nEinstein" + replacements, inserts, deletes = 3, 5, 7 chars = len(gt) - gt.count("\n") - assert replacements + inserts + deletes == 15 - edits = Distance(match=chars - deletes - replacements, replace=replacements, - insert=inserts, delete=deletes) - expected = character_accuracy(edits) - assert expected == pytest.approx(0.779, abs=0.0005) + assert chars == 68 + + # We consider whitespace as error and in Config II two additional + # whitespaces have been introduced. One will be counted as insert. + # The other whitespace will be counted as replacement, + # additionally reducing the number of deletes. + if config == "Config II": + inserts += 1 + replacements += 1 + deletes -= 1 + + expected_dist = Distance(match=chars - deletes - replacements, replace=replacements, + insert=inserts, delete=deletes) + expected_score = character_accuracy(expected_dist) + result, matches = flexible_character_accuracy(gt, ocr) - assert result == pytest.approx(expected, abs=0.0005) + agg = reduce(lambda acc, match: acc + Counter(match.dist._asdict()), + matches, Counter()) + dist = Distance(**agg) + assert dist == expected_dist + assert result == pytest.approx(expected_score, abs=0.0005) @pytest.mark.parametrize(CASE_ARGS, EXTENDED_CASES) @@ -191,9 +203,15 @@ def test_remove_or_split(original, match, expected_lines): @pytest.mark.parametrize(EDIT_ARGS, [ *SIMPLE_EDITS, + (Part(text="a"), Part(text="b"), Distance(delete=1)), + (Part(text="aaa"), Part(text="bbb"), Distance(delete=3)), (Part(text="aaabbbaaa"), Part(text="bbb"), Distance(match=3)), (Part(text="bbb"), Part(text="aaabbbaaa"), Distance(match=3)), - (Part(text=""), Part(text=""), None) + (Part(text=""), Part(text=""), None), + (Part(text="abcd"), Part(text="acd"), Distance(match=3, delete=1)), + (Part(text="abc"), Part(text="abdc"), Distance(match=3, insert=1)), + (Part(text="aaabbbaaaddd"), Part(text="aaabcbaaa"), Distance(match=8, replace=1)), + (Part(text="aaabbbccc"), Part(text="aaabbbdddccc"), Distance(match=9, insert=3)), ]) def test_match_lines(gt, ocr, expected_dist): match = match_lines(gt, ocr) @@ -210,6 +228,8 @@ def test_match_lines(gt, ocr, expected_dist): (Part(text="").substring(), Part(text=""), Distance()), (Part(text="ab").substring(), Part("a"), Distance(match=1, delete=1)), (Part(text="a").substring(), Part("ab"), Distance(match=1, insert=1)), + (Part(text="a"), Part(text="b"), Distance(replace=1)), + (Part(text="aaa"), Part(text="bbb"), Distance(replace=3)), ]) def test_distance(gt, ocr, expected_dist): match = distance(gt, ocr)