diff --git a/qurator/dinglehopper/flexible_character_accuracy.py b/qurator/dinglehopper/flexible_character_accuracy.py index 607cd39..f44c114 100644 --- a/qurator/dinglehopper/flexible_character_accuracy.py +++ b/qurator/dinglehopper/flexible_character_accuracy.py @@ -46,8 +46,8 @@ def flexible_character_accuracy( Reference: contains steps 1-7 of the flexible character accuracy algorithm. - :param gt: The ground truth text. - :param ocr: The text to compare the ground truth with. + :param gt: The ground truth ExtractedText object. + :param ocr: The ExtractedText object to compare the ground truth with. :return: Score between 0 and 1 and match objects. """ return flexible_character_accuracy(gt.text, ocr.text) @@ -66,11 +66,11 @@ def flexible_character_accuracy(gt: str, ocr: str) -> Tuple[float, List[Match]]: best_score = -float("inf") best_matches = [] - # TODO: this should be configurable + # TODO: should this be configurable? combinations = product( range(15, 31, 5), range(0, 24, 3), range(0, 4, 1), range(0, 6, 1) ) - # TODO: place to parallelize the algorithm + # TODO: place to parallelize the algorithm? for (edit_dist, length_diff, offset, length) in combinations: coef = Coefficients( edit_dist=edit_dist, length_diff=length_diff, offset=offset, length=length @@ -89,7 +89,7 @@ def flexible_character_accuracy(gt: str, ocr: str) -> Tuple[float, List[Match]]: def match_with_coefficients(gt: str, ocr: str, coef: Coefficients) -> List[Match]: - """Match ground truth with ocr and considers a given set of coefficients. + """Match ground truth with ocr and consider a given set of coefficients. Reference: contains steps 1 - 6 of the flexible character accuracy algorithm. @@ -128,7 +128,8 @@ def match_longest_gt_lines( """Find the best match for the longest line(s) in ground truth. The longest lines in ground truth are matched against lines in ocr to find the - best matching pair. This pair is then either considered a match on full line + best matching pair. This pair is then either considered a match on a full line + or the line(s) is splitted and the non matching parts are added back to the list. Reference: contains steps 3 and 4 of the flexible character accuracy algorithm. @@ -139,11 +140,12 @@ def match_longest_gt_lines( return best_match # Step 3 of the flexible character accuracy algorithm (variation). - # Instead of the longest line we take all longest lines with equal length. - length = min(gt_lines[0].length, ocr_lines[0].length) - for gt_line in takewhile(lambda line: line.length >= length, gt_lines): + # We do not only take the longest line from ground truth but decide on a length + # threshold and take all lines from ground truth bigger than the threshold. + length_threshold = min(gt_lines[0].length, ocr_lines[0].length) - 1 + for gt_line in takewhile(lambda line: line.length > length_threshold, gt_lines): match, ocr_line = match_gt_line(gt_line, ocr_lines, coef) - score = 0 if not match else character_accuracy(match.dist) + score = -float("inf") if not match else character_accuracy(match.dist) if score > best_score: best_score, best_match, best_gt, best_ocr = score, match, gt_line, ocr_line # early breaking: we only need one perfect fit @@ -191,34 +193,17 @@ def match_gt_line( return best_match, best_ocr -def remove_or_split(original: "Part", match: "Part", lines: List["Part"]) -> bool: - """Removes the matched line or splits it into parts. - - Reference: contains step 4 of the flexible character accuracy algorithm. - - :return: True if line was splitted. - """ - splitted = False - del lines[lines.index(original)] - if match.length < original.length: - lines.extend(original.split(match)) - # sorting for ocr is not mentioned in the paper, but is used as tie breaking =) - lines.sort(key=lambda x: x.length, reverse=True) - splitted = True - return splitted - - -@lru_cache(maxsize=1000000) +@lru_cache(maxsize=10000) def match_lines(gt_line: "Part", ocr_line: "Part") -> Optional[Match]: - """Matches two lines searching for a local alignment. + """Matches two lines searching for a naive local alignment. The shorter line is moved along the longer line until the editing distance is minimized. - Reference: see figure 2 in the paper. + Reference: see figure 2 in the doi:10.1016/j.patrec.2020.02.003. TODO: make distance function configurable? - TODO: rethink @lru_cache + TODO: use @cache annotation in Python 3.9? :return: Match object if one is found. """ @@ -273,14 +258,14 @@ def match_lines(gt_line: "Part", ocr_line: "Part") -> Optional[Match]: return best_match -@lru_cache(maxsize=1000000) +@lru_cache(maxsize=10000) def distance(gt: "Part", ocr: "Part") -> Match: """Calculate the editing distance between the two lines. Using the already available `editops()` function with the Levenshtein distance. - TODO: replace with @cache annotation in Python 3.9 - TODO: rethink @lru_cache + TODO: use @cache annotation in Python 3.9? + TODO: wait for qurator-spk/dinglehopper#48 for efficient editops. :return: Match object containing the lines and the editing operations. """ @@ -300,7 +285,7 @@ def score_edit_distance(dist: Distance) -> int: return dist.delete + dist.insert + 2 * dist.replace -@lru_cache(1000000) +@lru_cache(10000) def calculate_penalty( gt_length: int, ocr_length: int, @@ -336,7 +321,6 @@ def character_accuracy_for_matches(matches: List[Match]) -> float: """Character accuracy of a full text represented by a list of matches. See other `character_accuracy` for details. - """ agg = reduce( lambda acc, match: acc + Counter(match.dist._asdict()), matches, Counter() @@ -355,7 +339,7 @@ def character_accuracy(edits: Distance) -> float: Errors are replacements, deletes and inserts. - Note that is is possible to have more errors than characters in which case the + Note that it is possible to have more errors than characters in which case the character accuracy turns negative. Comparing two empty strings (having no edits) results in a character accuracy of 1. @@ -391,10 +375,30 @@ def initialize_lines(text: str) -> List["Part"]: return lines -def split_matches(matches: List[Match]) -> Tuple[List[str], List[str], List[List]]: +def remove_or_split(original: "Part", match: "Part", lines: List["Part"]) -> bool: + """Removes the matched line or splits it into parts. + + Reference: contains step 4 of the flexible character accuracy algorithm. + + :return: True if line was splitted. + """ + splitted = False + del lines[lines.index(original)] + if match.length < original.length: + lines.extend(original.split(match)) + # sorting for ocr is not mentioned in the paper, but is used as tie breaking =) + lines.sort(key=lambda x: x.length, reverse=True) + splitted = True + return splitted + + +def split_matches( + matches: List[Match], linesep="\n" +) -> Tuple[List[str], List[str], List[List]]: """Extracts text segments and editing operations in separate lists. :param matches: List of match objects. + :param linesep: Character(s) or line separation. :return: List of ground truth segments, ocr segments and editing operations. """ matches = sorted(matches, key=lambda m: m.gt.line + m.gt.start / 10000) @@ -402,9 +406,9 @@ def split_matches(matches: List[Match]) -> Tuple[List[str], List[str], List[List gt, ocr, ops = [], [], [] for match in matches: if match.gt.line > line: - gt.append("\n") - ocr.append("\n") - ops.append([]) + gt.append(linesep) + ocr.append(linesep) + ops.extend([[]] * len(linesep)) line = match.gt.line gt.append(match.gt.text) ocr.append(match.ocr.text) diff --git a/qurator/dinglehopper/tests/test_flexible_character_accuracy.py b/qurator/dinglehopper/tests/test_flexible_character_accuracy.py index 2cbfdbd..ad62798 100644 --- a/qurator/dinglehopper/tests/test_flexible_character_accuracy.py +++ b/qurator/dinglehopper/tests/test_flexible_character_accuracy.py @@ -80,7 +80,7 @@ SIMPLE_EDITS = [ def extended_case_to_text(gt, ocr): """Generate sentence from reading order encoding. - See figure 4 in 10.1016/j.patrec.2020.02.003 + See figure 4 in 10.1016/j.patrec.2020.02.003. """ sentence = ( "Eight", @@ -159,7 +159,7 @@ def test_flexible_character_accuracy_xml(gt, ocr, first_line_score, all_line_sco ], ) def test_flexible_character_accuracy(config, ocr): - """Tests from figure 3 in the paper.""" + """Tests from figure 3 in the 10.1016/j.patrec.2020.02.003.""" gt = ( '"I have\nno special\ntalent.\n' 'I am only\npassionately\ncurious."\n' @@ -199,7 +199,7 @@ def test_flexible_character_accuracy(config, ocr): def test_flexible_character_accuracy_extended( gt, ocr, first_line_score, all_line_score ): - """Tests from figure 4 in the paper.""" + """Tests from figure 4 in the 10.1016/j.patrec.2020.02.003.""" gt_sentence, ocr_sentence = extended_case_to_text(gt, ocr) result, _ = flexible_character_accuracy(gt_sentence, ocr_sentence) assert result == pytest.approx(all_line_score, abs=0.001) diff --git a/qurator/dinglehopper/tests/test_integ_flexible_character_accuracy.py b/qurator/dinglehopper/tests/test_integ_flexible_character_accuracy.py index dec0799..3b3ecba 100644 --- a/qurator/dinglehopper/tests/test_integ_flexible_character_accuracy.py +++ b/qurator/dinglehopper/tests/test_integ_flexible_character_accuracy.py @@ -6,14 +6,13 @@ from lxml import etree as ET from .. import distance, page_text, extract from .. import flexible_character_accuracy, split_matches -data_dir = os.path.join( - os.path.dirname(os.path.abspath(__file__)), "data", "table-order" -) - @pytest.mark.parametrize("file", ["table-order-0002.xml", "table-no-reading-order.xml"]) @pytest.mark.integration def test_fac_ignoring_reading_order(file): + data_dir = os.path.join( + os.path.dirname(os.path.abspath(__file__)), "data", "table-order" + ) expected = "1\n2\n3\n4\n5\n6\n7\n8\n9" gt = page_text(ET.parse(os.path.join(data_dir, "table-order-0001.xml"))) @@ -42,6 +41,9 @@ def test_fac_ignoring_reading_order(file): ) @pytest.mark.integration def test_reading_order_settings(file, expected_text): + data_dir = os.path.join( + os.path.dirname(os.path.abspath(__file__)), "data", "table-order" + ) if "table-unordered.xml" == file: with pytest.raises(NotImplementedError): page_text(ET.parse(os.path.join(data_dir, file)))