Small corrections

2025-12-14 06:34:16 +01:00 · 2020-11-23 09:18:22 +01:00 · 2020-11-23 09:18:22 +01:00 · 0dd5fc0ee5
commit 0dd5fc0ee5
parent b24d8d5664
3 changed files with 54 additions and 48 deletions
--- a/qurator/dinglehopper/flexible_character_accuracy.py
+++ b/qurator/dinglehopper/flexible_character_accuracy.py
@ -46,8 +46,8 @@ def flexible_character_accuracy(

    Reference: contains steps 1-7 of the flexible character accuracy algorithm.

-    :param gt: The ground truth text.
-    :param ocr: The text to compare the ground truth with.
+    :param gt: The ground truth ExtractedText object.
+    :param ocr: The ExtractedText object to compare the ground truth with.
    :return: Score between 0 and 1 and match objects.
    """
    return flexible_character_accuracy(gt.text, ocr.text)
@ -66,11 +66,11 @@ def flexible_character_accuracy(gt: str, ocr: str) -> Tuple[float, List[Match]]:

    best_score = -float("inf")
    best_matches = []
-    # TODO: this should be configurable
+    # TODO: should this be configurable?
    combinations = product(
        range(15, 31, 5), range(0, 24, 3), range(0, 4, 1), range(0, 6, 1)
    )
-    # TODO: place to parallelize the algorithm
+    # TODO: place to parallelize the algorithm?
    for (edit_dist, length_diff, offset, length) in combinations:
        coef = Coefficients(
            edit_dist=edit_dist, length_diff=length_diff, offset=offset, length=length
@ -89,7 +89,7 @@ def flexible_character_accuracy(gt: str, ocr: str) -> Tuple[float, List[Match]]:


 def match_with_coefficients(gt: str, ocr: str, coef: Coefficients) -> List[Match]:
-    """Match ground truth with ocr and considers a given set of coefficients.
+    """Match ground truth with ocr and consider a given set of coefficients.

    Reference: contains steps 1 - 6 of the flexible character accuracy algorithm.

@ -128,7 +128,8 @@ def match_longest_gt_lines(
    """Find the best match for the longest line(s) in ground truth.

    The longest lines in ground truth are matched against lines in ocr to find the
-    best matching pair. This pair is then either considered a match on full line
+    best matching pair. This pair is then either considered a match on a full line
+    or the line(s) is splitted and the non matching parts are added back to the list.

    Reference: contains steps 3 and 4 of the flexible character accuracy algorithm.

@ -139,11 +140,12 @@ def match_longest_gt_lines(
        return best_match

    # Step 3 of the flexible character accuracy algorithm (variation).
-    # Instead of the longest line we take all longest lines with equal length.
-    length = min(gt_lines[0].length, ocr_lines[0].length)
-    for gt_line in takewhile(lambda line: line.length >= length, gt_lines):
+    # We do not only take the longest line from ground truth but decide on a length
+    # threshold and take all lines from ground truth bigger than the threshold.
+    length_threshold = min(gt_lines[0].length, ocr_lines[0].length) - 1
+    for gt_line in takewhile(lambda line: line.length > length_threshold, gt_lines):
        match, ocr_line = match_gt_line(gt_line, ocr_lines, coef)
-        score = 0 if not match else character_accuracy(match.dist)
+        score = -float("inf") if not match else character_accuracy(match.dist)
        if score > best_score:
            best_score, best_match, best_gt, best_ocr = score, match, gt_line, ocr_line
        # early breaking: we only need one perfect fit
@ -191,34 +193,17 @@ def match_gt_line(
    return best_match, best_ocr


-def remove_or_split(original: "Part", match: "Part", lines: List["Part"]) -> bool:
-    """Removes the matched line or splits it into parts.
-
-    Reference: contains step 4 of the flexible character accuracy algorithm.
-
-    :return: True if line was splitted.
-    """
-    splitted = False
-    del lines[lines.index(original)]
-    if match.length < original.length:
-        lines.extend(original.split(match))
-        # sorting for ocr is not mentioned in the paper, but is used as tie breaking =)
-        lines.sort(key=lambda x: x.length, reverse=True)
-        splitted = True
-    return splitted
-
-
-@lru_cache(maxsize=1000000)
+@lru_cache(maxsize=10000)
 def match_lines(gt_line: "Part", ocr_line: "Part") -> Optional[Match]:
-    """Matches two lines searching for a local alignment.
+    """Matches two lines searching for a naive local alignment.

    The shorter line is moved along the longer line
    until the editing distance is minimized.

-    Reference: see figure 2 in the paper.
+    Reference: see figure 2 in the doi:10.1016/j.patrec.2020.02.003.

    TODO: make distance function configurable?
-    TODO: rethink @lru_cache
+    TODO: use @cache annotation in Python 3.9?

    :return: Match object if one is found.
    """
@ -273,14 +258,14 @@ def match_lines(gt_line: "Part", ocr_line: "Part") -> Optional[Match]:
    return best_match


-@lru_cache(maxsize=1000000)
+@lru_cache(maxsize=10000)
 def distance(gt: "Part", ocr: "Part") -> Match:
    """Calculate the editing distance between the two lines.

    Using the already available `editops()` function with the Levenshtein distance.

-    TODO: replace with @cache annotation in Python 3.9
-    TODO: rethink @lru_cache
+    TODO: use @cache annotation in Python 3.9?
+    TODO: wait for qurator-spk/dinglehopper#48 for efficient editops.

    :return: Match object containing the lines and the editing operations.
    """
@ -300,7 +285,7 @@ def score_edit_distance(dist: Distance) -> int:
    return dist.delete + dist.insert + 2 * dist.replace


-@lru_cache(1000000)
+@lru_cache(10000)
 def calculate_penalty(
    gt_length: int,
    ocr_length: int,
@ -336,7 +321,6 @@ def character_accuracy_for_matches(matches: List[Match]) -> float:
    """Character accuracy of a full text represented by a list of matches.

    See other `character_accuracy` for details.
-
    """
    agg = reduce(
        lambda acc, match: acc + Counter(match.dist._asdict()), matches, Counter()
@ -355,7 +339,7 @@ def character_accuracy(edits: Distance) -> float:

    Errors are replacements, deletes and inserts.

-    Note that is is possible to have more errors than characters in which case the
+    Note that it is possible to have more errors than characters in which case the
    character accuracy turns negative.

    Comparing two empty strings (having no edits) results in a character accuracy of 1.
@ -391,10 +375,30 @@ def initialize_lines(text: str) -> List["Part"]:
    return lines


-def split_matches(matches: List[Match]) -> Tuple[List[str], List[str], List[List]]:
+def remove_or_split(original: "Part", match: "Part", lines: List["Part"]) -> bool:
+    """Removes the matched line or splits it into parts.
+
+    Reference: contains step 4 of the flexible character accuracy algorithm.
+
+    :return: True if line was splitted.
+    """
+    splitted = False
+    del lines[lines.index(original)]
+    if match.length < original.length:
+        lines.extend(original.split(match))
+        # sorting for ocr is not mentioned in the paper, but is used as tie breaking =)
+        lines.sort(key=lambda x: x.length, reverse=True)
+        splitted = True
+    return splitted
+
+
+def split_matches(
+    matches: List[Match], linesep="\n"
+) -> Tuple[List[str], List[str], List[List]]:
    """Extracts text segments and editing operations in separate lists.

    :param matches: List of match objects.
+    :param linesep: Character(s) or line separation.
    :return: List of ground truth segments, ocr segments and editing operations.
    """
    matches = sorted(matches, key=lambda m: m.gt.line + m.gt.start / 10000)
@ -402,9 +406,9 @@ def split_matches(matches: List[Match]) -> Tuple[List[str], List[str], List[List
    gt, ocr, ops = [], [], []
    for match in matches:
        if match.gt.line > line:
-            gt.append("\n")
-            ocr.append("\n")
-            ops.append([])
+            gt.append(linesep)
+            ocr.append(linesep)
+            ops.extend([[]] * len(linesep))
        line = match.gt.line
        gt.append(match.gt.text)
        ocr.append(match.ocr.text)
--- a/qurator/dinglehopper/tests/test_flexible_character_accuracy.py
+++ b/qurator/dinglehopper/tests/test_flexible_character_accuracy.py
@ -80,7 +80,7 @@ SIMPLE_EDITS = [
 def extended_case_to_text(gt, ocr):
    """Generate sentence from reading order encoding.

-    See figure 4 in 10.1016/j.patrec.2020.02.003
+    See figure 4 in 10.1016/j.patrec.2020.02.003.
    """
    sentence = (
        "Eight",
@ -159,7 +159,7 @@ def test_flexible_character_accuracy_xml(gt, ocr, first_line_score, all_line_sco
    ],
 )
 def test_flexible_character_accuracy(config, ocr):
-    """Tests from figure 3 in the paper."""
+    """Tests from figure 3 in the 10.1016/j.patrec.2020.02.003."""
    gt = (
        '"I have\nno special\ntalent.\n'
        'I am only\npassionately\ncurious."\n'
@ -199,7 +199,7 @@ def test_flexible_character_accuracy(config, ocr):
 def test_flexible_character_accuracy_extended(
    gt, ocr, first_line_score, all_line_score
 ):
-    """Tests from figure 4 in the paper."""
+    """Tests from figure 4 in the 10.1016/j.patrec.2020.02.003."""
    gt_sentence, ocr_sentence = extended_case_to_text(gt, ocr)
    result, _ = flexible_character_accuracy(gt_sentence, ocr_sentence)
    assert result == pytest.approx(all_line_score, abs=0.001)
--- a/qurator/dinglehopper/tests/test_integ_flexible_character_accuracy.py
+++ b/qurator/dinglehopper/tests/test_integ_flexible_character_accuracy.py
@ -6,14 +6,13 @@ from lxml import etree as ET
 from .. import distance, page_text, extract
 from .. import flexible_character_accuracy, split_matches

-data_dir = os.path.join(
-    os.path.dirname(os.path.abspath(__file__)), "data", "table-order"
-)
-

@pytest.mark.parametrize("file", ["table-order-0002.xml", "table-no-reading-order.xml"])
@pytest.mark.integration
 def test_fac_ignoring_reading_order(file):
+    data_dir = os.path.join(
+        os.path.dirname(os.path.abspath(__file__)), "data", "table-order"
+    )
    expected = "1\n2\n3\n4\n5\n6\n7\n8\n9"

    gt = page_text(ET.parse(os.path.join(data_dir, "table-order-0001.xml")))
@ -42,6 +41,9 @@ def test_fac_ignoring_reading_order(file):
 )
@pytest.mark.integration
 def test_reading_order_settings(file, expected_text):
+    data_dir = os.path.join(
+        os.path.dirname(os.path.abspath(__file__)), "data", "table-order"
+    )
    if "table-unordered.xml" == file:
        with pytest.raises(NotImplementedError):
            page_text(ET.parse(os.path.join(data_dir, file)))