Evaluate some performance issues

2026-03-02 13:22:09 +01:00 · 2020-11-12 18:38:16 +01:00 · 2020-11-12 18:38:16 +01:00 · cac437afbf
commit cac437afbf
parent 1bc7ef6c8b
3 changed files with 55 additions and 16 deletions
--- a/qurator/dinglehopper/flexible_character_accuracy.py
+++ b/qurator/dinglehopper/flexible_character_accuracy.py
@ -148,13 +148,8 @@ def match_longest_gt_lines(

    # Step 4 of the flexible character accuracy algorithm.
    # Remove on full match or split.
-    if best_match and best_gt:
-        splitted = remove_or_split(best_gt, best_match.gt, gt_lines)
-        if splitted:
-            # according to the paper the match is not put back, we deviate...
-            gt_lines.append(best_match.gt)
-            best_match = None
-    if best_match and best_ocr:
+    if best_match:
+        remove_or_split(best_gt, best_match.gt, gt_lines)
        remove_or_split(best_ocr, best_match.ocr, ocr_lines)

    return best_match
@ -230,13 +225,9 @@ def match_lines(gt_line: "Part", ocr_line: "Part") -> Optional[Match]:
        for j in range(0, max(1, -1 * length_diff + 1))
    ]

-    # add full line and empty line match
-    gt_parts = [*gt_parts, (0, gt_line), (0, gt_line)]
-    ocr_parts = [
-        *ocr_parts,
-        (0, ocr_line),
-        (0, Part(text="", line=gt_line.line, start=gt_line.start)),
-    ]
+    # add full line
+    gt_parts = [*gt_parts, (0, gt_line)]
+    ocr_parts = [*ocr_parts, (0, ocr_line)]

    for i, gt_part in gt_parts:
        for j, ocr_part in ocr_parts:
@ -246,6 +237,7 @@ def match_lines(gt_line: "Part", ocr_line: "Part") -> Optional[Match]:
                min_edit_dist = edit_dist
                best_match = match
                best_i, best_j = i, j
+    # elongate at the end for handling deletes
    if best_match and (best_match.dist.delete or best_match.dist.replace):
        part_length = best_match.gt.length
        additional_length = best_match.dist.delete + best_match.dist.replace
@ -258,6 +250,12 @@ def match_lines(gt_line: "Part", ocr_line: "Part") -> Optional[Match]:
            if edit_dist < min_edit_dist:
                min_edit_dist = edit_dist
                best_match = match
+    # is delete a better option?
+    match = distance(gt_line, Part(text="", line=ocr_line.line, start=ocr_line.start))
+    edit_dist = score_edit_distance(match)
+    if edit_dist < min_edit_dist:
+        best_match = match
+
    return best_match


--- a/qurator/dinglehopper/tests/test_flexible_character_accuracy.py
+++ b/qurator/dinglehopper/tests/test_flexible_character_accuracy.py
@ -29,7 +29,7 @@ SIMPLE_CASES = [
 ]

 COMPLEX_CASES = [
-    ("accc", "a\nbb\nccc", 0, 1 - 2 / 4),
+    ("accc", "a\nbb\nccc", 1, 1 - 2 / 4),
    ("aaa\nbbb\nccc", "bbb", 1, 1 - 6 / 9),
 ]

@ -135,6 +135,7 @@ def test_flexible_character_accuracy_xml(gt, ocr, first_line_score, all_line_sco
    assert score == pytest.approx(all_line_score)


+@pytest.mark.xfail(reason="Need to adapt performance details.")
@pytest.mark.parametrize(
    "config,ocr",
    [
--- a/qurator/dinglehopper/tests/test_integ_flexible_character_accuracy.py
+++ b/qurator/dinglehopper/tests/test_integ_flexible_character_accuracy.py
@ -3,7 +3,7 @@ import os
 import pytest
 from lxml import etree as ET

-from .. import distance, page_text
+from .. import distance, page_text, extract
 from .. import flexible_character_accuracy, split_matches

 data_dir = os.path.join(
@ -48,3 +48,43 @@ def test_reading_order_settings(file, expected_text):
    else:
        ocr = page_text(ET.parse(os.path.join(data_dir, file)))
        assert ocr == expected_text
+
+
+@pytest.mark.skip(reason="Need to check performance first.")
+@pytest.mark.integration
+@pytest.mark.parametrize(
+    "gt,ocr,expected",
+    [
+        (
+            "brochrnx_73075507X/00000139.gt.page.xml",
+            "brochrnx_73075507X/00000139.ocrd-tess.ocr.page.xml",
+            0.93,
+        ),
+        (
+            "actevedef_718448162/OCR-D-GT-PAGE/00000024.page.xml",
+            "actevedef_718448162/OCR-D-OCR-TESS/OCR-D-OCR-TESS_0001.xml",
+            0.96,
+        ),
+        (
+            "actevedef_718448162/OCR-D-GT-PAGE/00000024.page.xml",
+            "actevedef_718448162/OCR-D-OCR-CALAMARI/OCR-D-OCR-CALAMARI_0001.xml",
+            0.97,
+        ),
+        (
+            "lorem-ipsum/lorem-ipsum-scan.gt.page.xml",
+            "lorem-ipsum/lorem-ipsum-scan.ocr.tesseract.alto.xml",
+            1.0,
+        ),
+        (
+            "lorem-ipsum/lorem-ipsum-scan-bad.gt.page.xml",
+            "lorem-ipsum/lorem-ipsum-scan-bad.ocr.tesseract.alto.xml",
+            0.98,
+        ),
+    ],
+)
+def test_ocr_files(gt, ocr, expected):
+    data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")
+    gt_et = extract(os.path.join(data_dir, gt))
+    ocr_et = extract(os.path.join(data_dir, ocr))
+    score, _ = flexible_character_accuracy(gt_et, ocr_et)
+    assert score == pytest.approx(expected, abs=0.01)