Include fca as parameter and add some tests

2025-07-18 23:09:54 +02:00 · 2020-11-11 11:14:44 +01:00 · 2020-11-11 11:14:44 +01:00 · 53064bf833
commit 53064bf833
parent 9b76539936
11 changed files with 219 additions and 65 deletions
--- a/README.md
+++ b/README.md
@ -35,19 +35,23 @@ Usage: dinglehopper [OPTIONS] GT OCR [REPORT_PREFIX]
  their text and falls back to plain text if no ALTO or PAGE is detected.
  The files GT and OCR are usually a ground truth document and the result of
-  an OCR software, but you may use dinglehopper to compare two OCR results.
+  an OCR software, but you may use dinglehopper to compare two OCR results. In
-  In that case, use --no-metrics to disable the then meaningless metrics and
+  that case, use --metrics='' to disable the then meaningless metrics and also
-  also change the color scheme from green/red to blue.
+  change the color scheme from green/red to blue.
  The comparison report will be written to $REPORT_PREFIX.{html,json}, where
-  $REPORT_PREFIX defaults to "report". The reports include the character
+  $REPORT_PREFIX defaults to "report". Depending on your configuration the
-  error rate (CER) and the word error rate (WER).
+  reports include the character error rate (CER), the word error rate (WER)
  and the flexible character accuracy (FCA).
  The metrics can be chosen via a comma separated combination of their acronyms
  like "--metrics=cer,wer,fca".
  By default, the text of PAGE files is extracted on 'region' level. You may
  use "--textequiv-level line" to extract from the level of TextLine tags.
 Options:
-  --metrics / --no-metrics  Enable/disable metrics and green/red
+  --metrics                 Enable different metrics like cer, wer and fca.
  --textequiv-level LEVEL   PAGE TextEquiv level to extract text from
  --progress                Show progress bar
  --help                    Show this message and exit.
@ -80,12 +84,12 @@ The OCR-D processor has these parameters:
 | Parameter                 | Meaning                                                             |
 | ------------------------- | ------------------------------------------------------------------- |
-| `-P metrics false`        | Disable metrics and the green-red color scheme (default: enabled)   |
+| `-P metrics cer,wer`      | Enable character error rate and word error rate (default)           |
 | `-P textequiv_level line` | (PAGE) Extract text from TextLine level (default: TextRegion level) |
 For example:
 ~~~
-ocrd-dinglehopper -I ABBYY-FULLTEXT,OCR-D-OCR-CALAMARI -O OCR-D-OCR-COMPARE-ABBYY-CALAMARI -P metrics false
+ocrd-dinglehopper -I ABBYY-FULLTEXT,OCR-D-OCR-CALAMARI -O OCR-D-OCR-COMPARE-ABBYY-CALAMARI -P metrics cer,wer
 ~~~
 Developer information
--- a/qurator/dinglehopper/init.py
+++ b/qurator/dinglehopper/init.py
@ -3,3 +3,4 @@ from .extracted_text import *
 from .character_error_rate import *
 from .word_error_rate import *
 from .align import *
 from .flexible_character_accuracy import flexible_character_accuracy, split_matches
--- a/qurator/dinglehopper/align.py
+++ b/qurator/dinglehopper/align.py
@ -8,10 +8,11 @@ def align(t1, t2):
    return seq_align(s1, s2)
-def seq_align(s1, s2):
+def seq_align(s1, s2, ops=None):
    """Align general sequences."""
    s1 = list(s1)
    s2 = list(s2)
    if not ops:
        ops = seq_editops(s1, s2)
    i = 0
    j = 0
--- a/qurator/dinglehopper/cli.py
+++ b/qurator/dinglehopper/cli.py
@ -6,6 +6,7 @@ from markupsafe import escape
 from uniseg.graphemecluster import grapheme_clusters
 from .character_error_rate import character_error_rate_n
 from .flexible_character_accuracy import flexible_character_accuracy, split_matches
 from .word_error_rate import word_error_rate_n, words_normalized
 from .align import seq_align
 from .extracted_text import ExtractedText
@ -13,7 +14,7 @@ from .ocr_files import extract
 from .config import Config
-def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none):
+def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none, ops=None):
    gtx = ""
    ocrx = ""
@ -53,7 +54,7 @@ def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none):
    g_pos = 0
    o_pos = 0
-    for k, (g, o) in enumerate(seq_align(gt_things, ocr_things)):
+    for k, (g, o) in enumerate(seq_align(gt_things, ocr_things, ops=ops)):
        css_classes = None
        gt_id = None
        ocr_id = None
@ -83,28 +84,43 @@ def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none):
    )
-def process(gt, ocr, report_prefix, *, metrics=True, textequiv_level="region"):
+def process(gt, ocr, report_prefix, *, metrics="cer,wer", textequiv_level="region"):
    """Check OCR result against GT.
-    The @click decorators change the signature of the decorated functions, so we keep this undecorated version and use
+    The @click decorators change the signature of the decorated functions,
-    Click on a wrapper.
+    so we keep this undecorated version and use Click on a wrapper.
    """
    cer, char_diff_report, n_characters = None, None, None
    wer, word_diff_report, n_words = None, None, None
    fca, fca_diff_report = None, None
    gt_text = extract(gt, textequiv_level=textequiv_level)
    ocr_text = extract(ocr, textequiv_level=textequiv_level)
    if "cer" in metrics or not metrics:
        cer, n_characters = character_error_rate_n(gt_text, ocr_text)
    wer, n_words = word_error_rate_n(gt_text, ocr_text)
        char_diff_report = gen_diff_report(
            gt_text, ocr_text, css_prefix="c", joiner="", none="·"
        )
    if "wer" in metrics:
        gt_words = words_normalized(gt_text)
        ocr_words = words_normalized(ocr_text)
        wer, n_words = word_error_rate_n(gt_text, ocr_text)
        word_diff_report = gen_diff_report(
            gt_words, ocr_words, css_prefix="w", joiner=" ", none="⋯"
        )
    if "fca" in metrics:
        fca, fca_matches = flexible_character_accuracy(gt_text.text, ocr_text.text)
        fca_gt_segments, fca_ocr_segments, ops = split_matches(fca_matches)
        fca_diff_report = gen_diff_report(
            fca_gt_segments,
            fca_ocr_segments,
            css_prefix="c",
            joiner="",
            none="·",
            ops=ops,
        )
    def json_float(value):
        """Convert a float value to an JSON float.
@ -137,8 +153,10 @@ def process(gt, ocr, report_prefix, *, metrics=True, textequiv_level="region"):
            n_characters=n_characters,
            wer=wer,
            n_words=n_words,
            fca=fca,
            char_diff_report=char_diff_report,
            word_diff_report=word_diff_report,
            fca_diff_report=fca_diff_report,
            metrics=metrics,
        ).dump(out_fn)
@ -148,7 +166,9 @@ def process(gt, ocr, report_prefix, *, metrics=True, textequiv_level="region"):
@click.argument("ocr", type=click.Path(exists=True))
@click.argument("report_prefix", type=click.Path(), default="report")
@click.option(
-    "--metrics/--no-metrics", default=True, help="Enable/disable metrics and green/red"
+    "--metrics",
    default="cer,wer",
    help="Enable different metrics like cer, wer and fca.",
 )
@click.option(
    "--textequiv-level",
@ -166,12 +186,16 @@ def main(gt, ocr, report_prefix, metrics, textequiv_level, progress):
    The files GT and OCR are usually a ground truth document and the result of
    an OCR software, but you may use dinglehopper to compare two OCR results. In
-    that case, use --no-metrics to disable the then meaningless metrics and also
+    that case, use --metrics='' to disable the then meaningless metrics and also
    change the color scheme from green/red to blue.
    The comparison report will be written to $REPORT_PREFIX.{html,json}, where
-    $REPORT_PREFIX defaults to "report". The reports include the character error
+    $REPORT_PREFIX defaults to "report". Depending on your configuration the
-    rate (CER) and the word error rate (WER).
+    reports include the character error rate (CER), the word error rate (WER)
    and the flexible character accuracy (FCA).
    The metrics can be chosen via a comma separated combination of their acronyms
    like "--metrics=cer,wer,fca".
    By default, the text of PAGE files is extracted on 'region' level. You may
    use "--textequiv-level line" to extract from the level of TextLine tags.
--- a/qurator/dinglehopper/flexible_character_accuracy.py
+++ b/qurator/dinglehopper/flexible_character_accuracy.py
@ -270,7 +270,9 @@ def score_edit_distance(match: Match) -> int:
    return match.dist.delete + match.dist.insert + 2 * match.dist.replace
-def calculate_penalty(gt: "Part", ocr: "Part", match: Match, coef: Coefficients) -> float:
+def calculate_penalty(
    gt: "Part", ocr: "Part", match: Match, coef: Coefficients
 ) -> float:
    """Calculate the penalty for a given match.
    For details and discussion see Section 3 in doi:10.1016/j.patrec.2020.02.003.
@ -325,6 +327,8 @@ def character_accuracy(edits: Distance) -> float:
    if not chars and not errors:
        # comparison of empty strings is considered a full match
        score = 1.0
    elif not chars:
        score = -errors
    else:
        score = 1.0 - errors / chars
    return score
@ -349,25 +353,25 @@ def initialize_lines(text: str) -> List["Part"]:
    return lines
-def combine_lines(matches: List[Match]) -> Tuple[str, str]:
+def split_matches(matches: List[Match]) -> Tuple[List[str], List[str], List[List]]:
-    """Combines the matches to aligned texts.
+    """Extracts text segments and editing operations in separate lists.
    TODO: just hacked, needs tests and refinement. Also missing insert/delete marking.
    :param matches: List of match objects.
-    :return: the aligned ground truth and ocr as texts.
+    :return: List of ground truth segments, ocr segments and editing operations.
    """
-    matches.sort(key=lambda x: x.gt.line + x.gt.start / 10000)
+    matches = sorted(matches, key=lambda x: x.gt.line + x.gt.start / 10000)
    line = 0
-    gt, ocr = "", ""
+    gt, ocr, ops = [], [], []
    for match in matches:
        if match.gt.line > line:
-            gt += "\n"
+            gt.append("\n")
-            ocr += "\n"
+            ocr.append("\n")
-            line += 1
+            ops.append([])
-        gt += match.gt.text
+        line = match.gt.line
-        ocr += match.ocr.text
+        gt.append(match.gt.text)
-    return gt, ocr
+        ocr.append(match.ocr.text)
        ops.append(match.ops)
    return gt, ocr, ops
 class Part(PartVersionSpecific):
--- a/qurator/dinglehopper/ocrd-tool.json
+++ b/qurator/dinglehopper/ocrd-tool.json
@ -19,9 +19,10 @@
      ],
      "parameters": {
        "metrics": {
-          "type": "boolean",
+          "type": "string",
-          "default": true,
+          "enum": ["", "cer", "wer", "fca", "cer,wer", "cer,fca", "wer,fca", "cer,wer,fca"],
-          "description": "Enable/disable metrics and green/red"
+          "default": "cer,wer",
          "description": "Enable different metrics like cer, wer and fca."
        },
        "textequiv_level": {
          "type": "string",
--- a/qurator/dinglehopper/templates/report.html.j2
+++ b/qurator/dinglehopper/templates/report.html.j2
@ -40,16 +40,31 @@
 {% if metrics %}
 <h2>Metrics</h2>
-<p>CER: {{ cer|round(4) }}</p>
+    {% if cer %}
-<p>WER: {{ wer|round(4) }}</p>
+    <p>CER: {{ cer|round(4) }}</p>
    {% endif %}
    {% if wer %}
    <p>WER: {{ wer|round(4) }}</p>
    {% endif %}
    {% if fca %}
    <p>FCA: {{ fca|round(4) }}</p>
    {% endif %}
 {% endif %}
 {% if char_diff_report %}
 <h2>Character differences</h2>
 {{ char_diff_report }}
 {% endif %}
 {% if word_diff_report %}
 <h2>Word differences</h2>
 {{ word_diff_report }}
 {% endif %}
 {% if fca_diff_report %}
 <h2>Flexible character accuracy differences</h2>
 {{ fca_diff_report }}
 {% endif %}
 </div>
--- a/qurator/dinglehopper/templates/report.json.j2
+++ b/qurator/dinglehopper/templates/report.json.j2
@ -1,10 +1,11 @@
 {
    "gt": "{{ gt }}",
    "ocr": "{{ ocr }}",
 {% if metrics %}
-    "cer": {{ cer|json_float }},
+    {% if cer %}"cer": {{ cer|json_float }},{% endif %}
-    "wer": {{ wer|json_float }},
+    {% if wer %}"wer": {{ wer|json_float }},{% endif %}
    {% if fca %}"fca": {{ fca|json_float }},{% endif %}
    {% if n_characters %}"n_characters": {{ n_characters }},{% endif %}
    {% if n_words %}"n_words": {{ n_words }},{% endif %}
 {% endif %}
-    "n_characters": {{ n_characters }},
+    "gt": "{{ gt }}",
-    "n_words": {{ n_words }}
+    "ocr": "{{ ocr }}"
 }
--- a/qurator/dinglehopper/tests/test_flexible_character_accuracy.py
+++ b/qurator/dinglehopper/tests/test_flexible_character_accuracy.py
@ -117,13 +117,13 @@ def test_flexible_character_accuracy_simple(gt, ocr, first_line_score, all_line_
        ),
        (
            "Config II",
-            '1 hav\nnospecial\ntalents. Alberto\n'
+            "1 hav\nnospecial\ntalents. Alberto\n"
            'I am one Emstein\npassionate\ncuriousity."',
        ),
        (
            "Config III",
-            'Alberto\nEmstein\n'
+            "Alberto\nEmstein\n"
-            '1 hav\nnospecial\ntalents.\n'
+            "1 hav\nnospecial\ntalents.\n"
            'I am one\npassionate\ncuriousity."',
        ),
    ],
@ -323,6 +323,8 @@ def test_character_accuracy_matches(matches, expected_dist):
        (Distance(), 1),
        (Distance(match=1), 1),
        (Distance(replace=1), 0),
        (Distance(delete=1), 0),
        (Distance(insert=1), -1),
        (Distance(match=1, insert=1), 0),
        (Distance(match=1, insert=2), 1 - 2 / 1),
        (Distance(match=2, insert=1), 0.5),
@ -377,9 +379,42 @@ def test_initialize_lines():
    assert lines == [line3, line1, line2]
-@pytest.mark.xfail
+@pytest.mark.parametrize(
-def test_combine_lines():
+    "matches,expected_gt,expected_ocr,expected_ops",
-    assert False
+    [
        ([], [], [], []),
        (
            [Match(gt=Part(text="aaa"), ocr=Part(text="aaa"), dist=Distance(), ops=[])],
            ["aaa"],
            ["aaa"],
            [[]],
        ),
        (
            [
                Match(
                    gt=Part(text="aaa", line=1),
                    ocr=Part(text="aaa"),
                    dist=Distance(),
                    ops=[],
                ),
                Match(
                    gt=Part(text="bbb", line=2),
                    ocr=Part(text="bbc"),
                    dist=Distance(),
                    ops=[["replace", 2]],
                ),
            ],
            ["\n", "aaa", "\n", "bbb"],
            ["\n", "aaa", "\n", "bbc"],
            [[], [], [], [["replace", 2]]],
        ),
    ],
 )
 def test_split_matches(matches, expected_gt, expected_ocr, expected_ops):
    gt_segments, ocr_segments, ops = split_matches(matches)
    assert gt_segments == expected_gt
    assert ocr_segments == expected_ocr
    assert ops == expected_ops
@pytest.mark.parametrize(
--- a/qurator/dinglehopper/tests/test_integ_cli_valid_json.py
+++ b/qurator/dinglehopper/tests/test_integ_cli_valid_json.py
@ -1,4 +1,5 @@
 import json
 from itertools import combinations
 import pytest
 from .util import working_directory
@ -7,9 +8,19 @@ from ..cli import process
@pytest.mark.integration
-def test_cli_json(tmp_path):
+@pytest.mark.parametrize(
    "metrics",
    [
        *(("",), ("cer",), ("wer",), ("fca",)),
        *combinations(("cer", "wer", "fca"), 2),
        ("cer", "wer", "fca"),
    ],
 )
 def test_cli_json(metrics, tmp_path):
    """Test that the cli/process() yields a loadable JSON report"""
    expected_values = {"cer": 0.2, "wer": 1.0, "fca": 0.8}
    with working_directory(str(tmp_path)):
        with open("gt.txt", "w") as gtf:
            gtf.write("AAAAA")
@ -18,12 +29,18 @@ def test_cli_json(tmp_path):
        with open("gt.txt", "r") as gtf:
            print(gtf.read())
-        process("gt.txt", "ocr.txt", "report")
+
        process("gt.txt", "ocr.txt", "report", metrics=",".join(metrics))
        with open("report.json", "r") as jsonf:
            print(jsonf.read())
        with open("report.json", "r") as jsonf:
            j = json.load(jsonf)
-            assert j["cer"] == pytest.approx(0.2)
+            for metric, expected_value in expected_values.items():
                if metric in metrics:
                    assert j[metric] == pytest.approx(expected_values[metric])
                else:
                    assert metric not in j.keys()
@pytest.mark.integration
@ -36,7 +53,8 @@ def test_cli_json_cer_is_infinity(tmp_path):
        with open("ocr.txt", "w") as ocrf:
            ocrf.write("Not important")
-        process("gt.txt", "ocr.txt", "report")
+        process("gt.txt", "ocr.txt", "report", metrics="cer,wer,fca")
        with open("report.json", "r") as jsonf:
            j = json.load(jsonf)
            assert j["cer"] == pytest.approx(float("inf"))
            assert j["fca"] == pytest.approx(-13)
--- a/qurator/dinglehopper/tests/test_integ_flexible_character_accuracy.py
+++ b/qurator/dinglehopper/tests/test_integ_flexible_character_accuracy.py
@ -0,0 +1,50 @@
 import os
 import pytest
 from lxml import etree as ET
 from .. import distance, page_text
 from .. import flexible_character_accuracy, split_matches
 data_dir = os.path.join(
    os.path.dirname(os.path.abspath(__file__)), "data", "table-order"
 )
@pytest.mark.parametrize("file", ["table-order-0002.xml", "table-no-reading-order.xml"])
@pytest.mark.integration
 def test_fac_ignoring_reading_order(file):
    expected = "1\n2\n3\n4\n5\n6\n7\n8\n9"
    gt = page_text(ET.parse(os.path.join(data_dir, "table-order-0001.xml")))
    assert gt == expected
    ocr = page_text(ET.parse(os.path.join(data_dir, file)))
    assert distance(gt, ocr) > 0
    fac, matches = flexible_character_accuracy(gt, ocr)
    assert fac == pytest.approx(1.0)
    gt_segments, ocr_segments, ops = split_matches(matches)
    assert not any(ops)
    assert "".join(gt_segments) == expected
    assert "".join(ocr_segments) == expected
@pytest.mark.parametrize(
    "file,expected_text",
    [
        ("table-order-0001.xml", "1\n2\n3\n4\n5\n6\n7\n8\n9"),
        ("table-order-0002.xml", "1\n4\n7\n2\n5\n8\n3\n6\n9"),
        ("table-no-reading-order.xml", "5\n6\n7\n8\n9\n1\n2\n3\n4"),
        ("table-unordered.xml", "5\n6\n7\n8\n9\n1\n2\n3\n4"),
    ],
 )
@pytest.mark.integration
 def test_reading_order_settings(file, expected_text):
    if "table-unordered.xml" == file:
        with pytest.raises(NotImplementedError):
            page_text(ET.parse(os.path.join(data_dir, file)))
    else:
        ocr = page_text(ET.parse(os.path.join(data_dir, file)))
        assert ocr == expected_text