Merge branch 'feat/compare-line-texts'

2025-06-30 22:19:57 +02:00 · 2022-01-24 18:46:33 +01:00 · 2022-01-24 18:46:33 +01:00 · 195354c6d4
commit 195354c6d4
parent f0f3cd2d96 8a3f5e48c2
6 changed files with 194 additions and 20 deletions
--- a/README.md
+++ b/README.md
@ -61,6 +61,15 @@ This generates `report.html` and `report.json`.
 ![dinglehopper displaying metrics and character differences](.screenshots/dinglehopper.png?raw=true)
 ### dinglehopper-line-dirs
 You also may want to compare a directory of GT text files (i.e. `gt/line0001.gt.txt`)
 with a directory of OCR text files (i.e. `ocr/line0001.some-ocr.txt`) with a separate
 CLI interface:
 ~~~
 dinglehopper-line-dirs gt/ ocr/
 ~~~
 ### dinglehopper-extract
 The tool `dinglehopper-extract` extracts the text of the given input file on
 stdout, for example:
--- a/qurator/dinglehopper/cli.py
+++ b/qurator/dinglehopper/cli.py
@ -84,6 +84,19 @@ def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none):
    )
 def json_float(value):
    """Convert a float value to an JSON float.
    This is here so that float('inf') yields "Infinity", not "inf".
    """
    if value == float("inf"):
        return "Infinity"
    elif value == float("-inf"):
        return "-Infinity"
    else:
        return str(value)
 def process(gt, ocr, report_prefix, *, metrics=True, textequiv_level="region"):
    """Check OCR result against GT.
@ -107,18 +120,6 @@ def process(gt, ocr, report_prefix, *, metrics=True, textequiv_level="region"):
        gt_words, ocr_words, css_prefix="w", joiner=" ", none="⋯"
    )
    def json_float(value):
        """Convert a float value to an JSON float.
        This is here so that float('inf') yields "Infinity", not "inf".
        """
        if value == float("inf"):
            return "Infinity"
        elif value == float("-inf"):
            return "-Infinity"
        else:
            return str(value)
    env = Environment(
        loader=FileSystemLoader(
            os.path.join(os.path.dirname(os.path.realpath(__file__)), "templates")
--- a/qurator/dinglehopper/cli_line_dirs.py
+++ b/qurator/dinglehopper/cli_line_dirs.py
@ -0,0 +1,142 @@
 import os
 import sys
 import itertools
 import click
 from jinja2 import Environment, FileSystemLoader
 from markupsafe import escape
 from uniseg.graphemecluster import grapheme_clusters
 from ocrd_utils import initLogging
 from .character_error_rate import character_error_rate_n
 from .word_error_rate import word_error_rate_n, words_normalized
 from .align import seq_align
 from .extracted_text import ExtractedText
 from .ocr_files import plain_extract
 from .config import Config
 from .cli import gen_diff_report, json_float
 def all_equal(iterable):
    g = itertools.groupby(iterable)
    return next(g, True) and not next(g, False)
 def common_prefix(its):
    return [p[0] for p in itertools.takewhile(all_equal, zip(*its))]
 def common_suffix(its):
    return reversed(common_prefix(reversed(it) for it in its))
 def removesuffix(text, suffix):
    if suffix and text.endswith(suffix):
        return text[:-len(suffix)]
    return text
 def process(gt_dir, ocr_dir, report_prefix, *, metrics=True):
    gt_suffix = "".join(common_suffix(os.listdir(gt_dir)))
    ocr_suffix = "".join(common_suffix(os.listdir(ocr_dir)))
    cer = None
    n_characters = None
    char_diff_report = ""
    wer = None
    n_words = None
    word_diff_report = ""
    for k, gt in enumerate(os.listdir(gt_dir)):
        # Find a match by replacing the suffix
        ocr = removesuffix(gt, gt_suffix) + ocr_suffix
        gt_text = plain_extract(os.path.join(gt_dir, gt), include_filename_in_id=True)
        ocr_text = plain_extract(os.path.join(ocr_dir, ocr), include_filename_in_id=True)
        # Compute CER
        l_cer, l_n_characters = character_error_rate_n(gt_text, ocr_text)
        if cer is None:
            cer, n_characters = l_cer, l_n_characters
        else:
            # Rolling update
            cer = (cer * n_characters + l_cer * l_n_characters) / (n_characters + l_n_characters)
            n_characters = n_characters + l_n_characters
        # Compute WER
        l_wer, l_n_words = word_error_rate_n(gt_text, ocr_text)
        if wer is None:
            wer, n_words = l_wer, l_n_words
        else:
            # Rolling update
            wer = (wer * n_words + l_wer * l_n_words) / (n_words + l_n_words)
            n_words = n_words + l_n_words
        # Generate diff reports
        char_diff_report += gen_diff_report(
            gt_text, ocr_text, css_prefix="l{0}-c".format(k), joiner="", none="·"
        )
        gt_words = words_normalized(gt_text)
        ocr_words = words_normalized(ocr_text)
        word_diff_report += gen_diff_report(
            gt_words, ocr_words, css_prefix="l{0}-w".format(k), joiner=" ", none="⋯"
        )
    env = Environment(
        loader=FileSystemLoader(
            os.path.join(os.path.dirname(os.path.realpath(__file__)), "templates")
        )
    )
    env.filters["json_float"] = json_float
    for report_suffix in (".html", ".json"):
        template_fn = "report" + report_suffix + ".j2"
        out_fn = report_prefix + report_suffix
        template = env.get_template(template_fn)
        template.stream(
            gt=gt_dir,  # Note: directory
            ocr=ocr_dir,  # Note: directory
            cer=cer,
            n_characters=n_characters,
            wer=wer,
            n_words=n_words,
            char_diff_report=char_diff_report,
            word_diff_report=word_diff_report,
            metrics=metrics,
        ).dump(out_fn)
@click.command()
@click.argument("gt", type=click.Path(exists=True))
@click.argument("ocr", type=click.Path(exists=True))
@click.argument("report_prefix", type=click.Path(), default="report")
@click.option(
    "--metrics/--no-metrics", default=True, help="Enable/disable metrics and green/red"
 )
 def main(gt, ocr, report_prefix, metrics):
    """
    Compare the GT line text directory against the OCR line text directory.
    This assumes that the GT line text directory contains textfiles with a common
    suffix like ".gt.txt", and the OCR line text directory contains textfiles with
    a common suffix like ".some-ocr.txt". The text files also need to be paired,
    i.e. the GT file "line001.gt.txt" needs to match a file "line001.some-ocr.txt"
    in the OCT lines directory.
    The GT and OCR directories are usually round truth line texts and the results of
    an OCR software, but you may use dinglehopper to compare two OCR results. In
    that case, use --no-metrics to disable the then meaningless metrics and also
    change the color scheme from green/red to blue.
    The comparison report will be written to $REPORT_PREFIX.{html,json}, where
    $REPORT_PREFIX defaults to "report". The reports include the character error
    rate (CER) and the word error rate (WER).
    """
    initLogging()
    process(gt, ocr, report_prefix, metrics=metrics)
 if __name__ == "__main__":
    main()
--- a/qurator/dinglehopper/ocr_files.py
+++ b/qurator/dinglehopper/ocr_files.py
@ -1,8 +1,9 @@
 from __future__ import division, print_function
 import os
 import sys
 from typing import Iterator
 from warnings import warn
 import sys
 from lxml import etree as ET
 from lxml.etree import XMLSyntaxError
@ -130,12 +131,15 @@ def page_text(tree, *, textequiv_level="region"):
    return page_extract(tree, textequiv_level=textequiv_level).text
-def plain_extract(filename):
+def plain_extract(filename, include_filename_in_id=False):
    id_template = "{filename} - line {no}" if include_filename_in_id else "line {no}"
    with open(filename, "r") as f:
        return ExtractedText(
            None,
            [
-                ExtractedText("line %d" % no, None, None, normalize_sbb(line))
+                ExtractedText(
                    id_template.format(filename=os.path.basename(filename), no=no),
                    None, None, normalize_sbb(line))
                for no, line in enumerate(f.readlines())
            ],
            "\n",
--- a/qurator/dinglehopper/word_error_rate.py
+++ b/qurator/dinglehopper/word_error_rate.py
@ -10,12 +10,17 @@ from rapidfuzz.string_metric import levenshtein
 from . import ExtractedText
-@multimethod
+# Did we patch uniseg.wordbreak.word_break already?
-def words(s: str):
+word_break_patched = False
    """Extract words from a string"""
-    # Patch uniseg.wordbreak.word_break to deal with our private use characters. See also
+
-    # https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/WordBreakProperty.txt
+def patch_word_break():
    """
    Patch uniseg.wordbreak.word_break to deal with our private use characters.
    See also
    https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/WordBreakProperty.txt
    """
    old_word_break = uniseg.wordbreak.word_break
    def new_word_break(c, index=0):
@ -25,6 +30,18 @@ def words(s: str):
            return old_word_break(c, index)
    uniseg.wordbreak.word_break = new_word_break
    global word_break_patched
    word_break_patched = True
@multimethod
 def words(s: str):
    """Extract words from a string"""
    global word_break_patched
    if not word_break_patched:
        patch_word_break()
    # Check if c is an unwanted character, i.e. whitespace, punctuation, or similar
    def unwanted(c):
--- a/setup.py
+++ b/setup.py
@ -26,6 +26,7 @@ setup(
    entry_points={
        "console_scripts": [
            "dinglehopper=qurator.dinglehopper.cli:main",
            "dinglehopper-line-dirs=qurator.dinglehopper.cli_line_dirs:main",
            "dinglehopper-extract=qurator.dinglehopper.cli_extract:main",
            "ocrd-dinglehopper=qurator.dinglehopper.ocrd_cli:ocrd_dinglehopper",
        ]