From 36b36f69861d0dd12550a9eada54286061a65202 Mon Sep 17 00:00:00 2001
From: "Gerber, Mike" <mike.gerber@sbb.spk-berlin.de>
Date: Mon, 13 Dec 2021 19:26:21 +0100
Subject: [PATCH 1/8] =?UTF-8?q?=F0=9F=9A=A7=20dinglehopper:=20Compare=20li?=
 =?UTF-8?q?ne=20text=20directories=20(WIP)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 qurator/dinglehopper/cli_line_dirs.py | 150 ++++++++++++++++++++++++++
 setup.py                              |   1 +
 2 files changed, 151 insertions(+)
 create mode 100644 qurator/dinglehopper/cli_line_dirs.py

diff --git a/qurator/dinglehopper/cli_line_dirs.py b/qurator/dinglehopper/cli_line_dirs.py
new file mode 100644
index 0000000..1b77cdb
--- /dev/null
+++ b/qurator/dinglehopper/cli_line_dirs.py
@@ -0,0 +1,150 @@
+import os
+import sys
+import itertools
+
+import click
+from jinja2 import Environment, FileSystemLoader
+from markupsafe import escape
+from uniseg.graphemecluster import grapheme_clusters
+from ocrd_utils import initLogging
+
+from .character_error_rate import character_error_rate_n
+from .word_error_rate import word_error_rate_n, words_normalized
+from .align import seq_align
+from .extracted_text import ExtractedText
+from .ocr_files import plain_extract
+from .config import Config
+from .cli import gen_diff_report
+
+
+def all_equal(iterable):
+    g = itertools.groupby(iterable)
+    return next(g, True) and not next(g, False)
+
+
+def common_prefix(its):
+    return [p[0] for p in itertools.takewhile(all_equal, zip(*its))]
+
+
+def common_suffix(its):
+    return reversed(common_prefix(reversed(it) for it in its))
+
+
+def removesuffix(text, suffix):
+    if suffix and text.endswith(suffix):
+        return text[:-len(suffix)]
+    return text
+
+
+def process(gt_dir, ocr_dir, report_prefix, *, metrics=True):
+    gt_suffix = "".join(common_suffix(os.listdir(gt_dir)))
+    ocr_suffix = "".join(common_suffix(os.listdir(ocr_dir)))
+
+    cer = None
+    n_characters = None
+    char_diff_report = ""
+
+    for gt in os.listdir(gt_dir):
+        # Find a match by replacing the suffix
+        ocr = removesuffix(gt, gt_suffix) + ocr_suffix
+
+        gt_text = plain_extract(os.path.join(gt_dir, gt))
+        ocr_text = plain_extract(os.path.join(ocr_dir, ocr))
+
+        # Compute CER
+        l_cer, l_n_characters = character_error_rate_n(gt_text, ocr_text)
+        if cer is None:
+            cer, n_characters = l_cer, l_n_characters
+        else:
+            # Rolling update
+            cer = (cer * n_characters + l_cer * l_n_characters) / (n_characters + l_n_characters)
+            n_characters = n_characters + l_n_characters
+
+        # Compute WER
+        # TODO wer, n_words = word_error_rate_n(gt_text, ocr_text)
+        wer = 9999; n_words = 0
+
+        char_diff_report += gen_diff_report(
+             gt_text, ocr_text, css_prefix="c", joiner="", none="·"
+        )
+
+        # TODO
+        #  gt_words = words_normalized(gt_text)
+        #  ocr_words = words_normalized(ocr_text)
+        #  word_diff_report = gen_diff_report(
+        #      gt_words, ocr_words, css_prefix="w", joiner=" ", none="⋯"
+        #  )
+        word_diff_report = "TODO"
+
+
+    # XXX this is a copy from cli.py
+    def json_float(value):
+        """Convert a float value to an JSON float.
+
+        This is here so that float('inf') yields "Infinity", not "inf".
+        """
+        if value == float("inf"):
+            return "Infinity"
+        elif value == float("-inf"):
+            return "-Infinity"
+        else:
+            return str(value)
+
+    env = Environment(
+        loader=FileSystemLoader(
+            os.path.join(os.path.dirname(os.path.realpath(__file__)), "templates")
+        )
+    )
+    env.filters["json_float"] = json_float
+
+    for report_suffix in (".html", ".json"):
+        template_fn = "report" + report_suffix + ".j2"
+        out_fn = report_prefix + report_suffix
+
+        template = env.get_template(template_fn)
+        template.stream(
+            gt=gt,
+            ocr=ocr,
+            cer=cer,
+            n_characters=n_characters,
+            wer=wer,
+            n_words=n_words,
+            char_diff_report=char_diff_report,
+            word_diff_report=word_diff_report,
+            metrics=metrics,
+        ).dump(out_fn)
+
+
+@click.command()
+@click.argument("gt", type=click.Path(exists=True))
+@click.argument("ocr", type=click.Path(exists=True))
+@click.argument("report_prefix", type=click.Path(), default="report")
+@click.option(
+    "--metrics/--no-metrics", default=True, help="Enable/disable metrics and green/red"
+)
+def main(gt, ocr, report_prefix, metrics):
+    """
+    Compare the GT line text directory against the OCR line text directory.
+
+    This assumes that the GT line text directory contains textfiles with a common
+    suffix like ".gt.txt", and the OCR line text directory contains textfiles with
+    a common suffix like ".some-ocr.txt". The text files also need to be paired,
+    i.e. the GT file "line001.gt.txt" needs to match a file "line001.some-ocr.txt"
+    in the OCT lines directory.
+
+    The GT and OCR directories are usually round truth line texts and the results of
+    an OCR software, but you may use dinglehopper to compare two OCR results. In
+    that case, use --no-metrics to disable the then meaningless metrics and also
+    change the color scheme from green/red to blue.
+
+    The comparison report will be written to $REPORT_PREFIX.{html,json}, where
+    $REPORT_PREFIX defaults to "report". The reports include the character error
+    rate (CER) and the word error rate (WER).
+
+    """
+    initLogging()
+    process(gt, ocr, report_prefix, metrics=metrics)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/setup.py b/setup.py
index 1551c2d..be17cc6 100644
--- a/setup.py
+++ b/setup.py
@@ -26,6 +26,7 @@ setup(
     entry_points={
         "console_scripts": [
             "dinglehopper=qurator.dinglehopper.cli:main",
+            "dinglehopper-line-dirs=qurator.dinglehopper.cli_line_dirs:main",
             "dinglehopper-extract=qurator.dinglehopper.cli_extract:main",
             "ocrd-dinglehopper=qurator.dinglehopper.ocrd_cli:ocrd_dinglehopper",
         ]

From a018006f98e96c413da2fd96bf6d79916ed9c588 Mon Sep 17 00:00:00 2001
From: "Gerber, Mike" <mike.gerber@sbb.spk-berlin.de>
Date: Mon, 13 Dec 2021 19:32:55 +0100
Subject: [PATCH 2/8] =?UTF-8?q?=F0=9F=9A=A7=20dinglehopper:=20Compare=20li?=
 =?UTF-8?q?ne=20text=20directories=20(WIP)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 qurator/dinglehopper/cli_line_dirs.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/qurator/dinglehopper/cli_line_dirs.py b/qurator/dinglehopper/cli_line_dirs.py
index 1b77cdb..94dcee4 100644
--- a/qurator/dinglehopper/cli_line_dirs.py
+++ b/qurator/dinglehopper/cli_line_dirs.py
@@ -103,8 +103,8 @@ def process(gt_dir, ocr_dir, report_prefix, *, metrics=True):
 
         template = env.get_template(template_fn)
         template.stream(
-            gt=gt,
-            ocr=ocr,
+            gt=gt_dir,  # Note: directory
+            ocr=ocr_dir,  # Note: directory
             cer=cer,
             n_characters=n_characters,
             wer=wer,

From dbb660615a61da06e8831569b7558020366e1f47 Mon Sep 17 00:00:00 2001
From: "Gerber, Mike" <mike.gerber@sbb.spk-berlin.de>
Date: Mon, 13 Dec 2021 20:02:18 +0100
Subject: [PATCH 3/8] =?UTF-8?q?=F0=9F=9A=A7=20dinglehopper:=20Compare=20li?=
 =?UTF-8?q?ne=20text=20directories=20(WIP)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 qurator/dinglehopper/cli_line_dirs.py |  8 ++++----
 qurator/dinglehopper/ocr_files.py     | 10 +++++++---
 2 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/qurator/dinglehopper/cli_line_dirs.py b/qurator/dinglehopper/cli_line_dirs.py
index 94dcee4..f3d1f84 100644
--- a/qurator/dinglehopper/cli_line_dirs.py
+++ b/qurator/dinglehopper/cli_line_dirs.py
@@ -44,12 +44,12 @@ def process(gt_dir, ocr_dir, report_prefix, *, metrics=True):
     n_characters = None
     char_diff_report = ""
 
-    for gt in os.listdir(gt_dir):
+    for k, gt in enumerate(os.listdir(gt_dir)):
         # Find a match by replacing the suffix
         ocr = removesuffix(gt, gt_suffix) + ocr_suffix
 
-        gt_text = plain_extract(os.path.join(gt_dir, gt))
-        ocr_text = plain_extract(os.path.join(ocr_dir, ocr))
+        gt_text = plain_extract(os.path.join(gt_dir, gt), include_filename_in_id=True)
+        ocr_text = plain_extract(os.path.join(ocr_dir, ocr), include_filename_in_id=True)
 
         # Compute CER
         l_cer, l_n_characters = character_error_rate_n(gt_text, ocr_text)
@@ -65,7 +65,7 @@ def process(gt_dir, ocr_dir, report_prefix, *, metrics=True):
         wer = 9999; n_words = 0
 
         char_diff_report += gen_diff_report(
-             gt_text, ocr_text, css_prefix="c", joiner="", none="·"
+             gt_text, ocr_text, css_prefix="l{0}-c".format(k), joiner="", none="·"
         )
 
         # TODO
diff --git a/qurator/dinglehopper/ocr_files.py b/qurator/dinglehopper/ocr_files.py
index 5271727..69f4df7 100644
--- a/qurator/dinglehopper/ocr_files.py
+++ b/qurator/dinglehopper/ocr_files.py
@@ -1,8 +1,9 @@
 from __future__ import division, print_function
 
+import os
+import sys
 from typing import Iterator
 from warnings import warn
-import sys
 
 from lxml import etree as ET
 from lxml.etree import XMLSyntaxError
@@ -130,12 +131,15 @@ def page_text(tree, *, textequiv_level="region"):
     return page_extract(tree, textequiv_level=textequiv_level).text
 
 
-def plain_extract(filename):
+def plain_extract(filename, include_filename_in_id=False):
+    id_template = "{filename} - line {no}" if include_filename_in_id else "line {no}"
     with open(filename, "r") as f:
         return ExtractedText(
             None,
             [
-                ExtractedText("line %d" % no, None, None, normalize_sbb(line))
+                ExtractedText(
+                    id_template.format(filename=os.path.basename(filename), no=no),
+                    None, None, normalize_sbb(line))
                 for no, line in enumerate(f.readlines())
             ],
             "\n",

From cb2be96179543dba6ac069c92b842c1f56c198ec Mon Sep 17 00:00:00 2001
From: "Gerber, Mike" <mike.gerber@sbb.spk-berlin.de>
Date: Tue, 14 Dec 2021 18:20:04 +0100
Subject: [PATCH 4/8] =?UTF-8?q?=F0=9F=9A=A7=20dinglehopper:=20Add=20word?=
 =?UTF-8?q?=20differences=20in=20line-dirs=20report?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 qurator/dinglehopper/cli_line_dirs.py | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/qurator/dinglehopper/cli_line_dirs.py b/qurator/dinglehopper/cli_line_dirs.py
index f3d1f84..5c877f2 100644
--- a/qurator/dinglehopper/cli_line_dirs.py
+++ b/qurator/dinglehopper/cli_line_dirs.py
@@ -43,6 +43,7 @@ def process(gt_dir, ocr_dir, report_prefix, *, metrics=True):
     cer = None
     n_characters = None
     char_diff_report = ""
+    word_diff_report = ""
 
     for k, gt in enumerate(os.listdir(gt_dir)):
         # Find a match by replacing the suffix
@@ -65,16 +66,14 @@ def process(gt_dir, ocr_dir, report_prefix, *, metrics=True):
         wer = 9999; n_words = 0
 
         char_diff_report += gen_diff_report(
-             gt_text, ocr_text, css_prefix="l{0}-c".format(k), joiner="", none="·"
+            gt_text, ocr_text, css_prefix="l{0}-c".format(k), joiner="", none="·"
         )
 
-        # TODO
-        #  gt_words = words_normalized(gt_text)
-        #  ocr_words = words_normalized(ocr_text)
-        #  word_diff_report = gen_diff_report(
-        #      gt_words, ocr_words, css_prefix="w", joiner=" ", none="⋯"
-        #  )
-        word_diff_report = "TODO"
+        gt_words = words_normalized(gt_text)
+        ocr_words = words_normalized(ocr_text)
+        word_diff_report += gen_diff_report(
+            gt_words, ocr_words, css_prefix="l{0}-w".format(k), joiner=" ", none="⋯"
+        )
 
 
     # XXX this is a copy from cli.py

From 5b394649a7777f95932ab74c1e26743e8e180849 Mon Sep 17 00:00:00 2001
From: "Gerber, Mike" <mike.gerber@sbb.spk-berlin.de>
Date: Tue, 14 Dec 2021 18:33:20 +0100
Subject: [PATCH 5/8] =?UTF-8?q?=F0=9F=9A=A7=20dinglehopper:=20Compute=20WE?=
 =?UTF-8?q?R=20in=20line-dirs=20CLI?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 qurator/dinglehopper/cli_line_dirs.py | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/qurator/dinglehopper/cli_line_dirs.py b/qurator/dinglehopper/cli_line_dirs.py
index 5c877f2..48b86d2 100644
--- a/qurator/dinglehopper/cli_line_dirs.py
+++ b/qurator/dinglehopper/cli_line_dirs.py
@@ -43,6 +43,8 @@ def process(gt_dir, ocr_dir, report_prefix, *, metrics=True):
     cer = None
     n_characters = None
     char_diff_report = ""
+    wer = None
+    n_words = None
     word_diff_report = ""
 
     for k, gt in enumerate(os.listdir(gt_dir)):
@@ -62,13 +64,18 @@ def process(gt_dir, ocr_dir, report_prefix, *, metrics=True):
             n_characters = n_characters + l_n_characters
 
         # Compute WER
-        # TODO wer, n_words = word_error_rate_n(gt_text, ocr_text)
-        wer = 9999; n_words = 0
+        l_wer, l_n_words = word_error_rate_n(gt_text, ocr_text)
+        if wer is None:
+            wer, n_words = l_wer, l_n_words
+        else:
+            # Rolling update
+            wer = (wer * n_words + l_wer * l_n_words) / (n_words + l_n_words)
+            n_words = n_words + l_n_words
 
+        # Generate diff reports
         char_diff_report += gen_diff_report(
             gt_text, ocr_text, css_prefix="l{0}-c".format(k), joiner="", none="·"
         )
-
         gt_words = words_normalized(gt_text)
         ocr_words = words_normalized(ocr_text)
         word_diff_report += gen_diff_report(

From f77ce857b233df3705d264435fe4f5bd2f07cdf0 Mon Sep 17 00:00:00 2001
From: "Gerber, Mike" <mike.gerber@sbb.spk-berlin.de>
Date: Tue, 14 Dec 2021 18:37:07 +0100
Subject: [PATCH 6/8] =?UTF-8?q?=F0=9F=9A=A7=20dinglehopper:=20Sahre=20json?=
 =?UTF-8?q?=5Ffloat=20code?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 qurator/dinglehopper/cli.py           | 25 +++++++++++++------------
 qurator/dinglehopper/cli_line_dirs.py | 16 +---------------
 2 files changed, 14 insertions(+), 27 deletions(-)

diff --git a/qurator/dinglehopper/cli.py b/qurator/dinglehopper/cli.py
index 32e159f..72d428d 100644
--- a/qurator/dinglehopper/cli.py
+++ b/qurator/dinglehopper/cli.py
@@ -84,6 +84,19 @@ def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none):
     )
 
 
+def json_float(value):
+    """Convert a float value to an JSON float.
+
+    This is here so that float('inf') yields "Infinity", not "inf".
+    """
+    if value == float("inf"):
+        return "Infinity"
+    elif value == float("-inf"):
+        return "-Infinity"
+    else:
+        return str(value)
+
+
 def process(gt, ocr, report_prefix, *, metrics=True, textequiv_level="region"):
     """Check OCR result against GT.
 
@@ -107,18 +120,6 @@ def process(gt, ocr, report_prefix, *, metrics=True, textequiv_level="region"):
         gt_words, ocr_words, css_prefix="w", joiner=" ", none="⋯"
     )
 
-    def json_float(value):
-        """Convert a float value to an JSON float.
-
-        This is here so that float('inf') yields "Infinity", not "inf".
-        """
-        if value == float("inf"):
-            return "Infinity"
-        elif value == float("-inf"):
-            return "-Infinity"
-        else:
-            return str(value)
-
     env = Environment(
         loader=FileSystemLoader(
             os.path.join(os.path.dirname(os.path.realpath(__file__)), "templates")
diff --git a/qurator/dinglehopper/cli_line_dirs.py b/qurator/dinglehopper/cli_line_dirs.py
index 48b86d2..4c07ce5 100644
--- a/qurator/dinglehopper/cli_line_dirs.py
+++ b/qurator/dinglehopper/cli_line_dirs.py
@@ -14,7 +14,7 @@ from .align import seq_align
 from .extracted_text import ExtractedText
 from .ocr_files import plain_extract
 from .config import Config
-from .cli import gen_diff_report
+from .cli import gen_diff_report, json_float
 
 
 def all_equal(iterable):
@@ -82,20 +82,6 @@ def process(gt_dir, ocr_dir, report_prefix, *, metrics=True):
             gt_words, ocr_words, css_prefix="l{0}-w".format(k), joiner=" ", none="⋯"
         )
 
-
-    # XXX this is a copy from cli.py
-    def json_float(value):
-        """Convert a float value to an JSON float.
-
-        This is here so that float('inf') yields "Infinity", not "inf".
-        """
-        if value == float("inf"):
-            return "Infinity"
-        elif value == float("-inf"):
-            return "-Infinity"
-        else:
-            return str(value)
-
     env = Environment(
         loader=FileSystemLoader(
             os.path.join(os.path.dirname(os.path.realpath(__file__)), "templates")

From b6bde2b7ec702652cd15fb2298baec6feff29509 Mon Sep 17 00:00:00 2001
From: "Gerber, Mike" <mike.gerber@sbb.spk-berlin.de>
Date: Wed, 15 Dec 2021 11:16:40 +0100
Subject: [PATCH 7/8] =?UTF-8?q?=F0=9F=93=9D=20dinglehopper:=20Document=20d?=
 =?UTF-8?q?inglehopper-line-dirs=20in=20the=20README?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 README.md | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/README.md b/README.md
index 6d82541..e7b3c7b 100644
--- a/README.md
+++ b/README.md
@@ -61,6 +61,15 @@ This generates `report.html` and `report.json`.
 
 ![dinglehopper displaying metrics and character differences](.screenshots/dinglehopper.png?raw=true)
 
+### dinglehopper-line-dirs
+You also may want to compare a directory of GT text files (i.e. `gt/line0001.gt.txt`)
+with a directory of OCR text files (i.e. `ocr/line0001.some-ocr.txt`) with a separate
+CLI interface:
+
+~~~
+dinglehopper-line-dirs gt/ ocr/
+~~~
+
 ### dinglehopper-extract
 The tool `dinglehopper-extract` extracts the text of the given input file on
 stdout, for example:

From 8a3f5e48c2eac3e6d67f84e87409b8c69a1e150b Mon Sep 17 00:00:00 2001
From: "Gerber, Mike" <mike.gerber@sbb.spk-berlin.de>
Date: Mon, 24 Jan 2022 18:44:30 +0100
Subject: [PATCH 8/8] =?UTF-8?q?=F0=9F=90=9B=20dinglehopper:=20Patch=20word?=
 =?UTF-8?q?=5Fbreak=20only=20once?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Previously, we (accidently) patched uniseg's word_break on every call
to words(). Do it only once.
---
 qurator/dinglehopper/word_error_rate.py | 27 ++++++++++++++++++++-----
 1 file changed, 22 insertions(+), 5 deletions(-)

diff --git a/qurator/dinglehopper/word_error_rate.py b/qurator/dinglehopper/word_error_rate.py
index 64b40d2..0eb94a7 100644
--- a/qurator/dinglehopper/word_error_rate.py
+++ b/qurator/dinglehopper/word_error_rate.py
@@ -10,12 +10,17 @@ from rapidfuzz.string_metric import levenshtein
 from . import ExtractedText
 
 
-@multimethod
-def words(s: str):
-    """Extract words from a string"""
+# Did we patch uniseg.wordbreak.word_break already?
+word_break_patched = False
 
-    # Patch uniseg.wordbreak.word_break to deal with our private use characters. See also
-    # https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/WordBreakProperty.txt
+
+def patch_word_break():
+    """
+    Patch uniseg.wordbreak.word_break to deal with our private use characters.
+
+    See also
+    https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/WordBreakProperty.txt
+    """
     old_word_break = uniseg.wordbreak.word_break
 
     def new_word_break(c, index=0):
@@ -25,6 +30,18 @@ def words(s: str):
             return old_word_break(c, index)
 
     uniseg.wordbreak.word_break = new_word_break
+    global word_break_patched
+    word_break_patched = True
+
+
+@multimethod
+def words(s: str):
+    """Extract words from a string"""
+
+    global word_break_patched
+    if not word_break_patched:
+        patch_word_break()
+
 
     # Check if c is an unwanted character, i.e. whitespace, punctuation, or similar
     def unwanted(c):