From 36b36f69861d0dd12550a9eada54286061a65202 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Mon, 13 Dec 2021 19:26:21 +0100 Subject: [PATCH 1/8] =?UTF-8?q?=F0=9F=9A=A7=20dinglehopper:=20Compare=20li?= =?UTF-8?q?ne=20text=20directories=20(WIP)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- qurator/dinglehopper/cli_line_dirs.py | 150 ++++++++++++++++++++++++++ setup.py | 1 + 2 files changed, 151 insertions(+) create mode 100644 qurator/dinglehopper/cli_line_dirs.py diff --git a/qurator/dinglehopper/cli_line_dirs.py b/qurator/dinglehopper/cli_line_dirs.py new file mode 100644 index 0000000..1b77cdb --- /dev/null +++ b/qurator/dinglehopper/cli_line_dirs.py @@ -0,0 +1,150 @@ +import os +import sys +import itertools + +import click +from jinja2 import Environment, FileSystemLoader +from markupsafe import escape +from uniseg.graphemecluster import grapheme_clusters +from ocrd_utils import initLogging + +from .character_error_rate import character_error_rate_n +from .word_error_rate import word_error_rate_n, words_normalized +from .align import seq_align +from .extracted_text import ExtractedText +from .ocr_files import plain_extract +from .config import Config +from .cli import gen_diff_report + + +def all_equal(iterable): + g = itertools.groupby(iterable) + return next(g, True) and not next(g, False) + + +def common_prefix(its): + return [p[0] for p in itertools.takewhile(all_equal, zip(*its))] + + +def common_suffix(its): + return reversed(common_prefix(reversed(it) for it in its)) + + +def removesuffix(text, suffix): + if suffix and text.endswith(suffix): + return text[:-len(suffix)] + return text + + +def process(gt_dir, ocr_dir, report_prefix, *, metrics=True): + gt_suffix = "".join(common_suffix(os.listdir(gt_dir))) + ocr_suffix = "".join(common_suffix(os.listdir(ocr_dir))) + + cer = None + n_characters = None + char_diff_report = "" + + for gt in os.listdir(gt_dir): + # Find a match by replacing the suffix + ocr = removesuffix(gt, gt_suffix) + ocr_suffix + + gt_text = plain_extract(os.path.join(gt_dir, gt)) + ocr_text = plain_extract(os.path.join(ocr_dir, ocr)) + + # Compute CER + l_cer, l_n_characters = character_error_rate_n(gt_text, ocr_text) + if cer is None: + cer, n_characters = l_cer, l_n_characters + else: + # Rolling update + cer = (cer * n_characters + l_cer * l_n_characters) / (n_characters + l_n_characters) + n_characters = n_characters + l_n_characters + + # Compute WER + # TODO wer, n_words = word_error_rate_n(gt_text, ocr_text) + wer = 9999; n_words = 0 + + char_diff_report += gen_diff_report( + gt_text, ocr_text, css_prefix="c", joiner="", none="·" + ) + + # TODO + # gt_words = words_normalized(gt_text) + # ocr_words = words_normalized(ocr_text) + # word_diff_report = gen_diff_report( + # gt_words, ocr_words, css_prefix="w", joiner=" ", none="⋯" + # ) + word_diff_report = "TODO" + + + # XXX this is a copy from cli.py + def json_float(value): + """Convert a float value to an JSON float. + + This is here so that float('inf') yields "Infinity", not "inf". + """ + if value == float("inf"): + return "Infinity" + elif value == float("-inf"): + return "-Infinity" + else: + return str(value) + + env = Environment( + loader=FileSystemLoader( + os.path.join(os.path.dirname(os.path.realpath(__file__)), "templates") + ) + ) + env.filters["json_float"] = json_float + + for report_suffix in (".html", ".json"): + template_fn = "report" + report_suffix + ".j2" + out_fn = report_prefix + report_suffix + + template = env.get_template(template_fn) + template.stream( + gt=gt, + ocr=ocr, + cer=cer, + n_characters=n_characters, + wer=wer, + n_words=n_words, + char_diff_report=char_diff_report, + word_diff_report=word_diff_report, + metrics=metrics, + ).dump(out_fn) + + +@click.command() +@click.argument("gt", type=click.Path(exists=True)) +@click.argument("ocr", type=click.Path(exists=True)) +@click.argument("report_prefix", type=click.Path(), default="report") +@click.option( + "--metrics/--no-metrics", default=True, help="Enable/disable metrics and green/red" +) +def main(gt, ocr, report_prefix, metrics): + """ + Compare the GT line text directory against the OCR line text directory. + + This assumes that the GT line text directory contains textfiles with a common + suffix like ".gt.txt", and the OCR line text directory contains textfiles with + a common suffix like ".some-ocr.txt". The text files also need to be paired, + i.e. the GT file "line001.gt.txt" needs to match a file "line001.some-ocr.txt" + in the OCT lines directory. + + The GT and OCR directories are usually round truth line texts and the results of + an OCR software, but you may use dinglehopper to compare two OCR results. In + that case, use --no-metrics to disable the then meaningless metrics and also + change the color scheme from green/red to blue. + + The comparison report will be written to $REPORT_PREFIX.{html,json}, where + $REPORT_PREFIX defaults to "report". The reports include the character error + rate (CER) and the word error rate (WER). + + """ + initLogging() + process(gt, ocr, report_prefix, metrics=metrics) + + +if __name__ == "__main__": + main() diff --git a/setup.py b/setup.py index 1551c2d..be17cc6 100644 --- a/setup.py +++ b/setup.py @@ -26,6 +26,7 @@ setup( entry_points={ "console_scripts": [ "dinglehopper=qurator.dinglehopper.cli:main", + "dinglehopper-line-dirs=qurator.dinglehopper.cli_line_dirs:main", "dinglehopper-extract=qurator.dinglehopper.cli_extract:main", "ocrd-dinglehopper=qurator.dinglehopper.ocrd_cli:ocrd_dinglehopper", ] From a018006f98e96c413da2fd96bf6d79916ed9c588 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Mon, 13 Dec 2021 19:32:55 +0100 Subject: [PATCH 2/8] =?UTF-8?q?=F0=9F=9A=A7=20dinglehopper:=20Compare=20li?= =?UTF-8?q?ne=20text=20directories=20(WIP)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- qurator/dinglehopper/cli_line_dirs.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/qurator/dinglehopper/cli_line_dirs.py b/qurator/dinglehopper/cli_line_dirs.py index 1b77cdb..94dcee4 100644 --- a/qurator/dinglehopper/cli_line_dirs.py +++ b/qurator/dinglehopper/cli_line_dirs.py @@ -103,8 +103,8 @@ def process(gt_dir, ocr_dir, report_prefix, *, metrics=True): template = env.get_template(template_fn) template.stream( - gt=gt, - ocr=ocr, + gt=gt_dir, # Note: directory + ocr=ocr_dir, # Note: directory cer=cer, n_characters=n_characters, wer=wer, From dbb660615a61da06e8831569b7558020366e1f47 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Mon, 13 Dec 2021 20:02:18 +0100 Subject: [PATCH 3/8] =?UTF-8?q?=F0=9F=9A=A7=20dinglehopper:=20Compare=20li?= =?UTF-8?q?ne=20text=20directories=20(WIP)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- qurator/dinglehopper/cli_line_dirs.py | 8 ++++---- qurator/dinglehopper/ocr_files.py | 10 +++++++--- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/qurator/dinglehopper/cli_line_dirs.py b/qurator/dinglehopper/cli_line_dirs.py index 94dcee4..f3d1f84 100644 --- a/qurator/dinglehopper/cli_line_dirs.py +++ b/qurator/dinglehopper/cli_line_dirs.py @@ -44,12 +44,12 @@ def process(gt_dir, ocr_dir, report_prefix, *, metrics=True): n_characters = None char_diff_report = "" - for gt in os.listdir(gt_dir): + for k, gt in enumerate(os.listdir(gt_dir)): # Find a match by replacing the suffix ocr = removesuffix(gt, gt_suffix) + ocr_suffix - gt_text = plain_extract(os.path.join(gt_dir, gt)) - ocr_text = plain_extract(os.path.join(ocr_dir, ocr)) + gt_text = plain_extract(os.path.join(gt_dir, gt), include_filename_in_id=True) + ocr_text = plain_extract(os.path.join(ocr_dir, ocr), include_filename_in_id=True) # Compute CER l_cer, l_n_characters = character_error_rate_n(gt_text, ocr_text) @@ -65,7 +65,7 @@ def process(gt_dir, ocr_dir, report_prefix, *, metrics=True): wer = 9999; n_words = 0 char_diff_report += gen_diff_report( - gt_text, ocr_text, css_prefix="c", joiner="", none="·" + gt_text, ocr_text, css_prefix="l{0}-c".format(k), joiner="", none="·" ) # TODO diff --git a/qurator/dinglehopper/ocr_files.py b/qurator/dinglehopper/ocr_files.py index 5271727..69f4df7 100644 --- a/qurator/dinglehopper/ocr_files.py +++ b/qurator/dinglehopper/ocr_files.py @@ -1,8 +1,9 @@ from __future__ import division, print_function +import os +import sys from typing import Iterator from warnings import warn -import sys from lxml import etree as ET from lxml.etree import XMLSyntaxError @@ -130,12 +131,15 @@ def page_text(tree, *, textequiv_level="region"): return page_extract(tree, textequiv_level=textequiv_level).text -def plain_extract(filename): +def plain_extract(filename, include_filename_in_id=False): + id_template = "{filename} - line {no}" if include_filename_in_id else "line {no}" with open(filename, "r") as f: return ExtractedText( None, [ - ExtractedText("line %d" % no, None, None, normalize_sbb(line)) + ExtractedText( + id_template.format(filename=os.path.basename(filename), no=no), + None, None, normalize_sbb(line)) for no, line in enumerate(f.readlines()) ], "\n", From cb2be96179543dba6ac069c92b842c1f56c198ec Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Tue, 14 Dec 2021 18:20:04 +0100 Subject: [PATCH 4/8] =?UTF-8?q?=F0=9F=9A=A7=20dinglehopper:=20Add=20word?= =?UTF-8?q?=20differences=20in=20line-dirs=20report?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- qurator/dinglehopper/cli_line_dirs.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/qurator/dinglehopper/cli_line_dirs.py b/qurator/dinglehopper/cli_line_dirs.py index f3d1f84..5c877f2 100644 --- a/qurator/dinglehopper/cli_line_dirs.py +++ b/qurator/dinglehopper/cli_line_dirs.py @@ -43,6 +43,7 @@ def process(gt_dir, ocr_dir, report_prefix, *, metrics=True): cer = None n_characters = None char_diff_report = "" + word_diff_report = "" for k, gt in enumerate(os.listdir(gt_dir)): # Find a match by replacing the suffix @@ -65,16 +66,14 @@ def process(gt_dir, ocr_dir, report_prefix, *, metrics=True): wer = 9999; n_words = 0 char_diff_report += gen_diff_report( - gt_text, ocr_text, css_prefix="l{0}-c".format(k), joiner="", none="·" + gt_text, ocr_text, css_prefix="l{0}-c".format(k), joiner="", none="·" ) - # TODO - # gt_words = words_normalized(gt_text) - # ocr_words = words_normalized(ocr_text) - # word_diff_report = gen_diff_report( - # gt_words, ocr_words, css_prefix="w", joiner=" ", none="⋯" - # ) - word_diff_report = "TODO" + gt_words = words_normalized(gt_text) + ocr_words = words_normalized(ocr_text) + word_diff_report += gen_diff_report( + gt_words, ocr_words, css_prefix="l{0}-w".format(k), joiner=" ", none="⋯" + ) # XXX this is a copy from cli.py From 5b394649a7777f95932ab74c1e26743e8e180849 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Tue, 14 Dec 2021 18:33:20 +0100 Subject: [PATCH 5/8] =?UTF-8?q?=F0=9F=9A=A7=20dinglehopper:=20Compute=20WE?= =?UTF-8?q?R=20in=20line-dirs=20CLI?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- qurator/dinglehopper/cli_line_dirs.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/qurator/dinglehopper/cli_line_dirs.py b/qurator/dinglehopper/cli_line_dirs.py index 5c877f2..48b86d2 100644 --- a/qurator/dinglehopper/cli_line_dirs.py +++ b/qurator/dinglehopper/cli_line_dirs.py @@ -43,6 +43,8 @@ def process(gt_dir, ocr_dir, report_prefix, *, metrics=True): cer = None n_characters = None char_diff_report = "" + wer = None + n_words = None word_diff_report = "" for k, gt in enumerate(os.listdir(gt_dir)): @@ -62,13 +64,18 @@ def process(gt_dir, ocr_dir, report_prefix, *, metrics=True): n_characters = n_characters + l_n_characters # Compute WER - # TODO wer, n_words = word_error_rate_n(gt_text, ocr_text) - wer = 9999; n_words = 0 + l_wer, l_n_words = word_error_rate_n(gt_text, ocr_text) + if wer is None: + wer, n_words = l_wer, l_n_words + else: + # Rolling update + wer = (wer * n_words + l_wer * l_n_words) / (n_words + l_n_words) + n_words = n_words + l_n_words + # Generate diff reports char_diff_report += gen_diff_report( gt_text, ocr_text, css_prefix="l{0}-c".format(k), joiner="", none="·" ) - gt_words = words_normalized(gt_text) ocr_words = words_normalized(ocr_text) word_diff_report += gen_diff_report( From f77ce857b233df3705d264435fe4f5bd2f07cdf0 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Tue, 14 Dec 2021 18:37:07 +0100 Subject: [PATCH 6/8] =?UTF-8?q?=F0=9F=9A=A7=20dinglehopper:=20Sahre=20json?= =?UTF-8?q?=5Ffloat=20code?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- qurator/dinglehopper/cli.py | 25 +++++++++++++------------ qurator/dinglehopper/cli_line_dirs.py | 16 +--------------- 2 files changed, 14 insertions(+), 27 deletions(-) diff --git a/qurator/dinglehopper/cli.py b/qurator/dinglehopper/cli.py index 32e159f..72d428d 100644 --- a/qurator/dinglehopper/cli.py +++ b/qurator/dinglehopper/cli.py @@ -84,6 +84,19 @@ def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none): ) +def json_float(value): + """Convert a float value to an JSON float. + + This is here so that float('inf') yields "Infinity", not "inf". + """ + if value == float("inf"): + return "Infinity" + elif value == float("-inf"): + return "-Infinity" + else: + return str(value) + + def process(gt, ocr, report_prefix, *, metrics=True, textequiv_level="region"): """Check OCR result against GT. @@ -107,18 +120,6 @@ def process(gt, ocr, report_prefix, *, metrics=True, textequiv_level="region"): gt_words, ocr_words, css_prefix="w", joiner=" ", none="⋯" ) - def json_float(value): - """Convert a float value to an JSON float. - - This is here so that float('inf') yields "Infinity", not "inf". - """ - if value == float("inf"): - return "Infinity" - elif value == float("-inf"): - return "-Infinity" - else: - return str(value) - env = Environment( loader=FileSystemLoader( os.path.join(os.path.dirname(os.path.realpath(__file__)), "templates") diff --git a/qurator/dinglehopper/cli_line_dirs.py b/qurator/dinglehopper/cli_line_dirs.py index 48b86d2..4c07ce5 100644 --- a/qurator/dinglehopper/cli_line_dirs.py +++ b/qurator/dinglehopper/cli_line_dirs.py @@ -14,7 +14,7 @@ from .align import seq_align from .extracted_text import ExtractedText from .ocr_files import plain_extract from .config import Config -from .cli import gen_diff_report +from .cli import gen_diff_report, json_float def all_equal(iterable): @@ -82,20 +82,6 @@ def process(gt_dir, ocr_dir, report_prefix, *, metrics=True): gt_words, ocr_words, css_prefix="l{0}-w".format(k), joiner=" ", none="⋯" ) - - # XXX this is a copy from cli.py - def json_float(value): - """Convert a float value to an JSON float. - - This is here so that float('inf') yields "Infinity", not "inf". - """ - if value == float("inf"): - return "Infinity" - elif value == float("-inf"): - return "-Infinity" - else: - return str(value) - env = Environment( loader=FileSystemLoader( os.path.join(os.path.dirname(os.path.realpath(__file__)), "templates") From b6bde2b7ec702652cd15fb2298baec6feff29509 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Wed, 15 Dec 2021 11:16:40 +0100 Subject: [PATCH 7/8] =?UTF-8?q?=F0=9F=93=9D=20dinglehopper:=20Document=20d?= =?UTF-8?q?inglehopper-line-dirs=20in=20the=20README?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/README.md b/README.md index 6d82541..e7b3c7b 100644 --- a/README.md +++ b/README.md @@ -61,6 +61,15 @@ This generates `report.html` and `report.json`. ![dinglehopper displaying metrics and character differences](.screenshots/dinglehopper.png?raw=true) +### dinglehopper-line-dirs +You also may want to compare a directory of GT text files (i.e. `gt/line0001.gt.txt`) +with a directory of OCR text files (i.e. `ocr/line0001.some-ocr.txt`) with a separate +CLI interface: + +~~~ +dinglehopper-line-dirs gt/ ocr/ +~~~ + ### dinglehopper-extract The tool `dinglehopper-extract` extracts the text of the given input file on stdout, for example: From 8a3f5e48c2eac3e6d67f84e87409b8c69a1e150b Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Mon, 24 Jan 2022 18:44:30 +0100 Subject: [PATCH 8/8] =?UTF-8?q?=F0=9F=90=9B=20dinglehopper:=20Patch=20word?= =?UTF-8?q?=5Fbreak=20only=20once?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previously, we (accidently) patched uniseg's word_break on every call to words(). Do it only once. --- qurator/dinglehopper/word_error_rate.py | 27 ++++++++++++++++++++----- 1 file changed, 22 insertions(+), 5 deletions(-) diff --git a/qurator/dinglehopper/word_error_rate.py b/qurator/dinglehopper/word_error_rate.py index 64b40d2..0eb94a7 100644 --- a/qurator/dinglehopper/word_error_rate.py +++ b/qurator/dinglehopper/word_error_rate.py @@ -10,12 +10,17 @@ from rapidfuzz.string_metric import levenshtein from . import ExtractedText -@multimethod -def words(s: str): - """Extract words from a string""" +# Did we patch uniseg.wordbreak.word_break already? +word_break_patched = False - # Patch uniseg.wordbreak.word_break to deal with our private use characters. See also - # https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/WordBreakProperty.txt + +def patch_word_break(): + """ + Patch uniseg.wordbreak.word_break to deal with our private use characters. + + See also + https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/WordBreakProperty.txt + """ old_word_break = uniseg.wordbreak.word_break def new_word_break(c, index=0): @@ -25,6 +30,18 @@ def words(s: str): return old_word_break(c, index) uniseg.wordbreak.word_break = new_word_break + global word_break_patched + word_break_patched = True + + +@multimethod +def words(s: str): + """Extract words from a string""" + + global word_break_patched + if not word_break_patched: + patch_word_break() + # Check if c is an unwanted character, i.e. whitespace, punctuation, or similar def unwanted(c):