From 4024e350f7f5379bfffe81d45ba31bf376a4f4db Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Thu, 12 Dec 2024 12:32:07 +0100 Subject: [PATCH 01/20] =?UTF-8?q?=F0=9F=9A=A7=20Test=20new=20flexible=20li?= =?UTF-8?q?ne=20dirs=20functions?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/dinglehopper/line_dirs_test.py | 148 ++++++++++++++++++ .../line_dirs_test/basic/gt/a.gt.txt | 1 + .../line_dirs_test/basic/gt/b.gt.txt | 1 + .../line_dirs_test/basic/ocr/a.some-ocr.txt | 1 + .../line_dirs_test/basic/ocr/b.some-ocr.txt | 1 + .../line_dirs_test/merged/a/a.dummy.jpg | 0 .../line_dirs_test/merged/a/a.gt.txt | 1 + .../line_dirs_test/merged/a/a.some-ocr.txt | 1 + .../line_dirs_test/merged/b/b.dummy.jpg | 0 .../line_dirs_test/merged/b/b.gt.txt | 1 + .../line_dirs_test/merged/b/b.some-ocr.txt | 1 + .../line_dirs_test/subdirs/gt/a/a.gt.txt | 1 + .../line_dirs_test/subdirs/gt/b/b.gt.txt | 1 + .../subdirs/ocr/a/a.some-ocr.txt | 1 + .../subdirs/ocr/b/b.some-ocr.txt | 1 + 15 files changed, 160 insertions(+) create mode 100644 src/dinglehopper/line_dirs_test.py create mode 100644 src/dinglehopper/line_dirs_test/basic/gt/a.gt.txt create mode 100644 src/dinglehopper/line_dirs_test/basic/gt/b.gt.txt create mode 100644 src/dinglehopper/line_dirs_test/basic/ocr/a.some-ocr.txt create mode 100644 src/dinglehopper/line_dirs_test/basic/ocr/b.some-ocr.txt create mode 100644 src/dinglehopper/line_dirs_test/merged/a/a.dummy.jpg create mode 100644 src/dinglehopper/line_dirs_test/merged/a/a.gt.txt create mode 100644 src/dinglehopper/line_dirs_test/merged/a/a.some-ocr.txt create mode 100644 src/dinglehopper/line_dirs_test/merged/b/b.dummy.jpg create mode 100644 src/dinglehopper/line_dirs_test/merged/b/b.gt.txt create mode 100644 src/dinglehopper/line_dirs_test/merged/b/b.some-ocr.txt create mode 100644 src/dinglehopper/line_dirs_test/subdirs/gt/a/a.gt.txt create mode 100644 src/dinglehopper/line_dirs_test/subdirs/gt/b/b.gt.txt create mode 100644 src/dinglehopper/line_dirs_test/subdirs/ocr/a/a.some-ocr.txt create mode 100644 src/dinglehopper/line_dirs_test/subdirs/ocr/b/b.some-ocr.txt diff --git a/src/dinglehopper/line_dirs_test.py b/src/dinglehopper/line_dirs_test.py new file mode 100644 index 0000000..676fe22 --- /dev/null +++ b/src/dinglehopper/line_dirs_test.py @@ -0,0 +1,148 @@ +import os.path +import itertools +from typing import Iterator, Tuple + +def is_hidden(filepath): + filename = os.path.basename(os.path.abspath(filepath)) + return filename.startswith(".") + +def find_all_files(dir_: str, pred=None, return_hidden=False) -> Iterator[str]: + """ + Find all files in dir_, returning filenames + + If pred is given, pred(filename) must be True for the filename. + + Does not return hidden files by default. + """ + for root, _, filenames in os.walk(dir_): + for fn in filenames: + if not return_hidden and is_hidden(fn): + continue + if pred and not pred(fn): + continue + yield os.path.join(root, fn) + + +def find_gt_and_ocr_files(gt_dir, gt_suffix, ocr_dir, ocr_suffix) -> Iterator[Tuple[str, str]]: + """ + Find GT files and matching OCR files. + + Returns pairs of GT and OCR files. + """ + for gt_fn in find_all_files(gt_dir, lambda fn: fn.endswith(gt_suffix)): + ocr_fn = os.path.join( + ocr_dir, + os.path.relpath(gt_fn, start=gt_dir).removesuffix(gt_suffix) + + ocr_suffix, + ) + if not os.path.exists(ocr_fn): + raise RuntimeError(f"{ocr_fn} (matching {gt_fn}) does not exist") + + yield gt_fn, ocr_fn + +def all_equal(iterable): + g = itertools.groupby(iterable) + return next(g, True) and not next(g, False) + +def common_prefix(its): + return [p[0] for p in itertools.takewhile(all_equal, zip(*its))] + + +def common_suffix(its): + return reversed(common_prefix(reversed(it) for it in its)) + + +def find_gt_and_ocr_files_autodetect(gt_dir, ocr_dir): + """ + Find GT files and matching OCR files, autodetect suffixes. + + This only works if gt_dir (or respectivley ocr_dir) only contains GT (OCR) + files with a common suffix. Currently the files must have a suffix, e.g. + ".gt.txt" (e.g. ".ocr.txt"). + + Returns pairs of GT and OCR files. + """ + + # Autodetect suffixes + gt_files = find_all_files(gt_dir) + gt_suffix = "".join(common_suffix(gt_files)) + if len(gt_suffix) == 0: + raise RuntimeError(f"Files in GT directory {gt_dir} do not have a common suffix") + ocr_files = find_all_files(ocr_dir) + ocr_suffix = "".join(common_suffix(ocr_files)) + if len(ocr_suffix) == 0: + raise RuntimeError(f"Files in OCR directory {ocr_dir} do not have a common suffix") + + yield from find_gt_and_ocr_files(gt_dir, gt_suffix, ocr_dir, ocr_suffix) + + +def test_basic(): + """Test the dumb method: User gives directories and suffixes.""" + pairs = list( + find_gt_and_ocr_files( + "line_dirs_test/basic/gt", + ".gt.txt", + "line_dirs_test/basic/ocr", + ".some-ocr.txt", + ) + ) + + assert len(pairs) == 2 + +def test_basic_autodetect(): + """Test the autodetect method: User gives directories, suffixes are autodetected if possible""" + pairs = list( + find_gt_and_ocr_files_autodetect( + "line_dirs_test/basic/gt", + "line_dirs_test/basic/ocr", + ) + ) + + assert len(pairs) == 2 + + +def test_subdirs(): + """Test the dumb method: Should also work when subdirectories are involved.""" + pairs = list( + find_gt_and_ocr_files( + "line_dirs_test/subdirs/gt", + ".gt.txt", + "line_dirs_test/subdirs/ocr", + ".some-ocr.txt", + ) + ) + + assert len(pairs) == 2 + + +def test_subdirs_autodetect(): + """Test the autodetect method: Should also work when subdirectories are involved.""" + pairs = list( + find_gt_and_ocr_files_autodetect( + "line_dirs_test/subdirs/gt", + "line_dirs_test/subdirs/ocr", + ) + ) + + assert len(pairs) == 2 + +def test_merged(): + """Test the dumb method: Should also work when GT and OCR texts are in the same directories.""" + pairs = list( + find_gt_and_ocr_files( + "line_dirs_test/merged", + ".gt.txt", + "line_dirs_test/merged", + ".some-ocr.txt", + ) + ) + + assert len(pairs) == 2 + +if __name__ == "__main__": + test_basic() + test_subdirs() + test_merged() + + test_basic_autodetect() + test_subdirs_autodetect() diff --git a/src/dinglehopper/line_dirs_test/basic/gt/a.gt.txt b/src/dinglehopper/line_dirs_test/basic/gt/a.gt.txt new file mode 100644 index 0000000..484ba93 --- /dev/null +++ b/src/dinglehopper/line_dirs_test/basic/gt/a.gt.txt @@ -0,0 +1 @@ +This is a test. diff --git a/src/dinglehopper/line_dirs_test/basic/gt/b.gt.txt b/src/dinglehopper/line_dirs_test/basic/gt/b.gt.txt new file mode 100644 index 0000000..fc9bd6a --- /dev/null +++ b/src/dinglehopper/line_dirs_test/basic/gt/b.gt.txt @@ -0,0 +1 @@ +Another test. diff --git a/src/dinglehopper/line_dirs_test/basic/ocr/a.some-ocr.txt b/src/dinglehopper/line_dirs_test/basic/ocr/a.some-ocr.txt new file mode 100644 index 0000000..27cf4bf --- /dev/null +++ b/src/dinglehopper/line_dirs_test/basic/ocr/a.some-ocr.txt @@ -0,0 +1 @@ +Tis is a test. diff --git a/src/dinglehopper/line_dirs_test/basic/ocr/b.some-ocr.txt b/src/dinglehopper/line_dirs_test/basic/ocr/b.some-ocr.txt new file mode 100644 index 0000000..0bc0e40 --- /dev/null +++ b/src/dinglehopper/line_dirs_test/basic/ocr/b.some-ocr.txt @@ -0,0 +1 @@ +AnÖther test. diff --git a/src/dinglehopper/line_dirs_test/merged/a/a.dummy.jpg b/src/dinglehopper/line_dirs_test/merged/a/a.dummy.jpg new file mode 100644 index 0000000..e69de29 diff --git a/src/dinglehopper/line_dirs_test/merged/a/a.gt.txt b/src/dinglehopper/line_dirs_test/merged/a/a.gt.txt new file mode 100644 index 0000000..484ba93 --- /dev/null +++ b/src/dinglehopper/line_dirs_test/merged/a/a.gt.txt @@ -0,0 +1 @@ +This is a test. diff --git a/src/dinglehopper/line_dirs_test/merged/a/a.some-ocr.txt b/src/dinglehopper/line_dirs_test/merged/a/a.some-ocr.txt new file mode 100644 index 0000000..27cf4bf --- /dev/null +++ b/src/dinglehopper/line_dirs_test/merged/a/a.some-ocr.txt @@ -0,0 +1 @@ +Tis is a test. diff --git a/src/dinglehopper/line_dirs_test/merged/b/b.dummy.jpg b/src/dinglehopper/line_dirs_test/merged/b/b.dummy.jpg new file mode 100644 index 0000000..e69de29 diff --git a/src/dinglehopper/line_dirs_test/merged/b/b.gt.txt b/src/dinglehopper/line_dirs_test/merged/b/b.gt.txt new file mode 100644 index 0000000..fc9bd6a --- /dev/null +++ b/src/dinglehopper/line_dirs_test/merged/b/b.gt.txt @@ -0,0 +1 @@ +Another test. diff --git a/src/dinglehopper/line_dirs_test/merged/b/b.some-ocr.txt b/src/dinglehopper/line_dirs_test/merged/b/b.some-ocr.txt new file mode 100644 index 0000000..0bc0e40 --- /dev/null +++ b/src/dinglehopper/line_dirs_test/merged/b/b.some-ocr.txt @@ -0,0 +1 @@ +AnÖther test. diff --git a/src/dinglehopper/line_dirs_test/subdirs/gt/a/a.gt.txt b/src/dinglehopper/line_dirs_test/subdirs/gt/a/a.gt.txt new file mode 100644 index 0000000..484ba93 --- /dev/null +++ b/src/dinglehopper/line_dirs_test/subdirs/gt/a/a.gt.txt @@ -0,0 +1 @@ +This is a test. diff --git a/src/dinglehopper/line_dirs_test/subdirs/gt/b/b.gt.txt b/src/dinglehopper/line_dirs_test/subdirs/gt/b/b.gt.txt new file mode 100644 index 0000000..fc9bd6a --- /dev/null +++ b/src/dinglehopper/line_dirs_test/subdirs/gt/b/b.gt.txt @@ -0,0 +1 @@ +Another test. diff --git a/src/dinglehopper/line_dirs_test/subdirs/ocr/a/a.some-ocr.txt b/src/dinglehopper/line_dirs_test/subdirs/ocr/a/a.some-ocr.txt new file mode 100644 index 0000000..27cf4bf --- /dev/null +++ b/src/dinglehopper/line_dirs_test/subdirs/ocr/a/a.some-ocr.txt @@ -0,0 +1 @@ +Tis is a test. diff --git a/src/dinglehopper/line_dirs_test/subdirs/ocr/b/b.some-ocr.txt b/src/dinglehopper/line_dirs_test/subdirs/ocr/b/b.some-ocr.txt new file mode 100644 index 0000000..0bc0e40 --- /dev/null +++ b/src/dinglehopper/line_dirs_test/subdirs/ocr/b/b.some-ocr.txt @@ -0,0 +1 @@ +AnÖther test. From ad8e6de36bf376a830af29e31cefa43066e5baff Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Thu, 12 Dec 2024 12:34:08 +0100 Subject: [PATCH 02/20] =?UTF-8?q?=F0=9F=90=9B=20cli=5Fline=5Fdirs:=20Fix?= =?UTF-8?q?=20character=20diff=20reports?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/dinglehopper/cli_line_dirs.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/dinglehopper/cli_line_dirs.py b/src/dinglehopper/cli_line_dirs.py index 03bf374..01fd585 100644 --- a/src/dinglehopper/cli_line_dirs.py +++ b/src/dinglehopper/cli_line_dirs.py @@ -81,7 +81,7 @@ def process(gt_dir, ocr_dir, report_prefix, *, metrics=True): joiner="", none="·", score_hint=score_hint(l_cer, l_n_characters), - ) + )[0] word_diff_report += gen_diff_report( gt_words, ocr_words, @@ -89,7 +89,7 @@ def process(gt_dir, ocr_dir, report_prefix, *, metrics=True): joiner=" ", none="⋯", score_hint=score_hint(l_wer, l_n_words), - ) + )[0] env = Environment( loader=FileSystemLoader( From 2bf2529c380f028e59953584aa2aa26dc3a828b5 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Thu, 12 Dec 2024 12:50:14 +0100 Subject: [PATCH 03/20] =?UTF-8?q?=F0=9F=9A=A7=20Port=20new=20line=20dir=20?= =?UTF-8?q?functions?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/dinglehopper/cli_line_dirs.py | 83 +++++++++++++++++++++++++----- src/dinglehopper/line_dirs_test.py | 71 ------------------------- 2 files changed, 69 insertions(+), 85 deletions(-) diff --git a/src/dinglehopper/cli_line_dirs.py b/src/dinglehopper/cli_line_dirs.py index 01fd585..43e4f1a 100644 --- a/src/dinglehopper/cli_line_dirs.py +++ b/src/dinglehopper/cli_line_dirs.py @@ -1,5 +1,6 @@ import itertools import os +from typing import Iterator, Tuple import click from jinja2 import Environment, FileSystemLoader @@ -12,11 +13,36 @@ from .ocr_files import plain_extract from .word_error_rate import word_error_rate_n, words_normalized +def removesuffix(text, suffix): + if suffix and text.endswith(suffix): + return text[: -len(suffix)] + return text + +def is_hidden(filepath): + filename = os.path.basename(os.path.abspath(filepath)) + return filename.startswith(".") + +def find_all_files(dir_: str, pred=None, return_hidden=False) -> Iterator[str]: + """ + Find all files in dir_, returning filenames + + If pred is given, pred(filename) must be True for the filename. + + Does not return hidden files by default. + """ + for root, _, filenames in os.walk(dir_): + for fn in filenames: + if not return_hidden and is_hidden(fn): + continue + if pred and not pred(fn): + continue + yield os.path.join(root, fn) + + def all_equal(iterable): g = itertools.groupby(iterable) return next(g, True) and not next(g, False) - def common_prefix(its): return [p[0] for p in itertools.takewhile(all_equal, zip(*its))] @@ -24,16 +50,49 @@ def common_prefix(its): def common_suffix(its): return reversed(common_prefix(reversed(it) for it in its)) +def find_gt_and_ocr_files(gt_dir, gt_suffix, ocr_dir, ocr_suffix) -> Iterator[Tuple[str, str]]: + """ + Find GT files and matching OCR files. -def removesuffix(text, suffix): - if suffix and text.endswith(suffix): - return text[: -len(suffix)] - return text + Returns pairs of GT and OCR files. + """ + for gt_fn in find_all_files(gt_dir, lambda fn: fn.endswith(gt_suffix)): + ocr_fn = os.path.join( + ocr_dir, + os.path.relpath(gt_fn, start=gt_dir).removesuffix(gt_suffix) + + ocr_suffix, + ) + if not os.path.exists(ocr_fn): + raise RuntimeError(f"{ocr_fn} (matching {gt_fn}) does not exist") + + yield gt_fn, ocr_fn + + +def find_gt_and_ocr_files_autodetect(gt_dir, ocr_dir): + """ + Find GT files and matching OCR files, autodetect suffixes. + + This only works if gt_dir (or respectivley ocr_dir) only contains GT (OCR) + files with a common suffix. Currently the files must have a suffix, e.g. + ".gt.txt" (e.g. ".ocr.txt"). + + Returns pairs of GT and OCR files. + """ + + # Autodetect suffixes + gt_files = find_all_files(gt_dir) + gt_suffix = "".join(common_suffix(gt_files)) + if len(gt_suffix) == 0: + raise RuntimeError(f"Files in GT directory {gt_dir} do not have a common suffix") + ocr_files = find_all_files(ocr_dir) + ocr_suffix = "".join(common_suffix(ocr_files)) + if len(ocr_suffix) == 0: + raise RuntimeError(f"Files in OCR directory {ocr_dir} do not have a common suffix") + + yield from find_gt_and_ocr_files(gt_dir, gt_suffix, ocr_dir, ocr_suffix) def process(gt_dir, ocr_dir, report_prefix, *, metrics=True): - gt_suffix = "".join(common_suffix(os.listdir(gt_dir))) - ocr_suffix = "".join(common_suffix(os.listdir(ocr_dir))) cer = None n_characters = None @@ -42,14 +101,10 @@ def process(gt_dir, ocr_dir, report_prefix, *, metrics=True): n_words = None word_diff_report = "" - for k, gt in enumerate(os.listdir(gt_dir)): - # Find a match by replacing the suffix - ocr = removesuffix(gt, gt_suffix) + ocr_suffix + for k, (gt_fn, ocr_fn) in enumerate(find_gt_and_ocr_files_autodetect(gt_dir, ocr_dir)): - gt_text = plain_extract(os.path.join(gt_dir, gt), include_filename_in_id=True) - ocr_text = plain_extract( - os.path.join(ocr_dir, ocr), include_filename_in_id=True - ) + gt_text = plain_extract(gt_fn, include_filename_in_id=True) + ocr_text = plain_extract(ocr_fn, include_filename_in_id=True) gt_words = words_normalized(gt_text) ocr_words = words_normalized(ocr_text) diff --git a/src/dinglehopper/line_dirs_test.py b/src/dinglehopper/line_dirs_test.py index 676fe22..9827f01 100644 --- a/src/dinglehopper/line_dirs_test.py +++ b/src/dinglehopper/line_dirs_test.py @@ -2,78 +2,7 @@ import os.path import itertools from typing import Iterator, Tuple -def is_hidden(filepath): - filename = os.path.basename(os.path.abspath(filepath)) - return filename.startswith(".") - -def find_all_files(dir_: str, pred=None, return_hidden=False) -> Iterator[str]: - """ - Find all files in dir_, returning filenames - - If pred is given, pred(filename) must be True for the filename. - - Does not return hidden files by default. - """ - for root, _, filenames in os.walk(dir_): - for fn in filenames: - if not return_hidden and is_hidden(fn): - continue - if pred and not pred(fn): - continue - yield os.path.join(root, fn) - - -def find_gt_and_ocr_files(gt_dir, gt_suffix, ocr_dir, ocr_suffix) -> Iterator[Tuple[str, str]]: - """ - Find GT files and matching OCR files. - - Returns pairs of GT and OCR files. - """ - for gt_fn in find_all_files(gt_dir, lambda fn: fn.endswith(gt_suffix)): - ocr_fn = os.path.join( - ocr_dir, - os.path.relpath(gt_fn, start=gt_dir).removesuffix(gt_suffix) - + ocr_suffix, - ) - if not os.path.exists(ocr_fn): - raise RuntimeError(f"{ocr_fn} (matching {gt_fn}) does not exist") - - yield gt_fn, ocr_fn - -def all_equal(iterable): - g = itertools.groupby(iterable) - return next(g, True) and not next(g, False) - -def common_prefix(its): - return [p[0] for p in itertools.takewhile(all_equal, zip(*its))] - - -def common_suffix(its): - return reversed(common_prefix(reversed(it) for it in its)) - - -def find_gt_and_ocr_files_autodetect(gt_dir, ocr_dir): - """ - Find GT files and matching OCR files, autodetect suffixes. - - This only works if gt_dir (or respectivley ocr_dir) only contains GT (OCR) - files with a common suffix. Currently the files must have a suffix, e.g. - ".gt.txt" (e.g. ".ocr.txt"). - - Returns pairs of GT and OCR files. - """ - - # Autodetect suffixes - gt_files = find_all_files(gt_dir) - gt_suffix = "".join(common_suffix(gt_files)) - if len(gt_suffix) == 0: - raise RuntimeError(f"Files in GT directory {gt_dir} do not have a common suffix") - ocr_files = find_all_files(ocr_dir) - ocr_suffix = "".join(common_suffix(ocr_files)) - if len(ocr_suffix) == 0: - raise RuntimeError(f"Files in OCR directory {ocr_dir} do not have a common suffix") - yield from find_gt_and_ocr_files(gt_dir, gt_suffix, ocr_dir, ocr_suffix) def test_basic(): From 6980d7a2526380833ffd4d964e1f1b4c58bfed8a Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Thu, 12 Dec 2024 13:21:49 +0100 Subject: [PATCH 04/20] =?UTF-8?q?=F0=9F=9A=A7=20Use=20our=20own=20removesu?= =?UTF-8?q?ffix()=20as=20we=20still=20support=20Python=203.8?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/dinglehopper/cli_line_dirs.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/dinglehopper/cli_line_dirs.py b/src/dinglehopper/cli_line_dirs.py index 43e4f1a..30b2be1 100644 --- a/src/dinglehopper/cli_line_dirs.py +++ b/src/dinglehopper/cli_line_dirs.py @@ -14,6 +14,11 @@ from .word_error_rate import word_error_rate_n, words_normalized def removesuffix(text, suffix): + """ + Remove suffix from text. + + Can be replaced with str.removesuffix when we only support Python >= 3.9. + """ if suffix and text.endswith(suffix): return text[: -len(suffix)] return text @@ -59,7 +64,7 @@ def find_gt_and_ocr_files(gt_dir, gt_suffix, ocr_dir, ocr_suffix) -> Iterator[Tu for gt_fn in find_all_files(gt_dir, lambda fn: fn.endswith(gt_suffix)): ocr_fn = os.path.join( ocr_dir, - os.path.relpath(gt_fn, start=gt_dir).removesuffix(gt_suffix) + removesuffix(os.path.relpath(gt_fn, start=gt_dir), gt_suffix) + ocr_suffix, ) if not os.path.exists(ocr_fn): From 73ee16fe5181c29a06f7460ed1fb1dadd84d6cc2 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Thu, 12 Dec 2024 13:59:55 +0100 Subject: [PATCH 05/20] =?UTF-8?q?=F0=9F=9A=A7=20Support=20'merged'=20GT+OC?= =?UTF-8?q?R=20line=20directories?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/dinglehopper/cli_line_dirs.py | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/src/dinglehopper/cli_line_dirs.py b/src/dinglehopper/cli_line_dirs.py index 30b2be1..44305d6 100644 --- a/src/dinglehopper/cli_line_dirs.py +++ b/src/dinglehopper/cli_line_dirs.py @@ -97,7 +97,7 @@ def find_gt_and_ocr_files_autodetect(gt_dir, ocr_dir): yield from find_gt_and_ocr_files(gt_dir, gt_suffix, ocr_dir, ocr_suffix) -def process(gt_dir, ocr_dir, report_prefix, *, metrics=True): +def process(gt_dir, ocr_dir, report_prefix, *, metrics=True, gt_suffix=None, ocr_suffix=None): cer = None n_characters = None @@ -106,8 +106,12 @@ def process(gt_dir, ocr_dir, report_prefix, *, metrics=True): n_words = None word_diff_report = "" - for k, (gt_fn, ocr_fn) in enumerate(find_gt_and_ocr_files_autodetect(gt_dir, ocr_dir)): + if gt_suffix is not None and ocr_suffix is not None: + gt_ocr_files = find_gt_and_ocr_files(gt_dir, gt_suffix, ocr_dir, ocr_suffix) + else: + gt_ocr_files = find_gt_and_ocr_files_autodetect(gt_dir, ocr_dir) + for k, (gt_fn, ocr_fn) in enumerate(gt_ocr_files): gt_text = plain_extract(gt_fn, include_filename_in_id=True) ocr_text = plain_extract(ocr_fn, include_filename_in_id=True) gt_words = words_normalized(gt_text) @@ -183,17 +187,25 @@ def process(gt_dir, ocr_dir, report_prefix, *, metrics=True): @click.option( "--metrics/--no-metrics", default=True, help="Enable/disable metrics and green/red" ) -def main(gt, ocr, report_prefix, metrics): +@click.option("--gt-suffix", help="Suffix of GT line text files") +@click.option("--ocr-suffix", help="Suffix of OCR line text files") +def main(gt, ocr, report_prefix, metrics, gt_suffix, ocr_suffix): """ Compare the GT line text directory against the OCR line text directory. This assumes that the GT line text directory contains textfiles with a common suffix like ".gt.txt", and the OCR line text directory contains textfiles with a common suffix like ".some-ocr.txt". The text files also need to be paired, - i.e. the GT file "line001.gt.txt" needs to match a file "line001.some-ocr.txt" - in the OCT lines directory. + i.e. the GT filename "line001.gt.txt" needs to match a filename + "line001.some-ocr.txt" in the OCR lines directory. - The GT and OCR directories are usually round truth line texts and the results of + GT and OCR directories may contain line text files in matching subdirectories, + e.g. "GT/goethe_faust/line1.gt.txt" and "OCR/goethe_faust/line1.pred.txt". + + GT and OCR directories can also be the same directory, but in this case you need + to give --gt-suffix and --ocr-suffix explicitly. + + The GT and OCR directories are usually ground truth line texts and the results of an OCR software, but you may use dinglehopper to compare two OCR results. In that case, use --no-metrics to disable the then meaningless metrics and also change the color scheme from green/red to blue. @@ -204,7 +216,7 @@ def main(gt, ocr, report_prefix, metrics): """ initLogging() - process(gt, ocr, report_prefix, metrics=metrics) + process(gt, ocr, report_prefix, metrics=metrics, gt_suffix=gt_suffix, ocr_suffix=ocr_suffix) if __name__ == "__main__": From 68344e48f870968a92c6c51afb759c1fa47dea2b Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Thu, 12 Dec 2024 14:49:40 +0100 Subject: [PATCH 06/20] =?UTF-8?q?=F0=9F=8E=A8=20Reformat=20cli=5Fline=5Fdi?= =?UTF-8?q?rs?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/dinglehopper/cli_line_dirs.py | 34 +++++++++++++++++++++++-------- 1 file changed, 26 insertions(+), 8 deletions(-) diff --git a/src/dinglehopper/cli_line_dirs.py b/src/dinglehopper/cli_line_dirs.py index 44305d6..9e806a1 100644 --- a/src/dinglehopper/cli_line_dirs.py +++ b/src/dinglehopper/cli_line_dirs.py @@ -23,11 +23,13 @@ def removesuffix(text, suffix): return text[: -len(suffix)] return text + def is_hidden(filepath): filename = os.path.basename(os.path.abspath(filepath)) return filename.startswith(".") -def find_all_files(dir_: str, pred=None, return_hidden=False) -> Iterator[str]: + +def find_all_files(dir_: str, pred: Callable[[str], bool]=None, return_hidden: bool=False) -> Iterator[str]: """ Find all files in dir_, returning filenames @@ -48,6 +50,7 @@ def all_equal(iterable): g = itertools.groupby(iterable) return next(g, True) and not next(g, False) + def common_prefix(its): return [p[0] for p in itertools.takewhile(all_equal, zip(*its))] @@ -55,7 +58,10 @@ def common_prefix(its): def common_suffix(its): return reversed(common_prefix(reversed(it) for it in its)) -def find_gt_and_ocr_files(gt_dir, gt_suffix, ocr_dir, ocr_suffix) -> Iterator[Tuple[str, str]]: + +def find_gt_and_ocr_files( + gt_dir, gt_suffix, ocr_dir, ocr_suffix +) -> Iterator[Tuple[str, str]]: """ Find GT files and matching OCR files. @@ -64,8 +70,7 @@ def find_gt_and_ocr_files(gt_dir, gt_suffix, ocr_dir, ocr_suffix) -> Iterator[Tu for gt_fn in find_all_files(gt_dir, lambda fn: fn.endswith(gt_suffix)): ocr_fn = os.path.join( ocr_dir, - removesuffix(os.path.relpath(gt_fn, start=gt_dir), gt_suffix) - + ocr_suffix, + removesuffix(os.path.relpath(gt_fn, start=gt_dir), gt_suffix) + ocr_suffix, ) if not os.path.exists(ocr_fn): raise RuntimeError(f"{ocr_fn} (matching {gt_fn}) does not exist") @@ -88,16 +93,22 @@ def find_gt_and_ocr_files_autodetect(gt_dir, ocr_dir): gt_files = find_all_files(gt_dir) gt_suffix = "".join(common_suffix(gt_files)) if len(gt_suffix) == 0: - raise RuntimeError(f"Files in GT directory {gt_dir} do not have a common suffix") + raise RuntimeError( + f"Files in GT directory {gt_dir} do not have a common suffix" + ) ocr_files = find_all_files(ocr_dir) ocr_suffix = "".join(common_suffix(ocr_files)) if len(ocr_suffix) == 0: - raise RuntimeError(f"Files in OCR directory {ocr_dir} do not have a common suffix") + raise RuntimeError( + f"Files in OCR directory {ocr_dir} do not have a common suffix" + ) yield from find_gt_and_ocr_files(gt_dir, gt_suffix, ocr_dir, ocr_suffix) -def process(gt_dir, ocr_dir, report_prefix, *, metrics=True, gt_suffix=None, ocr_suffix=None): +def process( + gt_dir, ocr_dir, report_prefix, *, metrics=True, gt_suffix=None, ocr_suffix=None +): cer = None n_characters = None @@ -216,7 +227,14 @@ def main(gt, ocr, report_prefix, metrics, gt_suffix, ocr_suffix): """ initLogging() - process(gt, ocr, report_prefix, metrics=metrics, gt_suffix=gt_suffix, ocr_suffix=ocr_suffix) + process( + gt, + ocr, + report_prefix, + metrics=metrics, + gt_suffix=gt_suffix, + ocr_suffix=ocr_suffix, + ) if __name__ == "__main__": From 9414a92f9f31760a694c44f06069f7677e679078 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Thu, 12 Dec 2024 15:19:37 +0100 Subject: [PATCH 07/20] =?UTF-8?q?=F0=9F=90=9B=20cli=5Fline=5Fdirs:=20Type-?= =?UTF-8?q?annotate=20functions?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/dinglehopper/cli_line_dirs.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/dinglehopper/cli_line_dirs.py b/src/dinglehopper/cli_line_dirs.py index 9e806a1..2cd4fe6 100644 --- a/src/dinglehopper/cli_line_dirs.py +++ b/src/dinglehopper/cli_line_dirs.py @@ -1,6 +1,6 @@ import itertools import os -from typing import Iterator, Tuple +from typing import Callable, Iterator, Optional, Tuple import click from jinja2 import Environment, FileSystemLoader @@ -29,7 +29,9 @@ def is_hidden(filepath): return filename.startswith(".") -def find_all_files(dir_: str, pred: Callable[[str], bool]=None, return_hidden: bool=False) -> Iterator[str]: +def find_all_files( + dir_: str, pred: Optional[Callable[[str], bool]] = None, return_hidden: bool = False +) -> Iterator[str]: """ Find all files in dir_, returning filenames @@ -60,7 +62,7 @@ def common_suffix(its): def find_gt_and_ocr_files( - gt_dir, gt_suffix, ocr_dir, ocr_suffix + gt_dir: str, gt_suffix: str, ocr_dir: str, ocr_suffix: str ) -> Iterator[Tuple[str, str]]: """ Find GT files and matching OCR files. From c37316da097d18b74f0da2398b53b64ab712495f Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Thu, 12 Dec 2024 19:57:12 +0100 Subject: [PATCH 08/20] =?UTF-8?q?=F0=9F=90=9B=20cli=5Fline=5Fdirs:=20Fix?= =?UTF-8?q?=20word=20differences=20section?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit At the time of generation of the section, the {gt,ocr}_words generators were drained. Fix by using a list. Fixes gh-124. --- src/dinglehopper/cli_line_dirs.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/dinglehopper/cli_line_dirs.py b/src/dinglehopper/cli_line_dirs.py index 2cd4fe6..2861d6f 100644 --- a/src/dinglehopper/cli_line_dirs.py +++ b/src/dinglehopper/cli_line_dirs.py @@ -1,6 +1,6 @@ import itertools import os -from typing import Callable, Iterator, Optional, Tuple +from typing import Callable, Iterator, Optional, Tuple, List import click from jinja2 import Environment, FileSystemLoader @@ -127,8 +127,8 @@ def process( for k, (gt_fn, ocr_fn) in enumerate(gt_ocr_files): gt_text = plain_extract(gt_fn, include_filename_in_id=True) ocr_text = plain_extract(ocr_fn, include_filename_in_id=True) - gt_words = words_normalized(gt_text) - ocr_words = words_normalized(ocr_text) + gt_words: List[str] = list(words_normalized(gt_text)) + ocr_words: List[str] = list(words_normalized(ocr_text)) # Compute CER l_cer, l_n_characters = character_error_rate_n(gt_text, ocr_text) From 322faeb26c2c60d8d777ab6132b9af397d0fd510 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Sat, 14 Dec 2024 09:21:09 +0100 Subject: [PATCH 09/20] =?UTF-8?q?=F0=9F=8E=A8=20Sort=20imports?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/dinglehopper/cli_line_dirs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/dinglehopper/cli_line_dirs.py b/src/dinglehopper/cli_line_dirs.py index 2861d6f..5cd1bfa 100644 --- a/src/dinglehopper/cli_line_dirs.py +++ b/src/dinglehopper/cli_line_dirs.py @@ -1,6 +1,6 @@ import itertools import os -from typing import Callable, Iterator, Optional, Tuple, List +from typing import Callable, Iterator, List, Optional, Tuple import click from jinja2 import Environment, FileSystemLoader From 3b16c14c16dd00500574b74031107768d5cbb465 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Sat, 14 Dec 2024 09:50:24 +0100 Subject: [PATCH 10/20] =?UTF-8?q?=E2=9C=94=20=20Properly=20test=20line=20d?= =?UTF-8?q?ir=20finding?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .gitignore | 1 + .../data/line_dirs}/basic/gt/a.gt.txt | 0 .../data/line_dirs}/basic/gt/b.gt.txt | 0 .../data/line_dirs}/basic/ocr/a.some-ocr.txt | 0 .../data/line_dirs}/basic/ocr/b.some-ocr.txt | 0 .../data/line_dirs}/merged/a/a.dummy.jpg | 0 .../data/line_dirs}/merged/a/a.gt.txt | 0 .../data/line_dirs}/merged/a/a.some-ocr.txt | 0 .../data/line_dirs}/merged/b/b.dummy.jpg | 0 .../data/line_dirs}/merged/b/b.gt.txt | 0 .../data/line_dirs}/merged/b/b.some-ocr.txt | 0 .../data/line_dirs}/subdirs/gt/a/a.gt.txt | 0 .../data/line_dirs}/subdirs/gt/b/b.gt.txt | 0 .../line_dirs}/subdirs/ocr/a/a.some-ocr.txt | 0 .../line_dirs}/subdirs/ocr/b/b.some-ocr.txt | 0 .../test_line_dirs.py} | 40 ++++++++----------- 16 files changed, 18 insertions(+), 23 deletions(-) rename src/dinglehopper/{line_dirs_test => tests/data/line_dirs}/basic/gt/a.gt.txt (100%) rename src/dinglehopper/{line_dirs_test => tests/data/line_dirs}/basic/gt/b.gt.txt (100%) rename src/dinglehopper/{line_dirs_test => tests/data/line_dirs}/basic/ocr/a.some-ocr.txt (100%) rename src/dinglehopper/{line_dirs_test => tests/data/line_dirs}/basic/ocr/b.some-ocr.txt (100%) rename src/dinglehopper/{line_dirs_test => tests/data/line_dirs}/merged/a/a.dummy.jpg (100%) rename src/dinglehopper/{line_dirs_test => tests/data/line_dirs}/merged/a/a.gt.txt (100%) rename src/dinglehopper/{line_dirs_test => tests/data/line_dirs}/merged/a/a.some-ocr.txt (100%) rename src/dinglehopper/{line_dirs_test => tests/data/line_dirs}/merged/b/b.dummy.jpg (100%) rename src/dinglehopper/{line_dirs_test => tests/data/line_dirs}/merged/b/b.gt.txt (100%) rename src/dinglehopper/{line_dirs_test => tests/data/line_dirs}/merged/b/b.some-ocr.txt (100%) rename src/dinglehopper/{line_dirs_test => tests/data/line_dirs}/subdirs/gt/a/a.gt.txt (100%) rename src/dinglehopper/{line_dirs_test => tests/data/line_dirs}/subdirs/gt/b/b.gt.txt (100%) rename src/dinglehopper/{line_dirs_test => tests/data/line_dirs}/subdirs/ocr/a/a.some-ocr.txt (100%) rename src/dinglehopper/{line_dirs_test => tests/data/line_dirs}/subdirs/ocr/b/b.some-ocr.txt (100%) rename src/dinglehopper/{line_dirs_test.py => tests/test_line_dirs.py} (52%) diff --git a/.gitignore b/.gitignore index d931831..66d66bc 100644 --- a/.gitignore +++ b/.gitignore @@ -25,6 +25,7 @@ dmypy.json # User-specific stuff .idea +.*.swp # Build artifacts /build diff --git a/src/dinglehopper/line_dirs_test/basic/gt/a.gt.txt b/src/dinglehopper/tests/data/line_dirs/basic/gt/a.gt.txt similarity index 100% rename from src/dinglehopper/line_dirs_test/basic/gt/a.gt.txt rename to src/dinglehopper/tests/data/line_dirs/basic/gt/a.gt.txt diff --git a/src/dinglehopper/line_dirs_test/basic/gt/b.gt.txt b/src/dinglehopper/tests/data/line_dirs/basic/gt/b.gt.txt similarity index 100% rename from src/dinglehopper/line_dirs_test/basic/gt/b.gt.txt rename to src/dinglehopper/tests/data/line_dirs/basic/gt/b.gt.txt diff --git a/src/dinglehopper/line_dirs_test/basic/ocr/a.some-ocr.txt b/src/dinglehopper/tests/data/line_dirs/basic/ocr/a.some-ocr.txt similarity index 100% rename from src/dinglehopper/line_dirs_test/basic/ocr/a.some-ocr.txt rename to src/dinglehopper/tests/data/line_dirs/basic/ocr/a.some-ocr.txt diff --git a/src/dinglehopper/line_dirs_test/basic/ocr/b.some-ocr.txt b/src/dinglehopper/tests/data/line_dirs/basic/ocr/b.some-ocr.txt similarity index 100% rename from src/dinglehopper/line_dirs_test/basic/ocr/b.some-ocr.txt rename to src/dinglehopper/tests/data/line_dirs/basic/ocr/b.some-ocr.txt diff --git a/src/dinglehopper/line_dirs_test/merged/a/a.dummy.jpg b/src/dinglehopper/tests/data/line_dirs/merged/a/a.dummy.jpg similarity index 100% rename from src/dinglehopper/line_dirs_test/merged/a/a.dummy.jpg rename to src/dinglehopper/tests/data/line_dirs/merged/a/a.dummy.jpg diff --git a/src/dinglehopper/line_dirs_test/merged/a/a.gt.txt b/src/dinglehopper/tests/data/line_dirs/merged/a/a.gt.txt similarity index 100% rename from src/dinglehopper/line_dirs_test/merged/a/a.gt.txt rename to src/dinglehopper/tests/data/line_dirs/merged/a/a.gt.txt diff --git a/src/dinglehopper/line_dirs_test/merged/a/a.some-ocr.txt b/src/dinglehopper/tests/data/line_dirs/merged/a/a.some-ocr.txt similarity index 100% rename from src/dinglehopper/line_dirs_test/merged/a/a.some-ocr.txt rename to src/dinglehopper/tests/data/line_dirs/merged/a/a.some-ocr.txt diff --git a/src/dinglehopper/line_dirs_test/merged/b/b.dummy.jpg b/src/dinglehopper/tests/data/line_dirs/merged/b/b.dummy.jpg similarity index 100% rename from src/dinglehopper/line_dirs_test/merged/b/b.dummy.jpg rename to src/dinglehopper/tests/data/line_dirs/merged/b/b.dummy.jpg diff --git a/src/dinglehopper/line_dirs_test/merged/b/b.gt.txt b/src/dinglehopper/tests/data/line_dirs/merged/b/b.gt.txt similarity index 100% rename from src/dinglehopper/line_dirs_test/merged/b/b.gt.txt rename to src/dinglehopper/tests/data/line_dirs/merged/b/b.gt.txt diff --git a/src/dinglehopper/line_dirs_test/merged/b/b.some-ocr.txt b/src/dinglehopper/tests/data/line_dirs/merged/b/b.some-ocr.txt similarity index 100% rename from src/dinglehopper/line_dirs_test/merged/b/b.some-ocr.txt rename to src/dinglehopper/tests/data/line_dirs/merged/b/b.some-ocr.txt diff --git a/src/dinglehopper/line_dirs_test/subdirs/gt/a/a.gt.txt b/src/dinglehopper/tests/data/line_dirs/subdirs/gt/a/a.gt.txt similarity index 100% rename from src/dinglehopper/line_dirs_test/subdirs/gt/a/a.gt.txt rename to src/dinglehopper/tests/data/line_dirs/subdirs/gt/a/a.gt.txt diff --git a/src/dinglehopper/line_dirs_test/subdirs/gt/b/b.gt.txt b/src/dinglehopper/tests/data/line_dirs/subdirs/gt/b/b.gt.txt similarity index 100% rename from src/dinglehopper/line_dirs_test/subdirs/gt/b/b.gt.txt rename to src/dinglehopper/tests/data/line_dirs/subdirs/gt/b/b.gt.txt diff --git a/src/dinglehopper/line_dirs_test/subdirs/ocr/a/a.some-ocr.txt b/src/dinglehopper/tests/data/line_dirs/subdirs/ocr/a/a.some-ocr.txt similarity index 100% rename from src/dinglehopper/line_dirs_test/subdirs/ocr/a/a.some-ocr.txt rename to src/dinglehopper/tests/data/line_dirs/subdirs/ocr/a/a.some-ocr.txt diff --git a/src/dinglehopper/line_dirs_test/subdirs/ocr/b/b.some-ocr.txt b/src/dinglehopper/tests/data/line_dirs/subdirs/ocr/b/b.some-ocr.txt similarity index 100% rename from src/dinglehopper/line_dirs_test/subdirs/ocr/b/b.some-ocr.txt rename to src/dinglehopper/tests/data/line_dirs/subdirs/ocr/b/b.some-ocr.txt diff --git a/src/dinglehopper/line_dirs_test.py b/src/dinglehopper/tests/test_line_dirs.py similarity index 52% rename from src/dinglehopper/line_dirs_test.py rename to src/dinglehopper/tests/test_line_dirs.py index 9827f01..03966e1 100644 --- a/src/dinglehopper/line_dirs_test.py +++ b/src/dinglehopper/tests/test_line_dirs.py @@ -1,29 +1,30 @@ -import os.path -import itertools -from typing import Iterator, Tuple +import os +from ..cli_line_dirs import find_gt_and_ocr_files, find_gt_and_ocr_files_autodetect +data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data") def test_basic(): """Test the dumb method: User gives directories and suffixes.""" pairs = list( find_gt_and_ocr_files( - "line_dirs_test/basic/gt", + os.path.join(data_dir, "line_dirs/basic/gt"), ".gt.txt", - "line_dirs_test/basic/ocr", + os.path.join(data_dir, "line_dirs/basic/ocr"), ".some-ocr.txt", ) ) assert len(pairs) == 2 + def test_basic_autodetect(): - """Test the autodetect method: User gives directories, suffixes are autodetected if possible""" + """Test autodetect: User gives directories, suffixes are autodetected if possible""" pairs = list( find_gt_and_ocr_files_autodetect( - "line_dirs_test/basic/gt", - "line_dirs_test/basic/ocr", + os.path.join(data_dir, "line_dirs/basic/gt"), + os.path.join(data_dir, "line_dirs/basic/ocr"), ) ) @@ -34,9 +35,9 @@ def test_subdirs(): """Test the dumb method: Should also work when subdirectories are involved.""" pairs = list( find_gt_and_ocr_files( - "line_dirs_test/subdirs/gt", + os.path.join(data_dir, "line_dirs/subdirs/gt"), ".gt.txt", - "line_dirs_test/subdirs/ocr", + os.path.join(data_dir, "line_dirs/subdirs/ocr"), ".some-ocr.txt", ) ) @@ -48,30 +49,23 @@ def test_subdirs_autodetect(): """Test the autodetect method: Should also work when subdirectories are involved.""" pairs = list( find_gt_and_ocr_files_autodetect( - "line_dirs_test/subdirs/gt", - "line_dirs_test/subdirs/ocr", + os.path.join(data_dir, "line_dirs/subdirs/gt"), + os.path.join(data_dir, "line_dirs/subdirs/ocr"), ) ) assert len(pairs) == 2 + def test_merged(): - """Test the dumb method: Should also work when GT and OCR texts are in the same directories.""" + """Test the dumb method: GT and OCR texts are in the same directories.""" pairs = list( find_gt_and_ocr_files( - "line_dirs_test/merged", + os.path.join(data_dir, "line_dirs/merged"), ".gt.txt", - "line_dirs_test/merged", + os.path.join(data_dir, "line_dirs/merged"), ".some-ocr.txt", ) ) assert len(pairs) == 2 - -if __name__ == "__main__": - test_basic() - test_subdirs() - test_merged() - - test_basic_autodetect() - test_subdirs_autodetect() From f1a586cff1d306d3fbef95c8110af74d3941a894 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Sat, 14 Dec 2024 10:36:58 +0100 Subject: [PATCH 11/20] =?UTF-8?q?=E2=9C=94=20=20Test=20line=20dirs=20CLI?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../tests/test_integ_cli_line_dirs.py | 61 +++++++++++++++++++ 1 file changed, 61 insertions(+) create mode 100644 src/dinglehopper/tests/test_integ_cli_line_dirs.py diff --git a/src/dinglehopper/tests/test_integ_cli_line_dirs.py b/src/dinglehopper/tests/test_integ_cli_line_dirs.py new file mode 100644 index 0000000..90cbabf --- /dev/null +++ b/src/dinglehopper/tests/test_integ_cli_line_dirs.py @@ -0,0 +1,61 @@ +import json +import os.path +import re + +import pytest + +from ..cli_line_dirs import process +from .util import working_directory + +data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data") + + +@pytest.mark.integration +def test_cli_line_dirs_basic(tmp_path): + """Test that the cli/process() produces a good report""" + + with working_directory(tmp_path): + gt_dir = os.path.join(data_dir, "line_dirs/basic/gt") + ocr_dir = os.path.join(data_dir, "line_dirs/basic/ocr") + process(gt_dir, ocr_dir, "report") + with open("report.json", "r") as jsonf: + print(jsonf.read()) + with open("report.json", "r") as jsonf: + j = json.load(jsonf) + assert j["cer"] == pytest.approx(0.1071429) + assert j["wer"] == pytest.approx(0.5) + + +@pytest.mark.integration +def test_cli_line_dirs_basic_report_diff(tmp_path): + """Test that the cli/process() produces a report wiff char+word diff""" + + with working_directory(tmp_path): + gt_dir = os.path.join(data_dir, "line_dirs/basic/gt") + ocr_dir = os.path.join(data_dir, "line_dirs/basic/ocr") + process(gt_dir, ocr_dir, "report") + + with open("report.html", "r") as htmlf: + html_report = htmlf.read() + + # Counting GT lines in the diff + assert len(re.findall(r"gt.*l\d+-cdiff", html_report)) == 2 + assert len(re.findall(r"gt.*l\d+-wdiff", html_report)) == 2 + + +@pytest.mark.integration +def test_cli_line_dirs_merged(tmp_path): + """Test that the cli/process() produces a good report""" + + with working_directory(tmp_path): + gt_dir = os.path.join(data_dir, "line_dirs/merged") + ocr_dir = os.path.join(data_dir, "line_dirs/merged") + process( + gt_dir, ocr_dir, "report", gt_suffix=".gt.txt", ocr_suffix=".some-ocr.txt" + ) + with open("report.json", "r") as jsonf: + print(jsonf.read()) + with open("report.json", "r") as jsonf: + j = json.load(jsonf) + assert j["cer"] == pytest.approx(0.1071429) + assert j["wer"] == pytest.approx(0.5) From 480b3cf864ba1ba5c26ed550760b53193b91e93d Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Sat, 14 Dec 2024 11:14:07 +0100 Subject: [PATCH 12/20] =?UTF-8?q?=E2=9C=94=20=20Test=20that=20CLI=20produc?= =?UTF-8?q?es=20a=20complete=20HTML=20report?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ...json.py => test_integ_cli_valid_report.py} | 23 +++++++++++++++++++ 1 file changed, 23 insertions(+) rename src/dinglehopper/tests/{test_integ_cli_valid_json.py => test_integ_cli_valid_report.py} (64%) diff --git a/src/dinglehopper/tests/test_integ_cli_valid_json.py b/src/dinglehopper/tests/test_integ_cli_valid_report.py similarity index 64% rename from src/dinglehopper/tests/test_integ_cli_valid_json.py rename to src/dinglehopper/tests/test_integ_cli_valid_report.py index 6cbfa0c..fed0d28 100644 --- a/src/dinglehopper/tests/test_integ_cli_valid_json.py +++ b/src/dinglehopper/tests/test_integ_cli_valid_report.py @@ -1,4 +1,5 @@ import json +import re import pytest @@ -40,3 +41,25 @@ def test_cli_json_cer_is_infinity(tmp_path): with open("report.json", "r") as jsonf: j = json.load(jsonf) assert j["cer"] == pytest.approx(float("inf")) + + +@pytest.mark.integration +def test_cli_html(tmp_path): + """Test that the cli/process() yields complete HTML report""" + + with working_directory(tmp_path): + with open("gt.txt", "w") as gtf: + gtf.write("AAAAA") + with open("ocr.txt", "w") as ocrf: + ocrf.write("AAAAB") + + process("gt.txt", "ocr.txt", "report") + + with open("report.html", "r") as htmlf: + html_report = htmlf.read() + print(html_report) + + assert re.search(r"CER: 0\.\d+", html_report) + assert re.search(r"WER: 1\.0", html_report) + assert len(re.findall("gt.*cdiff", html_report)) == 1 + assert len(re.findall("gt.*wdiff", html_report)) == 1 From cf59b951a3a30cd23e36a0bb2e553f2d6abcee20 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Tue, 4 Feb 2025 13:54:28 +0100 Subject: [PATCH 13/20] =?UTF-8?q?=F0=9F=9A=A7=20Add=20option=20for=20text?= =?UTF-8?q?=20encoding=20to=20line=20dir=20cli?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/dinglehopper/cli_line_dirs.py | 27 +++++++++++++++++++++++---- src/dinglehopper/ocr_files.py | 22 ++++++++++++++++------ 2 files changed, 39 insertions(+), 10 deletions(-) diff --git a/src/dinglehopper/cli_line_dirs.py b/src/dinglehopper/cli_line_dirs.py index 5cd1bfa..4064de0 100644 --- a/src/dinglehopper/cli_line_dirs.py +++ b/src/dinglehopper/cli_line_dirs.py @@ -109,7 +109,14 @@ def find_gt_and_ocr_files_autodetect(gt_dir, ocr_dir): def process( - gt_dir, ocr_dir, report_prefix, *, metrics=True, gt_suffix=None, ocr_suffix=None + gt_dir, + ocr_dir, + report_prefix, + *, + metrics=True, + gt_suffix=None, + ocr_suffix=None, + plain_encoding="autodetect", ): cer = None @@ -125,8 +132,12 @@ def process( gt_ocr_files = find_gt_and_ocr_files_autodetect(gt_dir, ocr_dir) for k, (gt_fn, ocr_fn) in enumerate(gt_ocr_files): - gt_text = plain_extract(gt_fn, include_filename_in_id=True) - ocr_text = plain_extract(ocr_fn, include_filename_in_id=True) + gt_text = plain_extract( + gt_fn, include_filename_in_id=True, encoding=plain_encoding + ) + ocr_text = plain_extract( + ocr_fn, include_filename_in_id=True, encoding=plain_encoding + ) gt_words: List[str] = list(words_normalized(gt_text)) ocr_words: List[str] = list(words_normalized(ocr_text)) @@ -202,7 +213,12 @@ def process( ) @click.option("--gt-suffix", help="Suffix of GT line text files") @click.option("--ocr-suffix", help="Suffix of OCR line text files") -def main(gt, ocr, report_prefix, metrics, gt_suffix, ocr_suffix): +@click.option( + "--plain-encoding", + default="autodetect", + help='Encoding (e.g. "utf-8") of plain text files', +) +def main(gt, ocr, report_prefix, metrics, gt_suffix, ocr_suffix, plain_encoding): """ Compare the GT line text directory against the OCR line text directory. @@ -227,6 +243,8 @@ def main(gt, ocr, report_prefix, metrics, gt_suffix, ocr_suffix): $REPORT_PREFIX defaults to "report". The reports include the character error rate (CER) and the word error rate (WER). + It is recommended to specify the encoding of the text files, for example with + --plain-encoding utf-8. If this option is not given, we try to auto-detect it. """ initLogging() process( @@ -236,6 +254,7 @@ def main(gt, ocr, report_prefix, metrics, gt_suffix, ocr_suffix): metrics=metrics, gt_suffix=gt_suffix, ocr_suffix=ocr_suffix, + plain_encoding=plain_encoding, ) diff --git a/src/dinglehopper/ocr_files.py b/src/dinglehopper/ocr_files.py index 1593f44..1eecebb 100644 --- a/src/dinglehopper/ocr_files.py +++ b/src/dinglehopper/ocr_files.py @@ -5,10 +5,13 @@ from typing import Dict, Iterator, Optional import chardet from lxml import etree as ET from lxml.etree import XMLSyntaxError +from ocrd_utils import getLogger from uniseg.graphemecluster import grapheme_clusters from .extracted_text import ExtractedText, normalize_sbb +log = getLogger("processor.OcrdDinglehopperEvaluate") + def alto_namespace(tree: ET._ElementTree) -> Optional[str]: """Return the ALTO namespace used in the given ElementTree. @@ -149,7 +152,7 @@ def detect_encoding(filename): return chardet.detect(open(filename, "rb").read(1024))["encoding"] -def plain_extract(filename, include_filename_in_id=False): +def plain_extract(filename, include_filename_in_id=False, encoding="autodetect"): id_template = "{filename} - line {no}" if include_filename_in_id else "line {no}" def make_segment(no, line): @@ -163,7 +166,14 @@ def plain_extract(filename, include_filename_in_id=False): clusters, ) - fileencoding = detect_encoding(filename) + if encoding == "autodetect": + fileencoding = detect_encoding(filename) + log.warn( + f"Autodetected encoding as '{fileencoding}'" + ", it is recommended to specify it explicitly with --plain-encoding" + ) + else: + fileencoding = encoding with open(filename, "r", encoding=fileencoding) as f: return ExtractedText( None, @@ -175,11 +185,11 @@ def plain_extract(filename, include_filename_in_id=False): # XXX hardcoded SBB normalization -def plain_text(filename): - return plain_extract(filename).text +def plain_text(filename, encoding="autodetect"): + return plain_extract(filename, encoding=encoding).text -def extract(filename, *, textequiv_level="region"): +def extract(filename, *, textequiv_level="region", plain_encoding="autodetect"): """Extract the text from the given file. Supports PAGE, ALTO and falls back to plain text. @@ -187,7 +197,7 @@ def extract(filename, *, textequiv_level="region"): try: tree = ET.parse(filename) except (XMLSyntaxError, UnicodeDecodeError): - return plain_extract(filename) + return plain_extract(filename, encoding=plain_encoding) try: return page_extract(tree, textequiv_level=textequiv_level) except ValueError: From 5578ce83a3600bbe6f6a0a2679f2b35c90b34fe4 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Thu, 13 Feb 2025 16:39:29 +0100 Subject: [PATCH 14/20] =?UTF-8?q?=F0=9F=9A=A7=20Add=20option=20for=20text?= =?UTF-8?q?=20encoding=20to=20line=20dir=20cli?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/dinglehopper/cli.py | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/src/dinglehopper/cli.py b/src/dinglehopper/cli.py index b67e9cc..5e5e81c 100644 --- a/src/dinglehopper/cli.py +++ b/src/dinglehopper/cli.py @@ -114,6 +114,7 @@ def process( metrics: bool = True, differences: bool = False, textequiv_level: str = "region", + plain_encoding: str = "autodetect", ) -> None: """Check OCR result against GT. @@ -121,8 +122,12 @@ def process( this undecorated version and use Click on a wrapper. """ - gt_text = extract(gt, textequiv_level=textequiv_level) - ocr_text = extract(ocr, textequiv_level=textequiv_level) + gt_text = extract( + gt, textequiv_level=textequiv_level, plain_encoding=plain_encoding + ) + ocr_text = extract( + ocr, textequiv_level=textequiv_level, plain_encoding=plain_encoding + ) gt_words: List[str] = list(words_normalized(gt_text)) ocr_words: List[str] = list(words_normalized(ocr_text)) @@ -195,6 +200,7 @@ def process_dir( metrics: bool = True, differences: bool = False, textequiv_level: str = "region", + plain_encoding: str = "autodetect", ) -> None: for gt_file in os.listdir(gt): gt_file_path = os.path.join(gt, gt_file) @@ -209,6 +215,7 @@ def process_dir( metrics=metrics, differences=differences, textequiv_level=textequiv_level, + plain_encoding=plain_encoding, ) else: print("Skipping {0} and {1}".format(gt_file_path, ocr_file_path)) @@ -233,6 +240,11 @@ def process_dir( help="PAGE TextEquiv level to extract text from", metavar="LEVEL", ) +@click.option( + "--plain-encoding", + default="autodetect", + help='Encoding (e.g. "utf-8") of plain text files', +) @click.option("--progress", default=False, is_flag=True, help="Show progress bar") @click.version_option() def main( @@ -243,6 +255,7 @@ def main( metrics, differences, textequiv_level, + plain_encoding, progress, ): """ @@ -280,6 +293,7 @@ def main( metrics=metrics, differences=differences, textequiv_level=textequiv_level, + plain_encoding=plain_encoding, ) else: process( @@ -290,6 +304,7 @@ def main( metrics=metrics, differences=differences, textequiv_level=textequiv_level, + plain_encoding=plain_encoding, ) From 9db5b4caf5b6335066e121a231cee1b1298bfbfa Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Thu, 13 Feb 2025 16:48:50 +0100 Subject: [PATCH 15/20] =?UTF-8?q?=F0=9F=9A=A7=20Add=20OCR-D=20parameter=20?= =?UTF-8?q?for=20plain=20text=20encoding?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/dinglehopper/ocrd-tool.json | 5 +++++ src/dinglehopper/ocrd_cli.py | 2 ++ 2 files changed, 7 insertions(+) diff --git a/src/dinglehopper/ocrd-tool.json b/src/dinglehopper/ocrd-tool.json index 43795e1..ae7c9bb 100644 --- a/src/dinglehopper/ocrd-tool.json +++ b/src/dinglehopper/ocrd-tool.json @@ -25,6 +25,11 @@ "enum": ["region", "line"], "default": "region", "description": "PAGE XML hierarchy level to extract the text from" + }, + "plain_encoding": { + "type": "string", + "default": "autodetect", + "description": "Encoding (e.g. \"utf-8\") of plain text files" } } } diff --git a/src/dinglehopper/ocrd_cli.py b/src/dinglehopper/ocrd_cli.py index fa4747f..2d7da8e 100644 --- a/src/dinglehopper/ocrd_cli.py +++ b/src/dinglehopper/ocrd_cli.py @@ -26,6 +26,7 @@ class OcrdDinglehopperEvaluate(Processor): assert self.parameter metrics = self.parameter["metrics"] textequiv_level = self.parameter["textequiv_level"] + plain_encoding = self.parameter["plain_encoding"] # wrong number of inputs: let fail gt_file, ocr_file = input_files @@ -52,6 +53,7 @@ class OcrdDinglehopperEvaluate(Processor): self.output_file_grp, metrics=metrics, textequiv_level=textequiv_level, + plain_encoding=plain_encoding, ) # Add reports to the workspace From 224aa02163b5ba28a4f44569b4cbb04d0dae4188 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Thu, 13 Feb 2025 16:50:21 +0100 Subject: [PATCH 16/20] =?UTF-8?q?=F0=9F=9A=A7=20Fix=20help=20text?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/dinglehopper/cli.py | 2 +- src/dinglehopper/cli_line_dirs.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/dinglehopper/cli.py b/src/dinglehopper/cli.py index 5e5e81c..2d3c075 100644 --- a/src/dinglehopper/cli.py +++ b/src/dinglehopper/cli.py @@ -243,7 +243,7 @@ def process_dir( @click.option( "--plain-encoding", default="autodetect", - help='Encoding (e.g. "utf-8") of plain text files', + help='Encoding (e.g. "utf-8") of plain text files', ) @click.option("--progress", default=False, is_flag=True, help="Show progress bar") @click.version_option() diff --git a/src/dinglehopper/cli_line_dirs.py b/src/dinglehopper/cli_line_dirs.py index 4064de0..0160f87 100644 --- a/src/dinglehopper/cli_line_dirs.py +++ b/src/dinglehopper/cli_line_dirs.py @@ -216,7 +216,7 @@ def process( @click.option( "--plain-encoding", default="autodetect", - help='Encoding (e.g. "utf-8") of plain text files', + help='Encoding (e.g. "utf-8") of plain text files', ) def main(gt, ocr, report_prefix, metrics, gt_suffix, ocr_suffix, plain_encoding): """ From a70260c10edbff774fcae1d3f636b2b5e806d4ae Mon Sep 17 00:00:00 2001 From: Mike Gerber Date: Tue, 22 Apr 2025 13:56:13 +0200 Subject: [PATCH 17/20] =?UTF-8?q?=F0=9F=90=9B=20Use=20warning()=20to=20fix?= =?UTF-8?q?=20DeprecationWarning?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/dinglehopper/ocr_files.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/dinglehopper/ocr_files.py b/src/dinglehopper/ocr_files.py index 1eecebb..fdcaf54 100644 --- a/src/dinglehopper/ocr_files.py +++ b/src/dinglehopper/ocr_files.py @@ -168,7 +168,7 @@ def plain_extract(filename, include_filename_in_id=False, encoding="autodetect") if encoding == "autodetect": fileencoding = detect_encoding(filename) - log.warn( + log.warning( f"Autodetected encoding as '{fileencoding}'" ", it is recommended to specify it explicitly with --plain-encoding" ) From 14a4bc56d85bd953153bf64bcb95a92413814efb Mon Sep 17 00:00:00 2001 From: Mike Gerber Date: Tue, 22 Apr 2025 18:24:35 +0200 Subject: [PATCH 18/20] =?UTF-8?q?=F0=9F=90=9B=20Add=20--plain-encoding=20o?= =?UTF-8?q?ption=20to=20dinglehopper-extract?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/dinglehopper/cli_extract.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/src/dinglehopper/cli_extract.py b/src/dinglehopper/cli_extract.py index 9c51d34..5fce032 100644 --- a/src/dinglehopper/cli_extract.py +++ b/src/dinglehopper/cli_extract.py @@ -12,7 +12,12 @@ from .ocr_files import extract help="PAGE TextEquiv level to extract text from", metavar="LEVEL", ) -def main(input_file, textequiv_level): +@click.option( + "--plain-encoding", + default="autodetect", + help='Encoding (e.g. "utf-8") of plain text files', +) +def main(input_file, textequiv_level, plain_encoding): """ Extract the text of the given INPUT_FILE. @@ -23,7 +28,9 @@ def main(input_file, textequiv_level): use "--textequiv-level line" to extract from the level of TextLine tags. """ initLogging() - input_text = extract(input_file, textequiv_level=textequiv_level).text + input_text = extract( + input_file, textequiv_level=textequiv_level, plain_encoding=plain_encoding + ).text print(input_text) From 9fc8937324b8ba2c94ddd865fb8c05fa5f92c49d Mon Sep 17 00:00:00 2001 From: Mike Gerber Date: Thu, 24 Apr 2025 15:13:19 +0200 Subject: [PATCH 19/20] =?UTF-8?q?=E2=9C=92=20=20README:=20Mention=20dingle?= =?UTF-8?q?hopper-line-dirs=20--help?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 76fcc5a..a40db79 100644 --- a/README.md +++ b/README.md @@ -112,9 +112,13 @@ You also may want to compare a directory of GT text files (i.e. `gt/line0001.gt. with a directory of OCR text files (i.e. `ocr/line0001.some-ocr.txt`) with a separate CLI interface: -~~~ +``` dinglehopper-line-dirs gt/ ocr/ -~~~ +``` + +The CLI `dinglehopper-line-dirs` can also work with GT text files in the same +directories as the the OCR text files. You should read `dinglehopper-line-dirs --help` +in this case. ### dinglehopper-extract The tool `dinglehopper-extract` extracts the text of the given input file on From 5639f3db7f12647694c4ef03437af00227f45f58 Mon Sep 17 00:00:00 2001 From: Mike Gerber Date: Thu, 24 Apr 2025 16:44:29 +0200 Subject: [PATCH 20/20] =?UTF-8?q?=E2=9C=94=20=20Add=20a=20tests=20that=20c?= =?UTF-8?q?hecks=20if=20plain=20text=20files=20with=20BOM=20are=20read=20c?= =?UTF-8?q?orrectly?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/dinglehopper/tests/test_ocr_files.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/src/dinglehopper/tests/test_ocr_files.py b/src/dinglehopper/tests/test_ocr_files.py index 342507a..0c2a500 100644 --- a/src/dinglehopper/tests/test_ocr_files.py +++ b/src/dinglehopper/tests/test_ocr_files.py @@ -182,3 +182,15 @@ def test_plain(tmp_path): result = plain_text("ocr.txt") expected = "First, a line.\nAnd a second line." assert result == expected + + +def test_plain_BOM(tmp_path): + """Test that plain text files with BOM are read correctly.""" + BOM = "\ufeff" + with working_directory(tmp_path): + with open("ocr.txt", "w") as ocrf: + ocrf.write(BOM + "First, a line.\nAnd a second line.\n") + + result = plain_text("ocr.txt") + expected = "First, a line.\nAnd a second line." + assert result == expected