From 2383730a55297be8903d01c1c8e5686a274539ef Mon Sep 17 00:00:00 2001 From: Mike Gerber Date: Mon, 8 Apr 2024 20:33:03 +0200 Subject: [PATCH] =?UTF-8?q?=E2=9C=94=20Test=20using=20empty=20files?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Test edge cases + empty files, e.g. empty text content and a Unicode BOM character. See also gh-79. --- .../tests/test_integ_empty_files.py | 35 +++++++++++++++++++ 1 file changed, 35 insertions(+) create mode 100644 src/dinglehopper/tests/test_integ_empty_files.py diff --git a/src/dinglehopper/tests/test_integ_empty_files.py b/src/dinglehopper/tests/test_integ_empty_files.py new file mode 100644 index 0000000..5c90ed1 --- /dev/null +++ b/src/dinglehopper/tests/test_integ_empty_files.py @@ -0,0 +1,35 @@ +from __future__ import division, print_function + +import math + +import pytest + +from .. import character_error_rate, plain_text +from .util import working_directory + + +@pytest.mark.integration +@pytest.mark.parametrize( + "gt_file_content,ocr_file_content,cer_expected", + [ + ("", "Lorem ipsum", math.inf), + ("Lorem ipsum", "", 1.0), + ("\ufeff", "Lorem ipsum", math.inf), + ("Lorem ipsum", "\ufeff", 1.0), + ("", "", 0.0), + ("\ufeff", "", 0.0), + ("", "\ufeff", 0.0), + ], +) +def test_empty_files(tmp_path, gt_file_content, ocr_file_content, cer_expected): + with working_directory(tmp_path): + + with open("gt.txt", "w") as gtf: + gtf.write(gt_file_content) + with open("ocr.txt", "w") as ocrf: + ocrf.write(ocr_file_content) + + gt_text = plain_text("gt.txt") + ocr_text = plain_text("ocr.txt") + + assert character_error_rate(gt_text, ocr_text) == cer_expected