From 0f0819512e4e958a88eaf1fa9a0e9a8f13d7c200 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Thu, 2 Mar 2023 10:22:51 +0100 Subject: [PATCH] =?UTF-8?q?=F0=9F=8E=A8=20Reformat=20using=20Black?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- qurator/dinglehopper/cli_line_dirs.py | 10 +++++++--- qurator/dinglehopper/ocr_files.py | 15 +++++++++++---- qurator/dinglehopper/ocrd_cli.py | 2 +- qurator/dinglehopper/tests/test_integ_ocrd_cli.py | 2 +- qurator/dinglehopper/word_error_rate.py | 2 -- setup.py | 2 +- 6 files changed, 21 insertions(+), 12 deletions(-) diff --git a/qurator/dinglehopper/cli_line_dirs.py b/qurator/dinglehopper/cli_line_dirs.py index 4c07ce5..950f668 100644 --- a/qurator/dinglehopper/cli_line_dirs.py +++ b/qurator/dinglehopper/cli_line_dirs.py @@ -32,7 +32,7 @@ def common_suffix(its): def removesuffix(text, suffix): if suffix and text.endswith(suffix): - return text[:-len(suffix)] + return text[: -len(suffix)] return text @@ -52,7 +52,9 @@ def process(gt_dir, ocr_dir, report_prefix, *, metrics=True): ocr = removesuffix(gt, gt_suffix) + ocr_suffix gt_text = plain_extract(os.path.join(gt_dir, gt), include_filename_in_id=True) - ocr_text = plain_extract(os.path.join(ocr_dir, ocr), include_filename_in_id=True) + ocr_text = plain_extract( + os.path.join(ocr_dir, ocr), include_filename_in_id=True + ) # Compute CER l_cer, l_n_characters = character_error_rate_n(gt_text, ocr_text) @@ -60,7 +62,9 @@ def process(gt_dir, ocr_dir, report_prefix, *, metrics=True): cer, n_characters = l_cer, l_n_characters else: # Rolling update - cer = (cer * n_characters + l_cer * l_n_characters) / (n_characters + l_n_characters) + cer = (cer * n_characters + l_cer * l_n_characters) / ( + n_characters + l_n_characters + ) n_characters = n_characters + l_n_characters # Compute WER diff --git a/qurator/dinglehopper/ocr_files.py b/qurator/dinglehopper/ocr_files.py index 69f4df7..97e56ed 100644 --- a/qurator/dinglehopper/ocr_files.py +++ b/qurator/dinglehopper/ocr_files.py @@ -98,14 +98,18 @@ def extract_texts_from_reading_order_group(group, tree, nsmap, textequiv_level): ro_children = filter(lambda child: "index" in child.attrib.keys(), ro_children) ro_children = sorted(ro_children, key=lambda child: int(child.attrib["index"])) - elif ET.QName(group.tag).localname in ["UnorderedGroup","UnorderedGroupIndexed"]: + elif ET.QName(group.tag).localname in ["UnorderedGroup", "UnorderedGroupIndexed"]: ro_children = list(group) else: raise NotImplementedError - for ro_child in ro_children: - if ET.QName(ro_child.tag).localname in ["OrderedGroup", "OrderedGroupIndexed", "UnorderedGroup", "UnorderedGroupIndexed"]: + if ET.QName(ro_child.tag).localname in [ + "OrderedGroup", + "OrderedGroupIndexed", + "UnorderedGroup", + "UnorderedGroupIndexed", + ]: regions.extend( extract_texts_from_reading_order_group( ro_child, tree, nsmap, textequiv_level @@ -139,7 +143,10 @@ def plain_extract(filename, include_filename_in_id=False): [ ExtractedText( id_template.format(filename=os.path.basename(filename), no=no), - None, None, normalize_sbb(line)) + None, + None, + normalize_sbb(line), + ) for no, line in enumerate(f.readlines()) ], "\n", diff --git a/qurator/dinglehopper/ocrd_cli.py b/qurator/dinglehopper/ocrd_cli.py index 7c513e6..9578a0a 100644 --- a/qurator/dinglehopper/ocrd_cli.py +++ b/qurator/dinglehopper/ocrd_cli.py @@ -33,7 +33,7 @@ class OcrdDinglehopperEvaluate(Processor): textequiv_level = self.parameter["textequiv_level"] gt_grp, ocr_grp = self.input_file_grp.split(",") - input_file_tuples = self.zip_input_files(on_error='abort') + input_file_tuples = self.zip_input_files(on_error="abort") for n, (gt_file, ocr_file) in enumerate(input_file_tuples): if not gt_file or not ocr_file: # file/page was not found in this group diff --git a/qurator/dinglehopper/tests/test_integ_ocrd_cli.py b/qurator/dinglehopper/tests/test_integ_ocrd_cli.py index 8aff22d..652b850 100644 --- a/qurator/dinglehopper/tests/test_integ_ocrd_cli.py +++ b/qurator/dinglehopper/tests/test_integ_ocrd_cli.py @@ -15,7 +15,7 @@ data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data") @pytest.mark.integration -@pytest.mark.skipif(sys.platform == 'win32', reason="only on unix") +@pytest.mark.skipif(sys.platform == "win32", reason="only on unix") def test_ocrd_cli(tmp_path): """Test OCR-D interface""" diff --git a/qurator/dinglehopper/word_error_rate.py b/qurator/dinglehopper/word_error_rate.py index 8f0cc96..64dc36c 100644 --- a/qurator/dinglehopper/word_error_rate.py +++ b/qurator/dinglehopper/word_error_rate.py @@ -42,10 +42,8 @@ def words(s: str): if not word_break_patched: patch_word_break() - # Check if c is an unwanted character, i.e. whitespace, punctuation, or similar def unwanted(c): - # See https://www.fileformat.info/info/unicode/category/index.htm # and https://unicodebook.readthedocs.io/unicode.html#categories unwanted_categories = "O", "M", "P", "Z", "S" diff --git a/setup.py b/setup.py index be17cc6..646a50f 100644 --- a/setup.py +++ b/setup.py @@ -4,7 +4,7 @@ from setuptools import find_packages, setup with open("requirements.txt") as fp: install_requires = fp.read() -with open('requirements-dev.txt') as fp: +with open("requirements-dev.txt") as fp: tests_require = fp.read() setup(