diff --git a/src/dinglehopper/cli.py b/src/dinglehopper/cli.py index 43bfb2e..3f3c835 100644 --- a/src/dinglehopper/cli.py +++ b/src/dinglehopper/cli.py @@ -116,8 +116,8 @@ def process( ): """Check OCR result against GT. - The @click decorators change the signature of the decorated functions, so we keep this undecorated version and use - Click on a wrapper. + The @click decorators change the signature of the decorated functions, so we keep + this undecorated version and use Click on a wrapper. """ gt_text = extract(gt, textequiv_level=textequiv_level) diff --git a/src/dinglehopper/ocr_files.py b/src/dinglehopper/ocr_files.py index da7b973..5c4339b 100644 --- a/src/dinglehopper/ocr_files.py +++ b/src/dinglehopper/ocr_files.py @@ -14,8 +14,8 @@ from .extracted_text import ExtractedText, normalize_sbb def alto_namespace(tree: ET.ElementTree) -> str: """Return the ALTO namespace used in the given ElementTree. - This relies on the assumption that, in any given ALTO file, the root element has the local name "alto". We do not - check if the files uses any valid ALTO namespace. + This relies on the assumption that, in any given ALTO file, the root element has the + local name "alto". We do not check if the files uses any valid ALTO namespace. """ root_name = ET.QName(tree.getroot().tag) if root_name.localname == "alto": @@ -48,8 +48,9 @@ def alto_text(tree): def page_namespace(tree): """Return the PAGE content namespace used in the given ElementTree. - This relies on the assumption that, in any given PAGE content file, the root element has the local name "PcGts". We - do not check if the files uses any valid PAGE namespace. + This relies on the assumption that, in any given PAGE content file, the root element + has the local name "PcGts". We do not check if the files uses any valid PAGE + namespace. """ root_name = ET.QName(tree.getroot().tag) if root_name.localname == "PcGts": diff --git a/src/dinglehopper/tests/test_align.py b/src/dinglehopper/tests/test_align.py index d8b051b..2c4e23a 100644 --- a/src/dinglehopper/tests/test_align.py +++ b/src/dinglehopper/tests/test_align.py @@ -72,7 +72,8 @@ def test_with_some_fake_ocr_errors(): result = list( align( "Über die vielen Sorgen wegen desselben vergaß", - "SomeJunk MoreJunk Übey die vielen Sorgen wegen AdditionalJunk deffelben vcrgab", + "SomeJunk MoreJunk " + + "Übey die vielen Sorgen wegen AdditionalJunk deffelben vcrgab", ) ) left, right = unzip(result) diff --git a/src/dinglehopper/tests/test_character_error_rate.py b/src/dinglehopper/tests/test_character_error_rate.py index 39301b4..970f740 100644 --- a/src/dinglehopper/tests/test_character_error_rate.py +++ b/src/dinglehopper/tests/test_character_error_rate.py @@ -36,6 +36,7 @@ def test_character_error_rate_hard(): len(s2) == 7 ) # This, OTOH, ends with LATIN SMALL LETTER M + COMBINING TILDE, 7 code points - # Both strings have the same length in terms of grapheme clusters. So the CER should be symmetrical. + # Both strings have the same length in terms of grapheme clusters. So the CER should + # be symmetrical. assert character_error_rate(s2, s1) == 1 / 6 assert character_error_rate(s1, s2) == 1 / 6 diff --git a/src/dinglehopper/tests/test_integ_align.py b/src/dinglehopper/tests/test_integ_align.py index 74b8c7e..b011ee5 100644 --- a/src/dinglehopper/tests/test_integ_align.py +++ b/src/dinglehopper/tests/test_integ_align.py @@ -15,7 +15,9 @@ def test_align_page_files(): # In the fake OCR file, we changed 2 characters and replaced a fi ligature with fi. # → 2 elements in the alignment should be different, the ligature is # (currently) not counted due to normalization. - # NOTE: In this example, it doesn't matter that we work with "characters", not grapheme clusters. + # + # NOTE: In this example, it doesn't matter that we work with "characters", not + # grapheme clusters. gt = page_text(ET.parse(os.path.join(data_dir, "test-gt.page2018.xml"))) ocr = page_text(ET.parse(os.path.join(data_dir, "test-fake-ocr.page2018.xml"))) diff --git a/src/dinglehopper/tests/test_integ_word_error_rate_ocr.py b/src/dinglehopper/tests/test_integ_word_error_rate_ocr.py index 65b2f54..8a57ed2 100644 --- a/src/dinglehopper/tests/test_integ_word_error_rate_ocr.py +++ b/src/dinglehopper/tests/test_integ_word_error_rate_ocr.py @@ -12,8 +12,8 @@ data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data") @pytest.mark.integration def test_word_error_rate_between_page_files(): - # In the fake OCR file, we changed 2 characters and replaced a fi ligature with fi. So we have 3 changed words, - # the ligature does not count → 2 errors + # In the fake OCR file, we changed 2 characters and replaced a fi ligature with fi. + # So we have 3 changed words, the ligature does not count → 2 errors gt = page_text(ET.parse(os.path.join(data_dir, "test-gt.page2018.xml"))) gt_word_count = ( diff --git a/src/dinglehopper/tests/test_ocr_files.py b/src/dinglehopper/tests/test_ocr_files.py index 4a1f485..4790c85 100644 --- a/src/dinglehopper/tests/test_ocr_files.py +++ b/src/dinglehopper/tests/test_ocr_files.py @@ -159,7 +159,8 @@ def test_page_level(): result = page_text(tree, textequiv_level="line") assert ( result - == "Hand, Mylord? fragte der Graf von Rocheſter.\nAls er einsmals in dem Oberhauſe eine Bill we-" + == "Hand, Mylord? fragte der Graf von Rocheſter.\n" + + "Als er einsmals in dem Oberhauſe eine Bill we-" ) diff --git a/src/dinglehopper/tests/test_word_error_rate.py b/src/dinglehopper/tests/test_word_error_rate.py index bc7b91e..311ffff 100644 --- a/src/dinglehopper/tests/test_word_error_rate.py +++ b/src/dinglehopper/tests/test_word_error_rate.py @@ -27,7 +27,8 @@ def test_words(): def test_words_private_use_area(): result = list( words( - "ber die vielen Sorgen wegen deelben vergaß Hartkopf, der Frau Amtmnnin das ver⸗\n" + "ber die vielen Sorgen wegen deelben vergaß Hartkopf, " + "der Frau Amtmnnin das ver⸗\n" "ſproene zu berliefern." ) ) diff --git a/src/dinglehopper/word_error_rate.py b/src/dinglehopper/word_error_rate.py index 470bf1f..9bf36b6 100644 --- a/src/dinglehopper/word_error_rate.py +++ b/src/dinglehopper/word_error_rate.py @@ -52,8 +52,9 @@ def words(s: str): cat = subcat[0] return cat in unwanted_categories or subcat in unwanted_subcategories - # We follow Unicode Standard Annex #29 on Unicode Text Segmentation here: Split on word boundaries using - # uniseg.wordbreak.words() and ignore all "words" that contain only whitespace, punctation "or similar characters." + # We follow Unicode Standard Annex #29 on Unicode Text Segmentation here: Split on + # word boundaries using uniseg.wordbreak.words() and ignore all "words" that contain + # only whitespace, punctation "or similar characters." for word in uniseg.wordbreak.words(s): if all(unwanted(c) for c in word): pass