From e4431797e638641c5aed021e6b519746db921a39 Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Thu, 3 Aug 2023 19:46:01 +0200
Subject: [PATCH] =?UTF-8?q?=F0=9F=8E=A8=20Reformat=20comments=20+=20string?=
 =?UTF-8?q?s=20manually=20(not=20auto-fixed=20by=20Black)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/dinglehopper/cli.py                                  | 4 ++--
 src/dinglehopper/ocr_files.py                            | 9 +++++----
 src/dinglehopper/tests/test_align.py                     | 3 ++-
 src/dinglehopper/tests/test_character_error_rate.py      | 3 ++-
 src/dinglehopper/tests/test_integ_align.py               | 4 +++-
 src/dinglehopper/tests/test_integ_word_error_rate_ocr.py | 4 ++--
 src/dinglehopper/tests/test_ocr_files.py                 | 3 ++-
 src/dinglehopper/tests/test_word_error_rate.py           | 3 ++-
 src/dinglehopper/word_error_rate.py                      | 5 +++--
 9 files changed, 23 insertions(+), 15 deletions(-)

diff --git a/src/dinglehopper/cli.py b/src/dinglehopper/cli.py
index 43bfb2e..3f3c835 100644
--- a/src/dinglehopper/cli.py
+++ b/src/dinglehopper/cli.py
@@ -116,8 +116,8 @@ def process(
 ):
     """Check OCR result against GT.
 
-    The @click decorators change the signature of the decorated functions, so we keep this undecorated version and use
-    Click on a wrapper.
+    The @click decorators change the signature of the decorated functions, so we keep
+    this undecorated version and use Click on a wrapper.
     """
 
     gt_text = extract(gt, textequiv_level=textequiv_level)
diff --git a/src/dinglehopper/ocr_files.py b/src/dinglehopper/ocr_files.py
index da7b973..5c4339b 100644
--- a/src/dinglehopper/ocr_files.py
+++ b/src/dinglehopper/ocr_files.py
@@ -14,8 +14,8 @@ from .extracted_text import ExtractedText, normalize_sbb
 def alto_namespace(tree: ET.ElementTree) -> str:
     """Return the ALTO namespace used in the given ElementTree.
 
-    This relies on the assumption that, in any given ALTO file, the root element has the local name "alto". We do not
-    check if the files uses any valid ALTO namespace.
+    This relies on the assumption that, in any given ALTO file, the root element has the
+    local name "alto". We do not check if the files uses any valid ALTO namespace.
     """
     root_name = ET.QName(tree.getroot().tag)
     if root_name.localname == "alto":
@@ -48,8 +48,9 @@ def alto_text(tree):
 def page_namespace(tree):
     """Return the PAGE content namespace used in the given ElementTree.
 
-    This relies on the assumption that, in any given PAGE content file, the root element has the local name "PcGts". We
-    do not check if the files uses any valid PAGE namespace.
+    This relies on the assumption that, in any given PAGE content file, the root element
+    has the local name "PcGts". We do not check if the files uses any valid PAGE
+    namespace.
     """
     root_name = ET.QName(tree.getroot().tag)
     if root_name.localname == "PcGts":
diff --git a/src/dinglehopper/tests/test_align.py b/src/dinglehopper/tests/test_align.py
index d8b051b..2c4e23a 100644
--- a/src/dinglehopper/tests/test_align.py
+++ b/src/dinglehopper/tests/test_align.py
@@ -72,7 +72,8 @@ def test_with_some_fake_ocr_errors():
     result = list(
         align(
             "Über die vielen Sorgen wegen desselben vergaß",
-            "SomeJunk MoreJunk Übey die vielen Sorgen wegen AdditionalJunk deffelben vcrgab",
+            "SomeJunk MoreJunk "
+            + "Übey die vielen Sorgen wegen AdditionalJunk deffelben vcrgab",
         )
     )
     left, right = unzip(result)
diff --git a/src/dinglehopper/tests/test_character_error_rate.py b/src/dinglehopper/tests/test_character_error_rate.py
index 39301b4..970f740 100644
--- a/src/dinglehopper/tests/test_character_error_rate.py
+++ b/src/dinglehopper/tests/test_character_error_rate.py
@@ -36,6 +36,7 @@ def test_character_error_rate_hard():
         len(s2) == 7
     )  # This, OTOH, ends with LATIN SMALL LETTER M + COMBINING TILDE, 7 code points
 
-    # Both strings have the same length in terms of grapheme clusters. So the CER should be symmetrical.
+    # Both strings have the same length in terms of grapheme clusters. So the CER should
+    # be symmetrical.
     assert character_error_rate(s2, s1) == 1 / 6
     assert character_error_rate(s1, s2) == 1 / 6
diff --git a/src/dinglehopper/tests/test_integ_align.py b/src/dinglehopper/tests/test_integ_align.py
index 74b8c7e..b011ee5 100644
--- a/src/dinglehopper/tests/test_integ_align.py
+++ b/src/dinglehopper/tests/test_integ_align.py
@@ -15,7 +15,9 @@ def test_align_page_files():
     # In the fake OCR file, we changed 2 characters and replaced a fi ligature with fi.
     # → 2 elements in the alignment should be different, the ligature is
     # (currently) not counted due to normalization.
-    # NOTE: In this example, it doesn't matter that we work with "characters", not grapheme clusters.
+    #
+    # NOTE: In this example, it doesn't matter that we work with "characters", not
+    # grapheme clusters.
 
     gt = page_text(ET.parse(os.path.join(data_dir, "test-gt.page2018.xml")))
     ocr = page_text(ET.parse(os.path.join(data_dir, "test-fake-ocr.page2018.xml")))
diff --git a/src/dinglehopper/tests/test_integ_word_error_rate_ocr.py b/src/dinglehopper/tests/test_integ_word_error_rate_ocr.py
index 65b2f54..8a57ed2 100644
--- a/src/dinglehopper/tests/test_integ_word_error_rate_ocr.py
+++ b/src/dinglehopper/tests/test_integ_word_error_rate_ocr.py
@@ -12,8 +12,8 @@ data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")
 
 @pytest.mark.integration
 def test_word_error_rate_between_page_files():
-    # In the fake OCR file, we changed 2 characters and replaced a fi ligature with fi. So we have 3 changed words,
-    # the ligature does not count → 2 errors
+    # In the fake OCR file, we changed 2 characters and replaced a fi ligature with fi.
+    # So we have 3 changed words, the ligature does not count → 2 errors
     gt = page_text(ET.parse(os.path.join(data_dir, "test-gt.page2018.xml")))
 
     gt_word_count = (
diff --git a/src/dinglehopper/tests/test_ocr_files.py b/src/dinglehopper/tests/test_ocr_files.py
index 4a1f485..4790c85 100644
--- a/src/dinglehopper/tests/test_ocr_files.py
+++ b/src/dinglehopper/tests/test_ocr_files.py
@@ -159,7 +159,8 @@ def test_page_level():
     result = page_text(tree, textequiv_level="line")
     assert (
         result
-        == "Hand, Mylord? fragte der Graf von Rocheſter.\nAls er einsmals in dem Oberhauſe eine Bill we-"
+        == "Hand, Mylord? fragte der Graf von Rocheſter.\n"
+        + "Als er einsmals in dem Oberhauſe eine Bill we-"
     )
 
 
diff --git a/src/dinglehopper/tests/test_word_error_rate.py b/src/dinglehopper/tests/test_word_error_rate.py
index bc7b91e..311ffff 100644
--- a/src/dinglehopper/tests/test_word_error_rate.py
+++ b/src/dinglehopper/tests/test_word_error_rate.py
@@ -27,7 +27,8 @@ def test_words():
 def test_words_private_use_area():
     result = list(
         words(
-            "ber die vielen Sorgen wegen deelben vergaß Hartkopf, der Frau Amtmnnin das ver⸗\n"
+            "ber die vielen Sorgen wegen deelben vergaß Hartkopf, "
+            "der Frau Amtmnnin das ver⸗\n"
             "ſproene zu berliefern."
         )
     )
diff --git a/src/dinglehopper/word_error_rate.py b/src/dinglehopper/word_error_rate.py
index 470bf1f..9bf36b6 100644
--- a/src/dinglehopper/word_error_rate.py
+++ b/src/dinglehopper/word_error_rate.py
@@ -52,8 +52,9 @@ def words(s: str):
         cat = subcat[0]
         return cat in unwanted_categories or subcat in unwanted_subcategories
 
-    # We follow Unicode Standard Annex #29 on Unicode Text Segmentation here: Split on word boundaries using
-    # uniseg.wordbreak.words() and ignore all "words" that contain only whitespace, punctation "or similar characters."
+    # We follow Unicode Standard Annex #29 on Unicode Text Segmentation here: Split on
+    # word boundaries using uniseg.wordbreak.words() and ignore all "words" that contain
+    # only whitespace, punctation "or similar characters."
     for word in uniseg.wordbreak.words(s):
         if all(unwanted(c) for c in word):
             pass