🎨 Reformat comments + strings manually (not auto-fixed by Black)

pull/90/head
Mike Gerber 9 months ago
parent 704e7cca1c
commit e4431797e6

@ -116,8 +116,8 @@ def process(
):
"""Check OCR result against GT.
The @click decorators change the signature of the decorated functions, so we keep this undecorated version and use
Click on a wrapper.
The @click decorators change the signature of the decorated functions, so we keep
this undecorated version and use Click on a wrapper.
"""
gt_text = extract(gt, textequiv_level=textequiv_level)

@ -14,8 +14,8 @@ from .extracted_text import ExtractedText, normalize_sbb
def alto_namespace(tree: ET.ElementTree) -> str:
"""Return the ALTO namespace used in the given ElementTree.
This relies on the assumption that, in any given ALTO file, the root element has the local name "alto". We do not
check if the files uses any valid ALTO namespace.
This relies on the assumption that, in any given ALTO file, the root element has the
local name "alto". We do not check if the files uses any valid ALTO namespace.
"""
root_name = ET.QName(tree.getroot().tag)
if root_name.localname == "alto":
@ -48,8 +48,9 @@ def alto_text(tree):
def page_namespace(tree):
"""Return the PAGE content namespace used in the given ElementTree.
This relies on the assumption that, in any given PAGE content file, the root element has the local name "PcGts". We
do not check if the files uses any valid PAGE namespace.
This relies on the assumption that, in any given PAGE content file, the root element
has the local name "PcGts". We do not check if the files uses any valid PAGE
namespace.
"""
root_name = ET.QName(tree.getroot().tag)
if root_name.localname == "PcGts":

@ -72,7 +72,8 @@ def test_with_some_fake_ocr_errors():
result = list(
align(
"Über die vielen Sorgen wegen desselben vergaß",
"SomeJunk MoreJunk Übey die vielen Sorgen wegen AdditionalJunk deffelben vcrgab",
"SomeJunk MoreJunk "
+ "Übey die vielen Sorgen wegen AdditionalJunk deffelben vcrgab",
)
)
left, right = unzip(result)

@ -36,6 +36,7 @@ def test_character_error_rate_hard():
len(s2) == 7
) # This, OTOH, ends with LATIN SMALL LETTER M + COMBINING TILDE, 7 code points
# Both strings have the same length in terms of grapheme clusters. So the CER should be symmetrical.
# Both strings have the same length in terms of grapheme clusters. So the CER should
# be symmetrical.
assert character_error_rate(s2, s1) == 1 / 6
assert character_error_rate(s1, s2) == 1 / 6

@ -15,7 +15,9 @@ def test_align_page_files():
# In the fake OCR file, we changed 2 characters and replaced a fi ligature with fi.
# → 2 elements in the alignment should be different, the ligature is
# (currently) not counted due to normalization.
# NOTE: In this example, it doesn't matter that we work with "characters", not grapheme clusters.
#
# NOTE: In this example, it doesn't matter that we work with "characters", not
# grapheme clusters.
gt = page_text(ET.parse(os.path.join(data_dir, "test-gt.page2018.xml")))
ocr = page_text(ET.parse(os.path.join(data_dir, "test-fake-ocr.page2018.xml")))

@ -12,8 +12,8 @@ data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")
@pytest.mark.integration
def test_word_error_rate_between_page_files():
# In the fake OCR file, we changed 2 characters and replaced a fi ligature with fi. So we have 3 changed words,
# the ligature does not count → 2 errors
# In the fake OCR file, we changed 2 characters and replaced a fi ligature with fi.
# So we have 3 changed words, the ligature does not count → 2 errors
gt = page_text(ET.parse(os.path.join(data_dir, "test-gt.page2018.xml")))
gt_word_count = (

@ -159,7 +159,8 @@ def test_page_level():
result = page_text(tree, textequiv_level="line")
assert (
result
== "Hand, Mylord? fragte der Graf von Rocheſter.\nAls er einsmals in dem Oberhauſe eine Bill we-"
== "Hand, Mylord? fragte der Graf von Rocheſter.\n"
+ "Als er einsmals in dem Oberhauſe eine Bill we-"
)

@ -27,7 +27,8 @@ def test_words():
def test_words_private_use_area():
result = list(
words(
"ber die vielen Sorgen wegen deelben vergaß Hartkopf, der Frau Amtmnnin das ver⸗\n"
"ber die vielen Sorgen wegen deelben vergaß Hartkopf, "
"der Frau Amtmnnin das ver⸗\n"
"ſproene zu berliefern."
)
)

@ -52,8 +52,9 @@ def words(s: str):
cat = subcat[0]
return cat in unwanted_categories or subcat in unwanted_subcategories
# We follow Unicode Standard Annex #29 on Unicode Text Segmentation here: Split on word boundaries using
# uniseg.wordbreak.words() and ignore all "words" that contain only whitespace, punctation "or similar characters."
# We follow Unicode Standard Annex #29 on Unicode Text Segmentation here: Split on
# word boundaries using uniseg.wordbreak.words() and ignore all "words" that contain
# only whitespace, punctation "or similar characters."
for word in uniseg.wordbreak.words(s):
if all(unwanted(c) for c in word):
pass

Loading…
Cancel
Save