🎨 Reformat comments + strings manually (not auto-fixed by Black)

pull/90/head
Mike Gerber 1 year ago
parent 704e7cca1c
commit e4431797e6

@ -116,8 +116,8 @@ def process(
): ):
"""Check OCR result against GT. """Check OCR result against GT.
The @click decorators change the signature of the decorated functions, so we keep this undecorated version and use The @click decorators change the signature of the decorated functions, so we keep
Click on a wrapper. this undecorated version and use Click on a wrapper.
""" """
gt_text = extract(gt, textequiv_level=textequiv_level) gt_text = extract(gt, textequiv_level=textequiv_level)

@ -14,8 +14,8 @@ from .extracted_text import ExtractedText, normalize_sbb
def alto_namespace(tree: ET.ElementTree) -> str: def alto_namespace(tree: ET.ElementTree) -> str:
"""Return the ALTO namespace used in the given ElementTree. """Return the ALTO namespace used in the given ElementTree.
This relies on the assumption that, in any given ALTO file, the root element has the local name "alto". We do not This relies on the assumption that, in any given ALTO file, the root element has the
check if the files uses any valid ALTO namespace. local name "alto". We do not check if the files uses any valid ALTO namespace.
""" """
root_name = ET.QName(tree.getroot().tag) root_name = ET.QName(tree.getroot().tag)
if root_name.localname == "alto": if root_name.localname == "alto":
@ -48,8 +48,9 @@ def alto_text(tree):
def page_namespace(tree): def page_namespace(tree):
"""Return the PAGE content namespace used in the given ElementTree. """Return the PAGE content namespace used in the given ElementTree.
This relies on the assumption that, in any given PAGE content file, the root element has the local name "PcGts". We This relies on the assumption that, in any given PAGE content file, the root element
do not check if the files uses any valid PAGE namespace. has the local name "PcGts". We do not check if the files uses any valid PAGE
namespace.
""" """
root_name = ET.QName(tree.getroot().tag) root_name = ET.QName(tree.getroot().tag)
if root_name.localname == "PcGts": if root_name.localname == "PcGts":

@ -72,7 +72,8 @@ def test_with_some_fake_ocr_errors():
result = list( result = list(
align( align(
"Über die vielen Sorgen wegen desselben vergaß", "Über die vielen Sorgen wegen desselben vergaß",
"SomeJunk MoreJunk Übey die vielen Sorgen wegen AdditionalJunk deffelben vcrgab", "SomeJunk MoreJunk "
+ "Übey die vielen Sorgen wegen AdditionalJunk deffelben vcrgab",
) )
) )
left, right = unzip(result) left, right = unzip(result)

@ -36,6 +36,7 @@ def test_character_error_rate_hard():
len(s2) == 7 len(s2) == 7
) # This, OTOH, ends with LATIN SMALL LETTER M + COMBINING TILDE, 7 code points ) # This, OTOH, ends with LATIN SMALL LETTER M + COMBINING TILDE, 7 code points
# Both strings have the same length in terms of grapheme clusters. So the CER should be symmetrical. # Both strings have the same length in terms of grapheme clusters. So the CER should
# be symmetrical.
assert character_error_rate(s2, s1) == 1 / 6 assert character_error_rate(s2, s1) == 1 / 6
assert character_error_rate(s1, s2) == 1 / 6 assert character_error_rate(s1, s2) == 1 / 6

@ -15,7 +15,9 @@ def test_align_page_files():
# In the fake OCR file, we changed 2 characters and replaced a fi ligature with fi. # In the fake OCR file, we changed 2 characters and replaced a fi ligature with fi.
# → 2 elements in the alignment should be different, the ligature is # → 2 elements in the alignment should be different, the ligature is
# (currently) not counted due to normalization. # (currently) not counted due to normalization.
# NOTE: In this example, it doesn't matter that we work with "characters", not grapheme clusters. #
# NOTE: In this example, it doesn't matter that we work with "characters", not
# grapheme clusters.
gt = page_text(ET.parse(os.path.join(data_dir, "test-gt.page2018.xml"))) gt = page_text(ET.parse(os.path.join(data_dir, "test-gt.page2018.xml")))
ocr = page_text(ET.parse(os.path.join(data_dir, "test-fake-ocr.page2018.xml"))) ocr = page_text(ET.parse(os.path.join(data_dir, "test-fake-ocr.page2018.xml")))

@ -12,8 +12,8 @@ data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")
@pytest.mark.integration @pytest.mark.integration
def test_word_error_rate_between_page_files(): def test_word_error_rate_between_page_files():
# In the fake OCR file, we changed 2 characters and replaced a fi ligature with fi. So we have 3 changed words, # In the fake OCR file, we changed 2 characters and replaced a fi ligature with fi.
# the ligature does not count → 2 errors # So we have 3 changed words, the ligature does not count → 2 errors
gt = page_text(ET.parse(os.path.join(data_dir, "test-gt.page2018.xml"))) gt = page_text(ET.parse(os.path.join(data_dir, "test-gt.page2018.xml")))
gt_word_count = ( gt_word_count = (

@ -159,7 +159,8 @@ def test_page_level():
result = page_text(tree, textequiv_level="line") result = page_text(tree, textequiv_level="line")
assert ( assert (
result result
== "Hand, Mylord? fragte der Graf von Rocheſter.\nAls er einsmals in dem Oberhauſe eine Bill we-" == "Hand, Mylord? fragte der Graf von Rocheſter.\n"
+ "Als er einsmals in dem Oberhauſe eine Bill we-"
) )

@ -27,7 +27,8 @@ def test_words():
def test_words_private_use_area(): def test_words_private_use_area():
result = list( result = list(
words( words(
"ber die vielen Sorgen wegen deelben vergaß Hartkopf, der Frau Amtmnnin das ver⸗\n" "ber die vielen Sorgen wegen deelben vergaß Hartkopf, "
"der Frau Amtmnnin das ver⸗\n"
"ſproene zu berliefern." "ſproene zu berliefern."
) )
) )

@ -52,8 +52,9 @@ def words(s: str):
cat = subcat[0] cat = subcat[0]
return cat in unwanted_categories or subcat in unwanted_subcategories return cat in unwanted_categories or subcat in unwanted_subcategories
# We follow Unicode Standard Annex #29 on Unicode Text Segmentation here: Split on word boundaries using # We follow Unicode Standard Annex #29 on Unicode Text Segmentation here: Split on
# uniseg.wordbreak.words() and ignore all "words" that contain only whitespace, punctation "or similar characters." # word boundaries using uniseg.wordbreak.words() and ignore all "words" that contain
# only whitespace, punctation "or similar characters."
for word in uniseg.wordbreak.words(s): for word in uniseg.wordbreak.words(s):
if all(unwanted(c) for c in word): if all(unwanted(c) for c in word):
pass pass

Loading…
Cancel
Save