mirror of
https://github.com/qurator-spk/dinglehopper.git
synced 2025-06-30 22:19:57 +02:00
🎨 Reformat comments + strings manually (not auto-fixed by Black)
This commit is contained in:
parent
704e7cca1c
commit
e4431797e6
9 changed files with 23 additions and 15 deletions
|
@ -116,8 +116,8 @@ def process(
|
||||||
):
|
):
|
||||||
"""Check OCR result against GT.
|
"""Check OCR result against GT.
|
||||||
|
|
||||||
The @click decorators change the signature of the decorated functions, so we keep this undecorated version and use
|
The @click decorators change the signature of the decorated functions, so we keep
|
||||||
Click on a wrapper.
|
this undecorated version and use Click on a wrapper.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
gt_text = extract(gt, textequiv_level=textequiv_level)
|
gt_text = extract(gt, textequiv_level=textequiv_level)
|
||||||
|
|
|
@ -14,8 +14,8 @@ from .extracted_text import ExtractedText, normalize_sbb
|
||||||
def alto_namespace(tree: ET.ElementTree) -> str:
|
def alto_namespace(tree: ET.ElementTree) -> str:
|
||||||
"""Return the ALTO namespace used in the given ElementTree.
|
"""Return the ALTO namespace used in the given ElementTree.
|
||||||
|
|
||||||
This relies on the assumption that, in any given ALTO file, the root element has the local name "alto". We do not
|
This relies on the assumption that, in any given ALTO file, the root element has the
|
||||||
check if the files uses any valid ALTO namespace.
|
local name "alto". We do not check if the files uses any valid ALTO namespace.
|
||||||
"""
|
"""
|
||||||
root_name = ET.QName(tree.getroot().tag)
|
root_name = ET.QName(tree.getroot().tag)
|
||||||
if root_name.localname == "alto":
|
if root_name.localname == "alto":
|
||||||
|
@ -48,8 +48,9 @@ def alto_text(tree):
|
||||||
def page_namespace(tree):
|
def page_namespace(tree):
|
||||||
"""Return the PAGE content namespace used in the given ElementTree.
|
"""Return the PAGE content namespace used in the given ElementTree.
|
||||||
|
|
||||||
This relies on the assumption that, in any given PAGE content file, the root element has the local name "PcGts". We
|
This relies on the assumption that, in any given PAGE content file, the root element
|
||||||
do not check if the files uses any valid PAGE namespace.
|
has the local name "PcGts". We do not check if the files uses any valid PAGE
|
||||||
|
namespace.
|
||||||
"""
|
"""
|
||||||
root_name = ET.QName(tree.getroot().tag)
|
root_name = ET.QName(tree.getroot().tag)
|
||||||
if root_name.localname == "PcGts":
|
if root_name.localname == "PcGts":
|
||||||
|
|
|
@ -72,7 +72,8 @@ def test_with_some_fake_ocr_errors():
|
||||||
result = list(
|
result = list(
|
||||||
align(
|
align(
|
||||||
"Über die vielen Sorgen wegen desselben vergaß",
|
"Über die vielen Sorgen wegen desselben vergaß",
|
||||||
"SomeJunk MoreJunk Übey die vielen Sorgen wegen AdditionalJunk deffelben vcrgab",
|
"SomeJunk MoreJunk "
|
||||||
|
+ "Übey die vielen Sorgen wegen AdditionalJunk deffelben vcrgab",
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
left, right = unzip(result)
|
left, right = unzip(result)
|
||||||
|
|
|
@ -36,6 +36,7 @@ def test_character_error_rate_hard():
|
||||||
len(s2) == 7
|
len(s2) == 7
|
||||||
) # This, OTOH, ends with LATIN SMALL LETTER M + COMBINING TILDE, 7 code points
|
) # This, OTOH, ends with LATIN SMALL LETTER M + COMBINING TILDE, 7 code points
|
||||||
|
|
||||||
# Both strings have the same length in terms of grapheme clusters. So the CER should be symmetrical.
|
# Both strings have the same length in terms of grapheme clusters. So the CER should
|
||||||
|
# be symmetrical.
|
||||||
assert character_error_rate(s2, s1) == 1 / 6
|
assert character_error_rate(s2, s1) == 1 / 6
|
||||||
assert character_error_rate(s1, s2) == 1 / 6
|
assert character_error_rate(s1, s2) == 1 / 6
|
||||||
|
|
|
@ -15,7 +15,9 @@ def test_align_page_files():
|
||||||
# In the fake OCR file, we changed 2 characters and replaced a fi ligature with fi.
|
# In the fake OCR file, we changed 2 characters and replaced a fi ligature with fi.
|
||||||
# → 2 elements in the alignment should be different, the ligature is
|
# → 2 elements in the alignment should be different, the ligature is
|
||||||
# (currently) not counted due to normalization.
|
# (currently) not counted due to normalization.
|
||||||
# NOTE: In this example, it doesn't matter that we work with "characters", not grapheme clusters.
|
#
|
||||||
|
# NOTE: In this example, it doesn't matter that we work with "characters", not
|
||||||
|
# grapheme clusters.
|
||||||
|
|
||||||
gt = page_text(ET.parse(os.path.join(data_dir, "test-gt.page2018.xml")))
|
gt = page_text(ET.parse(os.path.join(data_dir, "test-gt.page2018.xml")))
|
||||||
ocr = page_text(ET.parse(os.path.join(data_dir, "test-fake-ocr.page2018.xml")))
|
ocr = page_text(ET.parse(os.path.join(data_dir, "test-fake-ocr.page2018.xml")))
|
||||||
|
|
|
@ -12,8 +12,8 @@ data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")
|
||||||
|
|
||||||
@pytest.mark.integration
|
@pytest.mark.integration
|
||||||
def test_word_error_rate_between_page_files():
|
def test_word_error_rate_between_page_files():
|
||||||
# In the fake OCR file, we changed 2 characters and replaced a fi ligature with fi. So we have 3 changed words,
|
# In the fake OCR file, we changed 2 characters and replaced a fi ligature with fi.
|
||||||
# the ligature does not count → 2 errors
|
# So we have 3 changed words, the ligature does not count → 2 errors
|
||||||
gt = page_text(ET.parse(os.path.join(data_dir, "test-gt.page2018.xml")))
|
gt = page_text(ET.parse(os.path.join(data_dir, "test-gt.page2018.xml")))
|
||||||
|
|
||||||
gt_word_count = (
|
gt_word_count = (
|
||||||
|
|
|
@ -159,7 +159,8 @@ def test_page_level():
|
||||||
result = page_text(tree, textequiv_level="line")
|
result = page_text(tree, textequiv_level="line")
|
||||||
assert (
|
assert (
|
||||||
result
|
result
|
||||||
== "Hand, Mylord? fragte der Graf von Rocheſter.\nAls er einsmals in dem Oberhauſe eine Bill we-"
|
== "Hand, Mylord? fragte der Graf von Rocheſter.\n"
|
||||||
|
+ "Als er einsmals in dem Oberhauſe eine Bill we-"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -27,7 +27,8 @@ def test_words():
|
||||||
def test_words_private_use_area():
|
def test_words_private_use_area():
|
||||||
result = list(
|
result = list(
|
||||||
words(
|
words(
|
||||||
"ber die vielen Sorgen wegen deelben vergaß Hartkopf, der Frau Amtmnnin das ver⸗\n"
|
"ber die vielen Sorgen wegen deelben vergaß Hartkopf, "
|
||||||
|
"der Frau Amtmnnin das ver⸗\n"
|
||||||
"ſproene zu berliefern."
|
"ſproene zu berliefern."
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
|
@ -52,8 +52,9 @@ def words(s: str):
|
||||||
cat = subcat[0]
|
cat = subcat[0]
|
||||||
return cat in unwanted_categories or subcat in unwanted_subcategories
|
return cat in unwanted_categories or subcat in unwanted_subcategories
|
||||||
|
|
||||||
# We follow Unicode Standard Annex #29 on Unicode Text Segmentation here: Split on word boundaries using
|
# We follow Unicode Standard Annex #29 on Unicode Text Segmentation here: Split on
|
||||||
# uniseg.wordbreak.words() and ignore all "words" that contain only whitespace, punctation "or similar characters."
|
# word boundaries using uniseg.wordbreak.words() and ignore all "words" that contain
|
||||||
|
# only whitespace, punctation "or similar characters."
|
||||||
for word in uniseg.wordbreak.words(s):
|
for word in uniseg.wordbreak.words(s):
|
||||||
if all(unwanted(c) for c in word):
|
if all(unwanted(c) for c in word):
|
||||||
pass
|
pass
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue