mirror of
				https://github.com/qurator-spk/dinglehopper.git
				synced 2025-11-04 11:24:17 +01:00 
			
		
		
		
	🎨 Reformat comments + strings manually (not auto-fixed by Black)
This commit is contained in:
		
							parent
							
								
									704e7cca1c
								
							
						
					
					
						commit
						e4431797e6
					
				
					 9 changed files with 23 additions and 15 deletions
				
			
		| 
						 | 
				
			
			@ -116,8 +116,8 @@ def process(
 | 
			
		|||
):
 | 
			
		||||
    """Check OCR result against GT.
 | 
			
		||||
 | 
			
		||||
    The @click decorators change the signature of the decorated functions, so we keep this undecorated version and use
 | 
			
		||||
    Click on a wrapper.
 | 
			
		||||
    The @click decorators change the signature of the decorated functions, so we keep
 | 
			
		||||
    this undecorated version and use Click on a wrapper.
 | 
			
		||||
    """
 | 
			
		||||
 | 
			
		||||
    gt_text = extract(gt, textequiv_level=textequiv_level)
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -14,8 +14,8 @@ from .extracted_text import ExtractedText, normalize_sbb
 | 
			
		|||
def alto_namespace(tree: ET.ElementTree) -> str:
 | 
			
		||||
    """Return the ALTO namespace used in the given ElementTree.
 | 
			
		||||
 | 
			
		||||
    This relies on the assumption that, in any given ALTO file, the root element has the local name "alto". We do not
 | 
			
		||||
    check if the files uses any valid ALTO namespace.
 | 
			
		||||
    This relies on the assumption that, in any given ALTO file, the root element has the
 | 
			
		||||
    local name "alto". We do not check if the files uses any valid ALTO namespace.
 | 
			
		||||
    """
 | 
			
		||||
    root_name = ET.QName(tree.getroot().tag)
 | 
			
		||||
    if root_name.localname == "alto":
 | 
			
		||||
| 
						 | 
				
			
			@ -48,8 +48,9 @@ def alto_text(tree):
 | 
			
		|||
def page_namespace(tree):
 | 
			
		||||
    """Return the PAGE content namespace used in the given ElementTree.
 | 
			
		||||
 | 
			
		||||
    This relies on the assumption that, in any given PAGE content file, the root element has the local name "PcGts". We
 | 
			
		||||
    do not check if the files uses any valid PAGE namespace.
 | 
			
		||||
    This relies on the assumption that, in any given PAGE content file, the root element
 | 
			
		||||
    has the local name "PcGts". We do not check if the files uses any valid PAGE
 | 
			
		||||
    namespace.
 | 
			
		||||
    """
 | 
			
		||||
    root_name = ET.QName(tree.getroot().tag)
 | 
			
		||||
    if root_name.localname == "PcGts":
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -72,7 +72,8 @@ def test_with_some_fake_ocr_errors():
 | 
			
		|||
    result = list(
 | 
			
		||||
        align(
 | 
			
		||||
            "Über die vielen Sorgen wegen desselben vergaß",
 | 
			
		||||
            "SomeJunk MoreJunk Übey die vielen Sorgen wegen AdditionalJunk deffelben vcrgab",
 | 
			
		||||
            "SomeJunk MoreJunk "
 | 
			
		||||
            + "Übey die vielen Sorgen wegen AdditionalJunk deffelben vcrgab",
 | 
			
		||||
        )
 | 
			
		||||
    )
 | 
			
		||||
    left, right = unzip(result)
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -36,6 +36,7 @@ def test_character_error_rate_hard():
 | 
			
		|||
        len(s2) == 7
 | 
			
		||||
    )  # This, OTOH, ends with LATIN SMALL LETTER M + COMBINING TILDE, 7 code points
 | 
			
		||||
 | 
			
		||||
    # Both strings have the same length in terms of grapheme clusters. So the CER should be symmetrical.
 | 
			
		||||
    # Both strings have the same length in terms of grapheme clusters. So the CER should
 | 
			
		||||
    # be symmetrical.
 | 
			
		||||
    assert character_error_rate(s2, s1) == 1 / 6
 | 
			
		||||
    assert character_error_rate(s1, s2) == 1 / 6
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -15,7 +15,9 @@ def test_align_page_files():
 | 
			
		|||
    # In the fake OCR file, we changed 2 characters and replaced a fi ligature with fi.
 | 
			
		||||
    # → 2 elements in the alignment should be different, the ligature is
 | 
			
		||||
    # (currently) not counted due to normalization.
 | 
			
		||||
    # NOTE: In this example, it doesn't matter that we work with "characters", not grapheme clusters.
 | 
			
		||||
    #
 | 
			
		||||
    # NOTE: In this example, it doesn't matter that we work with "characters", not
 | 
			
		||||
    # grapheme clusters.
 | 
			
		||||
 | 
			
		||||
    gt = page_text(ET.parse(os.path.join(data_dir, "test-gt.page2018.xml")))
 | 
			
		||||
    ocr = page_text(ET.parse(os.path.join(data_dir, "test-fake-ocr.page2018.xml")))
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -12,8 +12,8 @@ data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")
 | 
			
		|||
 | 
			
		||||
@pytest.mark.integration
 | 
			
		||||
def test_word_error_rate_between_page_files():
 | 
			
		||||
    # In the fake OCR file, we changed 2 characters and replaced a fi ligature with fi. So we have 3 changed words,
 | 
			
		||||
    # the ligature does not count → 2 errors
 | 
			
		||||
    # In the fake OCR file, we changed 2 characters and replaced a fi ligature with fi.
 | 
			
		||||
    # So we have 3 changed words, the ligature does not count → 2 errors
 | 
			
		||||
    gt = page_text(ET.parse(os.path.join(data_dir, "test-gt.page2018.xml")))
 | 
			
		||||
 | 
			
		||||
    gt_word_count = (
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -159,7 +159,8 @@ def test_page_level():
 | 
			
		|||
    result = page_text(tree, textequiv_level="line")
 | 
			
		||||
    assert (
 | 
			
		||||
        result
 | 
			
		||||
        == "Hand, Mylord? fragte der Graf von Rocheſter.\nAls er einsmals in dem Oberhauſe eine Bill we-"
 | 
			
		||||
        == "Hand, Mylord? fragte der Graf von Rocheſter.\n"
 | 
			
		||||
        + "Als er einsmals in dem Oberhauſe eine Bill we-"
 | 
			
		||||
    )
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -27,7 +27,8 @@ def test_words():
 | 
			
		|||
def test_words_private_use_area():
 | 
			
		||||
    result = list(
 | 
			
		||||
        words(
 | 
			
		||||
            "ber die vielen Sorgen wegen deelben vergaß Hartkopf, der Frau Amtmnnin das ver⸗\n"
 | 
			
		||||
            "ber die vielen Sorgen wegen deelben vergaß Hartkopf, "
 | 
			
		||||
            "der Frau Amtmnnin das ver⸗\n"
 | 
			
		||||
            "ſproene zu berliefern."
 | 
			
		||||
        )
 | 
			
		||||
    )
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -52,8 +52,9 @@ def words(s: str):
 | 
			
		|||
        cat = subcat[0]
 | 
			
		||||
        return cat in unwanted_categories or subcat in unwanted_subcategories
 | 
			
		||||
 | 
			
		||||
    # We follow Unicode Standard Annex #29 on Unicode Text Segmentation here: Split on word boundaries using
 | 
			
		||||
    # uniseg.wordbreak.words() and ignore all "words" that contain only whitespace, punctation "or similar characters."
 | 
			
		||||
    # We follow Unicode Standard Annex #29 on Unicode Text Segmentation here: Split on
 | 
			
		||||
    # word boundaries using uniseg.wordbreak.words() and ignore all "words" that contain
 | 
			
		||||
    # only whitespace, punctation "or similar characters."
 | 
			
		||||
    for word in uniseg.wordbreak.words(s):
 | 
			
		||||
        if all(unwanted(c) for c in word):
 | 
			
		||||
            pass
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue