🎨 dinglehopper: Reformat using black

2025-07-12 11:59:58 +02:00 · 2020-11-10 12:29:55 +01:00 · 2020-11-10 12:29:55 +01:00 · 14421c8e53
commit 14421c8e53
parent 31c63f9e4c
25 changed files with 774 additions and 466 deletions
--- a/qurator/dinglehopper/tests/extracted_text_test.py
+++ b/qurator/dinglehopper/tests/extracted_text_test.py
@ -10,25 +10,30 @@ from .. import seq_align, ExtractedText


 def test_text():
-    test1 = ExtractedText(None, [
-        ExtractedText('s0', None, None, 'foo'),
-        ExtractedText('s1', None, None, 'bar'),
-        ExtractedText('s2', None, None, 'bazinga')
-    ], ' ', None)
+    test1 = ExtractedText(
+        None,
+        [
+            ExtractedText("s0", None, None, "foo"),
+            ExtractedText("s1", None, None, "bar"),
+            ExtractedText("s2", None, None, "bazinga"),
+        ],
+        " ",
+        None,
+    )

-    assert test1.text == 'foo bar bazinga'
-    assert test1.segment_id_for_pos(0) == 's0'
+    assert test1.text == "foo bar bazinga"
+    assert test1.segment_id_for_pos(0) == "s0"
    assert test1.segment_id_for_pos(3) is None
-    assert test1.segment_id_for_pos(10) == 's2'
+    assert test1.segment_id_for_pos(10) == "s2"


 def test_normalization_check():
-    with pytest.raises(ValueError, match=r'.*is not in NFC.*'):
-        ExtractedText('foo', None, None, unicodedata.normalize('NFD', 'Schlyñ'))
-    assert ExtractedText('foo', None, None, unicodedata.normalize('NFC', 'Schlyñ'))
+    with pytest.raises(ValueError, match=r".*is not in NFC.*"):
+        ExtractedText("foo", None, None, unicodedata.normalize("NFD", "Schlyñ"))
+    assert ExtractedText("foo", None, None, unicodedata.normalize("NFC", "Schlyñ"))


-AlignmentElement = namedtuple('AlignmentElement', 'left right left_id right_id')
+AlignmentElement = namedtuple("AlignmentElement", "left right left_id right_id")


 def test_align():
@ -39,25 +44,36 @@ def test_align():
    not Python characters.
    """

-    test1 = ExtractedText(None, [
-        ExtractedText('s0', None, None, 'foo'),
-        ExtractedText('s1', None, None, 'bar'),
-        ExtractedText('s2', None, None, 'batzinga')
-    ], ' ', None)
-    test2 = ExtractedText(None, [
-        ExtractedText('x0', None, None, 'foo'),
-        ExtractedText('x1', None, None, 'bar'),
-        # extra .
-        ExtractedText('x2', None, None, '.'),
-        # deletion + different grapheme cluster, m̃ also is two Python characters
-        ExtractedText('x3', None, None, 'bazim̃ga'),
-    ], ' ', None)
+    test1 = ExtractedText(
+        None,
+        [
+            ExtractedText("s0", None, None, "foo"),
+            ExtractedText("s1", None, None, "bar"),
+            ExtractedText("s2", None, None, "batzinga"),
+        ],
+        " ",
+        None,
+    )
+    test2 = ExtractedText(
+        None,
+        [
+            ExtractedText("x0", None, None, "foo"),
+            ExtractedText("x1", None, None, "bar"),
+            # extra .
+            ExtractedText("x2", None, None, "."),
+            # deletion + different grapheme cluster, m̃ also is two Python characters
+            ExtractedText("x3", None, None, "bazim̃ga"),
+        ],
+        " ",
+        None,
+    )

    left_pos = 0
    right_pos = 0
    alignment = []
-    for left, right in seq_align(grapheme_clusters(test1.text),
-                                 grapheme_clusters(test2.text)):
+    for left, right in seq_align(
+        grapheme_clusters(test1.text), grapheme_clusters(test2.text)
+    ):
        left_id = test1.segment_id_for_pos(left_pos) if left is not None else None
        right_id = test2.segment_id_for_pos(right_pos) if right is not None else None
        el = AlignmentElement(left, right, left_id, right_id)
@ -67,46 +83,57 @@ def test_align():
        if right is not None:
            right_pos += len(right)

-    print('test1: {}'.format(test1.text))
-    print('test2: {}'.format(test2.text))
+    print("test1: {}".format(test1.text))
+    print("test2: {}".format(test2.text))

-    assert alignment[0] == ('f', 'f', 's0', 'x0')
-    assert alignment[8] == (None, '.', None, 'x2')
-    assert alignment[12] == ('t', None, 's2', None)
-    assert alignment[15] == ('n', 'm̃', 's2', 'x3')
+    assert alignment[0] == ("f", "f", "s0", "x0")
+    assert alignment[8] == (None, ".", None, "x2")
+    assert alignment[12] == ("t", None, "s2", None)
+    assert alignment[15] == ("n", "m̃", "s2", "x3")


-@pytest.mark.parametrize("attributes,expected_index,expected_log", [
-    ([], None, None),
-    (['index="0"'], 0, None),
-    ([''], 0, None),
-    (['conf="0.5"'], 0, None),
-    (['index="1"', 'index="0"'], 1, None),
-    (['index="0" conf="0.4"', 'conf="0.5"'], 0, "TextEquiv without index"),
-    (['conf="0.4"', 'conf="0.5"', 'conf="0.9"'], 2,
-     "No index attributes, use 'conf' attribute to sort TextEquiv"),
-    (['index="0"', ''], 0, "TextEquiv without index"),
-    (['', 'conf="0.4"'], 1,
-     "No index attributes, use 'conf' attribute to sort TextEquiv"),
-    (['', ''], 0, "No index attributes, use first TextEquiv"),
-])
+@pytest.mark.parametrize(
+    "attributes,expected_index,expected_log",
+    [
+        ([], None, None),
+        (['index="0"'], 0, None),
+        ([""], 0, None),
+        (['conf="0.5"'], 0, None),
+        (['index="1"', 'index="0"'], 1, None),
+        (['index="0" conf="0.4"', 'conf="0.5"'], 0, "TextEquiv without index"),
+        (
+            ['conf="0.4"', 'conf="0.5"', 'conf="0.9"'],
+            2,
+            "No index attributes, use 'conf' attribute to sort TextEquiv",
+        ),
+        (['index="0"', ""], 0, "TextEquiv without index"),
+        (
+            ["", 'conf="0.4"'],
+            1,
+            "No index attributes, use 'conf' attribute to sort TextEquiv",
+        ),
+        (["", ""], 0, "No index attributes, use first TextEquiv"),
+    ],
+)
 def test_textequiv(attributes, expected_index, expected_log, caplog):
    """Test that extracting text from a PAGE TextEquiv is working without index attr."""
    caplog.set_level(logging.INFO)
-    xml = "<?xml version=\"1.0\"?>"
+    xml = '<?xml version="1.0"?>'
    ns = "http://schema.primaresearch.org/PAGE/gts/pagecontent/2018-07-15"
    text = ["Text {0}".format(i) for i in range(len(attributes) + 1)]

-    equiv = ["<TextEquiv {0}><Unicode>{1}</Unicode></TextEquiv>".format(attr, text[i])
-             for i, attr in enumerate(attributes)]
+    equiv = [
+        "<TextEquiv {0}><Unicode>{1}</Unicode></TextEquiv>".format(attr, text[i])
+        for i, attr in enumerate(attributes)
+    ]

-    textline = "{0}<TextLine id=\"l3\" xmlns=\"{1}\">{2}</TextLine>"
-    textline = textline.format(xml, ns, ''.join(equiv))
+    textline = '{0}<TextLine id="l3" xmlns="{1}">{2}</TextLine>'
+    textline = textline.format(xml, ns, "".join(equiv))

    root = ET.fromstring(textline)
-    result = ExtractedText.from_text_segment(root,
-                                             {'page': ns},
-                                             textequiv_level='line').text
+    result = ExtractedText.from_text_segment(
+        root, {"page": ns}, textequiv_level="line"
+    ).text
    if expected_index is None:
        assert not result
    else: