🎨 dinglehopper: Reformat using black

2025-12-12 21:54:12 +01:00 · 2020-11-10 12:29:55 +01:00 · 2020-11-10 12:29:55 +01:00 · 14421c8e53
commit 14421c8e53
parent 31c63f9e4c
25 changed files with 774 additions and 466 deletions
--- a/qurator/init.py
+++ b/qurator/init.py
@ -1,2 +1 @@
-__import__('pkg_resources').declare_namespace(__name__)
-
+__import__("pkg_resources").declare_namespace(__name__)
--- a/qurator/dinglehopper/align.py
+++ b/qurator/dinglehopper/align.py
@ -3,8 +3,8 @@ from .edit_distance import *

 def align(t1, t2):
    """Align text."""
-    s1 = list(grapheme_clusters(unicodedata.normalize('NFC', t1)))
-    s2 = list(grapheme_clusters(unicodedata.normalize('NFC', t2)))
+    s1 = list(grapheme_clusters(unicodedata.normalize("NFC", t1)))
+    s2 = list(grapheme_clusters(unicodedata.normalize("NFC", t2)))
    return seq_align(s1, s2)


@ -27,13 +27,13 @@ def seq_align(s1, s2):
            pass

        if o:
-            if o[0] == 'insert':
+            if o[0] == "insert":
                yield None, s2[j]
                j += 1
-            elif o[0] == 'delete':
+            elif o[0] == "delete":
                yield s1[i], None
                i += 1
-            elif o[0] == 'replace':
+            elif o[0] == "replace":
                yield s1[i], s2[j]
                i += 1
                j += 1
--- a/qurator/dinglehopper/character_error_rate.py
+++ b/qurator/dinglehopper/character_error_rate.py
@ -19,19 +19,21 @@ def character_error_rate_n(reference: str, compared: str) -> Tuple[float, int]:
    """

    d = distance(reference, compared)
-    n = len(list(grapheme_clusters(unicodedata.normalize('NFC', reference))))
+    n = len(list(grapheme_clusters(unicodedata.normalize("NFC", reference))))

    if d == 0:
        return 0, n
    if n == 0:
-        return float('inf'), n
-    return d/n, n
+        return float("inf"), n
+    return d / n, n

    # XXX Should we really count newlines here?


@multimethod
-def character_error_rate_n(reference: ExtractedText, compared: ExtractedText) -> Tuple[float, int]:
+def character_error_rate_n(
+    reference: ExtractedText, compared: ExtractedText
+) -> Tuple[float, int]:
    return character_error_rate_n(reference.text, compared.text)


--- a/qurator/dinglehopper/cli.py
+++ b/qurator/dinglehopper/cli.py
@ -12,16 +12,17 @@ from .extracted_text import ExtractedText
 from .ocr_files import extract
 from .config import Config

+
 def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none):
-    gtx = ''
-    ocrx = ''
+    gtx = ""
+    ocrx = ""

    def format_thing(t, css_classes=None, id_=None):
        if t is None:
            html_t = none
-            css_classes += ' ellipsis'
-        elif t == '\n':
-            html_t = '<br>'
+            css_classes += " ellipsis"
+        elif t == "\n":
+            html_t = "<br>"
        else:
            html_t = escape(t)

@ -32,9 +33,13 @@ def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none):
            html_custom_attrs += 'data-toggle="tooltip" title="{}"'.format(id_)

        if css_classes:
-            return '<span class="{css_classes}" {html_custom_attrs}>{html_t}</span>'.format(css_classes=css_classes, html_t=html_t, html_custom_attrs=html_custom_attrs)
+            return '<span class="{css_classes}" {html_custom_attrs}>{html_t}</span>'.format(
+                css_classes=css_classes,
+                html_t=html_t,
+                html_custom_attrs=html_custom_attrs,
+            )
        else:
-            return '{html_t}'.format(html_t=html_t)
+            return "{html_t}".format(html_t=html_t)

    if isinstance(gt_in, ExtractedText):
        if not isinstance(ocr_in, ExtractedText):
@ -46,8 +51,6 @@ def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none):
        gt_things = gt_in
        ocr_things = ocr_in

-
-
    g_pos = 0
    o_pos = 0
    for k, (g, o) in enumerate(seq_align(gt_things, ocr_things)):
@ -55,7 +58,7 @@ def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none):
        gt_id = None
        ocr_id = None
        if g != o:
-            css_classes = '{css_prefix}diff{k} diff'.format(css_prefix=css_prefix, k=k)
+            css_classes = "{css_prefix}diff{k} diff".format(css_prefix=css_prefix, k=k)
            if isinstance(gt_in, ExtractedText):
                gt_id = gt_in.segment_id_for_pos(g_pos) if g is not None else None
                ocr_id = ocr_in.segment_id_for_pos(o_pos) if o is not None else None
@ -70,17 +73,17 @@ def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none):
        if o is not None:
            o_pos += len(o)

-
-    return \
-        '''
+    return """
        <div class="row">
           <div class="col-md-6 gt">{}</div>
           <div class="col-md-6 ocr">{}</div>
        </div>
-        '''.format(gtx, ocrx)
+        """.format(
+        gtx, ocrx
+    )


-def process(gt, ocr, report_prefix, *, metrics=True, textequiv_level='region'):
+def process(gt, ocr, report_prefix, *, metrics=True, textequiv_level="region"):
    """Check OCR result against GT.

    The @click decorators change the signature of the decorated functions, so we keep this undecorated version and use
@ -93,36 +96,47 @@ def process(gt, ocr, report_prefix, *, metrics=True, textequiv_level='region'):
    cer, n_characters = character_error_rate_n(gt_text, ocr_text)
    wer, n_words = word_error_rate_n(gt_text, ocr_text)

-    char_diff_report = gen_diff_report(gt_text, ocr_text, css_prefix='c', joiner='', none='·')
+    char_diff_report = gen_diff_report(
+        gt_text, ocr_text, css_prefix="c", joiner="", none="·"
+    )

    gt_words = words_normalized(gt_text)
    ocr_words = words_normalized(ocr_text)
-    word_diff_report = gen_diff_report(gt_words, ocr_words, css_prefix='w', joiner=' ', none='⋯')
+    word_diff_report = gen_diff_report(
+        gt_words, ocr_words, css_prefix="w", joiner=" ", none="⋯"
+    )

    def json_float(value):
        """Convert a float value to an JSON float.

        This is here so that float('inf') yields "Infinity", not "inf".
        """
-        if value == float('inf'):
-            return 'Infinity'
-        elif value == float('-inf'):
-            return '-Infinity'
+        if value == float("inf"):
+            return "Infinity"
+        elif value == float("-inf"):
+            return "-Infinity"
        else:
            return str(value)

-    env = Environment(loader=FileSystemLoader(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'templates')))
-    env.filters['json_float'] = json_float
+    env = Environment(
+        loader=FileSystemLoader(
+            os.path.join(os.path.dirname(os.path.realpath(__file__)), "templates")
+        )
+    )
+    env.filters["json_float"] = json_float

-    for report_suffix in ('.html', '.json'):
-        template_fn = 'report' + report_suffix + '.j2'
+    for report_suffix in (".html", ".json"):
+        template_fn = "report" + report_suffix + ".j2"
        out_fn = report_prefix + report_suffix

        template = env.get_template(template_fn)
        template.stream(
-            gt=gt, ocr=ocr,
-            cer=cer, n_characters=n_characters,
-            wer=wer, n_words=n_words,
+            gt=gt,
+            ocr=ocr,
+            cer=cer,
+            n_characters=n_characters,
+            wer=wer,
+            n_words=n_words,
            char_diff_report=char_diff_report,
            word_diff_report=word_diff_report,
            metrics=metrics,
@ -130,12 +144,19 @@ def process(gt, ocr, report_prefix, *, metrics=True, textequiv_level='region'):


@click.command()
-@click.argument('gt', type=click.Path(exists=True))
-@click.argument('ocr', type=click.Path(exists=True))
-@click.argument('report_prefix', type=click.Path(), default='report')
-@click.option('--metrics/--no-metrics', default=True, help='Enable/disable metrics and green/red')
-@click.option('--textequiv-level', default='region', help='PAGE TextEquiv level to extract text from', metavar='LEVEL')
-@click.option('--progress', default=False, is_flag=True, help='Show progress bar')
+@click.argument("gt", type=click.Path(exists=True))
+@click.argument("ocr", type=click.Path(exists=True))
+@click.argument("report_prefix", type=click.Path(), default="report")
+@click.option(
+    "--metrics/--no-metrics", default=True, help="Enable/disable metrics and green/red"
+)
+@click.option(
+    "--textequiv-level",
+    default="region",
+    help="PAGE TextEquiv level to extract text from",
+    metavar="LEVEL",
+)
+@click.option("--progress", default=False, is_flag=True, help="Show progress bar")
 def main(gt, ocr, report_prefix, metrics, textequiv_level, progress):
    """
    Compare the PAGE/ALTO/text document GT against the document OCR.
@ -159,5 +180,5 @@ def main(gt, ocr, report_prefix, metrics, textequiv_level, progress):
    process(gt, ocr, report_prefix, metrics=metrics, textequiv_level=textequiv_level)


-if __name__ == '__main__':
+if __name__ == "__main__":
    main()
--- a/qurator/dinglehopper/cli_extract.py
+++ b/qurator/dinglehopper/cli_extract.py
@ -7,8 +7,13 @@ from .ocr_files import extract


@click.command()
-@click.argument('input_file', type=click.Path(exists=True))
-@click.option('--textequiv-level', default='region', help='PAGE TextEquiv level to extract text from', metavar='LEVEL')
+@click.argument("input_file", type=click.Path(exists=True))
+@click.option(
+    "--textequiv-level",
+    default="region",
+    help="PAGE TextEquiv level to extract text from",
+    metavar="LEVEL",
+)
 def main(input_file, textequiv_level):
    """
    Extract the text of the given INPUT_FILE.
@ -23,5 +28,5 @@ def main(input_file, textequiv_level):
    print(input_text)


-if __name__ == '__main__':
+if __name__ == "__main__":
    main()
--- a/qurator/dinglehopper/edit_distance.py
+++ b/qurator/dinglehopper/edit_distance.py
@ -48,9 +48,10 @@ def _levenshtein_matrix(seq1: Tuple, seq2: Tuple):
    for i in tqdm(from_to(1, m), disable=not Config.progress):
        for j in from_to(1, n):
            D[i, j] = min(
-                D[i - 1, j - 1] + 1 * (seq1[i - 1] != seq2[j - 1]),  # Same or Substitution
+                D[i - 1, j - 1]
+                + 1 * (seq1[i - 1] != seq2[j - 1]),  # Same or Substitution
                D[i, j - 1] + 1,  # Insertion
-                D[i - 1, j] + 1   # Deletion
+                D[i - 1, j] + 1,  # Deletion
            )

    return D
@ -81,8 +82,8 @@ def distance(s1: str, s2: str):
    Note that this is different from levenshtein() as this function knows about Unicode normalization and grapheme
    clusters. This should be the correct way to compare two Unicode strings.
    """
-    seq1 = list(grapheme_clusters(unicodedata.normalize('NFC', s1)))
-    seq2 = list(grapheme_clusters(unicodedata.normalize('NFC', s2)))
+    seq1 = list(grapheme_clusters(unicodedata.normalize("NFC", s1)))
+    seq2 = list(grapheme_clusters(unicodedata.normalize("NFC", s2)))
    return levenshtein(seq1, seq2)


@ -106,11 +107,17 @@ def seq_editops(seq1, seq2):

    def _tail_backtrace(i, j, accumulator):
        if i > 0 and D[i - 1, j] + 1 == D[i, j]:
-            return partial(_tail_backtrace, i - 1, j, [('delete', i-1, j)] + accumulator)
+            return partial(
+                _tail_backtrace, i - 1, j, [("delete", i - 1, j)] + accumulator
+            )
        if j > 0 and D[i, j - 1] + 1 == D[i, j]:
-            return partial(_tail_backtrace, i, j - 1, [('insert', i, j-1)] + accumulator)
+            return partial(
+                _tail_backtrace, i, j - 1, [("insert", i, j - 1)] + accumulator
+            )
        if i > 0 and j > 0 and D[i - 1, j - 1] + 1 == D[i, j]:
-            return partial(_tail_backtrace, i - 1, j - 1, [('replace', i-1, j-1)] + accumulator)
+            return partial(
+                _tail_backtrace, i - 1, j - 1, [("replace", i - 1, j - 1)] + accumulator
+            )
        if i > 0 and j > 0 and D[i - 1, j - 1] == D[i, j]:
            return partial(_tail_backtrace, i - 1, j - 1, accumulator)  # NOP
        return accumulator
@ -132,6 +139,6 @@ def editops(word1, word2):

    Note that this returns indices to the _grapheme clusters_, not characters!
    """
-    word1 = list(grapheme_clusters(unicodedata.normalize('NFC', word1)))
-    word2 = list(grapheme_clusters(unicodedata.normalize('NFC', word2)))
+    word1 = list(grapheme_clusters(unicodedata.normalize("NFC", word1)))
+    word2 = list(grapheme_clusters(unicodedata.normalize("NFC", word2)))
    return seq_editops(word1, word2)
--- a/qurator/dinglehopper/extracted_text.py
+++ b/qurator/dinglehopper/extracted_text.py
@ -10,6 +10,7 @@ import numpy as np
 from lxml import etree as ET
 from ocrd_utils import getLogger

+
 class Normalization(enum.Enum):
    NFC = 1
    NFC_MUFI = 2  # TODO
@ -18,7 +19,7 @@ class Normalization(enum.Enum):

 def normalize(text, normalization):
    if normalization == Normalization.NFC:
-        return unicodedata.normalize('NFC', text)
+        return unicodedata.normalize("NFC", text)
    if normalization == Normalization.NFC_MUFI:
        raise NotImplementedError()
    if normalization == Normalization.NFC_SBB:
@ -36,31 +37,31 @@ def unjoin_ligatures(s):
    """Unjoin ligatures, i.e. ﬀ becomes ff."""

    equivalences = {
-        '': 'ſſ',
-        "\ueba7": 'ſſi',  # MUFI: LATIN SMALL LIGATURE LONG S LONG S I
-        '': 'ch',
-        '': 'ck',
-        '': 'll',
-        '': 'ſi',
-        '': 'ſt',
-        'ﬁ': 'fi',
-        'ﬀ': 'ff',
-        'ﬂ': 'fl',
-        'ﬃ': 'ffi',
-        '': 'ct',
-        '': 'tz',  # MUFI: LATIN SMALL LIGATURE TZ
-        '\uf532': 'as',  # eMOP: Latin small ligature as
-        '\uf533': 'is',  # eMOP: Latin small ligature is
-        '\uf534': 'us',  # eMOP: Latin small ligature us
-        '\uf535': 'Qu',  # eMOP: Latin ligature capital Q small u
-        'ĳ': 'ij',  # U+0133 LATIN SMALL LIGATURE IJ
-        '\uE8BF': 'q&',
+        "": "ſſ",
+        "\ueba7": "ſſi",  # MUFI: LATIN SMALL LIGATURE LONG S LONG S I
+        "": "ch",
+        "": "ck",
+        "": "ll",
+        "": "ſi",
+        "": "ſt",
+        "ﬁ": "fi",
+        "ﬀ": "ff",
+        "ﬂ": "fl",
+        "ﬃ": "ffi",
+        "": "ct",
+        "": "tz",  # MUFI: LATIN SMALL LIGATURE TZ
+        "\uf532": "as",  # eMOP: Latin small ligature as
+        "\uf533": "is",  # eMOP: Latin small ligature is
+        "\uf534": "us",  # eMOP: Latin small ligature us
+        "\uf535": "Qu",  # eMOP: Latin ligature capital Q small u
+        "ĳ": "ij",  # U+0133 LATIN SMALL LIGATURE IJ
+        "\uE8BF": "q&",
        # MUFI: LATIN SMALL LETTER Q LIGATED WITH FINAL ET
        # XXX How to replace this correctly?
-        '\uEBA5': 'ſp',  # MUFI: LATIN SMALL LIGATURE LONG S P
-        'ﬆ': 'st',  # U+FB06 LATIN SMALL LIGATURE ST
+        "\uEBA5": "ſp",  # MUFI: LATIN SMALL LIGATURE LONG S P
+        "ﬆ": "st",  # U+FB06 LATIN SMALL LIGATURE ST
    }
-    s = unicodedata.normalize('NFC', s)
+    s = unicodedata.normalize("NFC", s)
    for fr, to in equivalences.items():
        s = s.replace(fr, to)
    return s
@ -70,20 +71,20 @@ def substitute_equivalences(s):
    # These are for OCR-D GT vs Tesseract frk vs Calamari GT4HistOCR
    # It might make sense to use different rules for GT and for the different OCR
    equivalences = {
-        '': 'ü',
-        '': 'ä',
-        '==': '–',  # → en-dash
-        '—': '–',  # em-dash → en-dash
-        '': 'ö',
-        '’': '\'',
-        '⸗': '-',
-        'aͤ': 'ä',  # LATIN SMALL LETTER A, COMBINING LATIN SMALL LETTER E
-        'oͤ': 'ö',  # LATIN SMALL LETTER O, COMBINING LATIN SMALL LETTER E
-        'uͤ': 'ü',  # LATIN SMALL LETTER U, COMBINING LATIN SMALL LETTER E
-        '\uF50E': 'q́'  # U+F50E LATIN SMALL LETTER Q WITH ACUTE ACCENT
+        "": "ü",
+        "": "ä",
+        "==": "–",  # → en-dash
+        "—": "–",  # em-dash → en-dash
+        "": "ö",
+        "’": "'",
+        "⸗": "-",
+        "aͤ": "ä",  # LATIN SMALL LETTER A, COMBINING LATIN SMALL LETTER E
+        "oͤ": "ö",  # LATIN SMALL LETTER O, COMBINING LATIN SMALL LETTER E
+        "uͤ": "ü",  # LATIN SMALL LETTER U, COMBINING LATIN SMALL LETTER E
+        "\uF50E": "q́",  # U+F50E LATIN SMALL LETTER Q WITH ACUTE ACCENT
    }

-    s = unicodedata.normalize('NFC', s)
+    s = unicodedata.normalize("NFC", s)
    s = unjoin_ligatures(s)
    for fr, to in equivalences.items():
        s = s.replace(fr, to)
@ -115,13 +116,14 @@ class ExtractedText:
    Objects of this class are guaranteed to be a. always in their normalization
    and b. in NFC.
    """
+
    segment_id = attr.ib(type=Optional[str])

    @segment_id.validator
    def check(self, _, value):
        if value is None:
            return
-        if not re.match(r'[\w\d_-]+', value):
+        if not re.match(r"[\w\d_-]+", value):
            raise ValueError('Malformed segment id "{}"'.format(value))

    # An object contains either
@ -141,7 +143,7 @@ class ExtractedText:
    def check(self, _, value):
        if value is not None and self.segments is not None:
            raise ValueError("Can't have both segments and text")
-        if value is not None and unicodedata.normalize('NFC', value) != value:
+        if value is not None and unicodedata.normalize("NFC", value) != value:
            raise ValueError('String "{}" is not in NFC.'.format(value))
        if value is not None and normalize(value, self.normalization) != value:
            raise ValueError('String "{}" is not normalized.'.format(value))
@ -169,31 +171,24 @@ class ExtractedText:
                    seg_ids = [s.segment_id_for_pos(i) for i in range(len(s.text))]
                    segment_id_for_pos.extend(seg_ids)
                    segment_id_for_pos.extend(repeat(None, len(self.joiner)))
-                segment_id_for_pos = segment_id_for_pos[:-len(self.joiner)]
+                segment_id_for_pos = segment_id_for_pos[: -len(self.joiner)]

            # This is frozen, so we have to jump through the hoop:
-            object.__setattr__(self, '_segment_id_for_pos', segment_id_for_pos)
+            object.__setattr__(self, "_segment_id_for_pos", segment_id_for_pos)
            assert self._segment_id_for_pos

        return self._segment_id_for_pos[pos]

    @classmethod
-    def from_text_segment(cls, text_segment, nsmap, textequiv_level='region'):
+    def from_text_segment(cls, text_segment, nsmap, textequiv_level="region"):
        """Build an ExtractedText from a PAGE content text element"""

-        localname_for_textequiv_level = {
-            'region': 'TextRegion',
-            'line': 'TextLine'
-        }
+        localname_for_textequiv_level = {"region": "TextRegion", "line": "TextLine"}
        textequiv_level_for_localname = invert_dict(localname_for_textequiv_level)
-        children_for_localname = {
-            'TextRegion': 'TextLine'
-        }
-        joiner_for_textequiv_level = {
-            'line': '\n'
-        }
+        children_for_localname = {"TextRegion": "TextLine"}
+        joiner_for_textequiv_level = {"line": "\n"}

-        segment_id = text_segment.attrib['id']
+        segment_id = text_segment.attrib["id"]
        localname = ET.QName(text_segment).localname
        if localname == localname_for_textequiv_level[textequiv_level]:
            segment_text = None
@ -201,19 +196,20 @@ class ExtractedText:
                segment_text = get_textequiv_unicode(text_segment, nsmap)
                # FIXME hardcoded SBB normalization
                segment_text = normalize_sbb(segment_text)
-            segment_text = segment_text or ''
+            segment_text = segment_text or ""
            return cls(segment_id, None, None, segment_text)
        else:
            # Recurse
            sub_localname = children_for_localname[localname]
            sub_textequiv_level = textequiv_level_for_localname[sub_localname]
            segments = []
-            for sub_segment in text_segment.iterfind('./page:%s' % sub_localname,
-                                                     namespaces=nsmap):
+            for sub_segment in text_segment.iterfind(
+                "./page:%s" % sub_localname, namespaces=nsmap
+            ):
                segments.append(
                    ExtractedText.from_text_segment(
-                        sub_segment, nsmap,
-                        textequiv_level=sub_textequiv_level)
+                        sub_segment, nsmap, textequiv_level=sub_textequiv_level
+                    )
                )
            joiner = joiner_for_textequiv_level[sub_textequiv_level]
            return cls(segment_id, segments, joiner, None)
@ -231,24 +227,24 @@ def invert_dict(d):

 def get_textequiv_unicode(text_segment, nsmap) -> str:
    """Get the TextEquiv/Unicode text of the given PAGE text element."""
-    segment_id = text_segment.attrib['id']
-    textequivs = text_segment.findall('./page:TextEquiv', namespaces=nsmap)
+    segment_id = text_segment.attrib["id"]
+    textequivs = text_segment.findall("./page:TextEquiv", namespaces=nsmap)

    if not textequivs:
-        return ''
+        return ""

    textequiv = get_first_textequiv(textequivs, segment_id)
-    return textequiv.find('./page:Unicode', namespaces=nsmap).text or ''
+    return textequiv.find("./page:Unicode", namespaces=nsmap).text or ""


 def get_first_textequiv(textequivs, segment_id):
    """Get the first TextEquiv based on index or conf order if index is not present."""
-    log = getLogger('processor.OcrdDinglehopperEvaluate')
+    log = getLogger("processor.OcrdDinglehopperEvaluate")
    if len(textequivs) == 1:
        return textequivs[0]

    # try ordering by index
-    indices = np.array([get_attr(te, 'index') for te in textequivs], dtype=float)
+    indices = np.array([get_attr(te, "index") for te in textequivs], dtype=float)
    nan_mask = np.isnan(indices)
    if np.any(~nan_mask):
        if np.any(nan_mask):
@ -256,10 +252,12 @@ def get_first_textequiv(textequivs, segment_id):
        index = np.nanargmin(indices)
    else:
        # try ordering by conf
-        confidences = np.array([get_attr(te, 'conf') for te in textequivs], dtype=float)
+        confidences = np.array([get_attr(te, "conf") for te in textequivs], dtype=float)
        if np.any(~np.isnan(confidences)):
-            log.info("No index attributes, use 'conf' attribute to sort TextEquiv in %s.",
-                     segment_id)
+            log.info(
+                "No index attributes, use 'conf' attribute to sort TextEquiv in %s.",
+                segment_id,
+            )
            index = np.nanargmax(confidences)
        else:
            # fallback to first entry in case of neither index or conf present
--- a/qurator/dinglehopper/ocr_files.py
+++ b/qurator/dinglehopper/ocr_files.py
@ -17,24 +17,27 @@ def alto_namespace(tree: ET.ElementTree) -> str:
    check if the files uses any valid ALTO namespace.
    """
    root_name = ET.QName(tree.getroot().tag)
-    if root_name.localname == 'alto':
+    if root_name.localname == "alto":
        return root_name.namespace
    else:
-        raise ValueError('Not an ALTO tree')
+        raise ValueError("Not an ALTO tree")


 def alto_extract_lines(tree: ET.ElementTree) -> Generator[ExtractedText, None, None]:
-    nsmap = {'alto': alto_namespace(tree)}
-    for line in tree.iterfind('.//alto:TextLine', namespaces=nsmap):
-        line_id = line.attrib.get('ID')
-        line_text = ' '.join(string.attrib.get('CONTENT') for string in line.iterfind('alto:String', namespaces=nsmap))
+    nsmap = {"alto": alto_namespace(tree)}
+    for line in tree.iterfind(".//alto:TextLine", namespaces=nsmap):
+        line_id = line.attrib.get("ID")
+        line_text = " ".join(
+            string.attrib.get("CONTENT")
+            for string in line.iterfind("alto:String", namespaces=nsmap)
+        )
        yield ExtractedText(line_id, None, None, normalize_sbb(line_text))
        # FIXME hardcoded SBB normalization


 def alto_extract(tree: ET.ElementTree()) -> ExtractedText:
    """Extract text from the given ALTO ElementTree."""
-    return ExtractedText(None, list(alto_extract_lines(tree)), '\n', None)
+    return ExtractedText(None, list(alto_extract_lines(tree)), "\n", None)


 def alto_text(tree):
@ -48,56 +51,73 @@ def page_namespace(tree):
    do not check if the files uses any valid PAGE namespace.
    """
    root_name = ET.QName(tree.getroot().tag)
-    if root_name.localname == 'PcGts':
+    if root_name.localname == "PcGts":
        return root_name.namespace
    else:
-        raise ValueError('Not a PAGE tree')
+        raise ValueError("Not a PAGE tree")


-def page_extract(tree, *, textequiv_level='region'):
+def page_extract(tree, *, textequiv_level="region"):
    """Extract text from the given PAGE content ElementTree."""

    # Internally, this is just parsing the Reading Order (if it exists) and
    # and leaves reading the TextRegions to ExtractedText.from_text_segment().

-    nsmap = {'page': page_namespace(tree)}
+    nsmap = {"page": page_namespace(tree)}

    regions = []
-    reading_order = tree.find('.//page:ReadingOrder', namespaces=nsmap)
+    reading_order = tree.find(".//page:ReadingOrder", namespaces=nsmap)
    if reading_order is not None:
-        for group in reading_order.iterfind('./*', namespaces=nsmap):
-            if ET.QName(group.tag).localname == 'OrderedGroup':
-                region_ref_indexeds = group.findall('./page:RegionRefIndexed', namespaces=nsmap)
-                for region_ref_indexed in sorted(region_ref_indexeds, key=lambda r: int(r.attrib['index'])):
-                    region_id = region_ref_indexed.attrib['regionRef']
-                    region = tree.find('.//page:TextRegion[@id="%s"]' % region_id, namespaces=nsmap)
+        for group in reading_order.iterfind("./*", namespaces=nsmap):
+            if ET.QName(group.tag).localname == "OrderedGroup":
+                region_ref_indexeds = group.findall(
+                    "./page:RegionRefIndexed", namespaces=nsmap
+                )
+                for region_ref_indexed in sorted(
+                    region_ref_indexeds, key=lambda r: int(r.attrib["index"])
+                ):
+                    region_id = region_ref_indexed.attrib["regionRef"]
+                    region = tree.find(
+                        './/page:TextRegion[@id="%s"]' % region_id, namespaces=nsmap
+                    )
                    if region is not None:
-                        regions.append(ExtractedText.from_text_segment(region, nsmap, textequiv_level=textequiv_level))
+                        regions.append(
+                            ExtractedText.from_text_segment(
+                                region, nsmap, textequiv_level=textequiv_level
+                            )
+                        )
                    else:
                        pass  # Not a TextRegion
            else:
                raise NotImplementedError
    else:
-        for region in tree.iterfind('.//page:TextRegion', namespaces=nsmap):
-            regions.append(ExtractedText.from_text_segment(region, nsmap, textequiv_level=textequiv_level))
+        for region in tree.iterfind(".//page:TextRegion", namespaces=nsmap):
+            regions.append(
+                ExtractedText.from_text_segment(
+                    region, nsmap, textequiv_level=textequiv_level
+                )
+            )

    # Filter empty region texts
-    regions = [r for r in regions if r.text != '']
+    regions = [r for r in regions if r.text != ""]

-    return ExtractedText(None, regions, '\n', None)
+    return ExtractedText(None, regions, "\n", None)


-def page_text(tree, *, textequiv_level='region'):
+def page_text(tree, *, textequiv_level="region"):
    return page_extract(tree, textequiv_level=textequiv_level).text


 def plain_extract(filename):
-    with open(filename, 'r') as f:
+    with open(filename, "r") as f:
        return ExtractedText(
-                None,
-                [ExtractedText('line %d' % no, None, None, line) for no, line in enumerate(f.readlines())],
-                '\n',
-                None
+            None,
+            [
+                ExtractedText("line %d" % no, None, None, line)
+                for no, line in enumerate(f.readlines())
+            ],
+            "\n",
+            None,
        )


@ -105,7 +125,7 @@ def plain_text(filename):
    return plain_extract(filename).text


-def extract(filename, *, textequiv_level='region'):
+def extract(filename, *, textequiv_level="region"):
    """Extract the text from the given file.

    Supports PAGE, ALTO and falls back to plain text.
@ -124,5 +144,5 @@ def text(filename):
    return extract(filename).text


-if __name__ == '__main__':
+if __name__ == "__main__":
    print(text(sys.argv[1]))
--- a/qurator/dinglehopper/ocrd_cli.py
+++ b/qurator/dinglehopper/ocrd_cli.py
@ -10,7 +10,7 @@ from pkg_resources import resource_string
 from .cli import process as cli_process
 from .edit_distance import levenshtein_matrix_cache_clear

-OCRD_TOOL = json.loads(resource_string(__name__, 'ocrd-tool.json').decode('utf8'))
+OCRD_TOOL = json.loads(resource_string(__name__, "ocrd-tool.json").decode("utf8"))


@click.command()
@ -20,20 +20,19 @@ def ocrd_dinglehopper(*args, **kwargs):


 class OcrdDinglehopperEvaluate(Processor):
-
    def __init__(self, *args, **kwargs):
-        kwargs['ocrd_tool'] = OCRD_TOOL['tools']['ocrd-dinglehopper']
+        kwargs["ocrd_tool"] = OCRD_TOOL["tools"]["ocrd-dinglehopper"]
        super(OcrdDinglehopperEvaluate, self).__init__(*args, **kwargs)

    def process(self):
-        assert_file_grp_cardinality(self.input_file_grp, 2, 'GT and OCR')
+        assert_file_grp_cardinality(self.input_file_grp, 2, "GT and OCR")
        assert_file_grp_cardinality(self.output_file_grp, 1)

-        log = getLogger('processor.OcrdDinglehopperEvaluate')
+        log = getLogger("processor.OcrdDinglehopperEvaluate")

-        metrics = self.parameter['metrics']
-        textequiv_level = self.parameter['textequiv_level']
-        gt_grp, ocr_grp = self.input_file_grp.split(',')
+        metrics = self.parameter["metrics"]
+        textequiv_level = self.parameter["textequiv_level"]
+        gt_grp, ocr_grp = self.input_file_grp.split(",")

        input_file_tuples = self._zip_input_files([gt_grp, ocr_grp])
        for n, (gt_file, ocr_file) in enumerate(input_file_tuples):
@ -55,40 +54,47 @@ class OcrdDinglehopperEvaluate(Processor):
            except FileExistsError:
                pass
            cli_process(
-                    gt_file.local_filename,
-                    ocr_file.local_filename,
-                    report_prefix,
-                    metrics=metrics,
-                    textequiv_level=textequiv_level
+                gt_file.local_filename,
+                ocr_file.local_filename,
+                report_prefix,
+                metrics=metrics,
+                textequiv_level=textequiv_level,
            )

            # Add reports to the workspace
-            for report_suffix, mimetype in \
-                    [
-                        ['.html', 'text/html'],
-                        ['.json', 'application/json']
-                    ]:
+            for report_suffix, mimetype in [
+                [".html", "text/html"],
+                [".json", "application/json"],
+            ]:
                self.workspace.add_file(
-                     ID=file_id + report_suffix,
-                     file_grp=self.output_file_grp,
-                     pageId=page_id,
-                     mimetype=mimetype,
-                     local_filename=report_prefix + report_suffix)
+                    ID=file_id + report_suffix,
+                    file_grp=self.output_file_grp,
+                    pageId=page_id,
+                    mimetype=mimetype,
+                    local_filename=report_prefix + report_suffix,
+                )

            # Clear cache between files
            levenshtein_matrix_cache_clear()

    def _zip_input_files(self, input_file_grps):
-        log = getLogger('processor.OcrdDinglehopperEvaluate')
+        log = getLogger("processor.OcrdDinglehopperEvaluate")
        input_file_tuples = list()
-        for page_id in ([self.page_id] if self.page_id else
-                        self.workspace.mets.physical_pages):
+        for page_id in (
+            [self.page_id] if self.page_id else self.workspace.mets.physical_pages
+        ):
            ifiles = list()
            for input_file_grp in input_file_grps:
-                log.debug("Adding input file group %s to page %s", input_file_grp, page_id)
-                files = self.workspace.mets.find_all_files(pageId=page_id, fileGrp=input_file_grp)
+                log.debug(
+                    "Adding input file group %s to page %s", input_file_grp, page_id
+                )
+                files = self.workspace.mets.find_all_files(
+                    pageId=page_id, fileGrp=input_file_grp
+                )
                if not files:
-                    log.error('Found no page "%s" in file group %s', page_id, input_file_grp)
+                    log.error(
+                        'Found no page "%s" in file group %s', page_id, input_file_grp
+                    )
                    ifiles.append(None)
                else:
                    ifiles.append(files[0])
@ -97,5 +103,5 @@ class OcrdDinglehopperEvaluate(Processor):
        return input_file_tuples


-if __name__ == '__main__':
+if __name__ == "__main__":
    ocrd_dinglehopper()
--- a/qurator/dinglehopper/tests/extracted_text_test.py
+++ b/qurator/dinglehopper/tests/extracted_text_test.py
@ -10,25 +10,30 @@ from .. import seq_align, ExtractedText


 def test_text():
-    test1 = ExtractedText(None, [
-        ExtractedText('s0', None, None, 'foo'),
-        ExtractedText('s1', None, None, 'bar'),
-        ExtractedText('s2', None, None, 'bazinga')
-    ], ' ', None)
+    test1 = ExtractedText(
+        None,
+        [
+            ExtractedText("s0", None, None, "foo"),
+            ExtractedText("s1", None, None, "bar"),
+            ExtractedText("s2", None, None, "bazinga"),
+        ],
+        " ",
+        None,
+    )

-    assert test1.text == 'foo bar bazinga'
-    assert test1.segment_id_for_pos(0) == 's0'
+    assert test1.text == "foo bar bazinga"
+    assert test1.segment_id_for_pos(0) == "s0"
    assert test1.segment_id_for_pos(3) is None
-    assert test1.segment_id_for_pos(10) == 's2'
+    assert test1.segment_id_for_pos(10) == "s2"


 def test_normalization_check():
-    with pytest.raises(ValueError, match=r'.*is not in NFC.*'):
-        ExtractedText('foo', None, None, unicodedata.normalize('NFD', 'Schlyñ'))
-    assert ExtractedText('foo', None, None, unicodedata.normalize('NFC', 'Schlyñ'))
+    with pytest.raises(ValueError, match=r".*is not in NFC.*"):
+        ExtractedText("foo", None, None, unicodedata.normalize("NFD", "Schlyñ"))
+    assert ExtractedText("foo", None, None, unicodedata.normalize("NFC", "Schlyñ"))


-AlignmentElement = namedtuple('AlignmentElement', 'left right left_id right_id')
+AlignmentElement = namedtuple("AlignmentElement", "left right left_id right_id")


 def test_align():
@ -39,25 +44,36 @@ def test_align():
    not Python characters.
    """

-    test1 = ExtractedText(None, [
-        ExtractedText('s0', None, None, 'foo'),
-        ExtractedText('s1', None, None, 'bar'),
-        ExtractedText('s2', None, None, 'batzinga')
-    ], ' ', None)
-    test2 = ExtractedText(None, [
-        ExtractedText('x0', None, None, 'foo'),
-        ExtractedText('x1', None, None, 'bar'),
-        # extra .
-        ExtractedText('x2', None, None, '.'),
-        # deletion + different grapheme cluster, m̃ also is two Python characters
-        ExtractedText('x3', None, None, 'bazim̃ga'),
-    ], ' ', None)
+    test1 = ExtractedText(
+        None,
+        [
+            ExtractedText("s0", None, None, "foo"),
+            ExtractedText("s1", None, None, "bar"),
+            ExtractedText("s2", None, None, "batzinga"),
+        ],
+        " ",
+        None,
+    )
+    test2 = ExtractedText(
+        None,
+        [
+            ExtractedText("x0", None, None, "foo"),
+            ExtractedText("x1", None, None, "bar"),
+            # extra .
+            ExtractedText("x2", None, None, "."),
+            # deletion + different grapheme cluster, m̃ also is two Python characters
+            ExtractedText("x3", None, None, "bazim̃ga"),
+        ],
+        " ",
+        None,
+    )

    left_pos = 0
    right_pos = 0
    alignment = []
-    for left, right in seq_align(grapheme_clusters(test1.text),
-                                 grapheme_clusters(test2.text)):
+    for left, right in seq_align(
+        grapheme_clusters(test1.text), grapheme_clusters(test2.text)
+    ):
        left_id = test1.segment_id_for_pos(left_pos) if left is not None else None
        right_id = test2.segment_id_for_pos(right_pos) if right is not None else None
        el = AlignmentElement(left, right, left_id, right_id)
@ -67,46 +83,57 @@ def test_align():
        if right is not None:
            right_pos += len(right)

-    print('test1: {}'.format(test1.text))
-    print('test2: {}'.format(test2.text))
+    print("test1: {}".format(test1.text))
+    print("test2: {}".format(test2.text))

-    assert alignment[0] == ('f', 'f', 's0', 'x0')
-    assert alignment[8] == (None, '.', None, 'x2')
-    assert alignment[12] == ('t', None, 's2', None)
-    assert alignment[15] == ('n', 'm̃', 's2', 'x3')
+    assert alignment[0] == ("f", "f", "s0", "x0")
+    assert alignment[8] == (None, ".", None, "x2")
+    assert alignment[12] == ("t", None, "s2", None)
+    assert alignment[15] == ("n", "m̃", "s2", "x3")


-@pytest.mark.parametrize("attributes,expected_index,expected_log", [
-    ([], None, None),
-    (['index="0"'], 0, None),
-    ([''], 0, None),
-    (['conf="0.5"'], 0, None),
-    (['index="1"', 'index="0"'], 1, None),
-    (['index="0" conf="0.4"', 'conf="0.5"'], 0, "TextEquiv without index"),
-    (['conf="0.4"', 'conf="0.5"', 'conf="0.9"'], 2,
-     "No index attributes, use 'conf' attribute to sort TextEquiv"),
-    (['index="0"', ''], 0, "TextEquiv without index"),
-    (['', 'conf="0.4"'], 1,
-     "No index attributes, use 'conf' attribute to sort TextEquiv"),
-    (['', ''], 0, "No index attributes, use first TextEquiv"),
-])
+@pytest.mark.parametrize(
+    "attributes,expected_index,expected_log",
+    [
+        ([], None, None),
+        (['index="0"'], 0, None),
+        ([""], 0, None),
+        (['conf="0.5"'], 0, None),
+        (['index="1"', 'index="0"'], 1, None),
+        (['index="0" conf="0.4"', 'conf="0.5"'], 0, "TextEquiv without index"),
+        (
+            ['conf="0.4"', 'conf="0.5"', 'conf="0.9"'],
+            2,
+            "No index attributes, use 'conf' attribute to sort TextEquiv",
+        ),
+        (['index="0"', ""], 0, "TextEquiv without index"),
+        (
+            ["", 'conf="0.4"'],
+            1,
+            "No index attributes, use 'conf' attribute to sort TextEquiv",
+        ),
+        (["", ""], 0, "No index attributes, use first TextEquiv"),
+    ],
+)
 def test_textequiv(attributes, expected_index, expected_log, caplog):
    """Test that extracting text from a PAGE TextEquiv is working without index attr."""
    caplog.set_level(logging.INFO)
-    xml = "<?xml version=\"1.0\"?>"
+    xml = '<?xml version="1.0"?>'
    ns = "http://schema.primaresearch.org/PAGE/gts/pagecontent/2018-07-15"
    text = ["Text {0}".format(i) for i in range(len(attributes) + 1)]

-    equiv = ["<TextEquiv {0}><Unicode>{1}</Unicode></TextEquiv>".format(attr, text[i])
-             for i, attr in enumerate(attributes)]
+    equiv = [
+        "<TextEquiv {0}><Unicode>{1}</Unicode></TextEquiv>".format(attr, text[i])
+        for i, attr in enumerate(attributes)
+    ]

-    textline = "{0}<TextLine id=\"l3\" xmlns=\"{1}\">{2}</TextLine>"
-    textline = textline.format(xml, ns, ''.join(equiv))
+    textline = '{0}<TextLine id="l3" xmlns="{1}">{2}</TextLine>'
+    textline = textline.format(xml, ns, "".join(equiv))

    root = ET.fromstring(textline)
-    result = ExtractedText.from_text_segment(root,
-                                             {'page': ns},
-                                             textequiv_level='line').text
+    result = ExtractedText.from_text_segment(
+        root, {"page": ns}, textequiv_level="line"
+    ).text
    if expected_index is None:
        assert not result
    else:
--- a/qurator/dinglehopper/tests/test_align.py
+++ b/qurator/dinglehopper/tests/test_align.py
@ -3,64 +3,85 @@ from .. import align, seq_align, distance


 def test_left_empty():
-    result = list(align('', 'foo'))
-    expected = [(None, 'f'), (None, 'o'), (None, 'o')]
+    result = list(align("", "foo"))
+    expected = [(None, "f"), (None, "o"), (None, "o")]
    assert result == expected


 def test_right_empty():
-    result = list(align('foo', ''))
-    expected = [('f', None), ('o', None), ('o', None)]
+    result = list(align("foo", ""))
+    expected = [("f", None), ("o", None), ("o", None)]
    assert result == expected


 def test_left_longer():
-    result = list(align('food', 'foo'))
-    expected = [('f', 'f'), ('o', 'o'), ('o', 'o'), ('d', None)]
+    result = list(align("food", "foo"))
+    expected = [("f", "f"), ("o", "o"), ("o", "o"), ("d", None)]
    assert result == expected


 def test_right_longer():
-    result = list(align('foo', 'food'))
-    expected = [('f', 'f'), ('o', 'o'), ('o', 'o'), (None, 'd')]
+    result = list(align("foo", "food"))
+    expected = [("f", "f"), ("o", "o"), ("o", "o"), (None, "d")]
    assert result == expected


 def test_some_diff():
-    result = list(align('abcde', 'aaadef'))
+    result = list(align("abcde", "aaadef"))
    left, right = unzip(result)
-    assert list(left) == ['a', 'b', 'c', 'd', 'e', None]
-    assert list(right) == ['a', 'a', 'a', 'd', 'e', 'f']
+    assert list(left) == ["a", "b", "c", "d", "e", None]
+    assert list(right) == ["a", "a", "a", "d", "e", "f"]


 def test_longer():
-    s1 = 'Dies ist eine Tst!'
-    s2 = 'Dies ist ein Test.'
+    s1 = "Dies ist eine Tst!"
+    s2 = "Dies ist ein Test."

    result = list(align(s1, s2))  # ; diffprint(*unzip(result))
-    expected = [('D', 'D'), ('i', 'i'), ('e', 'e'), ('s', 's'), (' ', ' '),
-                ('i', 'i'), ('s', 's'), ('t', 't'), (' ', ' '),
-                ('e', 'e'), ('i', 'i'), ('n', 'n'), ('e', None), (' ', ' '),
-                ('T', 'T'), (None, 'e'), ('s', 's'), ('t', 't'), ('!', '.')]
+    expected = [
+        ("D", "D"),
+        ("i", "i"),
+        ("e", "e"),
+        ("s", "s"),
+        (" ", " "),
+        ("i", "i"),
+        ("s", "s"),
+        ("t", "t"),
+        (" ", " "),
+        ("e", "e"),
+        ("i", "i"),
+        ("n", "n"),
+        ("e", None),
+        (" ", " "),
+        ("T", "T"),
+        (None, "e"),
+        ("s", "s"),
+        ("t", "t"),
+        ("!", "."),
+    ]
    assert result == expected


 def test_completely_different():
-    assert len(list(align('abcde', 'fghij'))) == 5
+    assert len(list(align("abcde", "fghij"))) == 5


 def test_with_some_fake_ocr_errors():
-    result = list(align('Über die vielen Sorgen wegen desselben vergaß',
-                        'SomeJunk MoreJunk Übey die vielen Sorgen wegen AdditionalJunk deffelben vcrgab'))
+    result = list(
+        align(
+            "Über die vielen Sorgen wegen desselben vergaß",
+            "SomeJunk MoreJunk Übey die vielen Sorgen wegen AdditionalJunk deffelben vcrgab",
+        )
+    )
    left, right = unzip(result)

    # Beginning
-    assert list(left[:18]) == [None]*18
-    assert list(right[:18]) == list('SomeJunk MoreJunk ')
+    assert list(left[:18]) == [None] * 18
+    assert list(right[:18]) == list("SomeJunk MoreJunk ")

    # End
-    assert list(left[-1:]) == ['ß']
-    assert list(right[-1:]) == ['b']
+    assert list(left[-1:]) == ["ß"]
+    assert list(right[-1:]) == ["b"]


 def test_lines():
@ -68,13 +89,30 @@ def test_lines():

    This mainly serves as documentation for comparing lists of lines.
    """
-    result = list(seq_align(
-        ['This is a line.', 'This is another', 'And the last line'],
-        ['This is a line.', 'This is another', 'J  u   n      k', 'And the last line']
-    ))
+    result = list(
+        seq_align(
+            ["This is a line.", "This is another", "And the last line"],
+            [
+                "This is a line.",
+                "This is another",
+                "J  u   n      k",
+                "And the last line",
+            ],
+        )
+    )
    left, right = unzip(result)
-    assert list(left)  == ['This is a line.', 'This is another', None,              'And the last line']
-    assert list(right) == ['This is a line.', 'This is another', 'J  u   n      k', 'And the last line']
+    assert list(left) == [
+        "This is a line.",
+        "This is another",
+        None,
+        "And the last line",
+    ]
+    assert list(right) == [
+        "This is a line.",
+        "This is another",
+        "J  u   n      k",
+        "And the last line",
+    ]


 def test_lines_similar():
@ -92,7 +130,7 @@ def test_lines_similar():
            # Just an example!
            min_len = min(len(self._string), len(other._string))
            if min_len > 0:
-                normalized_distance = distance(self._string, other._string)/min_len
+                normalized_distance = distance(self._string, other._string) / min_len
                similar = normalized_distance < 0.1
            else:
                similar = False
@ -102,18 +140,39 @@ def test_lines_similar():
            return not self.__eq__(other)

        def __repr__(self):
-            return 'SimilarString(\'%s\')' % self._string
+            return "SimilarString('%s')" % self._string

        def __hash__(self):
            return hash(self._string)

-    result = list(seq_align(
-        [SimilarString('This is a line.'), SimilarString('This is another'),                                   SimilarString('And the last line')],
-        [SimilarString('This is a ljne.'), SimilarString('This is another'), SimilarString('J  u   n      k'), SimilarString('And the last line')]
-    ))
+    result = list(
+        seq_align(
+            [
+                SimilarString("This is a line."),
+                SimilarString("This is another"),
+                SimilarString("And the last line"),
+            ],
+            [
+                SimilarString("This is a ljne."),
+                SimilarString("This is another"),
+                SimilarString("J  u   n      k"),
+                SimilarString("And the last line"),
+            ],
+        )
+    )
    left, right = unzip(result)
-    assert list(left)  == [SimilarString('This is a line.'), SimilarString('This is another'), None,                             SimilarString('And the last line')]
-    assert list(right) == [SimilarString('This is a ljne.'), SimilarString('This is another'), SimilarString('J  u   n      k'), SimilarString('And the last line')]
+    assert list(left) == [
+        SimilarString("This is a line."),
+        SimilarString("This is another"),
+        None,
+        SimilarString("And the last line"),
+    ]
+    assert list(right) == [
+        SimilarString("This is a ljne."),
+        SimilarString("This is another"),
+        SimilarString("J  u   n      k"),
+        SimilarString("And the last line"),
+    ]

    # Test __eq__ (i.e. is it a substitution or a similar string?)
    assert list(left)[0] == list(right)[0]
--- a/qurator/dinglehopper/tests/test_character_error_rate.py
+++ b/qurator/dinglehopper/tests/test_character_error_rate.py
@ -7,31 +7,35 @@ from .. import character_error_rate


 def test_character_error_rate():
-    assert character_error_rate('a', 'a') == 0
-    assert character_error_rate('a', 'b') == 1/1
-    assert character_error_rate('Foo', 'Bar') == 3/3
+    assert character_error_rate("a", "a") == 0
+    assert character_error_rate("a", "b") == 1 / 1
+    assert character_error_rate("Foo", "Bar") == 3 / 3

-    assert character_error_rate('Foo', '') == 3/3
+    assert character_error_rate("Foo", "") == 3 / 3

-    assert character_error_rate('', '') == 0
-    assert math.isinf(character_error_rate('', 'Foo'))
+    assert character_error_rate("", "") == 0
+    assert math.isinf(character_error_rate("", "Foo"))

-    assert character_error_rate('Foo', 'Food') == 1/3
-    assert character_error_rate('Fnord', 'Food') == 2/5
-    assert character_error_rate('Müll', 'Mull') == 1/4
-    assert character_error_rate('Abstand', 'Sand') == 4/7
+    assert character_error_rate("Foo", "Food") == 1 / 3
+    assert character_error_rate("Fnord", "Food") == 2 / 5
+    assert character_error_rate("Müll", "Mull") == 1 / 4
+    assert character_error_rate("Abstand", "Sand") == 4 / 7


 def test_character_error_rate_hard():
-    s1 = unicodedata.normalize('NFC', 'Schlyñ lorem ipsum.')
-    s2 = unicodedata.normalize('NFD', 'Schlyñ lorem ipsum!')  # Different, decomposed!
-    assert character_error_rate(s1, s2) == 1/19
+    s1 = unicodedata.normalize("NFC", "Schlyñ lorem ipsum.")
+    s2 = unicodedata.normalize("NFD", "Schlyñ lorem ipsum!")  # Different, decomposed!
+    assert character_error_rate(s1, s2) == 1 / 19

-    s1 = 'Schlyñ'
-    assert len(s1) == 6  # This ends with LATIN SMALL LETTER N WITH TILDE, so 6 code points
-    s2 = 'Schlym̃'
-    assert len(s2) == 7  # This, OTOH, ends with LATIN SMALL LETTER M + COMBINING TILDE, 7 code points
+    s1 = "Schlyñ"
+    assert (
+        len(s1) == 6
+    )  # This ends with LATIN SMALL LETTER N WITH TILDE, so 6 code points
+    s2 = "Schlym̃"
+    assert (
+        len(s2) == 7
+    )  # This, OTOH, ends with LATIN SMALL LETTER M + COMBINING TILDE, 7 code points

    # Both strings have the same length in terms of grapheme clusters. So the CER should be symmetrical.
-    assert character_error_rate(s2, s1) == 1/6
-    assert character_error_rate(s1, s2) == 1/6
+    assert character_error_rate(s2, s1) == 1 / 6
+    assert character_error_rate(s1, s2) == 1 / 6
--- a/qurator/dinglehopper/tests/test_edit_distance.py
+++ b/qurator/dinglehopper/tests/test_edit_distance.py
@ -6,35 +6,39 @@ from .. import levenshtein, distance


 def test_levenshtein():
-    assert levenshtein('a', 'a') == 0
-    assert levenshtein('a', 'b') == 1
-    assert levenshtein('Foo', 'Bar') == 3
+    assert levenshtein("a", "a") == 0
+    assert levenshtein("a", "b") == 1
+    assert levenshtein("Foo", "Bar") == 3

-    assert levenshtein('', '') == 0
-    assert levenshtein('Foo', '') == 3
-    assert levenshtein('', 'Foo') == 3
+    assert levenshtein("", "") == 0
+    assert levenshtein("Foo", "") == 3
+    assert levenshtein("", "Foo") == 3

-    assert levenshtein('Foo', 'Food') == 1
-    assert levenshtein('Fnord', 'Food') == 2
-    assert levenshtein('Müll', 'Mull') == 1
-    assert levenshtein('Abstand', 'Sand') == 4
+    assert levenshtein("Foo", "Food") == 1
+    assert levenshtein("Fnord", "Food") == 2
+    assert levenshtein("Müll", "Mull") == 1
+    assert levenshtein("Abstand", "Sand") == 4


 def test_levenshtein_other_sequences():
-    assert levenshtein(['a', 'ab'], ['a', 'ab', 'c']) == 1
-    assert levenshtein(['a', 'ab'], ['a', 'c']) == 1
+    assert levenshtein(["a", "ab"], ["a", "ab", "c"]) == 1
+    assert levenshtein(["a", "ab"], ["a", "c"]) == 1


 def test_distance():
-    assert distance('Fnord', 'Food') == 2
-    assert distance('Müll', 'Mull') == 1
+    assert distance("Fnord", "Food") == 2
+    assert distance("Müll", "Mull") == 1

-    word1 = unicodedata.normalize('NFC', 'Schlyñ')
-    word2 = unicodedata.normalize('NFD', 'Schlyñ')  # Different, decomposed!
+    word1 = unicodedata.normalize("NFC", "Schlyñ")
+    word2 = unicodedata.normalize("NFD", "Schlyñ")  # Different, decomposed!
    assert distance(word1, word2) == 0

-    word1 = 'Schlyñ'
-    assert len(word1) == 6  # This ends with LATIN SMALL LETTER N WITH TILDE, so 6 code points
-    word2 = 'Schlym̃'
-    assert len(word2) == 7  # This, OTOH, ends with LATIN SMALL LETTER M + COMBINING TILDE, 7 code points
+    word1 = "Schlyñ"
+    assert (
+        len(word1) == 6
+    )  # This ends with LATIN SMALL LETTER N WITH TILDE, so 6 code points
+    word2 = "Schlym̃"
+    assert (
+        len(word2) == 7
+    )  # This, OTOH, ends with LATIN SMALL LETTER M + COMBINING TILDE, 7 code points
    assert distance(word1, word2) == 1
--- a/qurator/dinglehopper/tests/test_editops.py
+++ b/qurator/dinglehopper/tests/test_editops.py
@ -4,45 +4,60 @@ from .. import seq_editops, editops


 def test_trivial():
-    assert seq_editops('abc', 'abc') == []
-    assert seq_editops('', '') == []
+    assert seq_editops("abc", "abc") == []
+    assert seq_editops("", "") == []


 def test_insert():
-    assert seq_editops('bc', 'abc') == [('insert', 0, 0)]
-    assert seq_editops('ac', 'abc') == [('insert', 1, 1)]
-    assert seq_editops('ab', 'abc') == [('insert', 2, 2)]
-    assert seq_editops('', 'a') == [('insert', 0, 0)]
+    assert seq_editops("bc", "abc") == [("insert", 0, 0)]
+    assert seq_editops("ac", "abc") == [("insert", 1, 1)]
+    assert seq_editops("ab", "abc") == [("insert", 2, 2)]
+    assert seq_editops("", "a") == [("insert", 0, 0)]


 def test_multiple():
-    assert seq_editops('bcd', 'abce') == [('insert', 0, 0), ('replace', 2, 3)]
+    assert seq_editops("bcd", "abce") == [("insert", 0, 0), ("replace", 2, 3)]


 def test_delete():
-    assert seq_editops('abcdef', 'cdef') == [('delete', 0, 0), ('delete', 1, 0)]
-    assert seq_editops('Xabcdef', 'Xcdef') == [('delete', 1, 1), ('delete', 2, 1)]
-    assert seq_editops('abcdefg', 'acdefX') == [('delete', 1, 1), ('replace', 6, 5)]
-    assert seq_editops('abcde', 'aabcd') == [('insert', 1, 1), ('delete', 4, 5)]
-    assert seq_editops('Foo', '') == [('delete', 0, 0), ('delete', 1, 0), ('delete', 2, 0)]
-    assert seq_editops('Foolish', 'Foo') == [('delete', 3, 3), ('delete', 4, 3), ('delete', 5, 3), ('delete', 6, 3)]
+    assert seq_editops("abcdef", "cdef") == [("delete", 0, 0), ("delete", 1, 0)]
+    assert seq_editops("Xabcdef", "Xcdef") == [("delete", 1, 1), ("delete", 2, 1)]
+    assert seq_editops("abcdefg", "acdefX") == [("delete", 1, 1), ("replace", 6, 5)]
+    assert seq_editops("abcde", "aabcd") == [("insert", 1, 1), ("delete", 4, 5)]
+    assert seq_editops("Foo", "") == [
+        ("delete", 0, 0),
+        ("delete", 1, 0),
+        ("delete", 2, 0),
+    ]
+    assert seq_editops("Foolish", "Foo") == [
+        ("delete", 3, 3),
+        ("delete", 4, 3),
+        ("delete", 5, 3),
+        ("delete", 6, 3),
+    ]


 def test_ambiguous():
-    assert seq_editops('bcd', 'abcef') == [('insert', 0, 0), ('replace', 2, 3), ('insert', 3, 4)]
+    assert seq_editops("bcd", "abcef") == [
+        ("insert", 0, 0),
+        ("replace", 2, 3),
+        ("insert", 3, 4),
+    ]


 def test_editops():
    """Test editops() in cases where dealing with grapheme clusters matters"""

    # In these cases, one of the words has a composed form, the other one does not.
-    assert editops('Schlyñ', 'Schlym̃') == [('replace', 5, 5)]
-    assert editops('oͤde', 'öde') == [('replace', 0, 0)]
+    assert editops("Schlyñ", "Schlym̃") == [("replace", 5, 5)]
+    assert editops("oͤde", "öde") == [("replace", 0, 0)]


 def test_editops_canonically_equivalent():
-    left = unicodedata.lookup('LATIN SMALL LETTER N') + unicodedata.lookup('COMBINING TILDE')
-    right = unicodedata.lookup('LATIN SMALL LETTER N WITH TILDE')
+    left = unicodedata.lookup("LATIN SMALL LETTER N") + unicodedata.lookup(
+        "COMBINING TILDE"
+    )
+    right = unicodedata.lookup("LATIN SMALL LETTER N WITH TILDE")
    assert left != right
-    assert unicodedata.normalize('NFC', left) == unicodedata.normalize('NFC', right)
+    assert unicodedata.normalize("NFC", left) == unicodedata.normalize("NFC", right)
    assert editops(left, right) == []
--- a/qurator/dinglehopper/tests/test_integ_align.py
+++ b/qurator/dinglehopper/tests/test_integ_align.py
@ -7,7 +7,7 @@ from lxml import etree as ET

 from .. import align, page_text

-data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data')
+data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")


@pytest.mark.integration
@ -17,8 +17,8 @@ def test_align_page_files():
    # (currently) not counted due to normalization.
    # NOTE: In this example, it doesn't matter that we work with "characters", not grapheme clusters.

-    gt = page_text(ET.parse(os.path.join(data_dir, 'test-gt.page2018.xml')))
-    ocr = page_text(ET.parse(os.path.join(data_dir, 'test-fake-ocr.page2018.xml')))
+    gt = page_text(ET.parse(os.path.join(data_dir, "test-gt.page2018.xml")))
+    ocr = page_text(ET.parse(os.path.join(data_dir, "test-fake-ocr.page2018.xml")))

    result = list(align(gt, ocr))
    for left, right in result:
--- a/qurator/dinglehopper/tests/test_integ_character_error_rate_ocr.py
+++ b/qurator/dinglehopper/tests/test_integ_character_error_rate_ocr.py
@ -8,26 +8,34 @@ from uniseg.graphemecluster import grapheme_clusters

 from .. import character_error_rate, page_text, alto_text

-data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data')
+data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")


@pytest.mark.integration
 def test_character_error_rate_between_page_files():
    # In the fake OCR file, we changed 2 characters and replaced a fi ligature with fi.
    # The fi ligature does not count.
-    gt = page_text(ET.parse(os.path.join(data_dir, 'test-gt.page2018.xml')))
-    ocr = page_text(ET.parse(os.path.join(data_dir, 'test-fake-ocr.page2018.xml')))
+    gt = page_text(ET.parse(os.path.join(data_dir, "test-gt.page2018.xml")))
+    ocr = page_text(ET.parse(os.path.join(data_dir, "test-fake-ocr.page2018.xml")))

    gt_len = len(list(grapheme_clusters(gt)))
-    expected_cer = 2/gt_len
+    expected_cer = 2 / gt_len

    assert character_error_rate(gt, ocr) == expected_cer


@pytest.mark.integration
 def test_character_error_rate_between_page_alto():
-    gt = page_text(ET.parse(os.path.join(data_dir, 'lorem-ipsum', 'lorem-ipsum-scan.gt.page.xml')))
-    ocr = alto_text(ET.parse(os.path.join(data_dir, 'lorem-ipsum', 'lorem-ipsum-scan.ocr.tesseract.alto.xml')))
+    gt = page_text(
+        ET.parse(os.path.join(data_dir, "lorem-ipsum", "lorem-ipsum-scan.gt.page.xml"))
+    )
+    ocr = alto_text(
+        ET.parse(
+            os.path.join(
+                data_dir, "lorem-ipsum", "lorem-ipsum-scan.ocr.tesseract.alto.xml"
+            )
+        )
+    )

    assert gt == ocr
    assert character_error_rate(gt, ocr) == 0
@ -35,7 +43,17 @@ def test_character_error_rate_between_page_alto():

@pytest.mark.integration
 def test_character_error_rate_between_page_alto_2():
-    gt = page_text(ET.parse(os.path.join(data_dir, 'lorem-ipsum', 'lorem-ipsum-scan-bad.gt.page.xml')))
-    ocr = alto_text(ET.parse(os.path.join(data_dir, 'lorem-ipsum', 'lorem-ipsum-scan-bad.ocr.tesseract.alto.xml')))
+    gt = page_text(
+        ET.parse(
+            os.path.join(data_dir, "lorem-ipsum", "lorem-ipsum-scan-bad.gt.page.xml")
+        )
+    )
+    ocr = alto_text(
+        ET.parse(
+            os.path.join(
+                data_dir, "lorem-ipsum", "lorem-ipsum-scan-bad.ocr.tesseract.alto.xml"
+            )
+        )
+    )

-    assert character_error_rate(gt, ocr) == 8/591  # Manually verified
+    assert character_error_rate(gt, ocr) == 8 / 591  # Manually verified
--- a/qurator/dinglehopper/tests/test_integ_cli_valid_json.py
+++ b/qurator/dinglehopper/tests/test_integ_cli_valid_json.py
@ -10,31 +10,31 @@ def test_cli_json(tmp_path):
    """Test that the cli/process() yields a loadable JSON report"""

    with working_directory(str(tmp_path)):
-        with open('gt.txt', 'w') as gtf:
-            gtf.write('AAAAA')
-        with open('ocr.txt', 'w') as ocrf:
-            ocrf.write('AAAAB')
+        with open("gt.txt", "w") as gtf:
+            gtf.write("AAAAA")
+        with open("ocr.txt", "w") as ocrf:
+            ocrf.write("AAAAB")

-        with open('gt.txt', 'r') as gtf:
+        with open("gt.txt", "r") as gtf:
            print(gtf.read())
-        process('gt.txt', 'ocr.txt', 'report')
-        with open('report.json', 'r') as jsonf:
+        process("gt.txt", "ocr.txt", "report")
+        with open("report.json", "r") as jsonf:
            print(jsonf.read())
-        with open('report.json', 'r') as jsonf:
+        with open("report.json", "r") as jsonf:
            j = json.load(jsonf)
-            assert j['cer'] == pytest.approx(0.2)
+            assert j["cer"] == pytest.approx(0.2)


 def test_cli_json_cer_is_infinity(tmp_path):
    """Test that the cli/process() yields a loadable JSON report when CER == inf"""

    with working_directory(str(tmp_path)):
-        with open('gt.txt', 'w') as gtf:
-            gtf.write('')  # Empty to yield CER == inf
-        with open('ocr.txt', 'w') as ocrf:
-            ocrf.write('Not important')
+        with open("gt.txt", "w") as gtf:
+            gtf.write("")  # Empty to yield CER == inf
+        with open("ocr.txt", "w") as ocrf:
+            ocrf.write("Not important")

-        process('gt.txt', 'ocr.txt', 'report')
-        with open('report.json', 'r') as jsonf:
+        process("gt.txt", "ocr.txt", "report")
+        with open("report.json", "r") as jsonf:
            j = json.load(jsonf)
-            assert j['cer'] == pytest.approx(float('inf'))
+            assert j["cer"] == pytest.approx(float("inf"))
--- a/qurator/dinglehopper/tests/test_integ_edit_distance_ocr.py
+++ b/qurator/dinglehopper/tests/test_integ_edit_distance_ocr.py
@ -7,7 +7,7 @@ from lxml import etree as ET

 from .. import distance, page_text, alto_text

-data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data')
+data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")


@pytest.mark.integration
@ -15,15 +15,23 @@ def test_distance_between_page_files():
    # In the fake OCR file, we changed 2 characters and replaced a fi ligature with fi.
    # Due to normalization, we don't count the ligature.
    # → 2 differences
-    gt = page_text(ET.parse(os.path.join(data_dir, 'test-gt.page2018.xml')))
-    ocr = page_text(ET.parse(os.path.join(data_dir, 'test-fake-ocr.page2018.xml')))
+    gt = page_text(ET.parse(os.path.join(data_dir, "test-gt.page2018.xml")))
+    ocr = page_text(ET.parse(os.path.join(data_dir, "test-fake-ocr.page2018.xml")))
    assert distance(gt, ocr) == 2


@pytest.mark.integration
 def test_distance_between_page_alto():
-    gt = page_text(ET.parse(os.path.join(data_dir, 'lorem-ipsum', 'lorem-ipsum-scan.gt.page.xml')))
-    ocr = alto_text(ET.parse(os.path.join(data_dir, 'lorem-ipsum', 'lorem-ipsum-scan.ocr.tesseract.alto.xml')))
+    gt = page_text(
+        ET.parse(os.path.join(data_dir, "lorem-ipsum", "lorem-ipsum-scan.gt.page.xml"))
+    )
+    ocr = alto_text(
+        ET.parse(
+            os.path.join(
+                data_dir, "lorem-ipsum", "lorem-ipsum-scan.ocr.tesseract.alto.xml"
+            )
+        )
+    )

    assert gt == ocr
    assert distance(gt, ocr) == 0
@ -31,7 +39,17 @@ def test_distance_between_page_alto():

@pytest.mark.integration
 def test_distance_between_page_alto_2():
-    gt = page_text(ET.parse(os.path.join(data_dir, 'lorem-ipsum', 'lorem-ipsum-scan-bad.gt.page.xml')))
-    ocr = alto_text(ET.parse(os.path.join(data_dir, 'lorem-ipsum', 'lorem-ipsum-scan-bad.ocr.tesseract.alto.xml')))
+    gt = page_text(
+        ET.parse(
+            os.path.join(data_dir, "lorem-ipsum", "lorem-ipsum-scan-bad.gt.page.xml")
+        )
+    )
+    ocr = alto_text(
+        ET.parse(
+            os.path.join(
+                data_dir, "lorem-ipsum", "lorem-ipsum-scan-bad.ocr.tesseract.alto.xml"
+            )
+        )
+    )

    assert distance(gt, ocr) == 8  # Manually verified
--- a/qurator/dinglehopper/tests/test_integ_ocrd_cli.py
+++ b/qurator/dinglehopper/tests/test_integ_ocrd_cli.py
@ -10,27 +10,32 @@ from .util import working_directory

 from ..ocrd_cli import ocrd_dinglehopper

-data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data')
+data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")


 def test_ocrd_cli(tmp_path):
    """Test OCR-D interface"""

    # Copy test workspace
-    test_workspace_dir_source = Path(data_dir) / 'actevedef_718448162'
-    test_workspace_dir = tmp_path / 'test_ocrd_cli'
+    test_workspace_dir_source = Path(data_dir) / "actevedef_718448162"
+    test_workspace_dir = tmp_path / "test_ocrd_cli"
    shutil.copytree(str(test_workspace_dir_source), str(test_workspace_dir))

    # Run through the OCR-D interface
    with working_directory(str(test_workspace_dir)):
        runner = CliRunner()
        args = [
-            '-m', 'mets.xml',
-            '-I', 'OCR-D-GT-PAGE,OCR-D-OCR-CALAMARI',
-            '-O', 'OCR-D-OCR-CALAMARI-EVAL'
+            "-m",
+            "mets.xml",
+            "-I",
+            "OCR-D-GT-PAGE,OCR-D-OCR-CALAMARI",
+            "-O",
+            "OCR-D-OCR-CALAMARI-EVAL",
        ]
-        sys.argv[1:] = args  # XXX Hack to satisfy ocrd_cli_wrap_processor() check for arguments
+        sys.argv[
+            1:
+        ] = args  # XXX Hack to satisfy ocrd_cli_wrap_processor() check for arguments
        result = runner.invoke(ocrd_dinglehopper, args)
    assert result.exit_code == 0
-    result_json = list((test_workspace_dir / 'OCR-D-OCR-CALAMARI-EVAL').glob('*.json'))
-    assert json.load(open(str(result_json[0])))['cer'] < 0.03
+    result_json = list((test_workspace_dir / "OCR-D-OCR-CALAMARI-EVAL").glob("*.json"))
+    assert json.load(open(str(result_json[0])))["cer"] < 0.03
--- a/qurator/dinglehopper/tests/test_integ_word_error_rate_ocr.py
+++ b/qurator/dinglehopper/tests/test_integ_word_error_rate_ocr.py
@ -7,26 +7,36 @@ from lxml import etree as ET

 from .. import word_error_rate, words, page_text, alto_text

-data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data')
+data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")


@pytest.mark.integration
 def test_word_error_rate_between_page_files():
    # In the fake OCR file, we changed 2 characters and replaced a fi ligature with fi. So we have 3 changed words,
    # the ligature does not count → 2 errors
-    gt = page_text(ET.parse(os.path.join(data_dir, 'test-gt.page2018.xml')))
+    gt = page_text(ET.parse(os.path.join(data_dir, "test-gt.page2018.xml")))

-    gt_word_count = 7+6+5+8+7+6+7+8+6+7+7+5+6+8+8+7+7+6+5+4  # Manually verified word count per line
+    gt_word_count = (
+        7 + 6 + 5 + 8 + 7 + 6 + 7 + 8 + 6 + 7 + 7 + 5 + 6 + 8 + 8 + 7 + 7 + 6 + 5 + 4
+    )  # Manually verified word count per line
    assert len(list(words(gt))) == gt_word_count

-    ocr = page_text(ET.parse(os.path.join(data_dir, 'test-fake-ocr.page2018.xml')))
-    assert word_error_rate(gt, ocr) == 2/gt_word_count
+    ocr = page_text(ET.parse(os.path.join(data_dir, "test-fake-ocr.page2018.xml")))
+    assert word_error_rate(gt, ocr) == 2 / gt_word_count


@pytest.mark.integration
 def test_word_error_rate_between_page_alto():
-    gt = page_text(ET.parse(os.path.join(data_dir, 'lorem-ipsum', 'lorem-ipsum-scan.gt.page.xml')))
-    ocr = alto_text(ET.parse(os.path.join(data_dir, 'lorem-ipsum', 'lorem-ipsum-scan.ocr.tesseract.alto.xml')))
+    gt = page_text(
+        ET.parse(os.path.join(data_dir, "lorem-ipsum", "lorem-ipsum-scan.gt.page.xml"))
+    )
+    ocr = alto_text(
+        ET.parse(
+            os.path.join(
+                data_dir, "lorem-ipsum", "lorem-ipsum-scan.ocr.tesseract.alto.xml"
+            )
+        )
+    )

    assert gt == ocr
    assert word_error_rate(gt, ocr) == 0
@ -34,11 +44,25 @@ def test_word_error_rate_between_page_alto():

@pytest.mark.integration
 def test_word_error_rate_between_page_alto_2():
-    gt = page_text(ET.parse(os.path.join(data_dir, 'lorem-ipsum', 'lorem-ipsum-scan-bad.gt.page.xml')))
+    gt = page_text(
+        ET.parse(
+            os.path.join(data_dir, "lorem-ipsum", "lorem-ipsum-scan-bad.gt.page.xml")
+        )
+    )

-    gt_word_count = 14+18+17+14+17+17+3  # Manually verified word count per line
+    gt_word_count = (
+        14 + 18 + 17 + 14 + 17 + 17 + 3
+    )  # Manually verified word count per line
    assert len(list(words(gt))) == gt_word_count

-    ocr = alto_text(ET.parse(os.path.join(data_dir, 'lorem-ipsum', 'lorem-ipsum-scan-bad.ocr.tesseract.alto.xml')))
+    ocr = alto_text(
+        ET.parse(
+            os.path.join(
+                data_dir, "lorem-ipsum", "lorem-ipsum-scan-bad.ocr.tesseract.alto.xml"
+            )
+        )
+    )

-    assert word_error_rate(gt, ocr) == 7/gt_word_count  # Manually verified, 6 words are wrong, 1 got split (=2 errors)
+    assert (
+        word_error_rate(gt, ocr) == 7 / gt_word_count
+    )  # Manually verified, 6 words are wrong, 1 got split (=2 errors)
--- a/qurator/dinglehopper/tests/test_ocr_files.py
+++ b/qurator/dinglehopper/tests/test_ocr_files.py
@ -9,46 +9,54 @@ import pytest
 from .util import working_directory
 from .. import alto_namespace, alto_text, page_namespace, page_text, plain_text, text

-data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data')
+data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")


 def test_alto_namespace():
-    tree = ET.parse(os.path.join(data_dir, 'test.alto3.xml'))
-    assert alto_namespace(tree) == 'http://www.loc.gov/standards/alto/ns-v3#'
+    tree = ET.parse(os.path.join(data_dir, "test.alto3.xml"))
+    assert alto_namespace(tree) == "http://www.loc.gov/standards/alto/ns-v3#"


 def test_alto_text():
-    tree = ET.parse(os.path.join(data_dir, 'test.alto3.xml'))
+    tree = ET.parse(os.path.join(data_dir, "test.alto3.xml"))
    result = alto_text(tree)
-    expected = textwrap.dedent("""\
+    expected = textwrap.dedent(
+        """\
        über die vielen Sorgen wegen deſſelben vergaß
        Hartkopf, der Frau Amtmännin das ver-
-        ſprochene zu überliefern.""")
+        ſprochene zu überliefern."""
+    )
    assert result == expected


 def test_alto_text_ALTO1():
-    tree = ET.parse(os.path.join(data_dir, 'test.alto1.xml'))
+    tree = ET.parse(os.path.join(data_dir, "test.alto1.xml"))
    assert "being erected at the Broadway stock" in alto_text(tree)


 def test_alto_text_ALTO2():
-    tree = ET.parse(os.path.join(data_dir, 'test.alto2.xml'))
-    assert "Halbmonde, die genau durch einen Ouerstrich halbiert\nsind und an beiden Enden" in alto_text(tree)
+    tree = ET.parse(os.path.join(data_dir, "test.alto2.xml"))
+    assert (
+        "Halbmonde, die genau durch einen Ouerstrich halbiert\nsind und an beiden Enden"
+        in alto_text(tree)
+    )


 def test_alto_text_ALTO3():
-    tree = ET.parse(os.path.join(data_dir, 'test.alto3.xml'))
+    tree = ET.parse(os.path.join(data_dir, "test.alto3.xml"))
    assert "über die vielen Sorgen wegen deſſelben vergaß" in alto_text(tree)


 def test_page_namespace():
-    tree = ET.parse(os.path.join(data_dir, 'test.page2018.xml'))
-    assert page_namespace(tree) == 'http://schema.primaresearch.org/PAGE/gts/pagecontent/2018-07-15'
+    tree = ET.parse(os.path.join(data_dir, "test.page2018.xml"))
+    assert (
+        page_namespace(tree)
+        == "http://schema.primaresearch.org/PAGE/gts/pagecontent/2018-07-15"
+    )


 def test_page_test():
-    tree = ET.parse(os.path.join(data_dir, 'test.page2018.xml'))
+    tree = ET.parse(os.path.join(data_dir, "test.page2018.xml"))
    result = page_text(tree)

    # We are currently normalizing on extraction, so the text is normalized.
@ -74,7 +82,8 @@ def test_page_test():
    #      Jndeß mangelten do einige Generalia, die
    #      alſo wegﬁelen. — Hartkopf gieng ſelb
    #      mit und berbrate es. —""")
-    expected = textwrap.dedent("""\
+    expected = textwrap.dedent(
+        """\
        über die vielen Sorgen wegen deſſelben vergaß
        Hartkopf, der Frau Amtmännin das ver-
        ſprochene zu überliefern. – Ein Erpreſſer
@ -94,7 +103,8 @@ def test_page_test():
        ſie das, was da wäre, herbeyſchaffen möchte.
        Jndeß mangelten doch einige Generalia, die
        alſo wegfielen. – Hartkopf gieng ſelbſt
-        mit und überbrachte es. –""")
+        mit und überbrachte es. –"""
+    )
    assert result == expected


@ -107,56 +117,69 @@ def test_page_with_empty_region():
    #             <Unicode></Unicode>
    #         </TextEquiv>
    #     </TextRegion>
-    tree = ET.parse(os.path.join(data_dir, 'brochrnx_73075507X/00000139.ocrd-tess.ocr.page.xml'))
+    tree = ET.parse(
+        os.path.join(data_dir, "brochrnx_73075507X/00000139.ocrd-tess.ocr.page.xml")
+    )
    result = page_text(tree)
    assert result


 def test_page_order():
    # This file contains TextRegions where file order is not the same as reading order.
-    tree = ET.parse(os.path.join(data_dir, 'order.page.xml'))
+    tree = ET.parse(os.path.join(data_dir, "order.page.xml"))
    result = page_text(tree)

    print(result)
-    assert re.search(r'Herr Konfrater.*75.*Etwas f.r Wittwen.*Ein gewi.{1,2}er Lord.*76\. Die', result, re.DOTALL)
+    assert re.search(
+        r"Herr Konfrater.*75.*Etwas f.r Wittwen.*Ein gewi.{1,2}er Lord.*76\. Die",
+        result,
+        re.DOTALL,
+    )


 def test_page_mixed_regions():
    # This file contains ImageRegions and TextRegions in the ReadingOrder
-    tree = ET.parse(os.path.join(data_dir, 'mixed-regions.page.xml'))
+    tree = ET.parse(os.path.join(data_dir, "mixed-regions.page.xml"))
    result = page_text(tree)

-    assert 'non exaudiam uos. Chriſtiani uero quia orant iuxta' in result
+    assert "non exaudiam uos. Chriſtiani uero quia orant iuxta" in result


 def test_page_level():
    # This file contains inconsistent TextRegion and TextLine texts

    # TextRegion
-    tree = ET.parse(os.path.join(data_dir, 'levels-are-different.page.xml'))
+    tree = ET.parse(os.path.join(data_dir, "levels-are-different.page.xml"))
    result = page_text(tree)
-    assert result == 'Inconsistent dummy region text'
-    tree = ET.parse(os.path.join(data_dir, 'levels-are-different.page.xml'))
-    result = page_text(tree, textequiv_level='region')
-    assert result == 'Inconsistent dummy region text'
+    assert result == "Inconsistent dummy region text"
+    tree = ET.parse(os.path.join(data_dir, "levels-are-different.page.xml"))
+    result = page_text(tree, textequiv_level="region")
+    assert result == "Inconsistent dummy region text"

    # TextLine
-    tree = ET.parse(os.path.join(data_dir, 'levels-are-different.page.xml'))
-    result = page_text(tree, textequiv_level='line')
-    assert result == 'Hand, Mylord? fragte der Graf von Rocheſter.\nAls er einsmals in dem Oberhauſe eine Bill we-'
+    tree = ET.parse(os.path.join(data_dir, "levels-are-different.page.xml"))
+    result = page_text(tree, textequiv_level="line")
+    assert (
+        result
+        == "Hand, Mylord? fragte der Graf von Rocheſter.\nAls er einsmals in dem Oberhauſe eine Bill we-"
+    )


 def test_text():
-    assert "being erected at the Broadway stock" in text(os.path.join(data_dir, 'test.alto1.xml'))
-    assert "wieder ein. – Er langte den Zettel aus dem" in text(os.path.join(data_dir, 'test.page2018.xml'))
-    assert "Lorem ipsum" in text(os.path.join(data_dir, 'test.txt'))
+    assert "being erected at the Broadway stock" in text(
+        os.path.join(data_dir, "test.alto1.xml")
+    )
+    assert "wieder ein. – Er langte den Zettel aus dem" in text(
+        os.path.join(data_dir, "test.page2018.xml")
+    )
+    assert "Lorem ipsum" in text(os.path.join(data_dir, "test.txt"))


 def test_plain(tmp_path):
    with working_directory(str(tmp_path)):
-        with open('ocr.txt', 'w') as ocrf:
-            ocrf.write('AAAAB')
+        with open("ocr.txt", "w") as ocrf:
+            ocrf.write("AAAAB")

-        result = plain_text('ocr.txt')
-        expected = 'AAAAB'
+        result = plain_text("ocr.txt")
+        expected = "AAAAB"
        assert result == expected
--- a/qurator/dinglehopper/tests/test_word_error_rate.py
+++ b/qurator/dinglehopper/tests/test_word_error_rate.py
@ -6,32 +6,81 @@ from .. import word_error_rate, words


 def test_words():
-    result = list(words('Der schnelle [„braune“] Fuchs kann keine 3,14 Meter springen, oder?'))
-    expected = ['Der', 'schnelle', 'braune', 'Fuchs', 'kann', 'keine', '3,14', 'Meter', 'springen', 'oder']
+    result = list(
+        words("Der schnelle [„braune“] Fuchs kann keine 3,14 Meter springen, oder?")
+    )
+    expected = [
+        "Der",
+        "schnelle",
+        "braune",
+        "Fuchs",
+        "kann",
+        "keine",
+        "3,14",
+        "Meter",
+        "springen",
+        "oder",
+    ]
    assert result == expected


 def test_words_private_use_area():
-    result = list(words(
-        'ber die vielen Sorgen wegen deelben vergaß Hartkopf, der Frau Amtmnnin das ver⸗\n'
-        'ſproene zu berliefern.'))
+    result = list(
+        words(
+            "ber die vielen Sorgen wegen deelben vergaß Hartkopf, der Frau Amtmnnin das ver⸗\n"
+            "ſproene zu berliefern."
+        )
+    )
    expected = [
-        'ber', 'die', 'vielen', 'Sorgen', 'wegen', 'deelben', 'vergaß', 'Hartkopf',
-        'der', 'Frau', 'Amtmnnin', 'das', 'ver',
-        'ſproene', 'zu', 'berliefern']
+        "ber",
+        "die",
+        "vielen",
+        "Sorgen",
+        "wegen",
+        "deelben",
+        "vergaß",
+        "Hartkopf",
+        "der",
+        "Frau",
+        "Amtmnnin",
+        "das",
+        "ver",
+        "ſproene",
+        "zu",
+        "berliefern",
+    ]
    assert result == expected


 def test_word_error_rate():
-    assert word_error_rate('Dies ist ein Beispielsatz!', 'Dies ist ein Beispielsatz!') == 0
-    assert word_error_rate('Dies. ist ein Beispielsatz!', 'Dies ist ein Beispielsatz!') == 0
-    assert word_error_rate('Dies. ist ein Beispielsatz!', 'Dies ist ein Beispielsatz.') == 0
+    assert (
+        word_error_rate("Dies ist ein Beispielsatz!", "Dies ist ein Beispielsatz!") == 0
+    )
+    assert (
+        word_error_rate("Dies. ist ein Beispielsatz!", "Dies ist ein Beispielsatz!")
+        == 0
+    )
+    assert (
+        word_error_rate("Dies. ist ein Beispielsatz!", "Dies ist ein Beispielsatz.")
+        == 0
+    )

-    assert word_error_rate('Dies ist ein Beispielsatz!', 'Dies ist ein Beispielsarz:') == 1/4
-    assert word_error_rate('Dies ist ein Beispielsatz!', 'Dies ein ist Beispielsatz!') == 2/4
+    assert (
+        word_error_rate("Dies ist ein Beispielsatz!", "Dies ist ein Beispielsarz:")
+        == 1 / 4
+    )
+    assert (
+        word_error_rate("Dies ist ein Beispielsatz!", "Dies ein ist Beispielsatz!")
+        == 2 / 4
+    )

-    assert word_error_rate('Dies ist ein Beispielsatz!', '') == 4/4
-    assert math.isinf(word_error_rate('', 'Dies ist ein Beispielsatz!'))
-    assert word_error_rate('', '') == 0
+    assert word_error_rate("Dies ist ein Beispielsatz!", "") == 4 / 4
+    assert math.isinf(word_error_rate("", "Dies ist ein Beispielsatz!"))
+    assert word_error_rate("", "") == 0

-    assert word_error_rate('Schlyñ lorem ipsum dolor sit amet,', 'Schlym̃ lorem ipsum dolor sit amet.') == 1/6
+    assert (
+        word_error_rate(
+            "Schlyñ lorem ipsum dolor sit amet,", "Schlym̃ lorem ipsum dolor sit amet."
+        )
+        == 1 / 6
+    )
--- a/qurator/dinglehopper/tests/util.py
+++ b/qurator/dinglehopper/tests/util.py
@ -27,6 +27,7 @@ def unzip(an_iterable_of_tuples):

 class working_directory:
    """Context manager to temporarily change the working directory"""
+
    def __init__(self, wd):
        self.wd = wd

--- a/qurator/dinglehopper/word_error_rate.py
+++ b/qurator/dinglehopper/word_error_rate.py
@ -20,9 +20,10 @@ def words(s: str):

    def new_word_break(c, index=0):
        if 0xE000 <= ord(c) <= 0xF8FF:  # Private Use Area
-            return 'ALetter'
+            return "ALetter"
        else:
            return old_word_break(c, index)
+
    uniseg.wordbreak.word_break = new_word_break

    # Check if c is an unwanted character, i.e. whitespace, punctuation, or similar
@ -30,8 +31,8 @@ def words(s: str):

        # See https://www.fileformat.info/info/unicode/category/index.htm
        # and https://unicodebook.readthedocs.io/unicode.html#categories
-        unwanted_categories = 'O', 'M', 'P', 'Z', 'S'
-        unwanted_subcategories = 'Cc', 'Cf'
+        unwanted_categories = "O", "M", "P", "Z", "S"
+        unwanted_subcategories = "Cc", "Cf"

        subcat = unicodedata.category(c)
        cat = subcat[0]
@ -53,7 +54,7 @@ def words(s: ExtractedText):

@multimethod
 def words_normalized(s: str):
-    return words(unicodedata.normalize('NFC', s))
+    return words(unicodedata.normalize("NFC", s))


@multimethod
@ -69,7 +70,9 @@ def word_error_rate_n(reference: str, compared: str) -> Tuple[float, int]:


@multimethod
-def word_error_rate_n(reference: ExtractedText, compared: ExtractedText) -> Tuple[float, int]:
+def word_error_rate_n(
+    reference: ExtractedText, compared: ExtractedText
+) -> Tuple[float, int]:
    return word_error_rate_n(reference.text, compared.text)


@ -84,7 +87,7 @@ def word_error_rate_n(reference: Iterable, compared: Iterable) -> Tuple[float, i
    if d == 0:
        return 0, n
    if n == 0:
-        return float('inf'), n
+        return float("inf"), n
    return d / n, n