Merge branch 'feat/display-segment-id' of github.com:qurator-spk/dinglehopper into feat/display-segment-id

2026-06-18 19:09:21 +02:00 · 2020-06-23 17:02:56 +02:00 · 2020-06-23 17:02:56 +02:00 · 5a3a74b246
commit 5a3a74b246
parent eca8cbc81e 5aa74e8383
21 changed files with 413 additions and 110 deletions
--- a/.vimrc
+++ b/.vimrc
@ -0,0 +1,2 @@
+" project-specific .vimrc (needs set exrc + set secure)
+set textwidth=90
--- a/qurator/dinglehopper/align.py
+++ b/qurator/dinglehopper/align.py
@ -28,16 +28,16 @@ def seq_align(s1, s2):

        if o:
            if o[0] == 'insert':
-                yield (None, s2[j])
+                yield None, s2[j]
                j += 1
            elif o[0] == 'delete':
-                yield (s1[i], None)
+                yield s1[i], None
                i += 1
            elif o[0] == 'replace':
-                yield (s1[i], s2[j])
+                yield s1[i], s2[j]
                i += 1
                j += 1
        else:
-            yield (s1[i], s2[j])
+            yield s1[i], s2[j]
            i += 1
            j += 1
--- a/qurator/dinglehopper/character_error_rate.py
+++ b/qurator/dinglehopper/character_error_rate.py
@ -15,6 +15,10 @@ def character_error_rate_n(reference, compared) -> Tuple[float, int]:
    :return: character error rate and length of the reference
    """
    d = distance(reference, compared)
+    # XXX
+    from .cli import ExtractedText
+    if isinstance(reference, ExtractedText):
+        reference = reference.text
    n = len(list(grapheme_clusters(unicodedata.normalize('NFC', reference))))

    if d == 0:
--- a/qurator/dinglehopper/cli.py
+++ b/qurator/dinglehopper/cli.py
@ -8,11 +8,11 @@ from markupsafe import escape
 from qurator.dinglehopper import *


-def gen_diff_report(gt_things, ocr_things, css_prefix, joiner, none, align):
+def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none):
    gtx = ''
    ocrx = ''

-    def format_thing(t, css_classes=None):
+    def format_thing(t, css_classes=None, id_=None):
        if t is None:
            html_t = none
            css_classes += ' ellipsis'
@ -21,19 +21,51 @@ def gen_diff_report(gt_things, ocr_things, css_prefix, joiner, none, align):
        else:
            html_t = escape(t)

+        html_custom_attrs = ""
+
+        # Set Bootstrap tooltip to the segment id
+        if id_:
+            html_custom_attrs += 'data-toggle="tooltip" title="{}"'.format(id_)
+
        if css_classes:
-            return '<span class="{css_classes}">{html_t}</span>'.format(css_classes=css_classes, html_t=html_t)
+            return '<span class="{css_classes}" {html_custom_attrs}>{html_t}</span>'.format(css_classes=css_classes, html_t=html_t, html_custom_attrs=html_custom_attrs)
        else:
            return '{html_t}'.format(html_t=html_t)

-    for k, (g, o) in enumerate(align(gt_things, ocr_things)):
-        if g == o:
-            css_classes = None
+    if isinstance(gt_in, ExtractedText):
+        if not isinstance(ocr_in, ExtractedText):
+            raise TypeError()
+        # XXX splitting should be done in ExtractedText
+        gt_things = list(grapheme_clusters(gt_in.text))
+        ocr_things = list(grapheme_clusters(ocr_in.text))
    else:
-            css_classes = '{css_prefix}diff{k} diff'.format(css_prefix=css_prefix, k=k)
+        gt_things = gt_in
+        ocr_things = ocr_in
+
+
+
+    g_pos = 0
+    o_pos = 0
+    for k, (g, o) in enumerate(seq_align(gt_things, ocr_things)):
+        css_classes = None
+        gt_id = None
+        ocr_id = None
+        if g != o:
+            css_classes = '{css_prefix}diff{k} diff'.format(css_prefix=css_prefix, k=k)
+            if isinstance(gt_in, ExtractedText):
+                gt_id = gt_in.segment_id_for_pos(g_pos) if g is not None else None
+                ocr_id = ocr_in.segment_id_for_pos(o_pos) if o is not None else None
+                # Deletions and inserts only produce one id + None, UI must
+                # support this, i.e. display for the one id produced
+
+        gtx += joiner + format_thing(g, css_classes, gt_id)
+        ocrx += joiner + format_thing(o, css_classes, ocr_id)
+
+        if g is not None:
+            g_pos += len(g)
+        if o is not None:
+            o_pos += len(o)

-        gtx += joiner + format_thing(g, css_classes)
-        ocrx += joiner + format_thing(o, css_classes)

    return \
        '''
@ -51,20 +83,17 @@ def process(gt, ocr, report_prefix, *, metrics=True):
    Click on a wrapper.
    """

-    gt_text = text(gt)
-    ocr_text = text(ocr)
-
-    gt_text = substitute_equivalences(gt_text)
-    ocr_text = substitute_equivalences(ocr_text)
+    gt_text = extract(gt)
+    ocr_text = extract(ocr)

    cer, n_characters = character_error_rate_n(gt_text, ocr_text)
    wer, n_words = word_error_rate_n(gt_text, ocr_text)

-    char_diff_report = gen_diff_report(gt_text, ocr_text, css_prefix='c', joiner='', none='·', align=align)
+    char_diff_report = gen_diff_report(gt_text, ocr_text, css_prefix='c', joiner='', none='·')

    gt_words = words_normalized(gt_text)
    ocr_words = words_normalized(ocr_text)
-    word_diff_report = gen_diff_report(gt_words, ocr_words, css_prefix='w', joiner=' ', none='⋯', align=seq_align)
+    word_diff_report = gen_diff_report(gt_words, ocr_words, css_prefix='w', joiner=' ', none='⋯')

    def json_float(value):
        """Convert a float value to an JSON float.
--- a/qurator/dinglehopper/edit_distance.py
+++ b/qurator/dinglehopper/edit_distance.py
@ -75,6 +75,12 @@ def distance(s1, s2):
    Note that this is different from levenshtein() as this function knows about Unicode normalization and grapheme
    clusters. This should be the correct way to compare two Unicode strings.
    """
+    # XXX
+    from .cli import ExtractedText
+    if isinstance(s1, ExtractedText):
+        s1 = s1.text
+    if isinstance(s2, ExtractedText):
+        s2 = s2.text
    s1 = list(grapheme_clusters(unicodedata.normalize('NFC', s1)))
    s2 = list(grapheme_clusters(unicodedata.normalize('NFC', s2)))
    return levenshtein(s1, s2)
@ -116,7 +122,11 @@ def seq_editops(seq1, seq2):


 def editops(word1, word2):
-    # XXX Note that this returns indices to the _grapheme clusters_, not characters!
+    """
+    Return sequence of edit operations transforming one string to another.
+
+    Note that this returns indices to the _grapheme clusters_, not characters!
+    """
    word1 = list(grapheme_clusters(unicodedata.normalize('NFC', word1)))
    word2 = list(grapheme_clusters(unicodedata.normalize('NFC', word2)))
    return seq_editops(word1, word2)
--- a/qurator/dinglehopper/ocr_files.py
+++ b/qurator/dinglehopper/ocr_files.py
@ -1,11 +1,97 @@
 from __future__ import division, print_function

+from typing import Optional
 from warnings import warn

 from lxml import etree as ET
-import sys
-
 from lxml.etree import XMLSyntaxError
+from contextlib import suppress
+from itertools import repeat
+from .substitute_equivalences import substitute_equivalences
+import sys
+import attr
+import enum
+import unicodedata
+import re
+
+
+@attr.s(frozen=True)
+class ExtractedText:
+    segments = attr.ib(converter=list)
+    joiner = attr.ib(type=str)
+    # TODO Types are not validated (attr does not do this yet)
+
+    @property
+    def text(self):
+        return self.joiner.join(s.text for s in self.segments)
+
+    _segment_id_for_pos = None
+
+    def segment_id_for_pos(self, pos):
+        # Calculate segment ids once, on the first call
+        if not self._segment_id_for_pos:
+            segment_id_for_pos = []
+            for s in self.segments:
+                segment_id_for_pos.extend(repeat(s.segment_id, len(s.text)))
+                segment_id_for_pos.extend(repeat(None, len(self.joiner)))
+            # This is frozen, so we have to jump through the hoop:
+            object.__setattr__(self, '_segment_id_for_pos', segment_id_for_pos)
+            assert self._segment_id_for_pos
+
+        return self._segment_id_for_pos[pos]
+
+
+class Normalization(enum.Enum):
+    NFC = 1
+    NFC_MUFI = 2  # TODO
+    NFC_SBB = 3
+
+
+def normalize(text, normalization):
+    if normalization == Normalization.NFC:
+        return unicodedata.normalize('NFC', text)
+    if normalization == Normalization.NFC_MUFI:
+        raise NotImplementedError()
+    if normalization == Normalization.NFC_SBB:
+        return substitute_equivalences(text)
+    else:
+        raise ValueError()
+
+
+# XXX hack
+def normalize_sbb(t):
+    return normalize(t, Normalization.NFC_SBB)
+
+
+@attr.s(frozen=True)
+class ExtractedTextSegment:
+    segment_id = attr.ib(type=Optional[str])
+
+    @segment_id.validator
+    def check(self, _, value):
+        if value is None:
+            return
+        if not re.match(r'[\w\d_-]+', value):
+            raise ValueError('Malformed segment id "{}"'.format(value))
+    text = attr.ib(type=str)
+
+    @text.validator
+    def check(self, _, value):
+        if value is not None and normalize(value, self.normalization) != value:
+            raise ValueError('String "{}" is not normalized.'.format(value))
+    normalization = attr.ib(converter=Normalization, default=Normalization.NFC_SBB)
+
+    @classmethod
+    def from_text_segment(cls, text_segment, nsmap):
+        """Build an ExtractedTextSegment from a PAGE content text element"""
+
+        segment_id = text_segment.attrib['id']
+        segment_text = None
+        with suppress(AttributeError):
+            segment_text = text_segment.find('./page:TextEquiv/page:Unicode', namespaces=nsmap).text
+            segment_text = segment_text or ''
+            segment_text = normalize_sbb(segment_text)
+        return cls(segment_id, segment_text)


 def alto_namespace(tree):
@ -21,7 +107,7 @@ def alto_namespace(tree):
        raise ValueError('Not an ALTO tree')


-def alto_text(tree):
+def alto_extract(tree):
    """Extract text from the given ALTO ElementTree."""

    nsmap = {'alto': alto_namespace(tree)}
@ -29,9 +115,18 @@ def alto_text(tree):
    lines = (
        ' '.join(string.attrib.get('CONTENT') for string in line.iterfind('alto:String', namespaces=nsmap))
        for line in tree.iterfind('.//alto:TextLine', namespaces=nsmap))
-    text_ = '\n'.join(lines)

-    return text_
+    return ExtractedText(
+            (ExtractedTextSegment(None, normalize_sbb(line_text)) for line_text in lines),
+            '\n'
+    )
+    # TODO This currently does not extract any segment id, because we are
+    #      clueless about the ALTO format.
+    # FIXME needs to handle normalization
+
+
+def alto_text(tree):
+    return alto_extract(tree).text


 def page_namespace(tree):
@ -47,18 +142,12 @@ def page_namespace(tree):
        raise ValueError('Not a PAGE tree')


-def page_text(tree):
+def page_extract(tree):
    """Extract text from the given PAGE content ElementTree."""

    nsmap = {'page': page_namespace(tree)}

-    def region_text(region):
-        try:
-            return region.find('./page:TextEquiv/page:Unicode', namespaces=nsmap).text
-        except AttributeError:
-            return None
-
-    region_texts = []
+    regions = []
    reading_order = tree.find('.//page:ReadingOrder', namespaces=nsmap)
    if reading_order is not None:
        for group in reading_order.iterfind('./*', namespaces=nsmap):
@ -68,39 +157,55 @@ def page_text(tree):
                    region_id = region_ref_indexed.attrib['regionRef']
                    region = tree.find('.//page:TextRegion[@id="%s"]' % region_id, namespaces=nsmap)
                    if region is not None:
-                        region_texts.append(region_text(region))
+                        regions.append(ExtractedTextSegment.from_text_segment(region, nsmap))
                    else:
                        warn('Not a TextRegion: "%s"' % region_id)
            else:
                raise NotImplementedError
    else:
        for region in tree.iterfind('.//page:TextRegion', namespaces=nsmap):
-            region_texts.append(region_text(region))
+            regions.append(ExtractedTextSegment.from_text_segment(region, nsmap))

-    # XXX Does a file have to have regions etc.? region vs lines etc.
    # Filter empty region texts
-    region_texts = (t for t in region_texts if t)
+    regions = (r for r in regions if r.text is not None)

-    text_ = '\n'.join(region_texts)
-
-    return text_
+    return ExtractedText(regions, '\n')
+    # FIXME needs to handle normalization


-def text(filename):
-    """Read the text from the given file.
+def page_text(tree):
+    return page_extract(tree).text
+
+
+def plain_extract(filename):
+    with open(filename, 'r') as f:
+        return ExtractedText(
+                (ExtractedTextSegment('line %d' % no, line) for no, line in enumerate(f.readlines())),
+                '\n'
+        )
+
+
+def plain_text(filename):
+    return plain_extract(filename).text
+
+
+def extract(filename):
+    """Extract the text from the given file.

    Supports PAGE, ALTO and falls back to plain text.
    """
-
    try:
        tree = ET.parse(filename)
    except XMLSyntaxError:
-        with open(filename, 'r') as f:
-            return f.read()
+        return plain_extract(filename)
    try:
-        return page_text(tree)
+        return page_extract(tree)
    except ValueError:
-        return alto_text(tree)
+        return alto_extract(tree)
+
+
+def text(filename):
+    return extract(filename).text


 if __name__ == '__main__':
--- a/qurator/dinglehopper/substitute_equivalences.py
+++ b/qurator/dinglehopper/substitute_equivalences.py
@ -1,21 +1,15 @@
 import unicodedata


-def substitute_equivalences(s):
+def unjoin_ligatures(s):
+    """Unjoin ligatures, i.e. ﬀ becomes ff."""

-    # These are for OCR-D GT vs Tesseract frk vs Calamari GT4HistOCR
-    # It might make sense to use different rules for GT and for the different OCR
    equivalences = {
-        '': 'ü',
        '': 'ſſ',
        "\ueba7": 'ſſi',  # MUFI: LATIN SMALL LIGATURE LONG S LONG S I
-        '': 'ä',
        '': 'ch',
-        '==': '–',  # → en-dash
-        '—': '–',   # em-dash → en-dash
        '': 'ck',
        '': 'll',
-        '': 'ö',
        '': 'ſi',
        '': 'ſt',
        'ﬁ': 'fi',
@ -23,12 +17,7 @@ def substitute_equivalences(s):
        'ﬂ': 'fl',
        'ﬃ': 'ffi',
        '': 'ct',
-        '’': '\'',
-        '⸗': '-',
        '': 'tz',       # MUFI: LATIN SMALL LIGATURE TZ
-        'aͤ': 'ä',        # LATIN SMALL LETTER A, COMBINING LATIN SMALL LETTER E
-        'oͤ': 'ö',        # LATIN SMALL LETTER O, COMBINING LATIN SMALL LETTER E
-        'uͤ': 'ü',        # LATIN SMALL LETTER U, COMBINING LATIN SMALL LETTER E
        '\uf532': 'as',  # eMOP: Latin small ligature as
        '\uf533': 'is',  # eMOP: Latin small ligature is
        '\uf534': 'us',  # eMOP: Latin small ligature us
@ -37,10 +26,32 @@ def substitute_equivalences(s):
        '\uE8BF': 'q&',  # MUFI: LATIN SMALL LETTER Q LIGATED WITH FINAL ET  XXX How to replace this correctly?
        '\uEBA5': 'ſp',  # MUFI: LATIN SMALL LIGATURE LONG S P
        'ﬆ': 'st',      # U+FB06 LATIN SMALL LIGATURE ST
-        '\uF50E': 'q́'    # U+F50E LATIN SMALL LETTER Q WITH ACUTE ACCENT
    }
-
    s = unicodedata.normalize('NFC', s)
    for fr, to in equivalences.items():
        s = s.replace(fr, to)
    return s
+
+
+def substitute_equivalences(s):
+    # These are for OCR-D GT vs Tesseract frk vs Calamari GT4HistOCR
+    # It might make sense to use different rules for GT and for the different OCR
+    equivalences = {
+        '': 'ü',
+        '': 'ä',
+        '==': '–',  # → en-dash
+        '—': '–',   # em-dash → en-dash
+        '': 'ö',
+        '’': '\'',
+        '⸗': '-',
+        'aͤ': 'ä',        # LATIN SMALL LETTER A, COMBINING LATIN SMALL LETTER E
+        'oͤ': 'ö',        # LATIN SMALL LETTER O, COMBINING LATIN SMALL LETTER E
+        'uͤ': 'ü',        # LATIN SMALL LETTER U, COMBINING LATIN SMALL LETTER E
+        '\uF50E': 'q́'    # U+F50E LATIN SMALL LETTER Q WITH ACUTE ACCENT
+    }
+
+    s = unicodedata.normalize('NFC', s)
+    s = unjoin_ligatures(s)
+    for fr, to in equivalences.items():
+        s = s.replace(fr, to)
+    return s
--- a/qurator/dinglehopper/templates/report.html.js
+++ b/qurator/dinglehopper/templates/report.html.js
@ -1,14 +1,15 @@
 function find_diff_class(classes) {
-    return classes.split(/\s+/).find(x => x.match(/.diff\d.*/));
+    return $('.' + classes.split(/\s+/).find(x => x.match(/.diff\d.*/)));
 }

 $(document).ready(function() {
+    /* Enable Bootstrap tooltips */
+    $('[data-toggle="tooltip"]').tooltip();
+
    $('.diff').mouseover(function() {
-        let c = find_diff_class($(this).attr('class'))
-        $('.' + c).addClass('diff-highlight')
+        find_diff_class($(this).attr('class')).addClass('diff-highlight');
    });
    $('.diff').mouseout(function() {
-        let c = find_diff_class($(this).attr('class'))
-        $('.' + c).removeClass('diff-highlight')
+        find_diff_class($(this).attr('class')).removeClass('diff-highlight');
    });
 });
--- a/qurator/dinglehopper/tests/extracted_text_test.py
+++ b/qurator/dinglehopper/tests/extracted_text_test.py
@ -0,0 +1,68 @@
+import unicodedata
+import pytest
+from qurator.dinglehopper import ExtractedText, ExtractedTextSegment
+from uniseg.graphemecluster import grapheme_clusters
+from qurator.dinglehopper import seq_align
+from collections import namedtuple
+
+
+def test_text():
+    test1 = ExtractedText([
+        ExtractedTextSegment('s0', 'foo'),
+        ExtractedTextSegment('s1', 'bar'),
+        ExtractedTextSegment('s2', 'bazinga')
+    ], ' ')
+
+    assert test1.text == 'foo bar bazinga'
+    assert test1.segment_id_for_pos(0) == 's0'
+    assert test1.segment_id_for_pos(3) is None
+    assert test1.segment_id_for_pos(10) == 's2'
+
+
+def test_normalization_check():
+    with pytest.raises(ValueError, match=r'.*is not normalized.*'):
+        ExtractedTextSegment('foo', unicodedata.normalize('NFD', 'Schlyñ'))
+    assert ExtractedTextSegment('foo', unicodedata.normalize('NFC', 'Schlyñ'))
+
+
+AlignmentElement = namedtuple('AlignmentElement', 'left right left_id right_id')
+
+
+def test_align():
+    """
+    Test aligning by character while retaining segment id info
+
+    The difficulty here is that aligning should work on grapheme clusters,
+    not Python characters.
+    """
+
+    test1 = ExtractedText([
+        ExtractedTextSegment('s0', 'foo'),
+        ExtractedTextSegment('s1', 'bar'),
+        ExtractedTextSegment('s2', 'batzinga')
+    ], ' ')
+    test2 = ExtractedText([
+        ExtractedTextSegment('x0', 'foo'),
+        ExtractedTextSegment('x1', 'bar'),
+        ExtractedTextSegment('x2', '.'),  # extra .
+        ExtractedTextSegment('x3', 'bazim̃ga'),  # deletion + different grapheme cluster, m̃ also is two Python characters
+    ], ' ')
+
+    left_pos = 0; right_pos = 0; alignment = []
+    for left, right in seq_align(grapheme_clusters(test1.text), grapheme_clusters(test2.text)):
+        left_id = test1.segment_id_for_pos(left_pos) if left is not None else None
+        right_id = test2.segment_id_for_pos(right_pos) if right is not None else None
+        el = AlignmentElement(left, right, left_id, right_id)
+        alignment.append(el)
+        if left is not None:
+            left_pos += len(left)
+        if right is not None:
+            right_pos += len(right)
+
+    print('test1: {}'.format(test1.text))
+    print('test2: {}'.format(test2.text))
+
+    assert alignment[0]  == ('f',  'f',  's0', 'x0')
+    assert alignment[8]  == (None, '.',  None, 'x2')
+    assert alignment[12] == ('t',  None, 's2', None)
+    assert alignment[15] == ('n',  'm̃',  's2', 'x3')
--- a/qurator/dinglehopper/tests/test_align.py
+++ b/qurator/dinglehopper/tests/test_align.py
@ -78,7 +78,8 @@ def test_lines():


 def test_lines_similar():
-    """Test comparing list of lines while using a "weaker equivalence".
+    """
+    Test comparing list of lines while using a "weaker equivalence".

    This mainly serves as documentation.
    """
@ -88,7 +89,14 @@ def test_lines_similar():
            self._string = string

        def __eq__(self, other):
-            return distance(self._string, other._string) < 2    # XXX NOT the final version
+            # Just an example!
+            min_len = min(len(self._string), len(other._string))
+            if min_len > 0:
+                normalized_distance = distance(self._string, other._string)/min_len
+                similar = normalized_distance < 0.1
+            else:
+                similar = False
+            return similar

        def __ne__(self, other):
            return not self.__eq__(other)
@ -106,3 +114,6 @@ def test_lines_similar():
    left, right = unzip(result)
    assert list(left)  == [SimilarString('This is a line.'), SimilarString('This is another'), None,                             SimilarString('And the last line')]
    assert list(right) == [SimilarString('This is a ljne.'), SimilarString('This is another'), SimilarString('J  u   n      k'), SimilarString('And the last line')]
+
+    # Test __eq__ (i.e. is it a substitution or a similar string?)
+    assert list(left)[0] == list(right)[0]
--- a/qurator/dinglehopper/tests/test_integ_align.py
+++ b/qurator/dinglehopper/tests/test_integ_align.py
@ -13,11 +13,15 @@ data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data')
@pytest.mark.integration
 def test_align_page_files():
    # In the fake OCR file, we changed 2 characters and replaced a fi ligature with fi.
-    # → 4 elements in the alignment should be different.
+    # → 2 elements in the alignment should be different, the ligature is
+    # (currently) not counted due to normalization.
    # NOTE: In this example, it doesn't matter that we work with "characters", not grapheme clusters.

    gt = page_text(ET.parse(os.path.join(data_dir, 'test-gt.page2018.xml')))
    ocr = page_text(ET.parse(os.path.join(data_dir, 'test-fake-ocr.page2018.xml')))

    result = list(align(gt, ocr))
-    assert sum(left != right for left, right in result) == 4
+    for left, right in result:
+        if left != right:
+            print(left, right)
+    assert sum(left != right for left, right in result) == 2
--- a/qurator/dinglehopper/tests/test_integ_character_error_rate_ocr.py
+++ b/qurator/dinglehopper/tests/test_integ_character_error_rate_ocr.py
@ -4,6 +4,7 @@ import os

 import pytest
 from lxml import etree as ET
+from uniseg.graphemecluster import grapheme_clusters

 from .. import character_error_rate, page_text, alto_text

@ -13,9 +14,14 @@ data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data')
@pytest.mark.integration
 def test_character_error_rate_between_page_files():
    # In the fake OCR file, we changed 2 characters and replaced a fi ligature with fi.
+    # The fi ligature does not count.
    gt = page_text(ET.parse(os.path.join(data_dir, 'test-gt.page2018.xml')))
    ocr = page_text(ET.parse(os.path.join(data_dir, 'test-fake-ocr.page2018.xml')))
-    assert character_error_rate(gt, ocr) == 4/(470 + 1 + 311)  # 2 TextRegions, 1 \n
+
+    gt_len = len(list(grapheme_clusters(gt)))
+    expected_cer = 2/gt_len
+
+    assert character_error_rate(gt, ocr) == expected_cer


@pytest.mark.integration
--- a/qurator/dinglehopper/tests/test_integ_cli_valid_json.py
+++ b/qurator/dinglehopper/tests/test_integ_cli_valid_json.py
@ -1,4 +1,3 @@
-import os
 import json

 import pytest
@ -10,14 +9,17 @@ from ..cli import process
 def test_cli_json(tmp_path):
    """Test that the cli/process() yields a loadable JSON report"""

-    # XXX Path.__str__() is necessary for Python 3.5
    with working_directory(str(tmp_path)):
        with open('gt.txt', 'w') as gtf:
            gtf.write('AAAAA')
        with open('ocr.txt', 'w') as ocrf:
            ocrf.write('AAAAB')

+        with open('gt.txt', 'r') as gtf:
+            print(gtf.read())
        process('gt.txt', 'ocr.txt', 'report')
+        with open('report.json', 'r') as jsonf:
+            print(jsonf.read())
        with open('report.json', 'r') as jsonf:
            j = json.load(jsonf)
            assert j['cer'] == pytest.approx(0.2)
@ -26,7 +28,6 @@ def test_cli_json(tmp_path):
 def test_cli_json_cer_is_infinity(tmp_path):
    """Test that the cli/process() yields a loadable JSON report when CER == inf"""

-    # XXX Path.__str__() is necessary for Python 3.5
    with working_directory(str(tmp_path)):
        with open('gt.txt', 'w') as gtf:
            gtf.write('')  # Empty to yield CER == inf
--- a/qurator/dinglehopper/tests/test_integ_edit_distance_ocr.py
+++ b/qurator/dinglehopper/tests/test_integ_edit_distance_ocr.py
@ -13,9 +13,11 @@ data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data')
@pytest.mark.integration
 def test_distance_between_page_files():
    # In the fake OCR file, we changed 2 characters and replaced a fi ligature with fi.
+    # Due to normalization, we don't count the ligature.
+    # → 2 differences
    gt = page_text(ET.parse(os.path.join(data_dir, 'test-gt.page2018.xml')))
    ocr = page_text(ET.parse(os.path.join(data_dir, 'test-fake-ocr.page2018.xml')))
-    assert distance(gt, ocr) == 4
+    assert distance(gt, ocr) == 2


@pytest.mark.integration
--- a/qurator/dinglehopper/tests/test_integ_ocrd_cli.py
+++ b/qurator/dinglehopper/tests/test_integ_ocrd_cli.py
@ -1,11 +1,9 @@
 import os
-import re
 import shutil
 import json
 from pathlib import Path

 from click.testing import CliRunner
-import pytest
 from .util import working_directory


@ -17,8 +15,6 @@ data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data')
 def test_ocrd_cli(tmp_path):
    """Test OCR-D interface"""

-    # XXX Path.str() is necessary for Python 3.5
-
    # Copy test workspace
    test_workspace_dir_source = Path(data_dir) / 'actevedef_718448162'
    test_workspace_dir = tmp_path / 'test_ocrd_cli'
--- a/qurator/dinglehopper/tests/test_integ_word_error_rate_ocr.py
+++ b/qurator/dinglehopper/tests/test_integ_word_error_rate_ocr.py
@ -12,14 +12,15 @@ data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data')

@pytest.mark.integration
 def test_word_error_rate_between_page_files():
-    # In the fake OCR file, we changed 2 characters and replaced a fi ligature with fi. → 3 changed words
+    # In the fake OCR file, we changed 2 characters and replaced a fi ligature with fi. So we have 3 changed words,
+    # the ligature does not count → 2 errors
    gt = page_text(ET.parse(os.path.join(data_dir, 'test-gt.page2018.xml')))

    gt_word_count = 7+6+5+8+7+6+7+8+6+7+7+5+6+8+8+7+7+6+5+4  # Manually verified word count per line
    assert len(list(words(gt))) == gt_word_count

    ocr = page_text(ET.parse(os.path.join(data_dir, 'test-fake-ocr.page2018.xml')))
-    assert word_error_rate(gt, ocr) == 3/gt_word_count
+    assert word_error_rate(gt, ocr) == 2/gt_word_count


@pytest.mark.integration
--- a/qurator/dinglehopper/tests/test_ocr_files.py
+++ b/qurator/dinglehopper/tests/test_ocr_files.py
@ -6,7 +6,8 @@ import textwrap

 import pytest

-from .. import alto_namespace, alto_text, page_namespace, page_text, text
+from .util import working_directory
+from .. import alto_namespace, alto_text, page_namespace, page_text, plain_text, text

 data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data')

@ -49,27 +50,51 @@ def test_page_namespace():
 def test_page_test():
    tree = ET.parse(os.path.join(data_dir, 'test.page2018.xml'))
    result = page_text(tree)
+
+    # We are currently normalizing on extraction, so the text is normalized.
+    #
+    #  expected = textwrap.dedent("""\
+    #      ber die vielen Sorgen wegen deelben vergaß
+    #      Hartkopf, der Frau Amtmnnin das ver⸗
+    #      ſproene zu berliefern. — Ein Erpreer
+    #      wurde an ihn abgeſit, um ihn ums Him⸗
+    #      melswien zu ſagen, daß er das Verſproene
+    #      glei den Augenbli berbringen mte, die
+    #      Frau Amtmnnin htte  auf ihn verlaen,
+    #      und nun wßte e nit, was e anfangen
+    #      ſote. Den Augenbli ſote er kommen,
+    #      ſon vergieng e in ihrer Ang. — Die
+    #      Ge wren ſon angekommen, und es fehlte
+    #      ihr do no an aem. —
+    #      Hartkopf mußte  er bennen, und
+    #      endli na langem Nadenken ﬁel es ihm er
+    #      wieder ein. — Er langte den Zettel aus dem
+    #      Accisbue heraus, und ſagte ſeiner Frau, daß
+    #      e das, was da wre, herbeyſaﬀen mte.
+    #      Jndeß mangelten do einige Generalia, die
+    #      alſo wegﬁelen. — Hartkopf gieng ſelb
+    #      mit und berbrate es. —""")
    expected = textwrap.dedent("""\
-        ber die vielen Sorgen wegen deelben vergaß
-        Hartkopf, der Frau Amtmnnin das ver⸗
-        ſproene zu berliefern. — Ein Erpreer
-        wurde an ihn abgeſit, um ihn ums Him⸗
-        melswien zu ſagen, daß er das Verſproene
-        glei den Augenbli berbringen mte, die
-        Frau Amtmnnin htte  auf ihn verlaen,
-        und nun wßte e nit, was e anfangen
-        ſote. Den Augenbli ſote er kommen,
-        ſon vergieng e in ihrer Ang. — Die
-        Ge wren ſon angekommen, und es fehlte
-        ihr do no an aem. —
-        Hartkopf mußte  er bennen, und
-        endli na langem Nadenken ﬁel es ihm er
-        wieder ein. — Er langte den Zettel aus dem
-        Accisbue heraus, und ſagte ſeiner Frau, daß
-        e das, was da wre, herbeyſaﬀen mte.
-        Jndeß mangelten do einige Generalia, die
-        alſo wegﬁelen. — Hartkopf gieng ſelb
-        mit und berbrate es. —""")
+        über die vielen Sorgen wegen deſſelben vergaß
+        Hartkopf, der Frau Amtmännin das ver-
+        ſprochene zu überliefern. – Ein Erpreſſer
+        wurde an ihn abgeſchickt, um ihn ums Him-
+        melswillen zu ſagen, daß er das Verſprochene
+        gleich den Augenblick überbringen möchte, die
+        Frau Amtmännin hätte ſich auf ihn verlaſſen,
+        und nun wüßte ſie nicht, was ſie anfangen
+        ſollte. Den Augenblick ſollte er kommen,
+        ſonſt vergieng ſie in ihrer Angſt. – Die
+        Gäſte wären ſchon angekommen, und es fehlte
+        ihr doch noch an allem. –
+        Hartkopf mußte ſich erſt beſinnen, und
+        endlich nach langem Nachdenken fiel es ihm erſt
+        wieder ein. – Er langte den Zettel aus dem
+        Accisbuche heraus, und ſagte ſeiner Frau, daß
+        ſie das, was da wäre, herbeyſchaffen möchte.
+        Jndeß mangelten doch einige Generalia, die
+        alſo wegfielen. – Hartkopf gieng ſelbſt
+        mit und überbrachte es. –""")
    assert result == expected


@ -92,7 +117,8 @@ def test_page_order():
    tree = ET.parse(os.path.join(data_dir, 'order.page.xml'))
    result = page_text(tree)

-    assert re.search(r'Herr Konfrater.*75.*Etwas f.r Wittwen.*Ein gewi.er Lord.*76\. Die', result, re.DOTALL)
+    print(result)
+    assert re.search(r'Herr Konfrater.*75.*Etwas f.r Wittwen.*Ein gewi.{1,2}er Lord.*76\. Die', result, re.DOTALL)


 def test_page_mixed_regions():
@ -106,5 +132,15 @@ def test_page_mixed_regions():

 def test_text():
    assert "being erected at the Broadway stock" in text(os.path.join(data_dir, 'test.alto1.xml'))
-    assert "wieder ein. — Er langte den Zettel aus dem" in text(os.path.join(data_dir, 'test.page2018.xml'))
+    assert "wieder ein. – Er langte den Zettel aus dem" in text(os.path.join(data_dir, 'test.page2018.xml'))
    assert "Lorem ipsum" in text(os.path.join(data_dir, 'test.txt'))
+
+
+def test_plain(tmp_path):
+    with working_directory(str(tmp_path)):
+        with open('ocr.txt', 'w') as ocrf:
+            ocrf.write('AAAAB')
+
+        result = plain_text('ocr.txt')
+        expected = 'AAAAB'
+        assert result == expected
--- a/qurator/dinglehopper/tests/util.py
+++ b/qurator/dinglehopper/tests/util.py
@ -21,8 +21,8 @@ def diffprint(x, y):
        _diffprint(x, y)


-def unzip(l):
-    return zip(*l)
+def unzip(an_iterable_of_tuples):
+    return zip(*an_iterable_of_tuples)


 class working_directory:
--- a/qurator/dinglehopper/word_error_rate.py
+++ b/qurator/dinglehopper/word_error_rate.py
@ -32,6 +32,11 @@ def words(s):
        cat = subcat[0]
        return cat in unwanted_categories or subcat in unwanted_subcategories

+    # XXX
+    from .cli import ExtractedText
+    if isinstance(s, ExtractedText):
+        s = s.text
+
    # We follow Unicode Standard Annex #29 on Unicode Text Segmentation here: Split on word boundaries using
    # uniseg.wordbreak.words() and ignore all "words" that contain only whitespace, punctation "or similar characters."
    for word in uniseg.wordbreak.words(s):
@ -42,10 +47,20 @@ def words(s):


 def words_normalized(s):
+    # XXX
+    from .cli import ExtractedText
+    if isinstance(s, ExtractedText):
+        s = s.text
    return words(unicodedata.normalize('NFC', s))


 def word_error_rate_n(reference, compared) -> Tuple[float, int]:
+    # XXX
+    from .cli import ExtractedText
+    if isinstance(reference, ExtractedText):
+        reference = reference.text
+    if isinstance(compared, ExtractedText):
+        compared = compared.text
    if isinstance(reference, str):
        reference_seq = list(words_normalized(reference))
        compared_seq = list(words_normalized(compared))
--- a/requirements.txt
+++ b/requirements.txt
@ -4,5 +4,6 @@ lxml
 uniseg
 numpy
 colorama
+MarkupSafe
 ocrd >= 1.0.0b15
 attrs
--- a/setup.cfg
+++ b/setup.cfg
@ -1,2 +1,2 @@
 [flake8]
-max-line-length = 120
+max-line-length = 90