diff --git a/.vimrc b/.vimrc
new file mode 100644
index 0000000..3b935a0
--- /dev/null
+++ b/.vimrc
@@ -0,0 +1,2 @@
+" project-specific .vimrc (needs set exrc + set secure)
+set textwidth=90
diff --git a/qurator/dinglehopper/align.py b/qurator/dinglehopper/align.py
index ab44760..87febb7 100644
--- a/qurator/dinglehopper/align.py
+++ b/qurator/dinglehopper/align.py
@@ -28,16 +28,16 @@ def seq_align(s1, s2):
 
         if o:
             if o[0] == 'insert':
-                yield (None, s2[j])
+                yield None, s2[j]
                 j += 1
             elif o[0] == 'delete':
-                yield (s1[i], None)
+                yield s1[i], None
                 i += 1
             elif o[0] == 'replace':
-                yield (s1[i], s2[j])
+                yield s1[i], s2[j]
                 i += 1
                 j += 1
         else:
-            yield (s1[i], s2[j])
+            yield s1[i], s2[j]
             i += 1
             j += 1
diff --git a/qurator/dinglehopper/character_error_rate.py b/qurator/dinglehopper/character_error_rate.py
index 05cc931..e99f391 100644
--- a/qurator/dinglehopper/character_error_rate.py
+++ b/qurator/dinglehopper/character_error_rate.py
@@ -15,6 +15,10 @@ def character_error_rate_n(reference, compared) -> Tuple[float, int]:
     :return: character error rate and length of the reference
     """
     d = distance(reference, compared)
+    # XXX
+    from .cli import ExtractedText
+    if isinstance(reference, ExtractedText):
+        reference = reference.text
     n = len(list(grapheme_clusters(unicodedata.normalize('NFC', reference))))
 
     if d == 0:
diff --git a/qurator/dinglehopper/cli.py b/qurator/dinglehopper/cli.py
index 63bfd92..9c963c1 100644
--- a/qurator/dinglehopper/cli.py
+++ b/qurator/dinglehopper/cli.py
@@ -8,11 +8,11 @@ from markupsafe import escape
 from qurator.dinglehopper import *
 
 
-def gen_diff_report(gt_things, ocr_things, css_prefix, joiner, none, align):
+def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none):
     gtx = ''
     ocrx = ''
 
-    def format_thing(t, css_classes=None):
+    def format_thing(t, css_classes=None, id_=None):
         if t is None:
             html_t = none
             css_classes += ' ellipsis'
@@ -21,19 +21,51 @@ def gen_diff_report(gt_things, ocr_things, css_prefix, joiner, none, align):
         else:
             html_t = escape(t)
 
+        html_custom_attrs = ""
+
+        # Set Bootstrap tooltip to the segment id
+        if id_:
+            html_custom_attrs += 'data-toggle="tooltip" title="{}"'.format(id_)
+
         if css_classes:
-            return '<span class="{css_classes}">{html_t}</span>'.format(css_classes=css_classes, html_t=html_t)
+            return '<span class="{css_classes}" {html_custom_attrs}>{html_t}</span>'.format(css_classes=css_classes, html_t=html_t, html_custom_attrs=html_custom_attrs)
         else:
             return '{html_t}'.format(html_t=html_t)
 
-    for k, (g, o) in enumerate(align(gt_things, ocr_things)):
-        if g == o:
-            css_classes = None
-        else:
+    if isinstance(gt_in, ExtractedText):
+        if not isinstance(ocr_in, ExtractedText):
+            raise TypeError()
+        # XXX splitting should be done in ExtractedText
+        gt_things = list(grapheme_clusters(gt_in.text))
+        ocr_things = list(grapheme_clusters(ocr_in.text))
+    else:
+        gt_things = gt_in
+        ocr_things = ocr_in
+
+
+
+    g_pos = 0
+    o_pos = 0
+    for k, (g, o) in enumerate(seq_align(gt_things, ocr_things)):
+        css_classes = None
+        gt_id = None
+        ocr_id = None
+        if g != o:
             css_classes = '{css_prefix}diff{k} diff'.format(css_prefix=css_prefix, k=k)
+            if isinstance(gt_in, ExtractedText):
+                gt_id = gt_in.segment_id_for_pos(g_pos) if g is not None else None
+                ocr_id = ocr_in.segment_id_for_pos(o_pos) if o is not None else None
+                # Deletions and inserts only produce one id + None, UI must
+                # support this, i.e. display for the one id produced
+
+        gtx += joiner + format_thing(g, css_classes, gt_id)
+        ocrx += joiner + format_thing(o, css_classes, ocr_id)
+
+        if g is not None:
+            g_pos += len(g)
+        if o is not None:
+            o_pos += len(o)
 
-        gtx += joiner + format_thing(g, css_classes)
-        ocrx += joiner + format_thing(o, css_classes)
 
     return \
         '''
@@ -51,20 +83,17 @@ def process(gt, ocr, report_prefix, *, metrics=True):
     Click on a wrapper.
     """
 
-    gt_text = text(gt)
-    ocr_text = text(ocr)
-
-    gt_text = substitute_equivalences(gt_text)
-    ocr_text = substitute_equivalences(ocr_text)
+    gt_text = extract(gt)
+    ocr_text = extract(ocr)
 
     cer, n_characters = character_error_rate_n(gt_text, ocr_text)
     wer, n_words = word_error_rate_n(gt_text, ocr_text)
 
-    char_diff_report = gen_diff_report(gt_text, ocr_text, css_prefix='c', joiner='', none='·', align=align)
+    char_diff_report = gen_diff_report(gt_text, ocr_text, css_prefix='c', joiner='', none='·')
 
     gt_words = words_normalized(gt_text)
     ocr_words = words_normalized(ocr_text)
-    word_diff_report = gen_diff_report(gt_words, ocr_words, css_prefix='w', joiner=' ', none='⋯', align=seq_align)
+    word_diff_report = gen_diff_report(gt_words, ocr_words, css_prefix='w', joiner=' ', none='⋯')
 
     def json_float(value):
         """Convert a float value to an JSON float.
diff --git a/qurator/dinglehopper/edit_distance.py b/qurator/dinglehopper/edit_distance.py
index 8ca24d3..284b676 100644
--- a/qurator/dinglehopper/edit_distance.py
+++ b/qurator/dinglehopper/edit_distance.py
@@ -75,6 +75,12 @@ def distance(s1, s2):
     Note that this is different from levenshtein() as this function knows about Unicode normalization and grapheme
     clusters. This should be the correct way to compare two Unicode strings.
     """
+    # XXX
+    from .cli import ExtractedText
+    if isinstance(s1, ExtractedText):
+        s1 = s1.text
+    if isinstance(s2, ExtractedText):
+        s2 = s2.text
     s1 = list(grapheme_clusters(unicodedata.normalize('NFC', s1)))
     s2 = list(grapheme_clusters(unicodedata.normalize('NFC', s2)))
     return levenshtein(s1, s2)
@@ -116,7 +122,11 @@ def seq_editops(seq1, seq2):
 
 
 def editops(word1, word2):
-    # XXX Note that this returns indices to the _grapheme clusters_, not characters!
+    """
+    Return sequence of edit operations transforming one string to another.
+
+    Note that this returns indices to the _grapheme clusters_, not characters!
+    """
     word1 = list(grapheme_clusters(unicodedata.normalize('NFC', word1)))
     word2 = list(grapheme_clusters(unicodedata.normalize('NFC', word2)))
     return seq_editops(word1, word2)
diff --git a/qurator/dinglehopper/ocr_files.py b/qurator/dinglehopper/ocr_files.py
index b57a047..a048b1e 100644
--- a/qurator/dinglehopper/ocr_files.py
+++ b/qurator/dinglehopper/ocr_files.py
@@ -1,11 +1,97 @@
 from __future__ import division, print_function
 
+from typing import Optional
 from warnings import warn
 
 from lxml import etree as ET
+from lxml.etree import XMLSyntaxError
+from contextlib import suppress
+from itertools import repeat
+from .substitute_equivalences import substitute_equivalences
 import sys
+import attr
+import enum
+import unicodedata
+import re
+
+
+@attr.s(frozen=True)
+class ExtractedText:
+    segments = attr.ib(converter=list)
+    joiner = attr.ib(type=str)
+    # TODO Types are not validated (attr does not do this yet)
+
+    @property
+    def text(self):
+        return self.joiner.join(s.text for s in self.segments)
+
+    _segment_id_for_pos = None
+
+    def segment_id_for_pos(self, pos):
+        # Calculate segment ids once, on the first call
+        if not self._segment_id_for_pos:
+            segment_id_for_pos = []
+            for s in self.segments:
+                segment_id_for_pos.extend(repeat(s.segment_id, len(s.text)))
+                segment_id_for_pos.extend(repeat(None, len(self.joiner)))
+            # This is frozen, so we have to jump through the hoop:
+            object.__setattr__(self, '_segment_id_for_pos', segment_id_for_pos)
+            assert self._segment_id_for_pos
+
+        return self._segment_id_for_pos[pos]
+
+
+class Normalization(enum.Enum):
+    NFC = 1
+    NFC_MUFI = 2  # TODO
+    NFC_SBB = 3
+
+
+def normalize(text, normalization):
+    if normalization == Normalization.NFC:
+        return unicodedata.normalize('NFC', text)
+    if normalization == Normalization.NFC_MUFI:
+        raise NotImplementedError()
+    if normalization == Normalization.NFC_SBB:
+        return substitute_equivalences(text)
+    else:
+        raise ValueError()
 
-from lxml.etree import XMLSyntaxError
+
+# XXX hack
+def normalize_sbb(t):
+    return normalize(t, Normalization.NFC_SBB)
+
+
+@attr.s(frozen=True)
+class ExtractedTextSegment:
+    segment_id = attr.ib(type=Optional[str])
+
+    @segment_id.validator
+    def check(self, _, value):
+        if value is None:
+            return
+        if not re.match(r'[\w\d_-]+', value):
+            raise ValueError('Malformed segment id "{}"'.format(value))
+    text = attr.ib(type=str)
+
+    @text.validator
+    def check(self, _, value):
+        if value is not None and normalize(value, self.normalization) != value:
+            raise ValueError('String "{}" is not normalized.'.format(value))
+    normalization = attr.ib(converter=Normalization, default=Normalization.NFC_SBB)
+
+    @classmethod
+    def from_text_segment(cls, text_segment, nsmap):
+        """Build an ExtractedTextSegment from a PAGE content text element"""
+
+        segment_id = text_segment.attrib['id']
+        segment_text = None
+        with suppress(AttributeError):
+            segment_text = text_segment.find('./page:TextEquiv/page:Unicode', namespaces=nsmap).text
+            segment_text = segment_text or ''
+            segment_text = normalize_sbb(segment_text)
+        return cls(segment_id, segment_text)
 
 
 def alto_namespace(tree):
@@ -21,7 +107,7 @@ def alto_namespace(tree):
         raise ValueError('Not an ALTO tree')
 
 
-def alto_text(tree):
+def alto_extract(tree):
     """Extract text from the given ALTO ElementTree."""
 
     nsmap = {'alto': alto_namespace(tree)}
@@ -29,9 +115,18 @@ def alto_text(tree):
     lines = (
         ' '.join(string.attrib.get('CONTENT') for string in line.iterfind('alto:String', namespaces=nsmap))
         for line in tree.iterfind('.//alto:TextLine', namespaces=nsmap))
-    text_ = '\n'.join(lines)
 
-    return text_
+    return ExtractedText(
+            (ExtractedTextSegment(None, normalize_sbb(line_text)) for line_text in lines),
+            '\n'
+    )
+    # TODO This currently does not extract any segment id, because we are
+    #      clueless about the ALTO format.
+    # FIXME needs to handle normalization
+
+
+def alto_text(tree):
+    return alto_extract(tree).text
 
 
 def page_namespace(tree):
@@ -47,18 +142,12 @@ def page_namespace(tree):
         raise ValueError('Not a PAGE tree')
 
 
-def page_text(tree):
+def page_extract(tree):
     """Extract text from the given PAGE content ElementTree."""
 
     nsmap = {'page': page_namespace(tree)}
 
-    def region_text(region):
-        try:
-            return region.find('./page:TextEquiv/page:Unicode', namespaces=nsmap).text
-        except AttributeError:
-            return None
-
-    region_texts = []
+    regions = []
     reading_order = tree.find('.//page:ReadingOrder', namespaces=nsmap)
     if reading_order is not None:
         for group in reading_order.iterfind('./*', namespaces=nsmap):
@@ -68,39 +157,55 @@ def page_text(tree):
                     region_id = region_ref_indexed.attrib['regionRef']
                     region = tree.find('.//page:TextRegion[@id="%s"]' % region_id, namespaces=nsmap)
                     if region is not None:
-                        region_texts.append(region_text(region))
+                        regions.append(ExtractedTextSegment.from_text_segment(region, nsmap))
                     else:
                         warn('Not a TextRegion: "%s"' % region_id)
             else:
                 raise NotImplementedError
     else:
         for region in tree.iterfind('.//page:TextRegion', namespaces=nsmap):
-            region_texts.append(region_text(region))
+            regions.append(ExtractedTextSegment.from_text_segment(region, nsmap))
 
-    # XXX Does a file have to have regions etc.? region vs lines etc.
     # Filter empty region texts
-    region_texts = (t for t in region_texts if t)
+    regions = (r for r in regions if r.text is not None)
 
-    text_ = '\n'.join(region_texts)
+    return ExtractedText(regions, '\n')
+    # FIXME needs to handle normalization
 
-    return text_
 
+def page_text(tree):
+    return page_extract(tree).text
 
-def text(filename):
-    """Read the text from the given file.
+
+def plain_extract(filename):
+    with open(filename, 'r') as f:
+        return ExtractedText(
+                (ExtractedTextSegment('line %d' % no, line) for no, line in enumerate(f.readlines())),
+                '\n'
+        )
+
+
+def plain_text(filename):
+    return plain_extract(filename).text
+
+
+def extract(filename):
+    """Extract the text from the given file.
 
     Supports PAGE, ALTO and falls back to plain text.
     """
-
     try:
         tree = ET.parse(filename)
     except XMLSyntaxError:
-        with open(filename, 'r') as f:
-            return f.read()
+        return plain_extract(filename)
     try:
-        return page_text(tree)
+        return page_extract(tree)
     except ValueError:
-        return alto_text(tree)
+        return alto_extract(tree)
+
+
+def text(filename):
+    return extract(filename).text
 
 
 if __name__ == '__main__':
diff --git a/qurator/dinglehopper/substitute_equivalences.py b/qurator/dinglehopper/substitute_equivalences.py
index 1b7e0cf..39be276 100644
--- a/qurator/dinglehopper/substitute_equivalences.py
+++ b/qurator/dinglehopper/substitute_equivalences.py
@@ -1,21 +1,15 @@
 import unicodedata
 
 
-def substitute_equivalences(s):
+def unjoin_ligatures(s):
+    """Unjoin ligatures, i.e. ﬀ becomes ff."""
 
-    # These are for OCR-D GT vs Tesseract frk vs Calamari GT4HistOCR
-    # It might make sense to use different rules for GT and for the different OCR
     equivalences = {
-        '': 'ü',
         '': 'ſſ',
         "\ueba7": 'ſſi',  # MUFI: LATIN SMALL LIGATURE LONG S LONG S I
-        '': 'ä',
         '': 'ch',
-        '==': '–',  # → en-dash
-        '—': '–',   # em-dash → en-dash
         '': 'ck',
         '': 'll',
-        '': 'ö',
         '': 'ſi',
         '': 'ſt',
         'ﬁ': 'fi',
@@ -23,12 +17,7 @@ def substitute_equivalences(s):
         'ﬂ': 'fl',
         'ﬃ': 'ffi',
         '': 'ct',
-        '’': '\'',
-        '⸗': '-',
         '': 'tz',       # MUFI: LATIN SMALL LIGATURE TZ
-        'aͤ': 'ä',        # LATIN SMALL LETTER A, COMBINING LATIN SMALL LETTER E
-        'oͤ': 'ö',        # LATIN SMALL LETTER O, COMBINING LATIN SMALL LETTER E
-        'uͤ': 'ü',        # LATIN SMALL LETTER U, COMBINING LATIN SMALL LETTER E
         '\uf532': 'as',  # eMOP: Latin small ligature as
         '\uf533': 'is',  # eMOP: Latin small ligature is
         '\uf534': 'us',  # eMOP: Latin small ligature us
@@ -37,10 +26,32 @@ def substitute_equivalences(s):
         '\uE8BF': 'q&',  # MUFI: LATIN SMALL LETTER Q LIGATED WITH FINAL ET  XXX How to replace this correctly?
         '\uEBA5': 'ſp',  # MUFI: LATIN SMALL LIGATURE LONG S P
         'ﬆ': 'st',      # U+FB06 LATIN SMALL LIGATURE ST
+    }
+    s = unicodedata.normalize('NFC', s)
+    for fr, to in equivalences.items():
+        s = s.replace(fr, to)
+    return s
+
+
+def substitute_equivalences(s):
+    # These are for OCR-D GT vs Tesseract frk vs Calamari GT4HistOCR
+    # It might make sense to use different rules for GT and for the different OCR
+    equivalences = {
+        '': 'ü',
+        '': 'ä',
+        '==': '–',  # → en-dash
+        '—': '–',   # em-dash → en-dash
+        '': 'ö',
+        '’': '\'',
+        '⸗': '-',
+        'aͤ': 'ä',        # LATIN SMALL LETTER A, COMBINING LATIN SMALL LETTER E
+        'oͤ': 'ö',        # LATIN SMALL LETTER O, COMBINING LATIN SMALL LETTER E
+        'uͤ': 'ü',        # LATIN SMALL LETTER U, COMBINING LATIN SMALL LETTER E
         '\uF50E': 'q́'    # U+F50E LATIN SMALL LETTER Q WITH ACUTE ACCENT
     }
 
     s = unicodedata.normalize('NFC', s)
+    s = unjoin_ligatures(s)
     for fr, to in equivalences.items():
         s = s.replace(fr, to)
     return s
diff --git a/qurator/dinglehopper/templates/report.html.js b/qurator/dinglehopper/templates/report.html.js
index ac43676..4c2ba28 100644
--- a/qurator/dinglehopper/templates/report.html.js
+++ b/qurator/dinglehopper/templates/report.html.js
@@ -1,14 +1,15 @@
 function find_diff_class(classes) {
-    return classes.split(/\s+/).find(x => x.match(/.diff\d.*/));
+    return $('.' + classes.split(/\s+/).find(x => x.match(/.diff\d.*/)));
 }
 
 $(document).ready(function() {
+    /* Enable Bootstrap tooltips */
+    $('[data-toggle="tooltip"]').tooltip();
+
     $('.diff').mouseover(function() {
-        let c = find_diff_class($(this).attr('class'))
-        $('.' + c).addClass('diff-highlight')
+        find_diff_class($(this).attr('class')).addClass('diff-highlight');
     });
     $('.diff').mouseout(function() {
-        let c = find_diff_class($(this).attr('class'))
-        $('.' + c).removeClass('diff-highlight')
+        find_diff_class($(this).attr('class')).removeClass('diff-highlight');
     });
 });
diff --git a/qurator/dinglehopper/tests/extracted_text_test.py b/qurator/dinglehopper/tests/extracted_text_test.py
new file mode 100644
index 0000000..2e6a9e6
--- /dev/null
+++ b/qurator/dinglehopper/tests/extracted_text_test.py
@@ -0,0 +1,68 @@
+import unicodedata
+import pytest
+from qurator.dinglehopper import ExtractedText, ExtractedTextSegment
+from uniseg.graphemecluster import grapheme_clusters
+from qurator.dinglehopper import seq_align
+from collections import namedtuple
+
+
+def test_text():
+    test1 = ExtractedText([
+        ExtractedTextSegment('s0', 'foo'),
+        ExtractedTextSegment('s1', 'bar'),
+        ExtractedTextSegment('s2', 'bazinga')
+    ], ' ')
+
+    assert test1.text == 'foo bar bazinga'
+    assert test1.segment_id_for_pos(0) == 's0'
+    assert test1.segment_id_for_pos(3) is None
+    assert test1.segment_id_for_pos(10) == 's2'
+
+
+def test_normalization_check():
+    with pytest.raises(ValueError, match=r'.*is not normalized.*'):
+        ExtractedTextSegment('foo', unicodedata.normalize('NFD', 'Schlyñ'))
+    assert ExtractedTextSegment('foo', unicodedata.normalize('NFC', 'Schlyñ'))
+
+
+AlignmentElement = namedtuple('AlignmentElement', 'left right left_id right_id')
+
+
+def test_align():
+    """
+    Test aligning by character while retaining segment id info
+
+    The difficulty here is that aligning should work on grapheme clusters,
+    not Python characters.
+    """
+
+    test1 = ExtractedText([
+        ExtractedTextSegment('s0', 'foo'),
+        ExtractedTextSegment('s1', 'bar'),
+        ExtractedTextSegment('s2', 'batzinga')
+    ], ' ')
+    test2 = ExtractedText([
+        ExtractedTextSegment('x0', 'foo'),
+        ExtractedTextSegment('x1', 'bar'),
+        ExtractedTextSegment('x2', '.'),  # extra .
+        ExtractedTextSegment('x3', 'bazim̃ga'),  # deletion + different grapheme cluster, m̃ also is two Python characters
+    ], ' ')
+
+    left_pos = 0; right_pos = 0; alignment = []
+    for left, right in seq_align(grapheme_clusters(test1.text), grapheme_clusters(test2.text)):
+        left_id = test1.segment_id_for_pos(left_pos) if left is not None else None
+        right_id = test2.segment_id_for_pos(right_pos) if right is not None else None
+        el = AlignmentElement(left, right, left_id, right_id)
+        alignment.append(el)
+        if left is not None:
+            left_pos += len(left)
+        if right is not None:
+            right_pos += len(right)
+
+    print('test1: {}'.format(test1.text))
+    print('test2: {}'.format(test2.text))
+
+    assert alignment[0]  == ('f',  'f',  's0', 'x0')
+    assert alignment[8]  == (None, '.',  None, 'x2')
+    assert alignment[12] == ('t',  None, 's2', None)
+    assert alignment[15] == ('n',  'm̃',  's2', 'x3')
diff --git a/qurator/dinglehopper/tests/test_align.py b/qurator/dinglehopper/tests/test_align.py
index cc5cb43..23483f8 100644
--- a/qurator/dinglehopper/tests/test_align.py
+++ b/qurator/dinglehopper/tests/test_align.py
@@ -78,7 +78,8 @@ def test_lines():
 
 
 def test_lines_similar():
-    """Test comparing list of lines while using a "weaker equivalence".
+    """
+    Test comparing list of lines while using a "weaker equivalence".
 
     This mainly serves as documentation.
     """
@@ -88,7 +89,14 @@ def test_lines_similar():
             self._string = string
 
         def __eq__(self, other):
-            return distance(self._string, other._string) < 2    # XXX NOT the final version
+            # Just an example!
+            min_len = min(len(self._string), len(other._string))
+            if min_len > 0:
+                normalized_distance = distance(self._string, other._string)/min_len
+                similar = normalized_distance < 0.1
+            else:
+                similar = False
+            return similar
 
         def __ne__(self, other):
             return not self.__eq__(other)
@@ -106,3 +114,6 @@ def test_lines_similar():
     left, right = unzip(result)
     assert list(left)  == [SimilarString('This is a line.'), SimilarString('This is another'), None,                             SimilarString('And the last line')]
     assert list(right) == [SimilarString('This is a ljne.'), SimilarString('This is another'), SimilarString('J  u   n      k'), SimilarString('And the last line')]
+
+    # Test __eq__ (i.e. is it a substitution or a similar string?)
+    assert list(left)[0] == list(right)[0]
diff --git a/qurator/dinglehopper/tests/test_integ_align.py b/qurator/dinglehopper/tests/test_integ_align.py
index df1e230..b35974b 100644
--- a/qurator/dinglehopper/tests/test_integ_align.py
+++ b/qurator/dinglehopper/tests/test_integ_align.py
@@ -13,11 +13,15 @@ data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data')
 @pytest.mark.integration
 def test_align_page_files():
     # In the fake OCR file, we changed 2 characters and replaced a fi ligature with fi.
-    # → 4 elements in the alignment should be different.
+    # → 2 elements in the alignment should be different, the ligature is
+    # (currently) not counted due to normalization.
     # NOTE: In this example, it doesn't matter that we work with "characters", not grapheme clusters.
 
     gt = page_text(ET.parse(os.path.join(data_dir, 'test-gt.page2018.xml')))
     ocr = page_text(ET.parse(os.path.join(data_dir, 'test-fake-ocr.page2018.xml')))
 
     result = list(align(gt, ocr))
-    assert sum(left != right for left, right in result) == 4
+    for left, right in result:
+        if left != right:
+            print(left, right)
+    assert sum(left != right for left, right in result) == 2
diff --git a/qurator/dinglehopper/tests/test_integ_character_error_rate_ocr.py b/qurator/dinglehopper/tests/test_integ_character_error_rate_ocr.py
index c27cd31..1c3bf52 100644
--- a/qurator/dinglehopper/tests/test_integ_character_error_rate_ocr.py
+++ b/qurator/dinglehopper/tests/test_integ_character_error_rate_ocr.py
@@ -4,6 +4,7 @@ import os
 
 import pytest
 from lxml import etree as ET
+from uniseg.graphemecluster import grapheme_clusters
 
 from .. import character_error_rate, page_text, alto_text
 
@@ -13,9 +14,14 @@ data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data')
 @pytest.mark.integration
 def test_character_error_rate_between_page_files():
     # In the fake OCR file, we changed 2 characters and replaced a fi ligature with fi.
+    # The fi ligature does not count.
     gt = page_text(ET.parse(os.path.join(data_dir, 'test-gt.page2018.xml')))
     ocr = page_text(ET.parse(os.path.join(data_dir, 'test-fake-ocr.page2018.xml')))
-    assert character_error_rate(gt, ocr) == 4/(470 + 1 + 311)  # 2 TextRegions, 1 \n
+
+    gt_len = len(list(grapheme_clusters(gt)))
+    expected_cer = 2/gt_len
+
+    assert character_error_rate(gt, ocr) == expected_cer
 
 
 @pytest.mark.integration
diff --git a/qurator/dinglehopper/tests/test_integ_cli_valid_json.py b/qurator/dinglehopper/tests/test_integ_cli_valid_json.py
index 5699700..d71bc14 100644
--- a/qurator/dinglehopper/tests/test_integ_cli_valid_json.py
+++ b/qurator/dinglehopper/tests/test_integ_cli_valid_json.py
@@ -1,4 +1,3 @@
-import os
 import json
 
 import pytest
@@ -10,14 +9,17 @@ from ..cli import process
 def test_cli_json(tmp_path):
     """Test that the cli/process() yields a loadable JSON report"""
 
-    # XXX Path.__str__() is necessary for Python 3.5
     with working_directory(str(tmp_path)):
         with open('gt.txt', 'w') as gtf:
             gtf.write('AAAAA')
         with open('ocr.txt', 'w') as ocrf:
             ocrf.write('AAAAB')
 
+        with open('gt.txt', 'r') as gtf:
+            print(gtf.read())
         process('gt.txt', 'ocr.txt', 'report')
+        with open('report.json', 'r') as jsonf:
+            print(jsonf.read())
         with open('report.json', 'r') as jsonf:
             j = json.load(jsonf)
             assert j['cer'] == pytest.approx(0.2)
@@ -26,7 +28,6 @@ def test_cli_json(tmp_path):
 def test_cli_json_cer_is_infinity(tmp_path):
     """Test that the cli/process() yields a loadable JSON report when CER == inf"""
 
-    # XXX Path.__str__() is necessary for Python 3.5
     with working_directory(str(tmp_path)):
         with open('gt.txt', 'w') as gtf:
             gtf.write('')  # Empty to yield CER == inf
diff --git a/qurator/dinglehopper/tests/test_integ_edit_distance_ocr.py b/qurator/dinglehopper/tests/test_integ_edit_distance_ocr.py
index 2857d56..cbe12f8 100644
--- a/qurator/dinglehopper/tests/test_integ_edit_distance_ocr.py
+++ b/qurator/dinglehopper/tests/test_integ_edit_distance_ocr.py
@@ -13,9 +13,11 @@ data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data')
 @pytest.mark.integration
 def test_distance_between_page_files():
     # In the fake OCR file, we changed 2 characters and replaced a fi ligature with fi.
+    # Due to normalization, we don't count the ligature.
+    # → 2 differences
     gt = page_text(ET.parse(os.path.join(data_dir, 'test-gt.page2018.xml')))
     ocr = page_text(ET.parse(os.path.join(data_dir, 'test-fake-ocr.page2018.xml')))
-    assert distance(gt, ocr) == 4
+    assert distance(gt, ocr) == 2
 
 
 @pytest.mark.integration
diff --git a/qurator/dinglehopper/tests/test_integ_ocrd_cli.py b/qurator/dinglehopper/tests/test_integ_ocrd_cli.py
index 41da748..75bb816 100644
--- a/qurator/dinglehopper/tests/test_integ_ocrd_cli.py
+++ b/qurator/dinglehopper/tests/test_integ_ocrd_cli.py
@@ -1,11 +1,9 @@
 import os
-import re
 import shutil
 import json
 from pathlib import Path
 
 from click.testing import CliRunner
-import pytest
 from .util import working_directory
 
 
@@ -17,8 +15,6 @@ data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data')
 def test_ocrd_cli(tmp_path):
     """Test OCR-D interface"""
 
-    # XXX Path.str() is necessary for Python 3.5
-
     # Copy test workspace
     test_workspace_dir_source = Path(data_dir) / 'actevedef_718448162'
     test_workspace_dir = tmp_path / 'test_ocrd_cli'
diff --git a/qurator/dinglehopper/tests/test_integ_word_error_rate_ocr.py b/qurator/dinglehopper/tests/test_integ_word_error_rate_ocr.py
index 1d2dead..f5c922b 100644
--- a/qurator/dinglehopper/tests/test_integ_word_error_rate_ocr.py
+++ b/qurator/dinglehopper/tests/test_integ_word_error_rate_ocr.py
@@ -12,14 +12,15 @@ data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data')
 
 @pytest.mark.integration
 def test_word_error_rate_between_page_files():
-    # In the fake OCR file, we changed 2 characters and replaced a fi ligature with fi. → 3 changed words
+    # In the fake OCR file, we changed 2 characters and replaced a fi ligature with fi. So we have 3 changed words,
+    # the ligature does not count → 2 errors
     gt = page_text(ET.parse(os.path.join(data_dir, 'test-gt.page2018.xml')))
 
     gt_word_count = 7+6+5+8+7+6+7+8+6+7+7+5+6+8+8+7+7+6+5+4  # Manually verified word count per line
     assert len(list(words(gt))) == gt_word_count
 
     ocr = page_text(ET.parse(os.path.join(data_dir, 'test-fake-ocr.page2018.xml')))
-    assert word_error_rate(gt, ocr) == 3/gt_word_count
+    assert word_error_rate(gt, ocr) == 2/gt_word_count
 
 
 @pytest.mark.integration
diff --git a/qurator/dinglehopper/tests/test_ocr_files.py b/qurator/dinglehopper/tests/test_ocr_files.py
index dd9377a..3291152 100644
--- a/qurator/dinglehopper/tests/test_ocr_files.py
+++ b/qurator/dinglehopper/tests/test_ocr_files.py
@@ -6,7 +6,8 @@ import textwrap
 
 import pytest
 
-from .. import alto_namespace, alto_text, page_namespace, page_text, text
+from .util import working_directory
+from .. import alto_namespace, alto_text, page_namespace, page_text, plain_text, text
 
 data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data')
 
@@ -49,27 +50,51 @@ def test_page_namespace():
 def test_page_test():
     tree = ET.parse(os.path.join(data_dir, 'test.page2018.xml'))
     result = page_text(tree)
+
+    # We are currently normalizing on extraction, so the text is normalized.
+    #
+    #  expected = textwrap.dedent("""\
+    #      ber die vielen Sorgen wegen deelben vergaß
+    #      Hartkopf, der Frau Amtmnnin das ver⸗
+    #      ſproene zu berliefern. — Ein Erpreer
+    #      wurde an ihn abgeſit, um ihn ums Him⸗
+    #      melswien zu ſagen, daß er das Verſproene
+    #      glei den Augenbli berbringen mte, die
+    #      Frau Amtmnnin htte  auf ihn verlaen,
+    #      und nun wßte e nit, was e anfangen
+    #      ſote. Den Augenbli ſote er kommen,
+    #      ſon vergieng e in ihrer Ang. — Die
+    #      Ge wren ſon angekommen, und es fehlte
+    #      ihr do no an aem. —
+    #      Hartkopf mußte  er bennen, und
+    #      endli na langem Nadenken ﬁel es ihm er
+    #      wieder ein. — Er langte den Zettel aus dem
+    #      Accisbue heraus, und ſagte ſeiner Frau, daß
+    #      e das, was da wre, herbeyſaﬀen mte.
+    #      Jndeß mangelten do einige Generalia, die
+    #      alſo wegﬁelen. — Hartkopf gieng ſelb
+    #      mit und berbrate es. —""")
     expected = textwrap.dedent("""\
-        ber die vielen Sorgen wegen deelben vergaß
-        Hartkopf, der Frau Amtmnnin das ver⸗
-        ſproene zu berliefern. — Ein Erpreer
-        wurde an ihn abgeſit, um ihn ums Him⸗
-        melswien zu ſagen, daß er das Verſproene
-        glei den Augenbli berbringen mte, die
-        Frau Amtmnnin htte  auf ihn verlaen,
-        und nun wßte e nit, was e anfangen
-        ſote. Den Augenbli ſote er kommen,
-        ſon vergieng e in ihrer Ang. — Die
-        Ge wren ſon angekommen, und es fehlte
-        ihr do no an aem. —
-        Hartkopf mußte  er bennen, und
-        endli na langem Nadenken ﬁel es ihm er
-        wieder ein. — Er langte den Zettel aus dem
-        Accisbue heraus, und ſagte ſeiner Frau, daß
-        e das, was da wre, herbeyſaﬀen mte.
-        Jndeß mangelten do einige Generalia, die
-        alſo wegﬁelen. — Hartkopf gieng ſelb
-        mit und berbrate es. —""")
+        über die vielen Sorgen wegen deſſelben vergaß
+        Hartkopf, der Frau Amtmännin das ver-
+        ſprochene zu überliefern. – Ein Erpreſſer
+        wurde an ihn abgeſchickt, um ihn ums Him-
+        melswillen zu ſagen, daß er das Verſprochene
+        gleich den Augenblick überbringen möchte, die
+        Frau Amtmännin hätte ſich auf ihn verlaſſen,
+        und nun wüßte ſie nicht, was ſie anfangen
+        ſollte. Den Augenblick ſollte er kommen,
+        ſonſt vergieng ſie in ihrer Angſt. – Die
+        Gäſte wären ſchon angekommen, und es fehlte
+        ihr doch noch an allem. –
+        Hartkopf mußte ſich erſt beſinnen, und
+        endlich nach langem Nachdenken fiel es ihm erſt
+        wieder ein. – Er langte den Zettel aus dem
+        Accisbuche heraus, und ſagte ſeiner Frau, daß
+        ſie das, was da wäre, herbeyſchaffen möchte.
+        Jndeß mangelten doch einige Generalia, die
+        alſo wegfielen. – Hartkopf gieng ſelbſt
+        mit und überbrachte es. –""")
     assert result == expected
 
 
@@ -92,7 +117,8 @@ def test_page_order():
     tree = ET.parse(os.path.join(data_dir, 'order.page.xml'))
     result = page_text(tree)
 
-    assert re.search(r'Herr Konfrater.*75.*Etwas f.r Wittwen.*Ein gewi.er Lord.*76\. Die', result, re.DOTALL)
+    print(result)
+    assert re.search(r'Herr Konfrater.*75.*Etwas f.r Wittwen.*Ein gewi.{1,2}er Lord.*76\. Die', result, re.DOTALL)
 
 
 def test_page_mixed_regions():
@@ -106,5 +132,15 @@ def test_page_mixed_regions():
 
 def test_text():
     assert "being erected at the Broadway stock" in text(os.path.join(data_dir, 'test.alto1.xml'))
-    assert "wieder ein. — Er langte den Zettel aus dem" in text(os.path.join(data_dir, 'test.page2018.xml'))
+    assert "wieder ein. – Er langte den Zettel aus dem" in text(os.path.join(data_dir, 'test.page2018.xml'))
     assert "Lorem ipsum" in text(os.path.join(data_dir, 'test.txt'))
+
+
+def test_plain(tmp_path):
+    with working_directory(str(tmp_path)):
+        with open('ocr.txt', 'w') as ocrf:
+            ocrf.write('AAAAB')
+
+        result = plain_text('ocr.txt')
+        expected = 'AAAAB'
+        assert result == expected
diff --git a/qurator/dinglehopper/tests/util.py b/qurator/dinglehopper/tests/util.py
index 52b7506..1f224e5 100644
--- a/qurator/dinglehopper/tests/util.py
+++ b/qurator/dinglehopper/tests/util.py
@@ -21,8 +21,8 @@ def diffprint(x, y):
         _diffprint(x, y)
 
 
-def unzip(l):
-    return zip(*l)
+def unzip(an_iterable_of_tuples):
+    return zip(*an_iterable_of_tuples)
 
 
 class working_directory:
diff --git a/qurator/dinglehopper/word_error_rate.py b/qurator/dinglehopper/word_error_rate.py
index 7ed56e4..64eba0a 100644
--- a/qurator/dinglehopper/word_error_rate.py
+++ b/qurator/dinglehopper/word_error_rate.py
@@ -32,6 +32,11 @@ def words(s):
         cat = subcat[0]
         return cat in unwanted_categories or subcat in unwanted_subcategories
 
+    # XXX
+    from .cli import ExtractedText
+    if isinstance(s, ExtractedText):
+        s = s.text
+
     # We follow Unicode Standard Annex #29 on Unicode Text Segmentation here: Split on word boundaries using
     # uniseg.wordbreak.words() and ignore all "words" that contain only whitespace, punctation "or similar characters."
     for word in uniseg.wordbreak.words(s):
@@ -42,10 +47,20 @@ def words(s):
 
 
 def words_normalized(s):
+    # XXX
+    from .cli import ExtractedText
+    if isinstance(s, ExtractedText):
+        s = s.text
     return words(unicodedata.normalize('NFC', s))
 
 
 def word_error_rate_n(reference, compared) -> Tuple[float, int]:
+    # XXX
+    from .cli import ExtractedText
+    if isinstance(reference, ExtractedText):
+        reference = reference.text
+    if isinstance(compared, ExtractedText):
+        compared = compared.text
     if isinstance(reference, str):
         reference_seq = list(words_normalized(reference))
         compared_seq = list(words_normalized(compared))
diff --git a/requirements.txt b/requirements.txt
index de6547b..846990b 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -4,5 +4,6 @@ lxml
 uniseg
 numpy
 colorama
+MarkupSafe
 ocrd >= 1.0.0b15
 attrs
diff --git a/setup.cfg b/setup.cfg
index 6deafc2..43d7a3a 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,2 +1,2 @@
 [flake8]
-max-line-length = 120
+max-line-length = 90