commit 89048bf55da0e23f60f31b6282e8342cfda8dede Author: Gerber, Mike Date: Wed Aug 14 15:32:50 2019 +0200 ➡ Move dinglehopper into its own directory diff --git a/README.md b/README.md new file mode 100644 index 0000000..817cac3 --- /dev/null +++ b/README.md @@ -0,0 +1,4 @@ +dinglehopper +============ + +dinglehopper is an OCR evaluation tool and reads ALTO, PAGE and text files. diff --git a/qurator/__init__.py b/qurator/__init__.py new file mode 100644 index 0000000..8d17c21 --- /dev/null +++ b/qurator/__init__.py @@ -0,0 +1,2 @@ +__import__('pkg_resources').declare_namespace(__name__) + diff --git a/qurator/dinglehopper/.gitignore b/qurator/dinglehopper/.gitignore new file mode 100644 index 0000000..e70d1f9 --- /dev/null +++ b/qurator/dinglehopper/.gitignore @@ -0,0 +1,6 @@ +# User-specific stuff +.idea/**/workspace.xml +.idea/**/tasks.xml +.idea/**/usage.statistics.xml +.idea/**/dictionaries +.idea/**/shelf diff --git a/qurator/dinglehopper/.idea/dinglehopper.iml b/qurator/dinglehopper/.idea/dinglehopper.iml new file mode 100644 index 0000000..e273926 --- /dev/null +++ b/qurator/dinglehopper/.idea/dinglehopper.iml @@ -0,0 +1,12 @@ + + + + + + + + + + \ No newline at end of file diff --git a/qurator/dinglehopper/.idea/misc.xml b/qurator/dinglehopper/.idea/misc.xml new file mode 100644 index 0000000..ba209a1 --- /dev/null +++ b/qurator/dinglehopper/.idea/misc.xml @@ -0,0 +1,7 @@ + + + + + + \ No newline at end of file diff --git a/qurator/dinglehopper/.idea/modules.xml b/qurator/dinglehopper/.idea/modules.xml new file mode 100644 index 0000000..6035afb --- /dev/null +++ b/qurator/dinglehopper/.idea/modules.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/qurator/dinglehopper/__init__.py b/qurator/dinglehopper/__init__.py new file mode 100644 index 0000000..0e8ee38 --- /dev/null +++ b/qurator/dinglehopper/__init__.py @@ -0,0 +1,5 @@ +from .ocr_files import * +from .substitute_equivalences import * +from .character_error_rate import * +from .word_error_rate import * +from .align import * diff --git a/qurator/dinglehopper/align.py b/qurator/dinglehopper/align.py new file mode 100644 index 0000000..043db33 --- /dev/null +++ b/qurator/dinglehopper/align.py @@ -0,0 +1,34 @@ +from .edit_distance import * + +def align(s1, s2): + s1 = list(s1) + s2 = list(s2) + ops = seq_editops(s1, s2) + i = 0 + j = 0 + + while i < len(s1) or j < len(s2): + o = None + try: + ot = ops[0] + if ot[1] == i and ot[2] == j: + ops = ops[1:] + o = ot + except IndexError: + pass + + if o: + if o[0] == 'insert': + yield (None, s2[j]) + j += 1 + elif o[0] == 'delete': + yield (s1[i], None) + i += 1 + elif o[0] == 'replace': + yield (s1[i], s2[j]) + i += 1 + j += 1 + else: + yield (s1[i], s2[j]) + i += 1 + j += 1 diff --git a/qurator/dinglehopper/character_error_rate.py b/qurator/dinglehopper/character_error_rate.py new file mode 100644 index 0000000..f63a15f --- /dev/null +++ b/qurator/dinglehopper/character_error_rate.py @@ -0,0 +1,21 @@ +from __future__ import division + +import unicodedata + +from uniseg.graphemecluster import grapheme_clusters + +from qurator.dinglehopper.edit_distance import distance + + +def character_error_rate(reference, compared): + d = distance(reference, compared) + if d == 0: + return 0 + + n = len(list(grapheme_clusters(unicodedata.normalize('NFC', reference)))) + if n == 0: + return float('inf') + + return d/n + + # XXX Should we really count newlines here? diff --git a/qurator/dinglehopper/cli.py b/qurator/dinglehopper/cli.py new file mode 100644 index 0000000..8c3186a --- /dev/null +++ b/qurator/dinglehopper/cli.py @@ -0,0 +1,83 @@ +import os + +import click +from jinja2 import Environment, FileSystemLoader + + +from qurator.dinglehopper import * + + +def gen_diff_report(gt_things, ocr_things, css_prefix, joiner, none): + gtx = '' + ocrx = '' + + def format_thing(t, css_classes=None): + if t is None: + t = none + css_classes += ' ellipsis' + if t == '\n': + t = '
' + + if css_classes: + return '{t}'.format(css_classes=css_classes, t=t) + else: + return '{t}'.format(t=t) + + for k, (g, o) in enumerate(align(gt_things, ocr_things)): + if g == o: + css_classes = None + else: + css_classes = '{css_prefix}diff{k} diff'.format(css_prefix=css_prefix, k=k) + + gtx += joiner + format_thing(g, css_classes) + ocrx += joiner + format_thing(o, css_classes) + + return \ + ''' +
+
{}
+
{}
+
+ '''.format(gtx, ocrx) + + +@click.command() +@click.argument('gt', type=click.Path(exists=True)) +@click.argument('ocr', type=click.Path(exists=True)) +def process(gt, ocr): + """Check OCR result against GT""" + + gt_text = text(gt) + ocr_text = text(ocr) + + gt_text = substitute_equivalences(gt_text) + ocr_text = substitute_equivalences(ocr_text) + + cer = character_error_rate(gt_text, ocr_text) + wer = word_error_rate(gt_text, ocr_text) + uwer = unordered_word_error_rate(gt_text, ocr_text) + + char_diff_report = gen_diff_report(gt_text, ocr_text, css_prefix='c', joiner='', none='·') + + gt_words = words(gt_text) + ocr_words = words(ocr_text) + word_diff_report = gen_diff_report(gt_words, ocr_words, css_prefix='w', joiner=' ', none='⋯') + + env = Environment(loader=FileSystemLoader(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'templates'))) + for out_fn in ('report.html', 'report.json'): + template_fn = out_fn + '.j2' + template = env.get_template(template_fn) + template.stream( + gt=gt, ocr=ocr, + cer=cer, wer=wer, uwer=uwer, + char_diff_report=char_diff_report, + word_diff_report=word_diff_report + ).dump(out_fn) + + +def main(): + process() + + +if __name__ == '__main__': + main() diff --git a/qurator/dinglehopper/edit_distance.py b/qurator/dinglehopper/edit_distance.py new file mode 100644 index 0000000..7322563 --- /dev/null +++ b/qurator/dinglehopper/edit_distance.py @@ -0,0 +1,95 @@ +from __future__ import division, print_function + +import unicodedata +from functools import partial + +import numpy as np +from uniseg.graphemecluster import grapheme_clusters + + +def levenshtein_matrix(seq1, seq2): + """Compute the matrix commonly computed to produce the Levenshtein distance. + + This is also known as the Wagner-Fischer algorithm. The matrix element at the bottom right contains the desired + edit distance. + + This algorithm is implemented here because we need an implementation that can work with sequences other than + strings, e.g. lists of grapheme clusters or lists of word strings. + """ + m = len(seq1) + n = len(seq2) + + def from_to(start, stop): + return range(start, stop + 1, 1) + + D = np.zeros((m + 1, n + 1), np.int) + D[0, 0] = 0 + for i in from_to(1, m): + D[i, 0] = i + for j in from_to(1, n): + D[0, j] = j + for i in from_to(1, m): + for j in from_to(1, n): + D[i, j] = min( + D[i - 1, j - 1] + 1 * (seq1[i - 1] != seq2[j - 1]), # Same or Substitution + D[i, j - 1] + 1, # Insertion + D[i - 1, j] + 1 # Deletion + ) + + return D + + +def levenshtein(seq1, seq2): + """Compute the Levenshtein edit distance between two sequences""" + m = len(seq1) + n = len(seq2) + + D = levenshtein_matrix(seq1, seq2) + return D[m, n] + + +def distance(s1, s2): + """Compute the Levenshtein edit distance between two Unicode strings + + Note that this is different from levenshtein() as this function knows about Unicode normalization and grapheme + clusters. This should be the correct way to compare two Unicode strings. + """ + s1 = list(grapheme_clusters(unicodedata.normalize('NFC', s1))) + s2 = list(grapheme_clusters(unicodedata.normalize('NFC', s2))) + return levenshtein(s1, s2) + + +def seq_editops(seq1, seq2): + seq1 = list(seq1) + seq2 = list(seq2) + m = len(seq1) + n = len(seq2) + D = levenshtein_matrix(seq1, seq2) + + def _tail_backtrace(i, j, accumulator): + if i > 0 and D[i - 1, j] + 1 == D[i, j]: + return partial(_tail_backtrace, i - 1, j, [('delete', i-1, j)] + accumulator) + if j > 0 and D[i, j - 1] + 1 == D[i, j]: + return partial(_tail_backtrace, i, j - 1, [('insert', i, j-1)] + accumulator) + if i > 0 and j > 0 and D[i - 1, j - 1] + 1 == D[i, j]: + return partial(_tail_backtrace, i - 1, j - 1, [('replace', i-1, j-1)] + accumulator) + if i > 0 and j > 0 and D[i - 1, j - 1] == D[i, j]: + return partial(_tail_backtrace, i - 1, j - 1, accumulator) # NOP + return accumulator + + def backtrace(i, j): + result = partial(_tail_backtrace, i, j, []) + while isinstance(result, partial): + result = result() + + return result + + b = backtrace(m, n) + return b + + +def editops(word1, word2): + # XXX Note that this returns indices to the _grapheme clusters_, not characters! + word1 = list(grapheme_clusters(unicodedata.normalize('NFC', word1))) + word2 = list(grapheme_clusters(unicodedata.normalize('NFC', word2))) + return seq_editops(word1, word2) diff --git a/qurator/dinglehopper/notebooks/Levenshtein.ipynb b/qurator/dinglehopper/notebooks/Levenshtein.ipynb new file mode 100644 index 0000000..f56d0d7 --- /dev/null +++ b/qurator/dinglehopper/notebooks/Levenshtein.ipynb @@ -0,0 +1,1037 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import unicodedata\n", + "import inspect" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Levenshtein edit distance" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "def levenshtein_matrix(seq1, seq2):\n", + " \"\"\"Compute the matrix commonly computed to produce the Levenshtein distance.\n", + "\n", + " This is also known as the Wagner-Fischer algorithm. The matrix element at the bottom right contains the desired\n", + " edit distance.\n", + "\n", + " This algorithm is implemented here because we need an implementation that can work with sequences other than\n", + " strings, e.g. lists of grapheme clusters or lists of word strings.\n", + " \"\"\"\n", + " m = len(seq1)\n", + " n = len(seq2)\n", + "\n", + " def from_to(start, stop):\n", + " return range(start, stop + 1, 1)\n", + "\n", + " D = np.zeros((m + 1, n + 1), np.int)\n", + " D[0, 0] = 0\n", + " for i in from_to(1, m):\n", + " D[i, 0] = i\n", + " for j in from_to(1, n):\n", + " D[0, j] = j\n", + " for i in from_to(1, m):\n", + " for j in from_to(1, n):\n", + " D[i, j] = min(\n", + " D[i - 1, j - 1] + 1 * (seq1[i - 1] != seq2[j - 1]), # Same or Substitution\n", + " D[i, j - 1] + 1, # Insertion\n", + " D[i - 1, j] + 1 # Deletion\n", + " )\n", + "\n", + " return D\n", + "\n", + "def levenshtein(seq1, seq2):\n", + " \"\"\"Compute the Levenshtein edit distance between two sequences\"\"\"\n", + " m = len(seq1)\n", + " n = len(seq2)\n", + "\n", + " D = levenshtein_matrix(seq1, seq2)\n", + " return D[m, n]\n", + "\n" + ] + } + ], + "source": [ + "from edit_distance import levenshtein_matrix, levenshtein\n", + "\n", + "print(inspect.getsource(levenshtein_matrix))\n", + "print(inspect.getsource(levenshtein))" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "assert levenshtein('a', 'a') == 0\n", + "assert levenshtein('a', 'b') == 1\n", + "assert levenshtein('Foo', 'Bar') == 3\n", + "assert levenshtein('', '') == 0\n", + "assert levenshtein('Foo', '') == 3\n", + "assert levenshtein('', 'Foo') == 3\n", + "assert levenshtein('Fnord', 'Food') == 2\n", + "assert levenshtein('Müll', 'Mull') == 1\n", + "assert levenshtein('Abstand', 'Sand') == 4" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This fails for different representations of the \"same\" canonically equivalent string:" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "2" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "word1 = unicodedata.normalize('NFC', 'Schlyñ')\n", + "word2 = unicodedata.normalize('NFD', 'Schlyñ') # Different, decomposed!\n", + "levenshtein(word1, word2)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "1" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Same, but for grapheme clusters\n", + "from uniseg.graphemecluster import grapheme_clusters\n", + "\n", + "word1 = list(grapheme_clusters(unicodedata.normalize('NFC', 'Schlyñ')))\n", + "word2 = list(grapheme_clusters(unicodedata.normalize('NFD', 'Schlyñ')))\n", + "levenshtein(word1, word2)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Better." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's define a edit distance function that uses the basic Levenshtein algorithm, but knows about Unicode normalization and grapheme clusters!" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "def distance(s1, s2):\n", + " \"\"\"Compute the Levenshtein edit distance between two Unicode strings\n", + "\n", + " Note that this is different from levenshtein() as this function knows about Unicode normalization and grapheme\n", + " clusters. This should be the correct way to compare two Unicode strings.\n", + " \"\"\"\n", + " s1 = list(grapheme_clusters(unicodedata.normalize('NFC', s1)))\n", + " s2 = list(grapheme_clusters(unicodedata.normalize('NFC', s2)))\n", + " return levenshtein(s1, s2)\n", + "\n" + ] + } + ], + "source": [ + "from edit_distance import distance\n", + "print(inspect.getsource(distance))" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "word1 = unicodedata.normalize('NFC', 'Schlyñ')\n", + "word2 = unicodedata.normalize('NFD', 'Schlyñ') # Different, decomposed!\n", + "\n", + "distance(word1, word2)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This should give us the correct answer of 1 for 'Schlyñ' (with LATIN SMALL LETTER N WITH TILDE) vs 'Schlym̃' (with LATIN SMALL LETTER M + COMBINING TILDE):" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "1" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "word1 = 'Schlyñ'\n", + "word2 = 'Schlym̃'\n", + "#print('Lengths, as far as Python is concerned:', len(word1), len(word2)) # → gives 6 and 7!\n", + "distance(word1, word2)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Edit operations\n", + "\n", + "python-Levenshtein supports backtracing, i.e. giving a sequence of edit options that transforms a word to another word:\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[('insert', 5, 5), ('replace', 5, 6)]\n" + ] + } + ], + "source": [ + "import Levenshtein\n", + "word1 = 'Schlyñ' # with LATIN SMALL LETTER N WITH TILDE\n", + "word2 = 'Schlym̃' # with LATIN SMALL LETTER M + COMBINING TILDE\n", + "print(Levenshtein.editops(word1, word2))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Note that it does not work with grapheme clusters, but \"characters\", so it gives 2 operations." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Defining our own `editops()`. (This looks a bit wild due to our own tail recursion handling.)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "def seq_editops(seq1, seq2):\n", + " seq1 = list(seq1)\n", + " seq2 = list(seq2)\n", + " m = len(seq1)\n", + " n = len(seq2)\n", + " D = levenshtein_matrix(seq1, seq2)\n", + "\n", + " def _tail_backtrace(i, j, accumulator):\n", + " if i > 0 and D[i - 1, j] + 1 == D[i, j]:\n", + " return partial(_tail_backtrace, i - 1, j, [('delete', i-1, j)] + accumulator)\n", + " if j > 0 and D[i, j - 1] + 1 == D[i, j]:\n", + " return partial(_tail_backtrace, i, j - 1, [('insert', i, j-1)] + accumulator)\n", + " if i > 0 and j > 0 and D[i - 1, j - 1] + 1 == D[i, j]:\n", + " return partial(_tail_backtrace, i - 1, j - 1, [('replace', i-1, j-1)] + accumulator)\n", + " if i > 0 and j > 0 and D[i - 1, j - 1] == D[i, j]:\n", + " return partial(_tail_backtrace, i - 1, j - 1, accumulator) # NOP\n", + " return accumulator\n", + "\n", + " def backtrace(i, j):\n", + " result = partial(_tail_backtrace, i, j, [])\n", + " while isinstance(result, partial):\n", + " result = result()\n", + "\n", + " return result\n", + "\n", + " b = backtrace(m, n)\n", + " return b\n", + "\n", + "def editops(word1, word2):\n", + " # XXX Note that this returns indices to the _grapheme clusters_, not characters!\n", + " word1 = list(grapheme_clusters(unicodedata.normalize('NFC', word1)))\n", + " word2 = list(grapheme_clusters(unicodedata.normalize('NFC', word2)))\n", + " return seq_editops(word1, word2)\n", + "\n" + ] + } + ], + "source": [ + "from edit_distance import seq_editops, editops\n", + "print(inspect.getsource(seq_editops))\n", + "print(inspect.getsource(editops))" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[('replace', 2, 2)]" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "editops('Foo', 'Fon')" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[('insert', 4, 4)]\n", + "[('insert', 4, 4)]\n" + ] + } + ], + "source": [ + "print(editops('Käptn', 'Käpt\\'n'))\n", + "print(Levenshtein.editops('Käptn', 'Käpt\\'n'))" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[('delete', 6, 6)]\n", + "[('delete', 6, 6)]\n" + ] + } + ], + "source": [ + "print(editops('Delete something', 'Deletesomething'))\n", + "print(Levenshtein.editops('Delete something', 'Deletesomething'))" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[('delete', 1, 1), ('replace', 13, 12), ('insert', 17, 16), ('delete', 23, 23)]\n", + "[('delete', 1, 1), ('replace', 13, 12), ('insert', 16, 15), ('delete', 23, 23)]\n" + ] + } + ], + "source": [ + "print(editops('A more difficult example', 'Amore difficült exampl'))\n", + "print(Levenshtein.editops('A more difficult example', 'Amore difficült exampl'))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "XXX Note that our implementation returns different positions here for the 'insert'. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's try it with a difficult example that needs grapheme cluster handling:" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[('replace', 5, 5)]" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "word1 = 'Schlyñ' # with LATIN SMALL LETTER N WITH TILDE\n", + "word2 = 'Schlym̃' # with LATIN SMALL LETTER M + COMBINING TILDE\n", + "\n", + "editops(word1, word2)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "🎉" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Character error rate\n", + "\n", + "[digitisation.eu](https://sites.google.com/site/textdigitisation/qualitymeasures/computingerrorrates) defines the character error rate (CER) as:\n", + "\n", + "$$\n", + "\\text{CER} = \\frac{i + s + d}{n}\n", + "$$\n", + "\n", + "where $i$ is the number of inserts, $s$ the number of substitutions, $d$ the number of deletions and $n$ is the number of characters in the reference text. (The text is not super clear about $n$ being the number of characters in the reference text, but it seems appropiate as they *are* clear about this when computing the word error rate.)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Because our edit distance is equal to $i + s + d$, we can thus define:" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "def character_error_rate(reference, compared):\n", + " d = distance(reference, compared)\n", + " if d == 0:\n", + " return 0\n", + "\n", + " n = len(list(grapheme_clusters(unicodedata.normalize('NFC', reference))))\n", + " if n == 0:\n", + " return float('inf')\n", + "\n", + " return d/n\n", + "\n" + ] + } + ], + "source": [ + "from character_error_rate import character_error_rate\n", + "print(inspect.getsource(character_error_rate))" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "assert character_error_rate('Foo', 'Bär') == 3/3\n", + "assert character_error_rate('Fnord', 'Food') == 2/5\n", + "assert character_error_rate('Food', 'Fnord') == 2/4\n", + "assert character_error_rate('Schlyñ', 'Schlym̃') == 1/6" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "# From experiments/2019-07-ocrevalUAtion: These are already preprocessed by the equivalences in equivalences-tess-frk.csv.\n", + "gt = \"\"\"115 über die vielen Sorgen wegen deſſelben vergaß Hartkopf, der Frau Amtmännin das ver⸗ ſprochene zu überliefern. — Ein Erpreſſer wurde an ihn abgeſchickt, um ihn ums Him⸗ melswillen zu ſagen, daß er das Verſprochene gleich den Augenblick überbringen möchte, die Frau Amtmännin hätte ſich auf ihn verlaſſen, und nun wüßte ſie nicht, was ſie anfangen ſollte. Den Augenblick ſollte er kommen, ſonſt vergieng ſie in ihrer Angſt. — Die Gäſte wären ſchon angekommen, und es fehlte ihr doch noch an allem. — Hartkopf mußte ſich erſt beſinnen, und endlich nach langem Nachdenken fiel es ihm erſt wieder ein. — Er langte den Zettel aus dem Accisbuche heraus, und ſagte ſeiner Frau, daß ſie das, was da wäre, herbeyſchaffen möchte. Jndeß mangelten doch einige Generalia, die alſo wegfielen. — Hartkopf gieng ſelbſt mit und überbrachte es. — „Herr Jemine! er böſer Mann!“ — ſchrie ihm die Frau Amtmännin entgegen, und ſchlug ihn auf die Schulter und blickte den Korb, der voll gedrückt, gerüttelt und überflüſſig in ihren Schoos gegeben werden ſollte, mit Augen voller Freu⸗ H 2\"\"\"\n", + "tess = \"\"\"emm unmit; Lis Übey die vielen Sorgen wegen\" deſſelben vergaß Hartkopf, der Frau! Amimännin das- ver ſprochene zu überliefeen. ==\" Ein Epypreſſer- wurde an ihn abgeſchieet', um' ihn ums Hime melswillen zu ſagen, \"daß er das Verſyrochene leich den Augenblick \"überbringen möchte, die Frau Amtmännin hätte ſich auf ihn veriaſſen, und nun wüßte ſie- nicht, was ſie anfangen ſollte, =! 'Den Augenblick ſollte \"er kommen, ſonſt vergieng ſie in ihrer Angſt. == Die Säuaſie- wären. ſchon angekommen, und es fehlte ihr do < noch an alien, === Hartfopyf mußte ſich erſt TIM und endlich mach langem Rachdenken fiel es ihm erſt wieder ein, ==. Ex langte den Zettel aus dem- Accisbuche heraus, und ſagte ſeiner Frau, daß ſie das , was da wäre, herbeyſchaffen mschte. ZIudeß „mangelten doch einige Generalia, die alſo wegfielen. == ' Havrkopf gieng ſelbſt mit und überbrachte es == | „Herr Jemine! er böſer Mann 1-2 ſchrie ihm die Frau Amtmännin entgegen, und ſchlug ihn auf die Schulter und blickte den Korb, der - voll gedrückt, gerüttelt und überfirfſig in ihren Ss HEILE werden ſolite, mit Augen voller EE) Fron?\"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.1190\n" + ] + } + ], + "source": [ + "print('{:.4f}'.format(character_error_rate(gt, tess)))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "XXX This gives a smaller CER than ocrevalUAtion (which gives 0.1228). Why?" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.1190253045923149" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "levenshtein(gt, tess)/len(gt)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "That's ~ the same, so I think it's not about the character segmentation. Check that we're only dealing with single-codepoint grapheme clusters:" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [], + "source": [ + "for w in gt, tess:\n", + " for g in grapheme_clusters(w):\n", + " assert len(g) == 1" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Maybe ocrevalUAtion doesn't count whitespace?" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'115überdievielenSorgenwegendeſſelbenvergaßHartkopf,derFrauAmtmännindasver⸗ſprochenezuüberliefern.—EinErpreſſerwurdeanihnabgeſchickt,umihnumsHim⸗melswillenzuſagen,daßerdasVerſprochenegleichdenAugenblicküberbringenmöchte,dieFrauAmtmänninhätteſichaufihnverlaſſen,undnunwüßteſienicht,wasſieanfangenſollte.DenAugenblickſollteerkommen,ſonſtvergiengſieinihrerAngſt.—DieGäſtewärenſchonangekommen,undesfehlteihrdochnochanallem.—Hartkopfmußteſicherſtbeſinnen,undendlichnachlangemNachdenkenfielesihmerſtwiederein.—ErlangtedenZettelausdemAccisbucheheraus,undſagteſeinerFrau,daßſiedas,wasdawäre,herbeyſchaffenmöchte.JndeßmangeltendocheinigeGeneralia,diealſowegfielen.—Hartkopfgiengſelbſtmitundüberbrachtees.—„HerrJemine!erböſerMann!“—ſchrieihmdieFrauAmtmänninentgegen,undſchlugihnaufdieSchulterundblicktedenKorb,dervollgedrückt,gerütteltundüberflüſſiginihrenSchoosgegebenwerdenſollte,mitAugenvollerFreu⸗H2'" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "def remove_whitespace(s):\n", + " return s.replace(' ', '')\n", + "remove_whitespace(gt)" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.1324\n" + ] + } + ], + "source": [ + "print('{:.4f}'.format(character_error_rate(remove_whitespace(gt), remove_whitespace(tess))))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now it's larger than ocrevalUAtion 🤷‍♂️" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Word error rate" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Word segmentation" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Naively split on spaces.\n", + "\n", + "(Note: ocrevalUAtion does confusing things here, like the Token splitting in a hash function, with an empty pattern?!)" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [], + "source": [ + "def naive_word_split(s):\n", + " return s.split(' ')" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [], + "source": [ + "example_text = \"The quick (“brown”) fox can't jump 32.3 feet, right?\"" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['The',\n", + " 'quick',\n", + " '(“brown”)',\n", + " 'fox',\n", + " \"can't\",\n", + " 'jump',\n", + " '32.3',\n", + " 'feet,',\n", + " 'right?']" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "naive_word_split(example_text)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's do it the Unicode way (Appendix UAX #29 on Unicode Text Segmentation): Split on word boundaries using the uniseg libraries and ignore words that contain only whitespace, punctuation \"and similar characters\":" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "def words(s):\n", + " # Patch uniseg.wordbreak.word_break to deal with our private use characters. See also\n", + " # https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/WordBreakProperty.txt\n", + " old_word_break = uniseg.wordbreak.word_break\n", + "\n", + " def new_word_break(c, index=0):\n", + " if 0xE000 <= ord(c) <= 0xF8FF: # Private Use Area\n", + " return 'ALetter'\n", + " else:\n", + " return old_word_break(c, index)\n", + " uniseg.wordbreak.word_break = new_word_break\n", + "\n", + " # Check if c is an unwanted character, i.e. whitespace, punctuation, or similar\n", + " def unwanted(c):\n", + "\n", + " # See https://www.fileformat.info/info/unicode/category/index.htm\n", + " # and https://unicodebook.readthedocs.io/unicode.html#categories\n", + " unwanted_categories = 'O', 'M', 'P', 'Z', 'S'\n", + " unwanted_subcategories = 'Cc', 'Cf'\n", + "\n", + " subcat = unicodedata.category(c)\n", + " cat = subcat[0]\n", + " return cat in unwanted_categories or subcat in unwanted_subcategories\n", + "\n", + " # We follow Unicode Standard Annex #29 on Unicode Text Segmentation here: Split on word boundaries using\n", + " # uniseg.wordbreak.words() and ignore all \"words\" that contain only whitespace, punctation \"or similar characters.\"\n", + " for word in uniseg.wordbreak.words(s):\n", + " if all(unwanted(c) for c in word):\n", + " pass\n", + " else:\n", + " yield word\n", + "\n" + ] + }, + { + "data": { + "text/plain": [ + "['The', 'quick', 'brown', 'fox', \"can't\", 'jump', '32.3', 'feet', 'right']" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from word_error_rate import words\n", + "print(inspect.getsource(words))\n", + "\n", + "list(words(example_text))" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['Der',\n", + " 'schnelle',\n", + " 'braune',\n", + " 'Fuchs',\n", + " 'kann',\n", + " 'keine',\n", + " '3,14',\n", + " 'Meter',\n", + " 'springen',\n", + " 'oder']" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "list(words('Der schnelle [„braune“] Fuchs kann keine 3,14 Meter springen, oder?'))" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['Dies', 'ist', 'ein', 'Beispielsatz', 'Oh', 'ja']" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "list(words('Dies ist ein Beispielsatz. Oh, ja.'))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "It's probably not correct for Chinese and Japanese, but at least it doesn't rely on spaces." + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "['我', '很', '高', '興', '跟', '你', '見', '面']" + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "list(words('我很高興跟你見面')) # \"Pleased to meet you\" in Mandarin, Traditional writing" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['医', '者', 'を', '呼', 'ん', 'で', 'く', 'だ', 'さ', 'い']" + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "list(words('医者を呼んでください。'))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Word error rate" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "For the word error rate, normalize again and compare sequences of words." + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "def word_error_rate(reference, compared):\n", + " if isinstance(reference, str):\n", + " reference_seq = list(words_normalized(reference))\n", + " compared_seq = list(words_normalized(compared))\n", + " else:\n", + " reference_seq = list(reference)\n", + " compared_seq = list(compared)\n", + "\n", + " d = levenshtein(reference_seq, compared_seq)\n", + " if d == 0:\n", + " return 0\n", + "\n", + " n = len(reference_seq)\n", + " if n == 0:\n", + " return float('inf')\n", + "\n", + " return d / n\n", + "\n" + ] + } + ], + "source": [ + "from word_error_rate import word_error_rate\n", + "print(inspect.getsource(word_error_rate))" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.25" + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "word_error_rate('Dies ist ein Beispielsatz.', 'Dies isi ein Beispielsatz,')" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.75" + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "word_error_rate('Fnord ist verdampfter Kräutertee!', 'Fnòrd ist verdmpfter Krautertee.')" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.18823529411764706" + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "word_error_rate(gt, tess)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This is a little larger than the ocrevalUAtion result!" + ] + } + ], + "metadata": { + "hide_input": false, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.3" + }, + "toc": { + "base_numbering": 1, + "nav_menu": {}, + "number_sections": true, + "sideBar": true, + "skip_h1_title": false, + "title_cell": "Table of Contents", + "title_sidebar": "Contents", + "toc_cell": false, + "toc_position": {}, + "toc_section_display": true, + "toc_window_display": true + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/qurator/dinglehopper/notebooks/Unicode normalization and Character segmentation.ipynb b/qurator/dinglehopper/notebooks/Unicode normalization and Character segmentation.ipynb new file mode 100644 index 0000000..696fb4f --- /dev/null +++ b/qurator/dinglehopper/notebooks/Unicode normalization and Character segmentation.ipynb @@ -0,0 +1,558 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import unicodedata" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "def list_characters(s):\n", + " \"\"\"List characters of string s, as seen by Python\"\"\"\n", + " for c in s:\n", + " print(c, end=' ')\n", + " if unicodedata.combining(c):\n", + " print(end=' ')\n", + " print(unicodedata.name(c))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Comparing two Unicode strings" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "S LATIN CAPITAL LETTER S\n", + "c LATIN SMALL LETTER C\n", + "h LATIN SMALL LETTER H\n", + "l LATIN SMALL LETTER L\n", + "y LATIN SMALL LETTER Y\n", + "ñ LATIN SMALL LETTER N WITH TILDE\n", + "\n", + "S LATIN CAPITAL LETTER S\n", + "c LATIN SMALL LETTER C\n", + "h LATIN SMALL LETTER H\n", + "l LATIN SMALL LETTER L\n", + "y LATIN SMALL LETTER Y\n", + "n LATIN SMALL LETTER N\n", + "̃ COMBINING TILDE\n", + "\n" + ] + } + ], + "source": [ + "words = [unicodedata.normalize('NFC', 'Schlyñ'), unicodedata.normalize('NFD', 'Schlyñ')]\n", + "\n", + "for s in words:\n", + " list_characters(s)\n", + " print()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "These two strings are different:" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "False" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "words[0] == words[1]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "And yet they are the canonically equivalent:" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "unicodedata.normalize('NFC', words[0]) == unicodedata.normalize('NFC', words[1])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "→ Normalize to NFC (Normalization Form Composed) to compare. NFC is also composed, which is what we want. But it doesn't matter because we're not interested in the characters as Python sees them, but in grapheme clusters (see below.)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Grapheme clusters" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "For evaluation we're interesting in what is perceived as \"characters\". But is \"ñ\" 1 character (LATIN SMALL LETTER N WITH TILDE) or 2 (LATIN SMALL LETTER N + COMBINING TILDE)?\n", + "\n", + "What we're probably want are [grapheme clusters](https://uniseg-python.readthedocs.io/en/latest/graphemecluster.html):" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['S', 'c', 'h', 'l', 'y', 'ñ']\n", + "['S', 'c', 'h', 'l', 'y', 'ñ']\n" + ] + } + ], + "source": [ + "from uniseg.graphemecluster import grapheme_clusters\n", + "\n", + "for w in words:\n", + " print(list(grapheme_clusters(w)))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Just looking at the interesting character – the last one - from both words:" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "ñ LATIN SMALL LETTER N WITH TILDE\n", + "\n", + "n LATIN SMALL LETTER N\n", + "̃ COMBINING TILDE\n", + "\n" + ] + } + ], + "source": [ + "for w in words:\n", + " list_characters(list(grapheme_clusters(w))[-1])\n", + " print()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "→ Work with grapheme clusters, not \"characters as Python sees them\"." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "def unicode_name(c):\n", + " if 0xE000 <= ord(c) <= 0xF8FF:\n", + " return 'private use character 0x{:04X}'.format(ord(c))\n", + " else:\n", + " return unicodedata.name(c)\n", + " \n", + "\n", + "def list_grapheme_clusters(s):\n", + " \"\"\"List grapheme clusters of string s\"\"\"\n", + " for g in grapheme_clusters(s):\n", + " print(g, end=' ')\n", + " if len(g) > 1:\n", + " print('(multiple)', end=' ')\n", + " try:\n", + " print(', '.join(unicode_name(c) for c in g))\n", + " except ValueError:\n", + " print('ValueError')" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "S LATIN CAPITAL LETTER S\n", + "c LATIN SMALL LETTER C\n", + "h LATIN SMALL LETTER H\n", + "l LATIN SMALL LETTER L\n", + "y LATIN SMALL LETTER Y\n", + "ñ LATIN SMALL LETTER N WITH TILDE\n", + "\n", + "S LATIN CAPITAL LETTER S\n", + "c LATIN SMALL LETTER C\n", + "h LATIN SMALL LETTER H\n", + "l LATIN SMALL LETTER L\n", + "y LATIN SMALL LETTER Y\n", + "ñ (multiple) LATIN SMALL LETTER N, COMBINING TILDE\n", + "\n" + ] + } + ], + "source": [ + "for w in words:\n", + " list_grapheme_clusters(w)\n", + " print()" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "私 CJK UNIFIED IDEOGRAPH-79C1\n", + "は HIRAGANA LETTER HA\n", + "彼 CJK UNIFIED IDEOGRAPH-5F7C\n", + "女 CJK UNIFIED IDEOGRAPH-5973\n", + "が HIRAGANA LETTER GA\n", + "お HIRAGANA LETTER O\n", + "茶 CJK UNIFIED IDEOGRAPH-8336\n", + "を HIRAGANA LETTER WO\n", + "好 CJK UNIFIED IDEOGRAPH-597D\n", + "き HIRAGANA LETTER KI\n", + "な HIRAGANA LETTER NA\n", + "事 CJK UNIFIED IDEOGRAPH-4E8B\n", + "が HIRAGANA LETTER GA\n", + "分 CJK UNIFIED IDEOGRAPH-5206\n", + "か HIRAGANA LETTER KA\n", + "っ HIRAGANA LETTER SMALL TU\n", + "た HIRAGANA LETTER TA\n", + "。 IDEOGRAPHIC FULL STOP\n" + ] + } + ], + "source": [ + "list_grapheme_clusters('私は彼女がお茶を好きな事が分かった。')" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + ". FULL STOP\n", + " SPACE\n", + "ا ARABIC LETTER ALEF\n", + "م ARABIC LETTER MEEM\n", + "ا ARABIC LETTER ALEF\n", + " SPACE\n", + "چ ARABIC LETTER TCHEH\n", + "ن ARABIC LETTER NOON\n", + "د ARABIC LETTER DAL\n", + " SPACE\n", + "ت ARABIC LETTER TEH\n", + "ا ARABIC LETTER ALEF\n", + " SPACE\n", + "ح ARABIC LETTER HAH\n", + "ر ARABIC LETTER REH\n", + "ف ARABIC LETTER FEH\n", + " SPACE\n", + "ت ARABIC LETTER TEH\n", + "و ARABIC LETTER WAW\n", + " SPACE\n", + "ف ARABIC LETTER FEH\n", + "ا ARABIC LETTER ALEF\n", + "ر ARABIC LETTER REH\n", + "س ARABIC LETTER SEEN\n", + "ی ARABIC LETTER FARSI YEH\n", + " SPACE\n", + "ه ARABIC LETTER HEH\n", + "س ARABIC LETTER SEEN\n", + "ت ARABIC LETTER TEH\n", + " SPACE\n", + "ک ARABIC LETTER KEHEH\n", + "ه ARABIC LETTER HEH\n", + " SPACE\n", + "ت ARABIC LETTER TEH\n", + "و ARABIC LETTER WAW\n", + " SPACE\n", + "ع ARABIC LETTER AIN\n", + "ر ARABIC LETTER REH\n", + "ب ARABIC LETTER BEH\n", + "ی ARABIC LETTER FARSI YEH\n", + " SPACE\n", + "ن ARABIC LETTER NOON\n", + "ی ARABIC LETTER FARSI YEH\n", + "س ARABIC LETTER SEEN\n", + "ت ARABIC LETTER TEH\n" + ] + } + ], + "source": [ + "list_grapheme_clusters('. اما چند تا حرف تو فارسی هست که تو عربی نیست')" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + ". FULL STOP\n", + " SPACE\n", + "ل ARABIC LETTER LAM\n", + "ك ARABIC LETTER KAF\n", + "ن ARABIC LETTER NOON\n", + " SPACE\n", + "ك ARABIC LETTER KAF\n", + "م ARABIC LETTER MEEM\n", + " SPACE\n", + "ع ARABIC LETTER AIN\n", + "د ARABIC LETTER DAL\n", + "د ARABIC LETTER DAL\n", + " SPACE\n", + "ا ARABIC LETTER ALEF\n", + "ل ARABIC LETTER LAM\n", + "ك ARABIC LETTER KAF\n", + "ل ARABIC LETTER LAM\n", + "م ARABIC LETTER MEEM\n", + "ا ARABIC LETTER ALEF\n", + "ت ARABIC LETTER TEH\n", + " SPACE\n", + "ب ARABIC LETTER BEH\n", + "ا ARABIC LETTER ALEF\n", + "ل ARABIC LETTER LAM\n", + "ف ARABIC LETTER FEH\n", + "ا ARABIC LETTER ALEF\n", + "ر ARABIC LETTER REH\n", + "س ARABIC LETTER SEEN\n", + "ي ARABIC LETTER YEH\n", + "ة ARABIC LETTER TEH MARBUTA\n", + " SPACE\n", + "ه ARABIC LETTER HEH\n", + "ل ARABIC LETTER LAM\n", + " SPACE\n", + "أ ARABIC LETTER ALEF WITH HAMZA ABOVE\n", + "ن ARABIC LETTER NOON\n", + "ت ARABIC LETTER TEH\n", + " SPACE\n", + "ب ARABIC LETTER BEH\n", + "ا ARABIC LETTER ALEF\n", + "ل ARABIC LETTER LAM\n", + "ل ARABIC LETTER LAM\n", + "غ ARABIC LETTER GHAIN\n", + "ة ARABIC LETTER TEH MARBUTA\n", + " SPACE\n", + "ا ARABIC LETTER ALEF\n", + "ل ARABIC LETTER LAM\n", + "ع ARABIC LETTER AIN\n", + "ر ARABIC LETTER REH\n", + "ب ARABIC LETTER BEH\n", + "ي ARABIC LETTER YEH\n", + "ة ARABIC LETTER TEH MARBUTA\n", + "؟ ARABIC QUESTION MARK\n" + ] + } + ], + "source": [ + "list_grapheme_clusters('. لكن كم عدد الكلمات بالفارسية هل أنت باللغة العربية؟')" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "H LATIN CAPITAL LETTER H\n", + "e LATIN SMALL LETTER E\n", + "l LATIN SMALL LETTER L\n", + "l LATIN SMALL LETTER L\n", + "😀 GRINNING FACE\n", + " SPACE\n", + "W LATIN CAPITAL LETTER W\n", + "😀 GRINNING FACE\n", + "r LATIN SMALL LETTER R\n", + "l LATIN SMALL LETTER L\n", + "d LATIN SMALL LETTER D\n", + "! EXCLAMATION MARK\n" + ] + } + ], + "source": [ + "list_grapheme_clusters('Hell😀 W😀rld!')" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "u̶̜͓̬̞͚͙̪̰͓̯̲̝̬͔͎̳̼͇̓͊ͤ̋̃̀̄̓̿͊̀̚͟͜͟ͅ (multiple) LATIN SMALL LETTER U, COMBINING COMMA ABOVE, COMBINING NOT TILDE ABOVE, COMBINING LATIN SMALL LETTER E, COMBINING DOUBLE ACUTE ACCENT, COMBINING TILDE, COMBINING GRAVE ACCENT, COMBINING LEFT ANGLE ABOVE, COMBINING MACRON, COMBINING COMMA ABOVE, COMBINING DOUBLE OVERLINE, COMBINING NOT TILDE ABOVE, COMBINING DOUBLE MACRON BELOW, COMBINING GRAVE TONE MARK, COMBINING DOUBLE BREVE BELOW, COMBINING LONG STROKE OVERLAY, COMBINING DOUBLE MACRON BELOW, COMBINING LEFT HALF RING BELOW, COMBINING X BELOW, COMBINING CARON BELOW, COMBINING DOWN TACK BELOW, COMBINING DOUBLE RING BELOW, COMBINING ASTERISK BELOW, COMBINING BRIDGE BELOW, COMBINING TILDE BELOW, COMBINING X BELOW, COMBINING INVERTED BREVE BELOW, COMBINING LOW LINE, COMBINING UP TACK BELOW, COMBINING CARON BELOW, COMBINING LEFT ARROWHEAD BELOW, COMBINING UPWARDS ARROW BELOW, COMBINING DOUBLE LOW LINE, COMBINING SEAGULL BELOW, COMBINING EQUALS SIGN BELOW, COMBINING GREEK YPOGEGRAMMENI\n", + "ņ̷͔̤̜̗̘̠̦̦̖̟͉̹͕̬͎̙̲̲̎̅̈́ͮͣ̔̀̌͂̄͆͑̚ (multiple) LATIN SMALL LETTER N, COMBINING DOUBLE VERTICAL LINE ABOVE, COMBINING OVERLINE, COMBINING GREEK DIALYTIKA TONOS, COMBINING LEFT ANGLE ABOVE, COMBINING LATIN SMALL LETTER V, COMBINING LATIN SMALL LETTER A, COMBINING REVERSED COMMA ABOVE, COMBINING GRAVE ACCENT, COMBINING CARON, COMBINING GREEK PERISPOMENI, COMBINING MACRON, COMBINING BRIDGE ABOVE, COMBINING LEFT HALF RING ABOVE, COMBINING SHORT SOLIDUS OVERLAY, COMBINING CEDILLA, COMBINING LEFT ARROWHEAD BELOW, COMBINING DIAERESIS BELOW, COMBINING LEFT HALF RING BELOW, COMBINING ACUTE ACCENT BELOW, COMBINING LEFT TACK BELOW, COMBINING MINUS SIGN BELOW, COMBINING COMMA BELOW, COMBINING COMMA BELOW, COMBINING GRAVE ACCENT BELOW, COMBINING PLUS SIGN BELOW, COMBINING LEFT ANGLE BELOW, COMBINING RIGHT HALF RING BELOW, COMBINING RIGHT ARROWHEAD BELOW, COMBINING CARON BELOW, COMBINING UPWARDS ARROW BELOW, COMBINING RIGHT TACK BELOW, COMBINING LOW LINE, COMBINING LOW LINE\n", + "i̴̢͖̳̣̙͕̍ͯͧ̀ͥͭ̆ͣ̉͐͆̊͋͛̈́͒͟ (multiple) LATIN SMALL LETTER I, COMBINING VERTICAL LINE ABOVE, COMBINING LATIN SMALL LETTER X, COMBINING LATIN SMALL LETTER U, COMBINING GRAVE ACCENT, COMBINING LATIN SMALL LETTER I, COMBINING LATIN SMALL LETTER T, COMBINING BREVE, COMBINING LATIN SMALL LETTER A, COMBINING HOOK ABOVE, COMBINING RIGHT ARROWHEAD ABOVE, COMBINING BRIDGE ABOVE, COMBINING RING ABOVE, COMBINING HOMOTHETIC ABOVE, COMBINING ZIGZAG ABOVE, COMBINING GREEK DIALYTIKA TONOS, COMBINING FERMATA, COMBINING TILDE OVERLAY, COMBINING RETROFLEX HOOK BELOW, COMBINING DOUBLE MACRON BELOW, COMBINING RIGHT ARROWHEAD AND UP ARROWHEAD BELOW, COMBINING DOUBLE LOW LINE, COMBINING DOT BELOW, COMBINING RIGHT TACK BELOW, COMBINING RIGHT ARROWHEAD BELOW\n", + "c̰̟̫̲͇̺̹͖̼̦̾ͮ̍̐ͤͪ̓ͤ̐̈́̅ͯͤ̚̚͘ (multiple) LATIN SMALL LETTER C, COMBINING VERTICAL TILDE, COMBINING LATIN SMALL LETTER V, COMBINING VERTICAL LINE ABOVE, COMBINING CANDRABINDU, COMBINING LATIN SMALL LETTER E, COMBINING LEFT ANGLE ABOVE, COMBINING LATIN SMALL LETTER H, COMBINING COMMA ABOVE, COMBINING LATIN SMALL LETTER E, COMBINING LEFT ANGLE ABOVE, COMBINING CANDRABINDU, COMBINING GREEK DIALYTIKA TONOS, COMBINING OVERLINE, COMBINING LATIN SMALL LETTER X, COMBINING LATIN SMALL LETTER E, COMBINING DOT ABOVE RIGHT, COMBINING TILDE BELOW, COMBINING PLUS SIGN BELOW, COMBINING INVERTED DOUBLE ARCH BELOW, COMBINING LOW LINE, COMBINING EQUALS SIGN BELOW, COMBINING INVERTED BRIDGE BELOW, COMBINING RIGHT HALF RING BELOW, COMBINING RIGHT ARROWHEAD AND UP ARROWHEAD BELOW, COMBINING SEAGULL BELOW, COMBINING COMMA BELOW\n", + "o̴ͣ̑̐ͫ̈̄͊ͥ̓͟͏̫͔̠̤̜̤̥͘ (multiple) LATIN SMALL LETTER O, COMBINING LATIN SMALL LETTER A, COMBINING INVERTED BREVE, COMBINING CANDRABINDU, COMBINING LATIN SMALL LETTER M, COMBINING DIAERESIS, COMBINING MACRON, COMBINING NOT TILDE ABOVE, COMBINING LATIN SMALL LETTER I, COMBINING GREEK KORONIS, COMBINING DOUBLE MACRON BELOW, COMBINING TILDE OVERLAY, COMBINING GRAPHEME JOINER, COMBINING DOT ABOVE RIGHT, COMBINING INVERTED DOUBLE ARCH BELOW, COMBINING LEFT ARROWHEAD BELOW, COMBINING MINUS SIGN BELOW, COMBINING DIAERESIS BELOW, COMBINING LEFT HALF RING BELOW, COMBINING DIAERESIS BELOW, COMBINING RING BELOW\n", + "ḍ̛̥͖͓̪͈̹̯͖̱̘͙͖ͧ̿ͧ̓̓͊̈͑͘̕ (multiple) LATIN SMALL LETTER D, COMBINING LATIN SMALL LETTER U, COMBINING DOUBLE OVERLINE, COMBINING LATIN SMALL LETTER U, COMBINING COMMA ABOVE, COMBINING COMMA ABOVE, COMBINING NOT TILDE ABOVE, COMBINING DIAERESIS, COMBINING LEFT HALF RING ABOVE, COMBINING DOT ABOVE RIGHT, COMBINING COMMA ABOVE RIGHT, COMBINING HORN, COMBINING DOT BELOW, COMBINING RING BELOW, COMBINING RIGHT ARROWHEAD AND UP ARROWHEAD BELOW, COMBINING X BELOW, COMBINING BRIDGE BELOW, COMBINING DOUBLE VERTICAL LINE BELOW, COMBINING RIGHT HALF RING BELOW, COMBINING INVERTED BREVE BELOW, COMBINING RIGHT ARROWHEAD AND UP ARROWHEAD BELOW, COMBINING MACRON BELOW, COMBINING LEFT TACK BELOW, COMBINING ASTERISK BELOW, COMBINING RIGHT ARROWHEAD AND UP ARROWHEAD BELOW\n", + "e̛̺͈̜̰̜̖͎͚͈͋̒̆̈́̏͊ͬ̎̑̇̾̆̓ͬ̔̐̾ͭ́͞ (multiple) LATIN SMALL LETTER E, COMBINING HOMOTHETIC ABOVE, COMBINING TURNED COMMA ABOVE, COMBINING BREVE, COMBINING GREEK DIALYTIKA TONOS, COMBINING DOUBLE GRAVE ACCENT, COMBINING NOT TILDE ABOVE, COMBINING LATIN SMALL LETTER R, COMBINING DOUBLE VERTICAL LINE ABOVE, COMBINING INVERTED BREVE, COMBINING DOT ABOVE, COMBINING VERTICAL TILDE, COMBINING BREVE, COMBINING GREEK KORONIS, COMBINING LATIN SMALL LETTER R, COMBINING REVERSED COMMA ABOVE, COMBINING CANDRABINDU, COMBINING VERTICAL TILDE, COMBINING LATIN SMALL LETTER T, COMBINING ACUTE TONE MARK, COMBINING HORN, COMBINING DOUBLE MACRON, COMBINING INVERTED BRIDGE BELOW, COMBINING DOUBLE VERTICAL LINE BELOW, COMBINING LEFT HALF RING BELOW, COMBINING TILDE BELOW, COMBINING LEFT HALF RING BELOW, COMBINING GRAVE ACCENT BELOW, COMBINING UPWARDS ARROW BELOW, COMBINING DOUBLE RING BELOW, COMBINING DOUBLE VERTICAL LINE BELOW\n" + ] + } + ], + "source": [ + "list_grapheme_clusters('u̶̜͓̬̞͚͙̪̰͓̯̲̝̬͔͎̳̼͇̓͊ͤ̋̃̀̄̓̿͊̀̚͟͜͟ͅņ̷͔̤̜̗̘̠̦̦̖̟͉̹͕̬͎̙̲̲̎̅̈́ͮͣ̔̀̌͂̄͆͑̚i̴̢͖̳̣̙͕̍ͯͧ̀ͥͭ̆ͣ̉͐͆̊͋͛̈́͒͟c̰̟̫̲͇̺̹͖̼̦̾ͮ̍̐ͤͪ̓ͤ̐̈́̅ͯͤ̚̚͘o̴ͣ̑̐ͫ̈̄͊ͥ̓͟͏̫͔̠̤̜̤̥͘ḍ̛̥͖͓̪͈̹̯͖̱̘͙͖ͧ̿ͧ̓̓͊̈͑͘̕e̛̺͈̜̰̜̖͎͚͈͋̒̆̈́̏͊ͬ̎̑̇̾̆̓ͬ̔̐̾ͭ́͞')" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Z LATIN CAPITAL LETTER Z\n", + "e LATIN SMALL LETTER E\n", + "u LATIN SMALL LETTER U\n", + "g LATIN SMALL LETTER G\n", + "n LATIN SMALL LETTER N\n", + "uͤ (multiple) LATIN SMALL LETTER U, COMBINING LATIN SMALL LETTER E\n", + "ß LATIN SMALL LETTER SHARP S\n" + ] + } + ], + "source": [ + "list_grapheme_clusters('Zeugnuͤß')" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Z LATIN CAPITAL LETTER Z\n", + "e LATIN SMALL LETTER E\n", + "u LATIN SMALL LETTER U\n", + "g LATIN SMALL LETTER G\n", + "n LATIN SMALL LETTER N\n", + " private use character 0xE72B\n", + "ß LATIN SMALL LETTER SHARP S\n" + ] + } + ], + "source": [ + "list_grapheme_clusters('Zeugnß')" + ] + } + ], + "metadata": { + "hide_input": false, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.4" + }, + "toc": { + "base_numbering": 1, + "nav_menu": {}, + "number_sections": true, + "sideBar": true, + "skip_h1_title": false, + "title_cell": "Table of Contents", + "title_sidebar": "Contents", + "toc_cell": false, + "toc_position": {}, + "toc_section_display": true, + "toc_window_display": true + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/qurator/dinglehopper/ocr_files.py b/qurator/dinglehopper/ocr_files.py new file mode 100644 index 0000000..a65e03a --- /dev/null +++ b/qurator/dinglehopper/ocr_files.py @@ -0,0 +1,105 @@ +from __future__ import division, print_function + +from lxml import etree as ET +import sys + +from lxml.etree import XMLSyntaxError + + +def alto_namespace(tree): + """Return the ALTO namespace used in the given ElementTree. + + This relies on the assumption that, in any given ALTO file, the root element has the local name "alto". We do not + check if the files uses any valid ALTO namespace. + """ + root_name = ET.QName(tree.getroot().tag) + if root_name.localname == 'alto': + return root_name.namespace + else: + raise ValueError('Not an ALTO tree') + + +def alto_text(tree): + """Extract text from the given ALTO ElementTree.""" + + nsmap = {'alto': alto_namespace(tree)} + + lines = ( + ' '.join(string.attrib.get('CONTENT') for string in line.iterfind('alto:String', namespaces=nsmap)) + for line in tree.iterfind('.//alto:TextLine', namespaces=nsmap)) + text_ = '\n'.join(lines) + + return text_ + + +def page_namespace(tree): + """Return the PAGE content namespace used in the given ElementTree. + + This relies on the assumption that, in any given PAGE content file, the root element has the local name "PcGts". We + do not check if the files uses any valid PAGE namespace. + """ + root_name = ET.QName(tree.getroot().tag) + if root_name.localname == 'PcGts': + return root_name.namespace + else: + raise ValueError('Not a PAGE tree') + + +def page_text(tree): + """Extract text from the given PAGE content ElementTree.""" + + nsmap = {'page': page_namespace(tree)} + + def region_text(region): + try: + return region.find('./page:TextEquiv/page:Unicode', namespaces=nsmap).text + except AttributeError: + return None + + region_texts = [] + reading_order = tree.find('.//page:ReadingOrder', namespaces=nsmap) + if reading_order is not None: + for group in reading_order.iterfind('./*', namespaces=nsmap): + if ET.QName(group.tag).localname == 'OrderedGroup': + region_ref_indexeds = group.findall('./page:RegionRefIndexed', namespaces=nsmap) + for region_ref_indexed in sorted(region_ref_indexeds, key=lambda r: r.attrib['index']): + region_id = region_ref_indexed.attrib['regionRef'] + region = tree.find('.//page:TextRegion[@id="%s"]' % region_id, namespaces=nsmap) + if region is not None: + region_texts.append(region_text(region)) + else: + raise ValueError('Invalid region id "%s" in file' % region_id) + else: + raise NotImplementedError + else: + for region in tree.iterfind('.//page:TextRegion', namespaces=nsmap): + region_texts.append(region_text(region)) + + # XXX Does a file have to have regions etc.? region vs lines etc. + # Filter empty region texts + region_texts = (t for t in region_texts if t) + + text_ = '\n'.join(region_texts) + + return text_ + + +def text(filename): + """Read the text from the given file. + + Supports PAGE, ALTO and falls back to plain text. + """ + + try: + tree = ET.parse(filename) + except XMLSyntaxError: + with open(filename, 'r') as f: + return f.read() + try: + return page_text(tree) + except ValueError: + return alto_text(tree) + + +if __name__ == '__main__': + print(text(sys.argv[1])) diff --git a/qurator/dinglehopper/pytest.ini b/qurator/dinglehopper/pytest.ini new file mode 100644 index 0000000..c56273f --- /dev/null +++ b/qurator/dinglehopper/pytest.ini @@ -0,0 +1,4 @@ +[pytest] +markers = + integration: integration tests + serial diff --git a/qurator/dinglehopper/substitute_equivalences.py b/qurator/dinglehopper/substitute_equivalences.py new file mode 100644 index 0000000..9d5daa9 --- /dev/null +++ b/qurator/dinglehopper/substitute_equivalences.py @@ -0,0 +1,32 @@ +def substitute_equivalences(s): + + # These are for OCR-D GT vs Tesseract frk vs Calamari GT4HistOCR + # It might make sense to use different rules for GT and for the different OCR + equivalences = { + '': 'ü', + '': 'ſſ', + "\ueba7": 'ſſi', # MUFI: LATIN SMALL LIGATURE LONG S LONG S I + '': 'ä', + '': 'ch', + '==': '–', # → en-dash + '—': '–', # em-dash → en-dash + '': 'ck', + '': 'll', + '': 'ö', + '': 'ſi', + '': 'ſt', + 'fi': 'fi', + 'ff': 'ff', + 'fl': 'fl', + 'ffi': 'ffi', + '': 'ct', + '’': '\'', + '⸗': '-', + '': 'tz', # MUFI: LATIN SMALL LIGATURE TZ + 'aͤ': 'ä', # LATIN SMALL LETTER A, COMBINING LATIN SMALL LETTER E + 'oͤ': 'ö', # LATIN SMALL LETTER O, COMBINING LATIN SMALL LETTER E + 'uͤ': 'ü', # LATIN SMALL LETTER U, COMBINING LATIN SMALL LETTER E + } + for fr, to in equivalences.items(): + s = s.replace(fr, to) + return s diff --git a/qurator/dinglehopper/templates/report.html.j2 b/qurator/dinglehopper/templates/report.html.j2 new file mode 100644 index 0000000..5e56c73 --- /dev/null +++ b/qurator/dinglehopper/templates/report.html.j2 @@ -0,0 +1,61 @@ + + + + + + + + + + + + + +
+ +{{ gt }}
+{{ ocr }} + + +

Metrics

+

CER: {{ cer|round(4) }}

+

WER: {{ wer|round(4) }}

+ + +

Character differences

+{{ char_diff_report }} + +

Word differences

+{{ word_diff_report }} + + +
+ + + + + + + + + + + + diff --git a/qurator/dinglehopper/templates/report.html.js b/qurator/dinglehopper/templates/report.html.js new file mode 100644 index 0000000..ac43676 --- /dev/null +++ b/qurator/dinglehopper/templates/report.html.js @@ -0,0 +1,14 @@ +function find_diff_class(classes) { + return classes.split(/\s+/).find(x => x.match(/.diff\d.*/)); +} + +$(document).ready(function() { + $('.diff').mouseover(function() { + let c = find_diff_class($(this).attr('class')) + $('.' + c).addClass('diff-highlight') + }); + $('.diff').mouseout(function() { + let c = find_diff_class($(this).attr('class')) + $('.' + c).removeClass('diff-highlight') + }); +}); diff --git a/qurator/dinglehopper/templates/report.json.j2 b/qurator/dinglehopper/templates/report.json.j2 new file mode 100644 index 0000000..8619cd8 --- /dev/null +++ b/qurator/dinglehopper/templates/report.json.j2 @@ -0,0 +1,6 @@ +{ + "gt": "{{ gt }}", + "ocr": "{{ ocr }}", + "cer": {{ cer|round(6) }}, + "wer": {{ wer|round(6) }} +} diff --git a/qurator/dinglehopper/tests/__init__.py b/qurator/dinglehopper/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/qurator/dinglehopper/tests/data/00000119.tif b/qurator/dinglehopper/tests/data/00000119.tif new file mode 100644 index 0000000..b831bd0 Binary files /dev/null and b/qurator/dinglehopper/tests/data/00000119.tif differ diff --git a/qurator/dinglehopper/tests/data/brochrnx_73075507X/00000139.gt.page.xml b/qurator/dinglehopper/tests/data/brochrnx_73075507X/00000139.gt.page.xml new file mode 100644 index 0000000..2e57619 --- /dev/null +++ b/qurator/dinglehopper/tests/data/brochrnx_73075507X/00000139.gt.page.xml @@ -0,0 +1,5610 @@ + + + + doculibtopagexml + 2019-01-08T01:56:06 + 2019-04-11T08:41:58 + + + + + + + + + + + + + + + + + + + + + + + D + + + + i + + + + e + + Die + + + + + + ſ + + + + + + + + e + + + + i + + + + n + + + + e + + + + n + + ſeinen + + + + + + u + + + + n + + + + s + + uns + + + + + + b + + + + a + + + + l + + + + d + + bald + + + + + + k + + + + l + + + + e + + + + i + + + + n + + + + e + + kleine + + + + + + H + + + + + + + + g + + + + e + + + + l + + + + , + + Hgel, + + + + + + b + + + + a + + + + l + + + + d + + bald + + + + + + H + + + + + + + + t + + + + t + + + + e + + + + n + + + + , + + Htten, + + + + + + Z + + + + e + + + + l + + + + t + + + + e + + + + n + + Zelten + + + + + + u + + + + n + + + + d + + und + + + + + + b + + + + a + + + + l + + + + d + + bald + + Die ſeinen uns bald kleine Hgel, bald Htten, Zelten und bald + + + + + + + + W + + + + e + + + + + + + + e + + + + n + + Ween + + Ween + + + + + + + + D + + + + e + + + + n + + Den + + + + + + B + + + + l + + + + i + + + + + + + + e + + + + n + + + + , + + Blien, + + + + + + w + + + + e + + + + l + + + + + + + + e + + wele + + + + + + + + + + e + + e + + + + + + d + + + + u + + + + r + + + + + + + + l + + + + a + + + + u + + + + f + + + + e + + + + n + + + + , + + durlaufen, + + + + + + v + + + + o + + + + n + + von + + + + + + w + + + + t + + + + e + + + + n + + + + e + + + + i + + weiten + + + + + + + + + + f + + + + t + + + + e + + + + r + + + + s + + fters + + + + + + v + + + + o + + + + r + + + + z + + + + u + + + + + + + + e + + + + + + + + e + + + + n + + + + . + + vorzueen. + + Den Blien, wele e durlaufen, von weiten fters vorzueen. + + + + + + + + S + + + + i + + + + e + + + + h + + + + t + + Sieht + + + + + + m + + + + a + + + + n + + man + + + + + + e + + + + i + + + + n + + ein + + + + + + ſ + + + + o + + + + l + + + + + + ſol + + + + + + g + + + + e + + + + m + + + + + + + + h + + + + t + + + + e + + + + s + + gemhtes + + + + + + F + + + + e + + + + l + + + + d + + + + , + + Feld, + + + + + + v + + + + o + + + + n + + von + + + + + + o + + + + b + + + + e + + + + n + + + + , + + oben, + + Sieht man ein ſol gemhtes Feld, von oben, + + + + + + + + S + + + + o + + So + + + + + + g + + + + l + + + + e + + + + i + + + + + + + + t + + gleit + + + + + + e + + + + s + + es + + + + + + e + + + + e + + + + m + + + + i + + + + n + + einem + + + + + + w + + + + e + + + + i + + + + t + + + + e + + + + n + + weiten + + + + + + M + + + + e + + + + e + + + + r + + + + , + + Meer, + + + + + + w + + + + o + + + + r + + + + a + + + + u + + + + f + + worauf + + + + + + e + + + + r + + + + h + + + + a + + + + b + + + + n + + + + e + + erhabne + + + + + + W + + + + e + + + + + + + + e + + + + n + + Ween + + + + + + t + + + + o + + + + b + + + + e + + + + n + + + + , + + toben, + + So gleit es einem weiten Meer, worauf erhabne Ween toben, + + + + + + + + J + + + + e + + + + d + + + + o + + + + + + Jedo + + + + + + m + + + + i + + + + t + + mit + + + + + + d + + + + i + + + + e + + + + ſ + + + + e + + + + m + + dieſem + + + + + + U + + + + n + + + + t + + + + e + + + + r + + + + ſ + + + + + + + + e + + + + i + + + + d + + + + , + + Unterſeid, + + + + + + d + + + + a + + + + ß + + + + , + + daß, + + + + + + d + + + + a + + da + + + + + + + + + + + +  + + + + + + d + + + + i + + + + e + + die + + + + + + b + + + + e + + + + + + + + + + + + n + + + + d + + + + i + + + + g + + bendig + + + + + + r + + + + + + + + h + + + + r + + + + e + + + + n + + + + : + + rhren: + + Jedo mit dieſem Unterſeid, daß, da  die bendig rhren: + + + + + + + + V + + + + o + + + + n + + Von + + + + + + e + + + + i + + + + n + + + + i + + + + g + + + + e + + + + r + + einiger + + + + + + B + + + + e + + + + w + + + + e + + + + g + + + + u + + + + n + + + + g + + Bewegung + + + + + + h + + + + i + + + + e + + + + r + + + + , + + hier, + + + + + + i + + + + n + + in + + + + + + d + + + + i + + + + e + + + + ſ + + + + e + + + + n + + dieſen + + + + + + W + + + + e + + + + + + + + e + + + + n + + + + , + + Ween, + + + + + + n + + + + i + + + + + + + + t + + + + s + + nits + + + + + + z + + + + u + + zu + + + + + + ſ + + + + p + + + + + + + + h + + + + r + + + + e + + + + n + + + + . + + ſphren. + + Von einiger Bewegung hier, in dieſen Ween, nits zu ſphren. + + Die ſeinen uns bald kleine Hgel, bald Htten, Zelten und bald +Ween +Den Blien, wele e durlaufen, von weiten fters vorzueen. +Sieht man ein ſol gemhtes Feld, von oben, +So gleit es einem weiten Meer, worauf erhabne Ween toben, +Jedo mit dieſem Unterſeid, daß, da  die bendig rhren: +Von einiger Bewegung hier, in dieſen Ween, nits zu ſphren. + + + + + + + + + + + D + + + + a + + Da + + + + + + C + + + + a + + + + p + + + + o + + + + . + + Capo. + + Da Capo. + + Da Capo. + + + + + + + + + + + G + + + + e + + + + h + + + + t + + Geht + + + + + + m + + + + a + + + + n + + man + + + + + + a + + + + u + + + + f + + auf + + + + + + e + + + + i + + + + n + + + + e + + + + n + + einen + + + + + + ſ + + + + o + + + + l + + + + + + + + e + + + + n + + ſolen + + + + + + F + + + + e + + + + l + + + + d + + + + e + + + + , + + Felde, + + + + + + ſ + + + + o + + ſo + + + + + + e + + + + b + + + + e + + + + n + + eben + + + + + + e + + + + r + + + + + + er + + + + + + g + + + + e + + + + m + + + + + + + + h + + + + t + + + + , + + gemht, + + + + + + ſ + + + + p + + + + a + + + + + + + + i + + + + e + + + + r + + + + e + + + + n + + + + , + + ſpaieren, + + Geht man auf einen ſolen Felde, ſo eben er gemht, ſpaieren, + + + + + + + + D + + + + a + + + + s + + Das + + + + + + m + + + + a + + + + n + + man + + + + + + g + + + + e + + + + w + + + + o + + + + h + + + + n + + + + t + + gewohnt + + + + + + v + + + + o + + + + + + vo + + + + + + K + + + + o + + + + r + + + + n + + Korn + + + + + + z + + + + u + + zu + + + + + + ſ + + + + e + + + + h + + + + n + + + + ; + + ſehn; + + + + + + ſ + + + + o + + ſo + + + + + + k + + + + o + + + + m + + + + m + + + + e + + + + n + + kommen + + + + + + w + + + + i + + + + r + + wir + + + + + + u + + + + n + + + + s + + uns + + + + + + g + + + + r + + + + + + + + + + + + e + + + + r + + grer + + + + + + f + + + + + + + + r + + + + , + + fr, + + Das man gewohnt vo Korn zu ſehn; ſo kommen wir uns grer fr, + + + + + + + + D + + + + a + + + + s + + Das + + + + + + F + + + + e + + + + l + + + + d + + Feld + + + + + + h + + + + i + + + + n + + + + g + + + + e + + + + g + + + + e + + + + n + + hingegen + + + + + + n + + + + i + + + + e + + + + d + + + + r + + + + i + + + + g + + + + e + + + + r + + + + . + + niedriger. + + + + + + A + + + + u + + + + + + Au + + + + + + n + + + + i + + + + m + + + + m + + + + t + + nimmt + + + + + + ſ + + + + o + + + + d + + + + e + + + + n + + + + n + + ſodenn + + + + + + e + + + + i + + + + n + + ein + + + + + + n + + + + e + + + + u + + + + e + + + + r + + neuer + + + + + + S + + + + + + + + e + + + + i + + + + n + + + + , + + Sein, + + Das Feld hingegen niedriger. Au nimmt ſodenn ein neuer Sein, + + + + + + + + U + + + + n + + + + d + + Und + + + + + + e + + + + i + + + + n + + + + e + + eine + + + + + + n + + + + e + + + + u + + + + e + + neue + + + + + + F + + + + a + + + + r + + + + b + + + + e + + + + n + + Farben + + + + + + Z + + + + i + + + + e + + + + r + + Zier + + Und eine neue Farben Zier + + + + + + + + D + + + + e + + + + n + + Den + + + + + + e + + + + r + + + + + + er + + + + + + g + + + + e + + + + m + + + + + + + + h + + + + t + + + + e + + + + n + + gemhten + + + + + + A + + + + + + + + e + + + + r + + Aer + + + + + + e + + + + i + + + + n + + + + . + + ein. + + Den er gemhten Aer ein. + + + + + + + + D + + + + e + + + + r + + Der + + + + + + G + + + + r + + + + u + + + + n + + + + d + + Grund + + + + + + i + + + + + + i + + + + + + g + + + + r + + + + + + + + n + + + + , + + grn, + + + + + + d + + + + i + + + + e + + die + + + + + + S + + + + t + + + + o + + + + p + + + + p + + + + e + + + + l + + + + n + + Stoppeln + + + + + + g + + + + e + + + + l + + + + b + + + + , + + gelb, + + + + + + u + + + + n + + + + d + + und + + + + + + w + + + + e + + + + n + + + + n + + wenn + + + + + + + + + + + +  + + + + + + u + + + + n + + + + ſ + + + + r + + + + e + + + + r + + unſrer + + + + + + S + + + + o + + + + n + + + + + + Son⸗ + + Der Grund i grn, die Stoppeln gelb, und wenn  unſrer Son⸗ + + + + + + + + n + + + + e + + + + n + + nen + + + + + + L + + + + i + + + + + + + + t + + Lit + + nen Lit + + + + + + + + A + + + + n + + An + + + + + + i + + + + h + + + + r + + + + e + + ihre + + + + + + r + + + + u + + + + n + + + + d + + + + e + + runde + + + + + + g + + + + l + + + + a + + + + t + + + + t + + + + e + + glatte + + + + + + R + + + + + + + + h + + + + r + + + + e + + + + n + + + + , + + Rhren, + + + + + + z + + + + u + + + + m + + + + a + + + + h + + + + l + + + + e + + + + n + + zumahlen + + + + + + f + + + + r + + + + + + + + h + + frh + + + + + + u + + + + n + + + + d + + und + + + + + + A + + + + b + + + + e + + + + n + + + + d + + + + s + + + + , + + Abends, + + + + + + b + + + + r + + + + i + + + + + + + + t + + + + : + + brit: + + An ihre runde glatte Rhren, zumahlen frh und Abends, brit: + + + + + + + + S + + + + o + + So + + + + + + k + + + + a + + + + n + + + + n + + kann + + + + + + e + + + + i + + + + n + + ein + + + + + + G + + + + o + + + + l + + + + d + + Gold + + + + + + k + + + + a + + + + u + + + + m + + kaum + + + + + + + + + + + + + + r + + + + + + + + e + + + + r + + rer + + + + + + g + + + + l + + + + + + + + n + + + + + + + + e + + + + n + + + + . + + glnen. + + + + + + D + + + + i + + + + e + + + + s + + Dies + + + + + + m + + + + a + + + + + + + + t + + mat + + + + + + e + + + + i + + + + n + + ein + + + + + + l + + + + i + + + + e + + + + b + + + + l + + + + i + + + + + + + + e + + + + s + + lieblies + + So kann ein Gold kaum rer glnen. Dies mat ein lieblies + + + + + + + + G + + + + e + + + + m + + + + i + + + + ſ + + + + + + + + e + + + + , + + Gemiſe, + + Gemiſe, + + + + + + + + Z + + + + u + + + + m + + + + a + + + + h + + + + l + + Zumahl + + + + + + w + + + + e + + + + n + + + + n + + + + , + + wenn, + + + + + + i + + + + n + + in + + + + + + d + + + + e + + + + r + + der + + + + + + N + + + + a + + + + + + + + b + + + + a + + + + r + + + + ſ + + + + + + + + a + + + + f + + + + t + + + + , + + Nabarſaft, + + + + + + e + + + + i + + + + n + + ein + + + + + + d + + + + u + + + + n + + + + + + + + e + + + + l + + + + + + + + g + + + + r + + + + + + + + n + + + + e + + + + n + + + + d + + + + e + + + + s + + dunel⸗grnendes + + + + + + G + + + + e + + + + b + + + + + + + + e + + + + , + + + + ſ + + + + + + Gebſe, + + Zumahl wenn, in der Nabarſaft, ein dunel⸗grnendes Gebſe, + + + + + + + + D + + + + e + + + + n + + Den + + + + + + g + + + + e + + + + l + + + + b + + + + e + + + + n + + gelben + + + + + + S + + + + + + + + i + + + + m + + + + m + + + + e + + + + r + + Simmer + + + + + + n + + + + o + + + + + + no + + + + + + e + + + + r + + + + h + + + + + + + + h + + + + t + + + + . + + erhht. + + + + + + W + + + + i + + + + e + + Wie + + + + + + i + + + + + + i + + + + + + n + + + + u + + + + n + + nun + + + + + + j + + + + + + + + n + + + + g + + + + + + + + , + + jng, + + + + + + z + + + + u + + + + r + + zur + + + + + + A + + + + b + + + + e + + + + n + + + + d + + Abend + + + + + + Z + + + + e + + + + i + + + + t + + + + , + + Zeit, + + Den gelben Simmer no erhht. Wie i nun jng, zur Abend Zeit, + + + + + + + + D + + + + u + + + + r + + + + + + Dur + + + + + + ſ + + + + o + + ſo + + + + + + v + + + + i + + + + e + + + + l + + viel + + + + + + ſ + + + + + + + + w + + + + e + + + + r + + + + e + + ſwere + + + + + + S + + + + e + + + + e + + + + g + + + + e + + + + n + + + + s + + + + + + + + B + + + + e + + + + r + + + + g + + + + e + + + + , + + Seegens⸗Berge, + + + + + + m + + + + i + + + + t + + mit + + + + + + ſ + + + + a + + + + n + + + + f + + + + t + + + + e + + + + n + + ſanften + + + + + + S + + + + + + + + r + + + + i + + + + t + + + + t + + + + e + + + + n + + + + , + + Sritten, + + + + + + h + + + + i + + + + n + + hin + + + + + + u + + + + n + + + + d + + und + + Dur ſo viel ſwere Seegens⸗Berge, mit ſanften Sritten, hin und + + + + + + + + w + + + + d + + + + , + + + + i + + + + e + + + + e + + + + r + + wieder, + + wieder, + + + + + + + + G + + + + e + + + + r + + + + + + + + h + + + + r + + + + e + + + + t + + Gerhret + + + + + + d + + + + u + + + + r + + + + + + dur + + + + + + d + + + + e + + + + s + + des + + + + + + F + + + + e + + + + l + + + + d + + + + e + + + + s + + Feldes + + + + + + S + + + + + + + + m + + + + u + + + + + + + + , + + Smu, + + + + + + g + + + + e + + + + r + + + + + + + + h + + + + r + + + + e + + + + t + + gerhret + + + + + + d + + + + u + + + + r + + + + + + dur + + + + + + d + + + + i + + + + e + + die + + + + + + F + + + + r + + + + u + + + + + + + + t + + + + b + + + + a + + + + r + + + + k + + + + e + + + + i + + + + t + + + + , + + Frutbarkeit, + + Gerhret dur des Feldes Smu, gerhret dur die Frutbarkeit, + + + + + + + + V + + + + e + + + + r + + + + g + + + + n + + + + + + + + g + + + + t + + Vergngt + + + + + + a + + + + u + + + + f + + auf + + + + + + m + + + + e + + + + i + + + + n + + + + e + + + + m + + meinem + + + + + + A + + + + + + + + e + + + + r + + Aer + + + + + + g + + + + i + + + + e + + + + n + + + + g + + + + , + + gieng, + + + + + + e + + + + r + + + + t + + + + + + + + n + + + + t + + + + e + + + + n + + ertnten + + + + + + d + + + + i + + + + e + + + + ſ + + + + e + + dieſe + + + + + + m + + + + e + + + + e + + + + i + + + + n + + meine + + + + + + L + + + + i + + + + e + + + + d + + + + e + + + + r + + + + : + + Lieder: + + Vergngt auf meinem Aer gieng, ertnten dieſe meine Lieder: + + Geht man auf einen ſolen Felde, ſo eben er gemht, ſpaieren, +Das man gewohnt vo Korn zu ſehn; ſo kommen wir uns grer fr, +Das Feld hingegen niedriger. Au nimmt ſodenn ein neuer Sein, +Und eine neue Farben Zier +Den er gemhten Aer ein. +Der Grund i grn, die Stoppeln gelb, und wenn  unſrer Son⸗ +nen Lit +An ihre runde glatte Rhren, zumahlen frh und Abends, brit: +So kann ein Gold kaum rer glnen. Dies mat ein lieblies +Gemiſe, +Zumahl wenn, in der Nabarſaft, ein dunel⸗grnendes Gebſe, +Den gelben Simmer no erhht. Wie i nun jng, zur Abend Zeit, +Dur ſo viel ſwere Seegens⸗Berge, mit ſanften Sritten, hin und +wieder, +Gerhret dur des Feldes Smu, gerhret dur die Frutbarkeit, +Vergngt auf meinem Aer gieng, ertnten dieſe meine Lieder: + + + + + + + + + + + + + 1 + + + + 1 + + + + 5 + + 115 + + 115 + + 115 + + + + + + + + + + H + + H + + + + + + 2 + + 2 + + H 2 + + H 2 + + + + + + + + + + + . + + + + A + + + + R + + + + A + + + + I + + ARIA. + + ARIA. + + ARIA. + + + + + + + + + + + A + + + + R + + + + I + + + + A + + + + . + + ARIA. + + ARIA. + + ARIA. + + + + + + + + + + + W + + + + a + + + + s + + Was + + + + + + e + + + + r + + + + h + + + + e + + + + b + + + + t + + erhebt + + + + + + d + + + + e + + + + s + + des + + + + + + S + + + + + + + + + + + + p + + + + f + + + + e + + + + r + + + + s + + Spfers + + + + + + G + + + + + + + + t + + + + e + + Gte + + Was erhebt des Spfers Gte + + + + + + + + M + + + + e + + + + h + + + + r + + + + , + + Mehr, + + + + + + a + + + + l + + + + s + + als + + + + + + d + + + + i + + + + e + + + + ſ + + + + e + + + + s + + dieſes + + + + + + S + + + + e + + + + e + + + + g + + + + e + + + + n + + + + s + + Seegens + + + + + + M + + + + e + + + + e + + + + r + + + + ? + + Meer? + + Mehr, als dieſes Seegens Meer? + + + + + + + + K + + + + o + + + + m + + + + m + + + + t + + Kommt + + + + + + d + + + + i + + + + e + + + + s + + dies + + + + + + w + + + + o + + + + h + + + + l + + wohl + + + + + + v + + + + o + + + + n + + von + + + + + + u + + + + n + + + + g + + + + e + + + + f + + + + e + + + + h + + + + r + + + + ? + + ungefehr? + + Kommt dies wohl von ungefehr? + + + + + + + + N + + + + e + + + + i + + + + n + + + + , + + Nein, + + + + + + r + + + + u + + + + f + + + + t + + ruft + + + + + + m + + + + e + + + + i + + + + n + + mein + + + + + + e + + + + r + + + + f + + + + r + + + + e + + + + u + + + + t + + erfreut + + + + + + G + + + + e + + + + m + + + + + + + + h + + + + t + + + + e + + + + : + + Gemhte: + + Nein, ruft mein erfreut Gemhte: + + + + + + + + N + + + + u + + + + r + + Nur + + + + + + v + + + + o + + + + n + + von + + + + + + G + + + + O + + + + T + + + + T + + GOTT + + + + + + k + + + + o + + + + m + + + + m + + + + t + + kommt + + + + + + a + + + + + + + + e + + + + s + + aes + + + + + + h + + + + e + + + + r + + + + ; + + her; + + Nur von GOTT kommt aes her; + + + + + + + + I + + + + h + + + + m + + Ihm + + + + + + ſ + + + + e + + + + y + + ſey + + + + + + e + + + + i + + + + ß + + + + P + + + + r + + Preiß + + + + + + u + + + + n + + + + d + + und + + + + + + D + + + + a + + + + n + + + + + + Dan + + + + + + u + + + + n + + + + d + + und + + + + + + E + + + + h + + + + r + + + + ! + + Ehr! + + Ihm ſey Preiß und Dan und Ehr! + + Was erhebt des Spfers Gte +Mehr, als dieſes Seegens Meer? +Kommt dies wohl von ungefehr? +Nein, ruft mein erfreut Gemhte: +Nur von GOTT kommt aes her; +Ihm ſey Preiß und Dan und Ehr! + diff --git a/qurator/dinglehopper/tests/data/brochrnx_73075507X/00000139.ocrd-tess.ocr.page.xml b/qurator/dinglehopper/tests/data/brochrnx_73075507X/00000139.ocrd-tess.ocr.page.xml new file mode 100644 index 0000000..b60d0f7 --- /dev/null +++ b/qurator/dinglehopper/tests/data/brochrnx_73075507X/00000139.ocrd-tess.ocr.page.xml @@ -0,0 +1,289 @@ + + + + OCR-D/core 1.0.0b11 + 2019-08-01T15:03:17.741679 + 2019-08-01T15:03:17.741679 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Die ſcheinen uns bald kleine Hügel - bald Hütten x Zelten und bald + + + + + + „Bellen + + + + + + Den Blicken , welche ſie durchlaufen , von weiten öfters vorzuſtellen, + + + + + + Sieht man ein ſolch gemähtes Feld - von oben, + + + + + + Sy gleicht es einem weiten Meer - worauf erhabne Wellen kobeny + + + + + + Jedoch mit dieſem Unterſcheid - daß, da ſich die beſtändig rühren: + + + + + + Von einiger Bewegung hier - in dieſen Wellen ; nichts zu ſpähren, + + + + Die ſcheinen uns bald kleine Hügel - bald Hütten x Zelten und bald +„Bellen +Den Blicken , welche ſie durchlaufen , von weiten öfters vorzuſtellen, +Sieht man ein ſolch gemähtes Feld - von oben, +Sy gleicht es einem weiten Meer - worauf erhabne Wellen kobeny +Jedoch mit dieſem Unterſcheid - daß, da ſich die beſtändig rühren: +Von einiger Bewegung hier - in dieſen Wellen ; nichts zu ſpähren, + + + + + + + + + + + + + + Was erhebt des Schöpfers Güte + + + + + + Mehr , als dieſes Seegens Meer? + + + + + + Kommt dies wohl von ungefehv? + + + + + + Nein , rüſt mein erfreut Gemühte + + + + + + Nur von GOTT komint alles hers + + + + + + Ihm ſey Preiß und Dan und Ehr! + + + + Was erhebt des Schöpfers Güte +Mehr , als dieſes Seegens Meer? +Kommt dies wohl von ungefehv? +Nein , rüſt mein erfreut Gemühte +Nur von GOTT komint alles hers +Ihm ſey Preiß und Dan und Ehr! + + + + + + + + Da Capo, + + + + Da Capo, + + + + + + + + Geht man auf einen ſolhen Felde, ſo eben erſi gemäht - ſpaßtiereny + + + + + + Das man gewohnt voll Korn zu ſehn; ſo kommen wir uns gröſſer für, + + + + + + Das Feld hingegen niedriger. Auch nimmt ſodean ein neuer Scheinz + + + + + + Und eine neue Farben Zier + + + + + + Den erſt gemähten Aker ein, + + + + + + Der Grund iſt grün - die Stoppeln gelb und wenn fich unjrer Son- + + + + + + nen B;Of + + + + + + Un ihre runde glatte Röhren , zumahlen früh und Abends bricht; + + + + + + So kann ein Gold kaum ſtärcker glänßen.- Dies macht ein liebliches + + + + + + Gemiſche, | + + + + + + Zutnahl wenn , in der Nachbarſchaft - ein dumfel-grünendes Gebüſche + + + + + + Den gelben Schimmer noch erhöht. Wir ich nun jüngſt, zur Abend Zeif, + + + + + + Durch ſo viel ſhwere Scegens-Berge, mit ſanften Schritten, hin und + + + + + + Wieder; + + + + + + Gepühret durch des Feldes Schmu, gerühret durc< die Fruchtbarkeitz + + + + + + Vergmigt auf meinem Acker gieng - ertönten dieſe meine Lieder: + + + + Geht man auf einen ſolhen Felde, ſo eben erſi gemäht - ſpaßtiereny +Das man gewohnt voll Korn zu ſehn; ſo kommen wir uns gröſſer für, +Das Feld hingegen niedriger. Auch nimmt ſodean ein neuer Scheinz +Und eine neue Farben Zier +Den erſt gemähten Aker ein, +Der Grund iſt grün - die Stoppeln gelb und wenn fich unjrer Son- +nen B;Of +Un ihre runde glatte Röhren , zumahlen früh und Abends bricht; +So kann ein Gold kaum ſtärcker glänßen.- Dies macht ein liebliches +Gemiſche, | +Zutnahl wenn , in der Nachbarſchaft - ein dumfel-grünendes Gebüſche +Den gelben Schimmer noch erhöht. Wir ich nun jüngſt, zur Abend Zeif, +Durch ſo viel ſhwere Scegens-Berge, mit ſanften Schritten, hin und +Wieder; +Gepühret durch des Feldes Schmu, gerühret durc< die Fruchtbarkeitz +Vergmigt auf meinem Acker gieng - ertönten dieſe meine Lieder: + + + + + + + + 5) 2 + + + + + + ARIA. + + + + 5) 2 +ARIA. + + + + diff --git a/qurator/dinglehopper/tests/data/lorem-ipsum/lorem-ipsum-scan-bad.gt.page.xml b/qurator/dinglehopper/tests/data/lorem-ipsum/lorem-ipsum-scan-bad.gt.page.xml new file mode 100644 index 0000000..c28161b --- /dev/null +++ b/qurator/dinglehopper/tests/data/lorem-ipsum/lorem-ipsum-scan-bad.gt.page.xml @@ -0,0 +1,47 @@ + + + + + 2019-07-26T13:59:00 + 2019-07-26T14:00:29 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt +ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo +dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit +amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor +invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et +justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum +dolor sit amet. diff --git a/qurator/dinglehopper/tests/data/lorem-ipsum/lorem-ipsum-scan-bad.ocr.tesseract.alto.xml b/qurator/dinglehopper/tests/data/lorem-ipsum/lorem-ipsum-scan-bad.ocr.tesseract.alto.xml new file mode 100644 index 0000000..1fd8377 --- /dev/null +++ b/qurator/dinglehopper/tests/data/lorem-ipsum/lorem-ipsum-scan-bad.ocr.tesseract.alto.xml @@ -0,0 +1,139 @@ + + + + pixel + + + + + + + tesseract 4.1.0-rc4 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/qurator/dinglehopper/tests/data/lorem-ipsum/lorem-ipsum-scan-bad.pdf b/qurator/dinglehopper/tests/data/lorem-ipsum/lorem-ipsum-scan-bad.pdf new file mode 100644 index 0000000..da97e0e Binary files /dev/null and b/qurator/dinglehopper/tests/data/lorem-ipsum/lorem-ipsum-scan-bad.pdf differ diff --git a/qurator/dinglehopper/tests/data/lorem-ipsum/lorem-ipsum-scan-bad.tif b/qurator/dinglehopper/tests/data/lorem-ipsum/lorem-ipsum-scan-bad.tif new file mode 100644 index 0000000..42b3d23 Binary files /dev/null and b/qurator/dinglehopper/tests/data/lorem-ipsum/lorem-ipsum-scan-bad.tif differ diff --git a/qurator/dinglehopper/tests/data/lorem-ipsum/lorem-ipsum-scan.gt.page.xml b/qurator/dinglehopper/tests/data/lorem-ipsum/lorem-ipsum-scan.gt.page.xml new file mode 100644 index 0000000..c28161b --- /dev/null +++ b/qurator/dinglehopper/tests/data/lorem-ipsum/lorem-ipsum-scan.gt.page.xml @@ -0,0 +1,47 @@ + + + + + 2019-07-26T13:59:00 + 2019-07-26T14:00:29 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt +ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo +dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit +amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor +invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et +justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum +dolor sit amet. diff --git a/qurator/dinglehopper/tests/data/lorem-ipsum/lorem-ipsum-scan.ocr.tesseract.alto.xml b/qurator/dinglehopper/tests/data/lorem-ipsum/lorem-ipsum-scan.ocr.tesseract.alto.xml new file mode 100644 index 0000000..d4a79a0 --- /dev/null +++ b/qurator/dinglehopper/tests/data/lorem-ipsum/lorem-ipsum-scan.ocr.tesseract.alto.xml @@ -0,0 +1,138 @@ + + + + pixel + + + + + + + tesseract 4.1.0-rc4 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/qurator/dinglehopper/tests/data/lorem-ipsum/lorem-ipsum-scan.pdf b/qurator/dinglehopper/tests/data/lorem-ipsum/lorem-ipsum-scan.pdf new file mode 100644 index 0000000..38564d7 Binary files /dev/null and b/qurator/dinglehopper/tests/data/lorem-ipsum/lorem-ipsum-scan.pdf differ diff --git a/qurator/dinglehopper/tests/data/lorem-ipsum/lorem-ipsum-scan.tif b/qurator/dinglehopper/tests/data/lorem-ipsum/lorem-ipsum-scan.tif new file mode 100644 index 0000000..39f11d6 Binary files /dev/null and b/qurator/dinglehopper/tests/data/lorem-ipsum/lorem-ipsum-scan.tif differ diff --git a/qurator/dinglehopper/tests/data/lorem-ipsum/lorem-ipsum.odt b/qurator/dinglehopper/tests/data/lorem-ipsum/lorem-ipsum.odt new file mode 100644 index 0000000..ce93bfd Binary files /dev/null and b/qurator/dinglehopper/tests/data/lorem-ipsum/lorem-ipsum.odt differ diff --git a/qurator/dinglehopper/tests/data/order.page.xml b/qurator/dinglehopper/tests/data/order.page.xml new file mode 100644 index 0000000..a1e058f --- /dev/null +++ b/qurator/dinglehopper/tests/data/order.page.xml @@ -0,0 +1,4204 @@ + + + + doculibtopagexml + 2018-11-20T05:00:14 + 2019-04-17T10:47:36 + + + + + + + + + + + + + + + + + + + + 7 + + + + 5 + + + + . + + 75. + + 75. + + + + + + + + E + + + + t + + + + w + + + + a + + + + s + + Etwas + + + + + + f + + + + r + + + + + + fr + + + + + + W + + + + i + + + + t + + + + t + + + + w + + + + e + + + + n + + + + . + + Wittwen. + + Etwas fr Wittwen. + + 75. +Etwas fr Wittwen. + + + + + + + + + + + 7 + + + + 6 + + + + . + + 76. + + + + + + D + + + + i + + + + e + + Die + + 76. Die + + 76. Die + + + + + + + + + + + m + + + + e + + + + n + + + + . + + men. + + + + + + D + + + + e + + + + n + + Den + + + + + + a + + + + n + + + + d + + + + e + + + + r + + + + n + + andern + + + + + + T + + + + a + + + + g + + Tag + + + + + + s + + + + + + + + i + + + + e + + + + n + + + + e + + + + r + + ersien + + + + + + d + + + + e + + + + r + + der + + + + + + e + + + + i + + + + n + + + + g + + + + e + + + + l + + + + a + + + + d + + + + e + + + + n + + + + e + + eingeladene + + men. Den andern Tag ersien der eingeladene + + + + + + + + b + + + + i + + + + s + + bis + + + + + + d + + + + r + + + + e + + + + y + + drey + + + + + + T + + + + a + + + + g + + + + e + + Tage + + + + + + h + + + + i + + + + n + + + + t + + + + e + + + + r + + + + e + + + + i + + + + n + + + + a + + + + n + + + + d + + + + e + + + + r + + hintereinander + + + + + + j + + + + e + + + + d + + + + e + + + + s + + + + m + + + + a + + + + l + + jedesmal + + + + + + z + + + + u + + + + m + + zum + + + + + + M + + + + i + + + + t + + + + + + Mit⸗ + + bis drey Tage hintereinander jedesmal zum Mit⸗ + + + + + + + + G + + + + a + + + + + + Ga + + + + + + m + + + + i + + + + t + + mit + + + + + + d + + + + e + + + + n + + den + + + + + + S + + + + e + + + + i + + + + n + + + + i + + + + g + + + + e + + + + n + + + + , + + Seinigen, + + + + + + u + + + + n + + + + d + + und + + + + + + k + + + + a + + + + m + + kam + + + + + + n + + + + a + + + + + + + + h + + + + e + + + + r + + naher + + + + + + z + + + + w + + + + e + + + + y + + zwey + + Ga mit den Seinigen, und kam naher zwey + + + + + + + + t + + + + a + + + + g + + + + s + + + + e + + + + + + + + e + + + + n + + + + . + + tagseen. + + + + + + D + + + + e + + + + r + + Der + + + + + + a + + + + n + + + + d + + + + e + + + + r + + + + e + + andere + + + + + + h + + + + i + + + + e + + + + r + + + + + + + + b + + + + e + + + + r + + + + , + + hierber, + + + + + + u + + + + n + + + + d + + und + + + + + + w + + + + u + + + + n + + + + d + + + + e + + + + r + + + + t + + + + e + + wunderte + + + + + + + + + + + +  + + tagseen. Der andere wunderte  hierber, und + + + + + + + + H + + + + e + + + + r + + + + r + + Herr + + + + + + K + + + + o + + + + n + + + + f + + + + r + + + + a + + + + t + + + + e + + + + r + + Konfrater + + + + + + w + + + + i + + + + r + + + + d + + + + , + + wird, + + + + + + n + + + + e + + + + b + + + + + + neb + + + + + + ſ + + + + e + + + + i + + + + n + + + + e + + + + r + + ſeiner + + + + + + l + + + + i + + + + e + + + + b + + + + e + + + + n + + lieben + + + + + + F + + + + a + + + + m + + + + i + + + + l + + + + i + + + + e + + + + , + + Familie, + + Herr Konfrater wird, neb ſeiner lieben Familie, + + + + + + + + H + + + + e + + + + r + + + + r + + + + n + + Herrn + + + + + + K + + + + o + + + + n + + + + f + + + + r + + + + a + + + + t + + + + e + + + + r + + + + , + + Konfrater, + + + + + + d + + + + a + + + + ß + + daß + + + + + + e + + + + s + + es + + + + + + i + + + + h + + + + m + + ihm + + + + + + z + + + + w + + + + a + + + + r + + zwar + + + + + + ſ + + + + a + + + + g + + + + e + + + + t + + + + e + + ſagete + + + + + + ſ + + + + e + + + + i + + + + n + + + + e + + + + m + + ſeinem + + ſagete ſeinem Herrn Konfrater, daß es ihm zwar + + + + + + + + v + + + + o + + + + m + + vom + + + + + + H + + + + e + + + + r + + + + z + + + + e + + + + n + + Herzen + + + + + + w + + + + e + + + + n + + + + n + + wenn + + + + + + e + + + + r + + er + + + + + + i + + + + h + + + + n + + ihn + + + + + + b + + + + e + + + + y + + bey + + + + + + + + + + + +  + + + + + + a + + + + n + + + + g + + + + e + + + + n + + + + e + + + + h + + + + m + + angenehm + + + + + + ſ + + + + y + + + + e + + + + , + + ſey, + + vom Herzen angenehm ſey, wenn er ihn bey  + + + + + + + + ſ + + + + o + + ſo + + + + + + o + + + + f + + + + t + + + + e + + ofte + + + + + + h + + + + a + + + + b + + + + e + + + + . + + habe. + + + + + + D + + + + e + + + + r + + Der + + + + + + G + + + + a + + + + + + Ga + + + + + + a + + + + n + + + + t + + + + w + + + + o + + + + r + + + + t + + + + e + + + + t + + + + e + + + + , + + antwortete, + + + + + + d + + + + a + + + + ß + + daß + + + + + + e + + + + r + + er + + ſo ofte habe. Der Ga antwortete, daß er + + + + + + + + e + + + + s + + es + + + + + + k + + + + o + + + + m + + + + m + + + + e + + + + , + + komme, + + + + + + d + + + + a + + + + ß + + daß + + + + + + e + + + + r + + er + + + + + + j + + + + e + + + + + + + + t + + jet + + + + + + d + + + + i + + + + e + + die + + + + + + E + + + + h + + + + r + + + + e + + Ehre + + + + + + ſ + + + + e + + + + i + + + + n + + + + e + + + + s + + ſeines + + + + + + Z + + + + u + + + + ſ + + + + p + + + + r + + + + u + + + + + + + + s + + Zuſprus + + es komme, daß er jet die Ehre ſeines Zuſprus + + + + + + + + b + + + + e + + + + i + + + + r + + + + t + + + + h + + + + e + + + + n + + + + w + + bewirthen + + + + + + e + + + + r + + er + + + + + + i + + + + n + + + + z + + + + w + + + + i + + + + ſ + + + + + + + + e + + + + n + + inzwiſen + + + + + + n + + + + i + + + + + + + + t + + + + , + + nit, + + + + + + o + + + + h + + + + e + + + + r + + + + w + + woher + + + + + + w + + + + i + + + + + + + + e + + wie + + + + + + k + + + + + + + + n + + + + n + + + + e + + + + ; + + knne; + + bewirthen knne; er wie inzwiſen nit, woher + + + + + + + + t + + + + + + + + g + + + + l + + + + i + + + + + + tgli + + + + + + a + + + + n + + an + + + + + + ſ + + + + e + + + + i + + + + n + + + + e + + + + r + + ſeiner + + + + + + T + + + + h + + + + + + + + r + + + + e + + Thre + + + + + + a + + + + n + + + + g + + + + e + + + + ſ + + + + + + + + r + + + + i + + + + e + + + + b + + + + e + + + + n + + angeſrieben + + + + + + + + + + n + + + + d + + + + e + + + + , + + finde, + + + + + + m + + + + o + + + + r + + + + + + mor⸗ + + tgli an ſeiner Thre angeſrieben finde, mor⸗ + + + + + + + + g + + + + e + + + + n + + gen + + + + + + z + + + + u + + zu + + + + + + k + + + + o + + + + m + + + + m + + + + e + + + + n + + + + . + + kommen. + + gen zu kommen. + + + + + + + + g + + + + e + + + + b + + + + e + + + + t + + + + e + + + + n + + + + , + + gebeten, + + + + + + m + + + + o + + + + r + + + + g + + + + e + + + + n + + morgen + + + + + + z + + + + u + + zu + + + + + + k + + + + o + + + + + + + + m + + kom⸗ + + + + + + z + + + + u + + zu + + + + + + m + + + + i + + + + r + + mir + + + + + + z + + + + u + + + + m + + zum + + + + + + M + + + + i + + + + a + + + + g + + + + s + + + + e + + + + + + + + e + + + + n + + + + t + + + + t + + Mittagseen + + gebeten, morgen zum Mittagseen zu mir zu kom⸗ + + Herr Konfrater wird, neb ſeiner lieben Familie, +gebeten, morgen zum Mittagseen zu mir zu kom⸗ +men. Den andern Tag ersien der eingeladene +Ga mit den Seinigen, und kam naher zwey +bis drey Tage hintereinander jedesmal zum Mit⸗ +tagseen. Der andere wunderte  hierber, und +ſagete ſeinem Herrn Konfrater, daß es ihm zwar +vom Herzen angenehm ſey, wenn er ihn bey  +bewirthen knne; er wie inzwiſen nit, woher +es komme, daß er jet die Ehre ſeines Zuſprus +ſo ofte habe. Der Ga antwortete, daß er +tgli an ſeiner Thre angeſrieben finde, mor⸗ +gen zu kommen. + + + + + + + + + + + + + 7 + + + + 9 + + 79 + + 79 + + 79 + + + + + + + + + + + H + + + + a + + + + n + + + + d + + + + , + + Hand, + + + + + + M + + + + y + + + + l + + + + o + + + + r + + + + d + + + + ? + + Mylord? + + + + + + f + + + + r + + + + a + + + + g + + + + t + + + + e + + fragte + + + + + + d + + + + e + + + + r + + der + + + + + + G + + + + r + + + + a + + + + f + + Graf + + + + + + v + + + + o + + + + n + + von + + + + + + R + + + + o + + + + + + + + e + + + + + + + + e + + + + r + + + + . + + Roeer. + + Hand, Mylord? fragte der Graf von Roeer. + + + + + + + + A + + + + l + + + + s + + Als + + + + + + e + + + + r + + er + + + + + + e + + + + i + + + + n + + + + m + + + + a + + + + l + + + + s + + + + s + + einsmals + + + + + + i + + + + n + + in + + + + + + d + + + + e + + + + m + + dem + + + + + + O + + + + b + + + + e + + + + r + + + + h + + + + a + + + + u + + + + ſ + + + + e + + Oberhauſe + + + + + + w + + + + e + + + + + + we⸗ + + + + + + e + + + + i + + + + n + + + + e + + eine + + + + + + B + + + + i + + + + + + Bi + + Als er einsmals in dem Oberhauſe eine Bi we⸗ + + + + + + + + b + + + + e + + + + + + + + + + + + n + + + + d + + + + i + + + + g + + bendig + + + + + + d + + + + i + + + + e + + die + + + + + + e + + + + i + + + + n + + + + e + + eine + + + + + + H + + + + a + + + + n + + + + d + + Hand + + + + + + i + + + + n + + in + + + + + + d + + + + e + + + + n + + den + + + + + + H + + + + o + + + + ſ + + + + e + + + + n + + Hoſen + + + + + + h + + + + a + + + + t + + + + t + + + + e + + + + . + + hatte. + + bendig die eine Hand in den Hoſen hatte. + + + + + + + + i + + + + n + + + + E + + Ein + + + + + + g + + + + e + + + + w + + + + i + + + + + + + + e + + + + r + + gewier + + + + + + L + + + + o + + + + r + + + + d + + Lord + + + + + + h + + + + a + + + + t + + + + t + + + + e + + hatte + + + + + + d + + + + i + + + + e + + die + + + + + + G + + + + e + + + + w + + + + o + + + + h + + + + n + + + + h + + + + e + + + + i + + + + t + + + + , + + Gewohnheit, + + + + + + d + + + + a + + + + ß + + daß + + + + + + e + + + + r + + er + + Ein gewier Lord hatte die Gewohnheit, daß er + + + + + + + + g + + + + e + + + + n + + gen + + + + + + V + + + + e + + + + r + + + + ſ + + + + o + + + + r + + + + g + + + + u + + + + n + + + + g + + Verſorgung + + + + + + a + + + + r + + + + m + + + + e + + + + r + + armer + + + + + + O + + + + + + + + i + + + + c + + + + i + + + + e + + + + r + + + + w + + + + i + + + + w + + + + e + + + + n + + + + t + + + + t + + Officierwittwen + + + + + + e + + + + i + + + + n + + + + b + + + + r + + + + i + + + + n + + + + g + + + + e + + + + n + + einbringen + + gen Verſorgung armer Officierwittwen einbringen + + + + + + + + w + + + + o + + + + + + + + t + + + + e + + + + , + + wote, + + + + + + ſ + + + + o + + ſo + + + + + + + + + + b + + + + e + + + + r + + + + r + + + + e + + + + i + + + + + + + + t + + + + e + + berreite + + + + + + e + + + + r + + er + + + + + + + + + + e + + + + , + + e, + + + + + + i + + + + n + + + + d + + + + e + + + + m + + indem + + + + + + e + + + + r + + er + + + + + + d + + + + i + + + + e + + die + + + + + + e + + + + i + + + + n + + + + e + + eine + + + + + + H + + + + a + + + + n + + + + d + + Hand + + wote, ſo berreite er e, indem er die eine Hand + + + + + + + + i + + + + n + + in + + + + + + d + + + + e + + + + n + + den + + + + + + H + + + + o + + + + ſ + + + + e + + + + n + + + + , + + Hoſen, + + + + + + u + + + + n + + + + d + + und + + + + + + i + + + + n + + in + + + + + + d + + + + e + + + + r + + der + + + + + + a + + + + n + + + + d + + + + e + + + + r + + + + n + + andern + + + + + + H + + + + a + + + + n + + + + d + + Hand + + + + + + d + + + + i + + + + e + + die + + + + + + B + + + + i + + + + + + Bi + + in den Hoſen, und in der andern Hand die Bi + + + + + + + + + + + + i + + i + + + + + + e + + + + t + + + + w + + + + a + + + + s + + etwas + + + + + + f + + + + + + + + r + + fr + + + + + + a + + + + r + + + + m + + + + e + + arme + + + + + + O + + + + + + + + i + + + + c + + + + i + + + + e + + + + r + + + + w + + + + i + + + + w + + + + e + + + + n + + + + t + + + + t + + + + . + + Officierwittwen. + + + + + + J + + + + n + + Jn + + + + + + w + + + + e + + + + l + + + + + + + + e + + + + r + + weler + + i etwas fr arme Officierwittwen. Jn weler + + + + + + + + h + + + + a + + + + t + + + + t + + + + e + + + + . + + hatte. + + + + + + e + + + + r + + + + : + + er: + + + + + + h + + + + a + + + + b + + + + e + + habe + + + + + + H + + + + i + + + + e + + + + r + + + + b + + + + e + + + + y + + Hierbey + + + + + + ſ + + + + a + + + + g + + + + e + + + + t + + + + e + + ſagete + + + + + + H + + + + i + + + + e + + + + r + + + + , + + Hier, + + + + + + M + + + + l + + + + o + + + + r + + + + d + + + + s + + + + , + + + + y + + Mylords, + + hatte. Hierbey ſagete er: Hier, Mylords, habe + + Ein gewier Lord hatte die Gewohnheit, daß er +bendig die eine Hand in den Hoſen hatte. +Als er einsmals in dem Oberhauſe eine Bi we⸗ +gen Verſorgung armer Officierwittwen einbringen +wote, ſo berreite er e, indem er die eine Hand +in den Hoſen, und in der andern Hand die Bi +hatte. Hierbey ſagete er: Hier, Mylords, habe +i etwas fr arme Officierwittwen. Jn weler +Hand, Mylord? fragte der Graf von Roeer. + diff --git a/qurator/dinglehopper/tests/data/test-fake-ocr.page2018.xml b/qurator/dinglehopper/tests/data/test-fake-ocr.page2018.xml new file mode 100644 index 0000000..0e62647 --- /dev/null +++ b/qurator/dinglehopper/tests/data/test-fake-ocr.page2018.xml @@ -0,0 +1,3394 @@ + + + + doculibtopagexml + 2019-01-08T10:25:36 + 2019-04-26T07:11:05 + + + + + + + + + + + + + + + + + + + + + + + b + + + + e + + + + r + + ber + + + + + + + d + + + + i + + + + e + + die + + + + + + + v + + + + i + + + + e + + + + l + + + + e + + + + n + + vielen + + + + + + + S + + + + o + + + + r + + + + g + + + + e + + + + n + + Sorgen + + + + + + + w + + + + e + + + + g + + + + e + + + + n + + wegen + + + + + + + d + + + + e + + + + + + + + e + + + + l + + + + b + + + + e + + + + n + + deelben + + + + + + + v + + + + e + + + + r + + + + g + + + + a + + + + ß + + vergaß + + + ber die vielen Sorgen wegen deelben vergaß + + + + + + + + i + + + + h + + + + r + + ihr + + + + + + + d + + + + o + + + + + + do + + + + + + + n + + + + o + + + + + + no + + + + + + + a + + + + n + + an + + + + + + + a + + + + + + + + e + + + + m + + + + . + + aem. + + + + + + + + + + + + ihr do no an aem. — + + + + + + + + H + + + + a + + + + r + + + + t + + + + k + + + + o + + + + p + + + + f + + + + , + + Hartkopf, + + + + + + + d + + + + e + + + + r + + der + + + + + + + F + + + + r + + + + a + + + + u + + Frau + + + + + + + A + + + + m + + + + t + + + + m + + + + + + + + n + + + + n + + + + i + + + + n + + Amtmnnin + + + + + + + d + + + + a + + + + s + + das + + + + + + + v + + + + e + + + + r + + + + + + ver⸗ + + + Hartkopf, der Frau Amtmnnin das ver⸗ + + + + + + + + ſ + + + + p + + + + r + + + + o + + + + + + + + e + + + + n + + + + e + + ſproene + + + + + + + z + + + + u + + zu + + + + + + + + + + + b + + + + e + + + + r + + + + l + + + + i + + + + e + + + + f + + + + e + + + + r + + + + n + + + + . + + berliefern. + + + + + + + + + + + + + + + + E + + + + i + + + + n + + Ein + + + + + + + E + + + + r + + + + p + + + + + + + + e + + + + r + + + + r + + + + e + + Erpreer + + + ſproene zu berliefern. — Ein Erpreer + + + + + + + + w + + + + d + + + + e + + + + u + + + + r + + wurde + + + + + + + a + + + + n + + an + + + + + + + i + + + + h + + + + n + + ihn + + + + + + + a + + + + b + + + + g + + + + e + + + + ſ + + + + + + + + i + + + + + + + + t + + + + , + + abgeſit, + + + + + + + u + + + + m + + um + + + + + + + i + + + + h + + + + n + + ihn + + + + + + + u + + + + m + + + + s + + ums + + + + + + + H + + + + i + + + + m + + + + + + Him⸗ + + + wurde an ihn abgeſit, um ihn ums Him⸗ + + + + + + + + m + + + + e + + + + l + + + + s + + + + w + + + + i + + + + + + + + e + + + + n + + melswien + + + + + + + z + + + + u + + zu + + + + + + + ſ + + + + a + + + + g + + + + e + + + + n + + + + , + + ſagen, + + + + + + + d + + + + a + + + + ß + + daß + + + + + + + e + + + + r + + er + + + + + + + d + + + + a + + + + s + + das + + + + + + + V + + + + e + + + + r + + + + ſ + + + + p + + + + r + + + + o + + + + + + + + e + + + + n + + + + e + + Verſproene + + + melswien zu ſagen, daß er das Verſproene + + + + + + + + g + + + + l + + + + e + + + + i + + + + + + glei + + + + + + + d + + + + e + + + + n + + den + + + + + + + A + + + + u + + + + g + + + + e + + + + n + + + + b + + + + l + + + + i + + + + + + Augenbli + + + + + + + + + + + b + + + + e + + + + r + + + + b + + + + r + + + + i + + + + n + + + + g + + + + e + + + + n + + berbringen + + + + + + + m + + + + + + + + + + + + t + + + + e + + + + , + + mte, + + + + + + + d + + + + i + + + + e + + die + + + glei den Augenbli berbringen mte, die + + + + + + + + F + + + + r + + + + a + + + + u + + Frau + + + + + + + A + + + + m + + + + t + + + + m + + + + + + + + n + + + + n + + + + i + + + + n + + Amtmnnin + + + + + + + h + + + + + + + + t + + + + t + + + + e + + htte + + + + + + + + + + + + +  + + + + + + + a + + + + u + + + + f + + auf + + + + + + + i + + + + h + + + + n + + ihn + + + + + + + v + + + + e + + + + r + + + + l + + + + a + + + + + + + + e + + + + n + + + + , + + verlaen, + + + Frau Amtmnnin htte  auf ihn verlaen, + + + + + + + + u + + + + n + + + + d + + und + + + + + + + n + + + + u + + + + n + + nun + + + + + + + w + + + + + + + + ß + + + + t + + + + e + + wßte + + + + + + + + + + + e + + e + + + + + + + n + + + + i + + + + + + + + t + + + + , + + nit, + + + + + + + w + + + + a + + + + s + + was + + + + + + + + + + + e + + e + + + + + + + a + + + + n + + + + f + + + + a + + + + n + + + + g + + + + e + + + + n + + anfangen + + + und nun wßte e nit, was e anfangen + + + + + + + + ſ + + + + o + + + + + + + + t + + + + e + + + + . + + ſote. + + + + + + + D + + + + e + + + + n + + Den + + + + + + + A + + + + u + + + + g + + + + e + + + + n + + + + b + + + + l + + + + i + + + + + + Augenbli + + + + + + + ſ + + + + o + + + + + + + + t + + + + e + + ſote + + + + + + + e + + + + r + + er + + + + + + + k + + + + o + + + + m + + + + m + + + + e + + + + n + + + + , + + kommen, + + + ſote. Den Augenbli ſote er kommen, + + + + + + + + + + + + e + + e + + + + + + + i + + + + n + + in + + + + + + + i + + + + h + + + + r + + + + e + + + + r + + ihrer + + + + + + + A + + + + n + + + + g + + + + + + + + . + + Ang. + + + + + + + + + + + + + + + + D + + + + i + + + + e + + Die + + + + + + + ſ + + + + o + + + + n + + + + + + ſon + + + + + + + v + + + + e + + + + r + + + + g + + + + i + + + + e + + + + n + + + + g + + vergieng + + + ſon vergieng e in ihrer Ang. — Die + + + + + + + + G + + + + + + + + + + + + e + + Ge + + + + + + + w + + + + + + + + r + + + + e + + + + n + + wren + + + + + + + ſ + + + + + + + + o + + + + n + + ſon + + + + + + + a + + + + n + + + + g + + + + e + + + + k + + + + o + + + + m + + + + m + + + + e + + + + n + + + + , + + angekommen, + + + + + + + u + + + + n + + + + d + + und + + + + + + + e + + + + s + + es + + + + + + + f + + + + e + + + + h + + + + l + + + + t + + + + e + + fehlte + + + Ge wren ſon angekommen, und es fehlte + + ber die vielen Sorgen wegen deelben vergaß +Hartkopf, der Frau Amtmnnin das ver⸗ +ſproene zu berliefern. — Ein Erpreer +wurde an ihn abgeſit, um ihn ums Him⸗ +melswien zu ſagen, daß er das Verfproene +glei den Augenbli berbringen mte, die +Frau Amtmnnin htte  auf ihn verlaen, +und nun wßte e nit, was e anfangen +ſote. Den Augembli ſote er kommen, +ſon vergieng e in ihrer Ang. — Die +Ge wren ſon angekommen, und es fehlte +ihr do no an aem. — + + + + + + + + + + + H + + + + a + + + + r + + + + t + + + + k + + + + o + + + + f + + + + p + + Hartkopf + + + + + + + m + + + + u + + + + ß + + + + t + + + + e + + mußte + + + + + + + + + + + + +  + + + + + + + e + + + + r + + + + + + er + + + + + + + b + + + + e + + + + + + + + n + + + + n + + + + e + + + + n + + + + , + + bennen, + + + + + + + u + + + + n + + + + d + + und + + + Hartkopf mußte  er bennen, und + + + + + + + + m + + + + i + + + + t + + mit + + + + + + + u + + + + n + + + + d + + und + + + + + + + + + + + b + + + + e + + + + r + + + + b + + + + r + + + + a + + + + + + + + t + + + + e + + berbrate + + + + + + + e + + + + s + + + + . + + es. + + + + + + + + + + + + mit und berbrate es. — + + + + + + + + l + + + + a + + + + n + + + + g + + + + e + + + + m + + langem + + + + + + + N + + + + a + + + + + + + + d + + + + e + + + + n + + + + k + + + + e + + + + n + + Nadenken + + + + + + + + + + + e + + + + l + + fiel + + + + + + + e + + + + s + + es + + + + + + + i + + + + h + + + + m + + ihm + + + + + + + e + + + + r + + + + + + er + + + + + + + e + + + + n + + + + d + + + + l + + + + i + + + + + + endli + + + + + + + n + + + + a + + + + + + na + + + endli na langem Nadenken fiel es ihm er + + + + + + + + w + + + + i + + + + e + + + + d + + + + e + + + + r + + wieder + + + + + + + e + + + + i + + + + n + + + + . + + ein. + + + + + + + + + + + + + + + + E + + + + r + + Er + + + + + + + l + + + + a + + + + n + + + + g + + + + t + + + + e + + langte + + + + + + + d + + + + e + + + + n + + den + + + + + + + Z + + + + e + + + + t + + + + t + + + + e + + + + l + + Zettel + + + + + + + a + + + + u + + + + s + + aus + + + + + + + d + + + + e + + + + m + + dem + + + wieder ein. — Er langte den Zettel aus dem + + + + + + + + A + + + + c + + + + c + + + + i + + + + + + + + e + + + + s + + + + b + + + + u + + Accisbue + + + + + + + h + + + + e + + + + r + + + + a + + + + u + + + + s + + + + , + + heraus, + + + + + + + u + + + + n + + + + d + + und + + + + + + + ſ + + + + a + + + + g + + + + t + + + + e + + ſagte + + + + + + + ſ + + + + e + + + + i + + + + n + + + + e + + + + r + + ſeiner + + + + + + + F + + + + r + + + + a + + + + u + + + + , + + Frau, + + + + + + + d + + + + a + + + + ß + + daß + + + Accisbue heraus, und ſagte ſeiner Frau, daß + + + + + + + + + + + + e + + e + + + + + + + d + + + + a + + + + s + + + + , + + das, + + + + + + + w + + + + a + + + + s + + was + + + + + + + d + + + + a + + da + + + + + + + w + + + + + + + + r + + + + e + + + + , + + wre, + + + + + + + h + + + + e + + + + r + + + + b + + + + e + + + + y + + + + ſ + + + + + + + + a + + + + + + + + e + + + + n + + herbeyſaffen + + + + + + + m + + + + + + + + + + + + t + + + + e + + + + . + + mte. + + + e das, was da wre, herbeyſaffen mte. + + + + + + + + J + + + + n + + + + d + + + + e + + + + ß + + Jndeß + + + + + + + m + + + + a + + + + n + + + + g + + + + e + + + + l + + + + t + + + + e + + + + n + + mangelten + + + + + + + d + + + + i + + + + e + + die + + + + + + + d + + + + o + + + + + + do + + + + + + + e + + + + i + + + + n + + + + i + + + + g + + + + e + + einige + + + + + + + G + + + + e + + + + n + + + + e + + + + l + + + + i + + + + a + + + + , + + + + r + + + + a + + Generalia, + + + Jndeß mangelten do einige Generalia, die + + + + + + + + a + + + + l + + + + ſ + + + + o + + alſo + + + + + + + w + + + + e + + + + g + + + + + + + + e + + + + l + + + + e + + + + n + + + + . + + wegfielen. + + + + + + + + + + + + + + + + H + + + + a + + + + r + + + + t + + + + k + + + + o + + + + p + + + + f + + Hartkopf + + + + + + + g + + + + i + + + + e + + + + n + + + + g + + gieng + + + + + + + ſ + + + + e + + + + l + + + + b + + + + + + ſelb + + + alſo wegfielen. — Hartkopf gieng ſelb + + Hartkopf mußte  er bennen, und +endli na langem Nadenken fiel es ihm er +wieder ein. — Er langte den Zettel aus dem +Accisbue heraus, und ſagte ſeiner Frau, daß +e das, was da wre, herbeyſaffen mte. +Jndeß mangelten do einige Generalia, die +alſo wegfielen. — Hartkopf gieng ſelb +mit und berbrate es. — + + + + diff --git a/qurator/dinglehopper/tests/data/test-gt.page2018.xml b/qurator/dinglehopper/tests/data/test-gt.page2018.xml new file mode 100644 index 0000000..c0dc183 --- /dev/null +++ b/qurator/dinglehopper/tests/data/test-gt.page2018.xml @@ -0,0 +1,3394 @@ + + + + doculibtopagexml + 2019-01-08T10:25:36 + 2019-04-26T07:11:05 + + + + + + + + + + + + + + + + + + + + + + + b + + + + e + + + + r + + ber + + + + + + + d + + + + i + + + + e + + die + + + + + + + v + + + + i + + + + e + + + + l + + + + e + + + + n + + vielen + + + + + + + S + + + + o + + + + r + + + + g + + + + e + + + + n + + Sorgen + + + + + + + w + + + + e + + + + g + + + + e + + + + n + + wegen + + + + + + + d + + + + e + + + + + + + + e + + + + l + + + + b + + + + e + + + + n + + deelben + + + + + + + v + + + + e + + + + r + + + + g + + + + a + + + + ß + + vergaß + + + ber die vielen Sorgen wegen deelben vergaß + + + + + + + + i + + + + h + + + + r + + ihr + + + + + + + d + + + + o + + + + + + do + + + + + + + n + + + + o + + + + + + no + + + + + + + a + + + + n + + an + + + + + + + a + + + + + + + + e + + + + m + + + + . + + aem. + + + + + + + + + + + + ihr do no an aem. — + + + + + + + + H + + + + a + + + + r + + + + t + + + + k + + + + o + + + + p + + + + f + + + + , + + Hartkopf, + + + + + + + d + + + + e + + + + r + + der + + + + + + + F + + + + r + + + + a + + + + u + + Frau + + + + + + + A + + + + m + + + + t + + + + m + + + + + + + + n + + + + n + + + + i + + + + n + + Amtmnnin + + + + + + + d + + + + a + + + + s + + das + + + + + + + v + + + + e + + + + r + + + + + + ver⸗ + + + Hartkopf, der Frau Amtmnnin das ver⸗ + + + + + + + + ſ + + + + p + + + + r + + + + o + + + + + + + + e + + + + n + + + + e + + ſproene + + + + + + + z + + + + u + + zu + + + + + + + + + + + b + + + + e + + + + r + + + + l + + + + i + + + + e + + + + f + + + + e + + + + r + + + + n + + + + . + + berliefern. + + + + + + + + + + + + + + + + E + + + + i + + + + n + + Ein + + + + + + + E + + + + r + + + + p + + + + + + + + e + + + + r + + + + r + + + + e + + Erpreer + + + ſproene zu berliefern. — Ein Erpreer + + + + + + + + w + + + + d + + + + e + + + + u + + + + r + + wurde + + + + + + + a + + + + n + + an + + + + + + + i + + + + h + + + + n + + ihn + + + + + + + a + + + + b + + + + g + + + + e + + + + ſ + + + + + + + + i + + + + + + + + t + + + + , + + abgeſit, + + + + + + + u + + + + m + + um + + + + + + + i + + + + h + + + + n + + ihn + + + + + + + u + + + + m + + + + s + + ums + + + + + + + H + + + + i + + + + m + + + + + + Him⸗ + + + wurde an ihn abgeſit, um ihn ums Him⸗ + + + + + + + + m + + + + e + + + + l + + + + s + + + + w + + + + i + + + + + + + + e + + + + n + + melswien + + + + + + + z + + + + u + + zu + + + + + + + ſ + + + + a + + + + g + + + + e + + + + n + + + + , + + ſagen, + + + + + + + d + + + + a + + + + ß + + daß + + + + + + + e + + + + r + + er + + + + + + + d + + + + a + + + + s + + das + + + + + + + V + + + + e + + + + r + + + + ſ + + + + p + + + + r + + + + o + + + + + + + + e + + + + n + + + + e + + Verſproene + + + melswien zu ſagen, daß er das Verſproene + + + + + + + + g + + + + l + + + + e + + + + i + + + + + + glei + + + + + + + d + + + + e + + + + n + + den + + + + + + + A + + + + u + + + + g + + + + e + + + + n + + + + b + + + + l + + + + i + + + + + + Augenbli + + + + + + + + + + + b + + + + e + + + + r + + + + b + + + + r + + + + i + + + + n + + + + g + + + + e + + + + n + + berbringen + + + + + + + m + + + + + + + + + + + + t + + + + e + + + + , + + mte, + + + + + + + d + + + + i + + + + e + + die + + + glei den Augenbli berbringen mte, die + + + + + + + + F + + + + r + + + + a + + + + u + + Frau + + + + + + + A + + + + m + + + + t + + + + m + + + + + + + + n + + + + n + + + + i + + + + n + + Amtmnnin + + + + + + + h + + + + + + + + t + + + + t + + + + e + + htte + + + + + + + + + + + + +  + + + + + + + a + + + + u + + + + f + + auf + + + + + + + i + + + + h + + + + n + + ihn + + + + + + + v + + + + e + + + + r + + + + l + + + + a + + + + + + + + e + + + + n + + + + , + + verlaen, + + + Frau Amtmnnin htte  auf ihn verlaen, + + + + + + + + u + + + + n + + + + d + + und + + + + + + + n + + + + u + + + + n + + nun + + + + + + + w + + + + + + + + ß + + + + t + + + + e + + wßte + + + + + + + + + + + e + + e + + + + + + + n + + + + i + + + + + + + + t + + + + , + + nit, + + + + + + + w + + + + a + + + + s + + was + + + + + + + + + + + e + + e + + + + + + + a + + + + n + + + + f + + + + a + + + + n + + + + g + + + + e + + + + n + + anfangen + + + und nun wßte e nit, was e anfangen + + + + + + + + ſ + + + + o + + + + + + + + t + + + + e + + + + . + + ſote. + + + + + + + D + + + + e + + + + n + + Den + + + + + + + A + + + + u + + + + g + + + + e + + + + n + + + + b + + + + l + + + + i + + + + + + Augenbli + + + + + + + ſ + + + + o + + + + + + + + t + + + + e + + ſote + + + + + + + e + + + + r + + er + + + + + + + k + + + + o + + + + m + + + + m + + + + e + + + + n + + + + , + + kommen, + + + ſote. Den Augenbli ſote er kommen, + + + + + + + + + + + + e + + e + + + + + + + i + + + + n + + in + + + + + + + i + + + + h + + + + r + + + + e + + + + r + + ihrer + + + + + + + A + + + + n + + + + g + + + + + + + + . + + Ang. + + + + + + + + + + + + + + + + D + + + + i + + + + e + + Die + + + + + + + ſ + + + + o + + + + n + + + + + + ſon + + + + + + + v + + + + e + + + + r + + + + g + + + + i + + + + e + + + + n + + + + g + + vergieng + + + ſon vergieng e in ihrer Ang. — Die + + + + + + + + G + + + + + + + + + + + + e + + Ge + + + + + + + w + + + + + + + + r + + + + e + + + + n + + wren + + + + + + + ſ + + + + + + + + o + + + + n + + ſon + + + + + + + a + + + + n + + + + g + + + + e + + + + k + + + + o + + + + m + + + + m + + + + e + + + + n + + + + , + + angekommen, + + + + + + + u + + + + n + + + + d + + und + + + + + + + e + + + + s + + es + + + + + + + f + + + + e + + + + h + + + + l + + + + t + + + + e + + fehlte + + + Ge wren ſon angekommen, und es fehlte + + ber die vielen Sorgen wegen deelben vergaß +Hartkopf, der Frau Amtmnnin das ver⸗ +ſproene zu berliefern. — Ein Erpreer +wurde an ihn abgeſit, um ihn ums Him⸗ +melswien zu ſagen, daß er das Verſproene +glei den Augenbli berbringen mte, die +Frau Amtmnnin htte  auf ihn verlaen, +und nun wßte e nit, was e anfangen +ſote. Den Augenbli ſote er kommen, +ſon vergieng e in ihrer Ang. — Die +Ge wren ſon angekommen, und es fehlte +ihr do no an aem. — + + + + + + + + + + + H + + + + a + + + + r + + + + t + + + + k + + + + o + + + + f + + + + p + + Hartkopf + + + + + + + m + + + + u + + + + ß + + + + t + + + + e + + mußte + + + + + + + + + + + + +  + + + + + + + e + + + + r + + + + + + er + + + + + + + b + + + + e + + + + + + + + n + + + + n + + + + e + + + + n + + + + , + + bennen, + + + + + + + u + + + + n + + + + d + + und + + + Hartkopf mußte  er bennen, und + + + + + + + + m + + + + i + + + + t + + mit + + + + + + + u + + + + n + + + + d + + und + + + + + + + + + + + b + + + + e + + + + r + + + + b + + + + r + + + + a + + + + + + + + t + + + + e + + berbrate + + + + + + + e + + + + s + + + + . + + es. + + + + + + + + + + + + mit und berbrate es. — + + + + + + + + l + + + + a + + + + n + + + + g + + + + e + + + + m + + langem + + + + + + + N + + + + a + + + + + + + + d + + + + e + + + + n + + + + k + + + + e + + + + n + + Nadenken + + + + + + + + + + + e + + + + l + + fiel + + + + + + + e + + + + s + + es + + + + + + + i + + + + h + + + + m + + ihm + + + + + + + e + + + + r + + + + + + er + + + + + + + e + + + + n + + + + d + + + + l + + + + i + + + + + + endli + + + + + + + n + + + + a + + + + + + na + + + endli na langem Nadenken fiel es ihm er + + + + + + + + w + + + + i + + + + e + + + + d + + + + e + + + + r + + wieder + + + + + + + e + + + + i + + + + n + + + + . + + ein. + + + + + + + + + + + + + + + + E + + + + r + + Er + + + + + + + l + + + + a + + + + n + + + + g + + + + t + + + + e + + langte + + + + + + + d + + + + e + + + + n + + den + + + + + + + Z + + + + e + + + + t + + + + t + + + + e + + + + l + + Zettel + + + + + + + a + + + + u + + + + s + + aus + + + + + + + d + + + + e + + + + m + + dem + + + wieder ein. — Er langte den Zettel aus dem + + + + + + + + A + + + + c + + + + c + + + + i + + + + + + + + e + + + + s + + + + b + + + + u + + Accisbue + + + + + + + h + + + + e + + + + r + + + + a + + + + u + + + + s + + + + , + + heraus, + + + + + + + u + + + + n + + + + d + + und + + + + + + + ſ + + + + a + + + + g + + + + t + + + + e + + ſagte + + + + + + + ſ + + + + e + + + + i + + + + n + + + + e + + + + r + + ſeiner + + + + + + + F + + + + r + + + + a + + + + u + + + + , + + Frau, + + + + + + + d + + + + a + + + + ß + + daß + + + Accisbue heraus, und ſagte ſeiner Frau, daß + + + + + + + + + + + + e + + e + + + + + + + d + + + + a + + + + s + + + + , + + das, + + + + + + + w + + + + a + + + + s + + was + + + + + + + d + + + + a + + da + + + + + + + w + + + + + + + + r + + + + e + + + + , + + wre, + + + + + + + h + + + + e + + + + r + + + + b + + + + e + + + + y + + + + ſ + + + + + + + + a + + + + + + + + e + + + + n + + herbeyſaffen + + + + + + + m + + + + + + + + + + + + t + + + + e + + + + . + + mte. + + + e das, was da wre, herbeyſaffen mte. + + + + + + + + J + + + + n + + + + d + + + + e + + + + ß + + Jndeß + + + + + + + m + + + + a + + + + n + + + + g + + + + e + + + + l + + + + t + + + + e + + + + n + + mangelten + + + + + + + d + + + + i + + + + e + + die + + + + + + + d + + + + o + + + + + + do + + + + + + + e + + + + i + + + + n + + + + i + + + + g + + + + e + + einige + + + + + + + G + + + + e + + + + n + + + + e + + + + l + + + + i + + + + a + + + + , + + + + r + + + + a + + Generalia, + + + Jndeß mangelten do einige Generalia, die + + + + + + + + a + + + + l + + + + ſ + + + + o + + alſo + + + + + + + w + + + + e + + + + g + + + + + + + + e + + + + l + + + + e + + + + n + + + + . + + wegfielen. + + + + + + + + + + + + + + + + H + + + + a + + + + r + + + + t + + + + k + + + + o + + + + p + + + + f + + Hartkopf + + + + + + + g + + + + i + + + + e + + + + n + + + + g + + gieng + + + + + + + ſ + + + + e + + + + l + + + + b + + + + + + ſelb + + + alſo wegfielen. — Hartkopf gieng ſelb + + Hartkopf mußte  er bennen, und +endli na langem Nadenken fiel es ihm er +wieder ein. — Er langte den Zettel aus dem +Accisbue heraus, und ſagte ſeiner Frau, daß +e das, was da wre, herbeyſaffen mte. +Jndeß mangelten do einige Generalia, die +alſo wegfielen. — Hartkopf gieng ſelb +mit und berbrate es. — + + + + diff --git a/qurator/dinglehopper/tests/data/test.alto1.xml b/qurator/dinglehopper/tests/data/test.alto1.xml new file mode 100644 index 0000000..ac2a50b --- /dev/null +++ b/qurator/dinglehopper/tests/data/test.alto1.xml @@ -0,0 +1,20186 @@ + + + + inch1200 + + + \\libdpsrv1\storage\root\projects\NDNP\win_19101017-19101231\win_19101017-19101231-scan-1-\ocr\0070.tif + + + + + useAbbyy4:0 + dictionaryFlag.D:American-English + ImgPrep Component Count High Estimate:31557 + useCaere:0 + Abbyy6OCREngine Character Error Ratio:0.1398 + splitwords:0 + ScansoftOCREngine Character Count:21156 + conjoinWords:1 + dictionaryOn:1 + IrisOCREngine Character Count:24661 + Abbyy6OCREngine Character Count:24501 + Abbyy6OCREngine STAT BLOCK:STATBLOCK_Abbyy6OCREngine;24501;3425;43.9106;93.2252 + language:en + ScansoftOCREngine Predicted Accuracy:65.12% + page-reoriented:UP + loadfromfile:true + suppressPunctuation:false + multipleEngineWeight:0 + suggestionCount:1 + Node Count:8562 + ScansoftOCREngine Character Error Ratio:0.4152 + spawned:1 + IrisOCREngine Character Error Ratio:0 + IrisOCREngine STAT BLOCK:STATBLOCK_IrisOCREngine;24661;0;0;0 + cachePath:/jobq/caches/newpah_legacy + monkeyTimeout:1800 + Predicted Word Accuracy:93.23% + lexicondirectory:\jobq\caches\lex\wintertree\ + text-orientation:UP + IrisOCREngine Predicted Accuracy:0% + verboseOutput:false + trimwords:1 + Abbyy6OCREngine Predicted Accuracy:93.23% + ImgPrep Component Range:31451,31557 + version:Newpah v2.07 Apocalypse in 7/4 + ScansoftOCREngine STAT BLOCK:STATBLOCK_ScansoftOCREngine;21156;8785;0.770562;65.1163 + noPunctuation:1 + ImgPrep Component Count Low Estimate:31451 + DictionaryFlagsUsed:D + configFile:/jobq/caches/newpah_legacy/config/newpah_acwi_x.xml + + + iArchives + Newpah + v2.07 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Jails + + + + + + TIIE + TIE + + + + + + + + + + + + + + + inlurie + inure + + + + W1S + ams + sis + aims + + + + mncle + manacle + + + + Ik + Ir + + + + Reynold + + + + + + + + + pliitlist + littlest + + + + aftei + + + + tlle + elite + tulle + + + + exalnina + examine + exanimate + + + + + + + + ti1n + ion + + + + al11lC1l11wcd + auuoluleed + + + + Hie + tile + Hide + + + + + + lCllI1aJwl1tly + pern1nuentl + + + + + + + + 11IlJII1cd + imijIlVCl + + + + jid + id + + + + + + + + + + goneh + + + + + + + hilt + bat + + + + + + lie + + + + + + lint + + + + + + + + + + + + + + + + + + + hilds + child + holds + + + + appearal1ec + appeuuuc + + + + + + + + + XIIIis + Morris + + + + entile + entitle + + + + hucl + huyer + hull + + + + tints + + + + + + + + + weekend + + + + + + m1ny + + + + + + + + + anwnA + auung + annA + faun + + + + + + l5n + 150 + + + + + + or + + + + JlIlI1eS + Jaines + Gaines + + + + + + + Bigstaf + Briggs + + + + avelage + + + + + + + + pOllnd + pollen + + + + + + + + + G + + + + + + 10 + 150 + lobo + + + + or + + + + + + 1 + + + + lIighltnel + Ilighhutd + lighten + + + + + + + iverlge + diverge + + + + weigh + + + + + + pou1d + pottn1 + + + + + + + + + + + ancl + 8111 + encl + + + + GII + + + + + + Hcl1IY + Ilenly + Silently + + + + + + lIall + hail + leally + + + + 1el + + + + + + agc + ac + + + + + + 14111 + + + + + + bounds + + + + + + 6 + G + + + + ccnts + accents + + + + + 1lwfl + + + + cantle + + + + Wlrc + Welch + + + + honght + Hong + + + + rm + + + + flu + + + + Iltl + Ill + + + + + It'll + + + + + timole + tlnldre + timor + insole + landed + + + + + + + + verc + veer + + + + + + tll + Cu + tall + + + + + + + 1olris + Mnrizs + Minis + + + + purchasecl + purchase + + + + + + OB + + + + + + + purebased + pUlcohased + purebred + + + + + + + + + + + + + + cattlc + + + + + ncmge + avemage + income + avenge + + + + 1100 + + + + pound + ponnds + ponds + + + + att + + + + lhe + he + + + + lrcail + lyrical + + + + + ing + King + + + + pries + + + + lllCrc + + + + arc + + + + + + dICHlt + snout + ditch + + + + IOOn + 1001 + IOn + + + + + + + 10 + + + + rtlw + Elbe + rattle + + + + txPwt + + + + nutrkct + Utrecht + + + + unolcl + Unocal + + + + + + + lltis + Allis + + + + eOllnty + felinity + + + + + countywide + + + + + + + + + + + + + + 01 + tkL + TLC + + + + 3lTlle + nTh + + + + + + + husincss + huskiness + + + + + + + + + + + + + Lexiugion + + + + + + ninny + + + + + + + + + + + omplettl + coIUIfeted + completely + cloudiest + + + + M1on1UV + + + + + + + + finn + lna + ulna + + + + + paicr + pair + + + + + + + + + + + + 01 + + + + time + + + + Centra + + + + + lientueky + alienate + + + + rohareo + threw + O'Hare + + + + + + Con + + + + + pamiy + panky + amity + + + + n1ll + + + + ue + + + + signel + signal + + + + arfolcling + aceo11II + refilling + + + + + + + + + welJnnthcntientlcl + ellauthenticated + + + + + + cullnt + enrrent + Cullen + errant + + + + Snt + S8t + Sent + + + + + nrday + hurdy + nerdy + + + + + + Trite + + + + derl + ler1 + del + + + + inYohci + ii1VOlcs + Kiyoshi + + + + + + + cxhange + exehange + change + + + + oi + + + + + + 9S000 + + + + + + + + + + + + + + l1Icl + 8111 + + + + + + bfcn + bf + + + + mulcr + mulct + + + + consielert + consider + consulter + + + + + + + + + + ion + + + + + + eCnl + eNcl + + + + + + past + + + + + fhc + hoc + + + + pl1IC1tnSCIB + pnre1Iase + + + + + + + + ullling + + + + + + + + + + + largcst + + + + + + walc + wail + + + + + + + + + + + + + + + + + Loniile + Lenities + + + + + + + WHrchons + Horehounds + + + + Compan + + + + of + + + + + + + whih + + + + OWI1S + onus + + + + cight + + + + + + tho + thou + + + + + + + + twe1c + + + + similnl + swimmingly + + + + mouses + mousers + + + + + + busincs + busniess + buskins + + + + + + + thlt + halt + + + + eity + deity + + + + nnd + nd + + + + whicht + + + + + + + + ns + + + + + + + + + ltle4 + + + + warehol1se + tvarehmise + starchier + + + + enmpuy + empty + + + + + + + + + + + t11aeo + + + + + + + + + + + + + + + + + 1ollals + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Steck + Stcck + Stack + + + + + + Gcod + Cod + + + + + + + + + + + + + Kenry + Kerry + + + + + + + + + YtItCl + 11tlter + Wailer + Title + + + + Benn + Beata + Benin + Bata + + + + + + pnrchlsed + purchased + + + + + + + + + Hcnry + 1Ienry + + + + Phillip + + + + + + + + 11lc1 + + + + + + + i7ine + + + + statuI + st8nl + statue + + + + non + + + + + + + + Phimp + 1hilipS + Pimp + + + + + + + Irng + Airing + + + + storl + stol + + + + + + + + + + + + + sion + scion + + + + + + morninA + + + + rte + + + + + + + + + + + tho + thou + + + + + + paper + + + + + + 11l1gazincs + niagazinis + + + + + + + Mimi + + + + Pltillilis + Policlinics + + + + hnl + hl + + + + condncted + emuluCte1 + + + + + + + + + + + 1nuy + + + + year + + + + + + tins + + + + + + + + + + + jood + Jody + + + + pattonagl + atonal + + + + + + + + Bcnl + BBC + + + + + + + + + + + + + kClP + kcal + + + + + + 10 + + + + inercase + interclass + + + + lime + + + + + + + + + + + itid + pitied + + + + + + priyaic + prismatic + + + + + + + NEW + NI + + + + SECRHARY + SCRIARY + SECTARY + CRAY + + + + + + + r + + + + + TIlE + + + + + + + + + + + + + + + + + + + + SIJhar + Spar + Sitar + + + + + + + + Madcl + Madly + + + + + + + anil + anile + + + + + + + + + + I + + + + + + + Churched + + + + + + + morninA + + + + ml + + + + + + H + + + + SplInt + Spar + + + + + + + vas + + + + elccted + + + + sCllctary + oscillatory + + + + + + tlC1FIUll + + + + + + + 0f + + + + thc + + + + Firsl + Firs + + + + Baptisl + + + + + + + + itl + + + + + + + + + + + + + + + + + + + Er1p + Escarp + + + + wll + win + wall + + + + 10 + + + + + + Rigncd + Rind + + + + + + + + + + + + + + IHE + HE + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ChuIch + Chichi + + + + + + + + + + + + ncw + near + nw + + + + lurlUtee + lulu + + + + + + + + instnllcd + + + + ill + iii + + + + + 11Ie + + + + + + + + churdl + churl + + + + + + + + 01U + emir + + + + + 0 + + + + the + + + + + + impIocd + implied + + + + + + + + + + + + + UlldJih1i1 + tliebuilding + libidinal + + + + bettel + betel + + + + limn + + + + tlu + tlir + talus + lair + + + + + + + 011e + + + + 1Je + Tme + Tome + + + + wrk + + + + + + imtstrlling + installing + mottling + + + + tit + tho + thou + + + + + IllatinA + Illation + + + + + + ivlil + will + livelily + + + + be + + + + ompletcl + completely + + + + he + + + + + + tore + + + + + + + + whilh + whieh + whirl + + + + + + + + + + + + cltlltl + + + + hy + + + + Ihy + Hay + + + + + + 11 + + + + + + + + Lox + Alex + + + + + iUgton + Kingston + rigatoni + + + + hegins + + + + + + + + + + + + + + + + + + + + + + HORSED + + + + + + + + + + + + + + + + mlc + mc + + + + + + fontana + lontalla + flotilla + + + + horse + + + + + + + + + + + + 31t + + + + + + + + Whartou + Whnrton + + + + + + + + uril1Y + hurdy + + + + + + tl1 + + + + Broadwav + + + + stockyard + + + + + + + slot + + + + + + hut + + + + tix + ix + + + + + + + + war + + + + + 50111 + + + + Thirtninc + Thirty + Titanic + + + + + + offercd + + + + fOl + foci + foal + + + + + + + nid + 1111 + Enid + + + + Ihosc + Hoc + + + + + + + + + + hrought + brough + + + + + price + + + + rangin + + + + + + + + + + + + + + + + + + + + Hon1d + + + + + + 3uctioncer + + + + + + + + + it + + + + + + + + ourthouser + + + + + + + + sley + sllY + sly + + + + amid + + + + Banns + + + + + + 01 + cld + clad + + + + Beckley + + + + + + + + + + + + + + + + + Becknerville + Becker + Belleville + + + + tom + + + + + It + + + + + + Sconce + + + + + + + + opera + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + IIousl + IOUs + + + + whre + twhere + whore + + + + + + spnt + + + + 8aitmlty + + + + + + + + + + + atHi + anti + Kathie + + + + Sandty + Sandy + + + + Mlorgm + Mellor + + + + mH1 + and + + + + + + + + + 1JIy + Cry + + + + whu + twho + hub + + + + Ire + lrc + rc + + + + huth + hutch + + + + ahont + Mahomet + + + + Clio + + + + tell + + + + unt + aunt + + + + + + + 8 + + + + + + + + nutttee + + + + + + + + + wns + awns + + + + cmployud + compound + + + + + + + + + + + whcn + + + + + + eune + equine + + + + + + time + + + + cttJilll + + + + timc + iinu + tic + in + + + + + + + Caiy + Cay + + + + letueel + lettuce + + + + + + Jay + pad + + + + 1lig1I + + + + walnut + + + + Ill + + + + + + + thollgh + though + + + + + + clue + + + + hind + + + + foran + oran + + + + + + + + + + steadstead + steadied + + + + + + 1eJeclin + procoedimtg + + + + + + + + nj + + + + + tn + td + + + + + + + + + + stirted + storied + + + + + + + + + whieh + + + + cast + + + + hint + + + + + + + + iii + + + + + + + ourt + court + ort + + + + + + + Goy + + + + anc1 + + + + Dottily + + + + Bales + + + + + + + Maher + + + + occludes + + + + 111 + + + + + + Oer + Boer + + + + nnwcl + unwell + + + + + IllUtHJ + mounters + Oilcloth + Mounties + + + + + + I1ftcl1lOin + ftflemvoon + + + + + + + + + Gny + Gunny + + + + thupst + alamps + thrust + Alamos + + + + + + bit + + + + + + Butcs + Buts + + + + + + + + + Balds + Bald + + + + wno + iis + wino + is + + + + Jren + Jean + + + + + + tlte + tilted + + + + icromarr + micrometer + + + + lion + + + + + + + eWI + WI + + + + + + heiclcs + chemicals + + + + + + + + carry + + + + + hael + heal + + + + iiI + + + + 110 + + + + twill + + + + h1e + + + + + + + + + + + + + + + + + Inllills + Mulleins + Instills + + + + Noose + + + + + + + + + + + + + + 30000 + + + + + Jft + 1a + + + + + + H + + + + CUl1lingham + unningham + + + + + + + solcl + soc + + + + + + + + + + lnJ + nJ + + + + + + + + + + Jmwtion + Jnuetiou + Inaction + Jamestown + Neutrino + + + + + + + Ihout + Shout + + + + + + + + + + Ml + + + + + Alhlt + therm + Allot + thermo + + + + Stofel + Solver + Steel + + + + or + + + + Ifontgomneiy + Fronton + + + + + + + + + + + + NFVf + + + + PUY + PLUMY + + + + DY + + + + + + + II + + + + COLLEGE + + + + + + + + + + + + + + + + + + + + Novemher + + + + Untie + Under + + + + + + + + + + + + + Dahl + + + + + + + + Eueouraged + + + + by + + + + + + Sl1eeess + + + + + + + + + + + cflorts + florets + + + + + + collcgc + + + + + + imard + hUd + Mardi + + + + ut + + + + + + + + + + + II + n + + + + + + 11111elll + atnalear + antler + + + + jilny + jitney + + + + lhi3 + 19la + + + + piny + piney + + + + + + u + + + + + + + + + + 101 + + + + time + + + + henefit + + + + + + tic + + + + atlaletai + Athletic + Valletta + + + + + + + H + + + + hwItulc + inclulcS + wistful + + + + + + bey + bees + bevy + + + + + + + + + tdirn + din + + + + uul + usual + + + + + + be + + + + w11 + iye11 + + + + + + 1 + < + + + + + + + + + + stmdara + Tamara + + + + + + Ims + Sims + + + + heen + hen + + + + + + + + tlm + + + + + + + otnr + toner + oftener + + + + + + Pllt + Pelt + + + + + + nuder + nude + + + + + + direc + direct + + + + + Lion + ion + + + + + LionProfessor + + + + + + + + + Dalgct + Daltretry + Aglet + Dialect + Maltreat + + + + hill + + + + ngJin + azgain + aging + angina + again + + + + htl + luty + have + html + lusty + + + + + elmrge + emerge + + + + + + + + woll + woolly + + + + uncI + nod + ncI + + + + ho1es + + + + + + dui + + + + + + + + + + + or + + + + + + exccl + excl + + + + + + + + + + + + + + + + mrit + + + + + + + + lde + + + + take + + + + IICII + Koh + bosh + IC + Koch + + + + + or + + + + + + + + + + ivitli + civil + + + + + + addel + Adele + + + + cx + + + + + + + peieucc + prince + penile + + + + + + ilOm + Dillon + + + + + + sllcccssfu + sneeessttt + success + + + + + sununcI + umnmer + Sunni + + + + enmgetncnt + penmen + + + + ivitlm + civil + + + + + + Vl11g11 + Vaugll + Vaughn + Vault + + + + + + + Glasse + Glassed + + + + Compnny + + + + + + + + 1 + + + + + time + thou + + + + nwv + meat + wv + + + + + + plnyhollse + + + + + + Hoch + Itoclm + Rob + Hooch + Ito + + + + + + + N + + + + + + + + + + + soleefod + sleeved + + + + + + + + ocr + + + + + + + + + cometly + comely + + + + + + fthe + + + + lime + + + + ut + + + + Ute + time + + + + Xapoleon + + + + + i0 + + + + podoll + poliod + pool + polio + + + + + + + + bright + + + + sparklin + + + + anis + + + + + + + + + Cltmint + Culminate + + + + ihmtions + inhumations + + + + + + + + + + + WM + + + + + + fo1 + + + + nnd + nd + + + + netell + retell + + + + hy + + + + OJ1 + omt + OH + omit + + + + + + + + + + + tho + thou + + + + forlmost + toremost + + + + shus + shuns + + + + + + HIe + Hide + + + + Ameriem + Ameruam + Aerie + American + + + + + + + Etrge + starch + Deterge + + + + + Etrgefhe + starched + Deterge + + + + + Tlw + fhe + Tel + he + + + + + + + + time + + + + picce + + + + + + PhocbeS + Ihoehe + Phoebes + Hoe + + + + + nomanc + Romaine + Norman + + + + ill + + + + + + Stloet + Sloe + + + + + Thc + Ihc + Ic + + + + + + + + + + papilla + + + + + + + + + 1111 + + + + eostnnted + costumed + resented + + + + It + + + + + + nppcnt + appeau + Nippon + + + + + Lime + + + + littler + + + + pert + + + + 01 + + + + Xoc1l1hcr + + + + + + + + + + + + + + + thl + hl + + + + stndnnl + standard + stunningly + + + + + + Imil + quil + equal + Simile + quail + + + + + city + + + + thc + + + + Anclitoiuni + Antitoxin + + + + theatto + threat + + + + wilt + + + + pn + + + + + + + + ienL + enL + + + + 011 + all + + + + mfotlday + moldy + + + + amid + + + + TueFclay + Tuscany + + + + + + + + + subwoofer + + + + mcre + nacre + + + + + + + + + + + + + tiro + tire + + + + + + Hts + Hats + + + + atul + amid + actual + + + + + + only + + + + Audis + + + + + nicutrcs + incurs + + + + + + appear + + + + + + + + + + + + + Cmson + Mason + + + + + + Comlany + + + + + + + + King + + + + Thc + + + + + + + + thc + + + + + + + + + + + + + trlCular + trmeular + tricolor + trammel + + + + nu1 + + + + tmt5formntion + + + + + + + + + + + + + 011 + can + + + + speciul + + + + secIlcry + secularly + + + + + 1150 + + + + g6rgeous + A6rjeons + + + + efTelt + Ethel + + + + anti + + + + magl1ifieenl + magnifemil + magnified + + + + + eostnmcs + esteems + + + + Anothcr + Anothe + + + + stir + + + + net + + + + 011 + + + + + + + + + + + + + + Brows + + + + + + theit + theist + + + + cnmcd + comic + conc + + + + + + + skil + skill + + + + XooIUCNOOlll + NoodleNoodle + Neocolonial + + + + + + act + + + + + + 1tl1 + omi + + + + + doubted + + + + 0110 + coo + + + + + + + + + + nct + aet + ct + abet + + + + + + + thc + + + + meliclu + mellifluent + + + + stagc + stag + + + + mid + + + + + + nl + + + + + + + + Canse1 + + + + + + riol + roil + + + + + + + + + + + + + + + + + + + + + + + thi + Lid + thin + + + + + + + + + + + dtc + flit + dc + + + + + + plice + p11CC + plaice + + + + + + mI + + + + + missiO + 111S510t1 + + + + + + conts + emits + counts + + + + iyill + idyll + + + + hic + chic + + + + smuetl1inv + s0methin + + + + + + + + + + regrets + + + + + + + + + + + + + + + + Stoke + + + + agemtts + + + + fl + far + + + + + + + + + lTadl + Hal + ladle + + + + + + + + + + + + + + + Olive + + + + + + + + r1J1t + + + + + + + + Yan + 1an + Yang + + + + + + + + + cdiltainiug + dilettanti + + + + + + acrcs + + + + f0i + foci + + + + + 35 + + + + + + + + + + + + + loc + + + + + + Asla + Al + Ala + + + + + GnllUlo + Allan + Gluon + + + + nt + + + + time + + + + opern + open + + + + + + Xoem + Noven + Novena + Oem + Oven + + + + + + + ber + 11 + beer + + + + + + + OATIIS + COATIS + + + + + + + + + + + + + + + + + + + + Wilrox + + + + + + 70 + + + + + + + nt + + + + het + lice + heat + + + + + + + + + + Satnr1ay + + + + of + 01 + + + + + geueral + + + + + + rhe + rhea + + + + + + + + + + + + + Aulioeh + Calliope + + + + hunh + hunch + + + + hy + + + + + + EltIcr + Celtic + + + + + + + + + Lnwry + Lowery + Landry + + + + Sunray + + + + morningorning + + + + Burinl + Burin + + + + + + + nt + + + + Thomts + Ihoma + Homage + + + + burinl + burin + + + + greunl + gruel + + + + + + + Strivers + + + + + + + + + + Hwn + Hewn + + + + S + + + + yeal + yea + + + + 01 + + + + + + + + + + + mid + utd + td + + + + + + + + Ernes + + + + + + + Sticgrs + Strivers + Tigers + + + + lied + + + + inexpressiveness + + + + iii + + + + Titck + Tick + + + + + + + + Sol1 + + + + + + + + + + onyphoid + o1Ytyplioid + boyhood + + + + + + + + + Thc + Time + + + + + + Terre + + + + Inouglit + Inoculate + + + + Jtere + Terre + + + + + + + + + + fm + + + + hint + + + + + + + + + + + tool + + + + buttal + bunnies + brutal + + + + + + + + + + + + + + + FAMOUS + + + + + + + AND + + + + UGTURER + FUTURE + + + + + jOie + + + + + + + Opine + + + + + + + + anrl + ami + Carl + amid + + + + Philosophe + + + + + + + + + + + + + + + Liam + + + + + + + Limit + + + + + Buckling + + + + + + + + + + + Thl + Hl + + + + folloing + + + + tCll + tell + + + + retons + Bretons + + + + are + + + + gim1 + give + + + + + Cll + + + + feU + feud + + + + + + + + or + + + + Opine + + + + Kead + Knead + + + + + + + 1 + + + + JJCl1lll + 1s3e1mi1 + + + + + + + + + tie + + + + + + + + + + + + + + + + caption + + + + tllC + tlC + + + + mtlwl + tall + + + + 11 + of1 + + + + morc + marc + + + + wideiti + widget + + + + + resid + lcl + reside + local + + + + books + + + + tlwn + lawn + + + + + + + + liyimm + Elysium + + + + + Anleli1Ii + + + + 11thol + anther + + + + + + + Jc + Ilc + + + + + + out + + + + or + + + + thc + + + + + + souhl + soul + + + + + + + 11111 + + + + hest + chest + + + + + + cuntiibuiors + + + + + + + + + + rcnt + recant + + + + 1Jeriodicnls + periodieals + + + + + + + Iris + + + + + + + + familiiu + familial + + + + + + mosi + mos + + + + + uf + 1t + + + + till + + + + htl1lcgoill + Ieoturegoing + lecture + + + + pnblic + + + + lIf + + + + + + + + + auel + amid + laurel + + + + dty + duty + + + + + + + IIe + + + + + + + + + + iuheltis + advertiso + unhealthiest + + + + + 0 + > + + + + III + Ito + + + + + + 4110 + + + + + + + + hst + hest + hast + chest + + + + elmw + elm + + + + + + King + + + + aUractions + abreactions + + + + 011 + do + + + + ll1UlY + mimumy + mommy + + + + + + + + 1tlI + lair + + + + + guest + + + + + + tbis + tJis + tbs + + + + pat + + + + seon + Seton + + + + + 0 + + + + He + + + + mint + + + + Hilly + + + + glls + galls + + + + + + crowL + cowL + + + + + lie + + + + plesses + lessees + + + + thiml + Ihem + thimble + Hem + + + + + i + + + + lIe + Ile + + + + docs + 1005 + + + + lot + + + + loll + + + + + + + + OJ + 0r + + + + + JIM + + + + + Icltah + Clash + + + + oilier + + + + mens + means + + + + 11015 + + + + + + + tic + + + + + + iviih + iii + + + + eflllal + + + + iaein9 + fteiua + fascia + ferial + + + + + + + + + + + elasses + lasses + + + + 0t + + + + jcople + couple + + + + + O + + + + Ill + 1Ic + + + + Imows + Mows + + + + whcJeof + wher0ot + + + + + + spenlt + spank + spelt + + + + + 0J + + + + + + htR + hats + hR + + + + + + aunty + + + + + + + + + + + + + + + expc1ncc + + + + + + + Ill + lie + + + + + + + + + + + + yoe + yoke + + + + + + + + + unO1 + 11101 + union + + + + + + + + + + + + nt + + + + + lie + + + + + + + + + lit + + + + Read + + + + + + + + appeu + ape + + + + + + + + lectl1lt + leetur + lecture + fleeter + + + + + at + + + + + + Lint + Liam + + + + luekliu + lucking + clerklier + + + + nt + + + + Kcntucki + Kentuckian + + + + + + + + + + + of + + + + + + ecuinz + cumin + + + + of + + + + + omlI + onember + Somali + condemner + + + + + + + + + VIILL + VILLA + + + + Of + + + + + + + + + fORCf + forceful + + + + + + + + + + + + + + Accord + + + + + + + + + + + + + + + + + + + + + + + + + + MonM + MoM + + + + + Mon + + + + + + + + + + + + or + + + + + + fCord + McCord + Record + Cord + Accord + + + + ivill + civilly + + + + + + + Otis + + + + ntire + + + + fore + forcC + + + + ont + tint + + + + Mloulay + Modula + + + + + + is + + + + + proh + Drool + pro + + + + + + lHopelty + properly + lonely + + + + 1111 + + + + + + + + the + + + + + + + + + + Hntloe11 + + + + ptics + optics + + + + arc + + + + nrl1crly + + + + + + + + + fated + tatted + + + + + + 11101nillg + + + + + + + + mini + + + + + tcnded + ended + + + + + + poi + + + + + + + + + + mime + + + + defncinpI + defining + + + + + + + + + nod + nd + + + + ottcr + otter + + + + roivcly + richly + + + + IInd + amid + Indo + + + + nu + + + + + + + llIct + alled + licit + allied + + + + tor + tore + + + + + + + + it + + + + hy + + + + incur + + + + + + + nnd + l1td + nd + + + + Dogs + + + + on + + + + + + uighi + eight + ugh + + + + tie + + + + + + ill + iu + + + + + + + trmted + tructed + trucked + termed + truncated + + + + + + + + + + arret + garret + + + + anti + + + + plact + plaec + pact + place + + + + + + + jrd + dial + rd + + + + + + olle + 000 + ole + + + + + + drfacing + + + + 0r + + + + de + + + + + tloing + straying + loping + + + + pIopelty + poetry + + + + + + + + amy + + + + + WOODMEr + Woodier + + + + + + + + + Tle + 110 + Tale + + + + lOlleln + Allen + + + + Woodmcn + 1Fondmen + + + + 01 + + + + AmCICf + Ametier + America + Pacify + Ammeter + + + + + + + + + + + it + + + + + + + + + + theii + Heidi + + + + + + + + + don + + + + + + thc + time + + + + + + b1i1diti + + + + I + + + + + 1omlay + + + + + + + + + + + + + urce + Vance + cure + + + + + + + + trnnHted + tansacfed + tormented + + + + mill + dud + + + + nlso + atlsn + nelson + atlas + + + + worb + orb + + + + + + + + + tel1m + teanm + + + + + tamper + + + + + + + + + + + + + + + + + + + + + + Hcrald + + + + + + pIny + piney + + + + + + + + + + + + + Winche5tcr + + + + + + + + + of + + + + + + + + opera + + + + + + The + Thc + + + + Housc + + + + or + + + + 1 + + + + rhOUSfind + rosined + + + + Caai + Cain + + + + + tllea + dales + tulle + + + + + + + + + + + + + + opcrn + OCR + + + + housr + houso + hours + + + + + + + mntitHe + amniotes + + + + aul + Gaul + + + + + + + + + + + + + + + goodsizcc1 + goodie + + + + + + + + + + mensch + densely + mesh + + + + + + + + + + + + + + pJay + Jay + + + + + + + + Tought + Thought + + + + + + + + + + + + entertainiup + entertain + + + + + + + + + iyithottt + + + + + + iu + + + + + + 5C10 + + + + II + m + + + + + tronA + tonal + + + + storv + Astor + + + + suffer + + + + + + + + + + it + + + + + time + + + + dramatizction + + + + an1 + + + + iii + + + + + + hmul + imand + html + viand + + + + + + + + + thorouglIIY + + + + competcnt + + + + + + + + + + + + + + + + + + + + Time + + + + + + + + + + + + book + + + + + + + + + + + + + thc + + + + + + + + + + + + + + tees + + + + + + + imptrtl1rbahle + + + + tacituru + + + + tesource + + + + + + rnl + foul + rn + + + + + + + + thc + + + + + + + + Glenallll + Gleam + + + + + + + + + + + + + + + Ixst + prst + pet + Kist + prost + + + + + + conccnl + conc + + + + nud + nude + + + + + + + + + + + + + + + + + + + + + + + + + + + + nerontc + necromantic + necrotic + + + + leo1 + revolt + + + + + + + llntcF + lilt + + + + nnd + nd + + + + + + actiitte + ratite + + + + + + + n + + + + + + + + conlail1 + containn + contain + + + + + + + mlny + Maloney + + + + + + tinill + infill + + + + + + + + Wag + vas + + + + eon + + + + + + + petclt1 + petcntly + patently + + + + trken + Turkmen + + + + + + + + + + Inn + + + + + + + irho + biro + + + + + + + + + + compaitsot + comparison + complaints + + + + + cmincnt + + + + + + + + + + It + + + + + + + dth + doth + + + + + + + + + + + HoiIanl + Haitian + + + + tiho + Jo + thou + + + + Inst + Inset + + + + + + + + + + Alex + + + + + + + Kingston + + + + nCClthelc5 + + + + pOlllaR + + + + + + nrl + + + + + + + milnhlY + ntirally + amiably + smilingly + naturally + + + + 111 + + + + + + + + eritie + verities + + + + + + 11lwe + nerve + + + + + hod + hood + + + + ne + + + + pleasnre + pleasuie + + + + offsetting + + + + 1tr + + + + Hol + Holy + + + + + + + lie + + + + world + + + + he + + + + moo + + + + + + acclpt + aceept + + + + + + + land + + + + + 11110 + + + + + + + Pmabclle + Respelled + Marble + + + + Lcslic + Cystic + + + + + + plnocl + playa + pinochle + + + + + lhe + he + + + + pHt + Ht + + + + + + L1on + lariot + lariat + + + + Devereraix + Coverer + Severer + + + + unfor + unfol + uniform + unfold + + + + + tunntcly + ttnately + truncate + tunnel + attenuate + + + + wns + awns + + + + + + + + + + + + + prai1ing + + + + nmlady + nullady + milady + + + + lhat + hat + + + + + + + + + + + + + + firt + + + + anhim11 + + + + rold + old + + + + wac + ware + ac + + + + + + + wns + awns + + + + nble + ahlC + ail + noble + Dahl + + + + + + seak + peak + + + + + + Iiiti + lines + Iii + + + + on1y + + + + + + + + + diliculty + + + + + + + + + + + + + 1oarsines + + + + + hoarseness + + + + liner + + + + + + iloue + loupe + + + + + + + + + + + + fircgoly + fiercely + + + + + + iyafh + Riyadh + + + + + + dolly + + + + Irish + + + + + fiord + + + + + + Donoan + Donorau + Donovan + Donor + + + + portrayed + + + + + + + + + + + + + + + + + + + + + + + + + + pot + + + + + + + + + + + + + H1ul + + + + + + oldtimc + olcltime + oldie + voltaic + Mollie + + + + Xorth + NOJ1h + + + + + + Irclnucl + Micronuclei + + + + + + + + + lime + + + + + + aet + abet + + + + whell + hell + + + + time + + + + + erouwcd + crOutned + crowed + rowdy + counted + + + + + + + + + + + + attuckc + attack + + + + + + + GlCU1lUl + Glenarni + Gleam + Learn + + + + + + + + tlw + lime + law + + + + + + + + + + + + + dispossesitlg + + + + + + Gll + Gleu + Gall + Glue + + + + + Iun1 + return + + + + + + + + + + milnsion + minion + + + + + Thc + fhe + he + + + + + + + + + + + + + nollgh + knoll + + + + tll + + + + + + hoth + hot + + + + + + gll1ery + + + + 111 + 8111 + + + + + Iuwel + luircr + Unwell + blurry + + + + lloor + 1Ioor + + + + in + + + + 1 + + + + + + + + excitemcnt + + + + + + + morn + + + + anisanci + Stances + Nissan + + + + + + oat + + + + ul + anal + + + + leay + leave + leafy + + + + + IIi + + + + + + deeidedl + deeded + + + + + + + + + + time + + + + mouth + + + + + + + + + inditor + indictor + + + + it + + + + thl + hl + + + + + + + AIRPLANE + + + + + + + + + + + + + + + + + + + + + + Far + + + + + + + + + IVASII1GI0 + + + + Olt + Bolt + + + + 31Iol + Ovoid + + + + + + + fitct + fit + + + + + + + + ompilers + + + + + + + + tall + + + + + + + trale + tale + + + + + + havy + heavy + + + + toned + + + + + + + lleeuQ + lee + + + + + + mnJxC + mnJw + minx + NJ + + + + + + pernlaueul + prelature + + + + + + + plfCC + pelf + + + + ill + + + + + + + + 101 + + + + acroplull + teroplaue + airplane + Carroll + aeropause + + + + + amI + amid + + + + ailshij + ails + + + + X1J01tS + + + + aecutdulti + acidulating + + + + tft + ie + + + + + ConsuJ1 + Consuular + + + + rcpOlts + repots + + + + + + + + Om + flat + + + + + log + King + + + + t110 + + + + + + + + Septemhcr + + + + shipmcnt + + + + + + + + + + + mllounted + anointed + mullioned + + + + + + + + + nnll + fund + null + + + + + + + + pnions + onions + + + + mottlm + mottle + + + + + + 33870 + + + + + + + + + + + + + + Iliac + + + + + + + + + + Oet + Moet + + + + + + + + + + + + + + rnion + iut0t + Orion + + + + + + IIaeke11ctek + Hackie + + + + + ndopte + dope + + + + resolutioll + resolution + + + + 8unday + Stuulny + Truly + + + + + + + mngrtding + engirding + + + + Wilhur + 1lilbur + + + + Hlul + Hull + + + + 01111e + Oriille + + + + 11riht + + + + + fo + + + + rfusillg + remusitmg + fusil + reusing + + + + + + aUow + nlloiv + avow + unloved + + + + + + employe + employees + employed + + + + + + + + + tly + tally + + + + + + neroplanc + airplanes + necromancy + + + + 011 + of + + + + Snuday + + + + + + + + + + + i1st1lcte + + + + + + + + + t + + + + eopy + + + + oi + + + + + + + + + + + + + Priggish + + + + + + + + + + + + W1IEEIJNG + + + + + + + + + + + Yhit + 11hile + Hit + + + + + + + + rhcI + Archie + + + + Snulay + Snugly + + + + + eyening + + + + + + lfotndsrille + Louisville + + + + + + + + skitT + skit + + + + + Jane + + + + DowllinA + Dolling + + + + + + 1rOnted + droimied + domed + + + + + + + + + + + + + + + hnc1 + 1181 + + + + + + ccnp + Cscap + escarp + cc + Escape + + + + + + + whcn + + + + lime + + + + + + captizcll + captized + capsized + chaotically + baptized + + + + Downinp + flownin2 + Owning + Downing + + + + + + + + + it + + + + r0sident + + + + + + + + Rellaie + Blair + Relative + + + + + + + + + + AMERICAN + + + + + + + + + + + + + + + + + + + + AboutKid + Kidnaping + About + Kidnapping + + + + + + + napping + + + + Qf + + + + + + + + + Inflexion + + + + + + + + + + + hown + + + + + + teleInm + telexing + + + + eonrelni + concern + reorienting + + + + JUte + time + + + + lad + + + + + naping + nnping + napping + inning + + + + + + + + Holph + ltolph + Kop + Hop + lop + + + + + + fender + Spender + + + + + lebraskn + Nebraskan + + + + + + Ambassadoi + Ambnssac101 + + + + + teny + teeny + + + + + + Wilsou + + + + saic1 + + + + Snnday + + + + + niglit + niggling + + + + lw + + + + IUH1 + lord + + + + + + informuaiiot + + + + + + tit + + + + + ubject + + + + Mis5 + hiss + + + + Hnlph + Rolpli + Ralph + Roll + + + + + + kidnapped + + + + + + + hy + + + + n + + + + Jlcxican + Lucian + + + + Ieon + pent + Eon + + + + + + + + + + + Hcgnnda + icrgunda + Spondaic + Hacienda + Nicaraguan + + + + + + + + + + + + ncnl + encl + + + + Cheeoy + Chicory + Cheney + + + + + + + whcre + + + + + + hall + + + + heen + heed + hen + + + + 5penclinp + + + + + + + + + sllmmcr + slimmer + + + + + + + + + + + + + + + + TIIOIASYILLE + + + + Gn + + + + OcL + Ocala + + + + + + + + + + + + + 511Ot + + + + + + Iiilled + Initialed + + + + h0rm + + + + + + + aftrnoon + aftenoou + + + + + + + + 111 + + + + + + wIll + tvlmc + TLC + + + + + + + + + + + + + hnd + hand + + + + + + this + + + + clos + cloSI + clods + celosia + + + + + + + BId + + + + SllICllClcc1 + surretdered + + + + pantsuit + + + + + + + + + tired + + + + + + sclfcfcnsi + + + + + facsimile + + + + + + + + + + + + + + CIIICAGO + + + + + + + + + + WHlianv + WtiliaID + + + + + 11 + u + + + + + + mnn + Mann + + + + + + + + + + killct + killei + kill + killed + + + + + tt + tit + + + + + + + + + + + + SundaT + + + + + aight + alight + + + + Iilliams + + + + + + + + htirglnr + htaglar + twirl + Hagar + + + + Cr + + + + + tering + tarring + tearing + + + + + + + + + + am + + + + windo + window + + + + + 0111 + + + + nttcmpted + + + + + + + + + + Thr + Thor + + + + + inltltde + Gillette + + + + + + + + + + + + + + + + + + + + + + + + Prcparlng + + + + + Tablcs + + + + + + + + + + + + + + + + + + + + + + + + af01y + + + + + + t11nt + + + + + + op + + + + + utsille + iutsile + utile + inutile + + + + + + + + oUlce + dolce + + + + bas + has + + + + + + + + + ontire + putlre + pustule + + + + + + + + trom + tromp + + + + + + + + + + + + + + + + + + + + + + Ilrinleel + Irvine + + + + almamc + + + + ia + + + + + + + + officee + + + + + exnmlnell + exantlned + exampled + + + + tlo + lo + + + + + + + + + + + + + + + tolal + toll + + + + + + + + OJures + Juries + + + + + + + + + + + + + + + + + + + numbcr + + + + 19 + + + + + it + + + + + + trilling + + + + + + + + + + numb + + + + + + + + + calc + + + + + rations + lotions + + + + + + + + + + + + prepare + + + + + + + + + + + + + + + + + + + + + + + + + tor + tore + + + + itn + in + + + + calcul + calculi + + + + + tion + ti0m + ion + + + + + + + + + + + + ami + amid + + + + + + hair + bait + + + + + or + + + + + + + + slmll1rly + + + + + + otber + October + + + + + + + + + + + + + + + ns + + + + + + sett + set + + + + + tthe + Lithe + the + + + + planets + + + + + + + + + + + + + + + feral + + + + pIDio1 + oplnioa + olio + + + + practicall + practical + + + + + + + + + in + + + + + + + + + + + + trom + tromp + + + + ypar + par + + + + + + + + + + + + + + + + + + + aft + aim + + + + + tho + thou + + + + + + + + + + hae + hade + + + + + + orlg + orig + org + rig + + + + + lnally + finally + lineally + + + + + + + + + + laborn + labor + + + + at + + + + + + + astronomlcat + + + + + + + + + + + + + Lithe + + + + + largr + + + + + + + + + + + + + + + lhe + he + + + + + + + + + + planets + planers + + + + marle + marble + + + + + + + + + clue + + + + Horal + Oral + + + + + + + + + + + + escapes + scopes + + + + + + + + + + appall + + + + + + + + + + anccs + nnces + panics + minces + + + + aro + are + aero + + + + consllcuousl + + + + + + + + + + + + + + + + + statt + stat + + + + + + IlUrely + Laurel + + + + mathematic + + + + + + + meal + legal + + + + anti + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Tbose + Boise + + + + + + Ulen + Len + + + + Risk + + + + + tlous + Cious + ious + locus + + + + + + + + + + politb + + + + atten + alten + attend + Alton + + + + + + + Uon + tiomi + ion + Upon + timid + + + + + + theft + + + + wles + wiles + + + + + + tho + thou + + + + rcat + feat + cat + + + + + + + I + + + + + + + + + + he + + + + ecpected + + + + + + + + + + + + + + + + + + + ft + + + + + + + + + + + + + + + ehance + enhance + + + + + Ionic + + + + drear + + + + xI + + + + + + + + + + + + + + ii + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + CENTS + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + and + + + + + + + + + + + + + + + + + + + + + toPcwell + locally + topically + + + + + + + + + Voters + + + + + kola + + + + + + + + + + + + Hou + Hour + + + + + + + floyd + + + + Lyrd + Yard + + + + lllwCIatie + ileuluctatic + + + + nomince + iiumilice + iniquities + + + + + jour + + + + + + frum + 1tnnl + forum + + + + tIe + + + + Tejilh + Tl1th + + + + COH + OH + + + + + + + Messina + + + + itrid + nitride + + + + tIHI + Thai + + + + RuhlJt + Ruth + + + + H + It + + + + + + + + f1il1d + ivied + + + + + + Jryine + Juried + + + + + + Sat1lliay + Sntmday + Stoma + + + + + + + dressed + + + + 111 + thC + + + + v1ters + + + + + + Ptavell + Lowell + Travel + + + + + + + + + ntStanton + tagalong + instant + + + + tlul + au1 + lull + + + + lay + + + + + + + + spCak + + + + + cr + + + + + + vcr + + + + gIcetcl + iciest + + + + + + + + + + + + + iinIes + biomes + Winnies + + + + tht + + + + + + Itcmlcd + tte1lccl + Sitcom + + + + + + + + + 8pelkmJr + + + + iii + + + + + + eunnty + aunty + + + + + + year + years + + + + + Ireie + Fibre + Retie + + + + + + eleuiire + elsewhere + eclair + + + + lllmbel + 11umhers + + + + or + + + + Hc + Re + + + + + + + + + cXTlrcssatl + cxrrresscl + + + + + + + + + + + Snpnorting + Sporting + + + + + + + + ill + + + + + Jlit + liar + Lit + + + + loch + + + + minions + + + + + + + + ctfeetlc + octet + + + + + + + their + + + + + + + + + rssiic + Rossini + + + + 0 + + + + + + + + camuaiun + causation + + + + mll + 5111 + mall + + + + + + + + + + + nreliele1 + + + + Oil + 0t + + + + all + + + + + + + + tlw + law + + + + + alolylutl + ally + + + + Dmnocratie + Democratize + + + + l1lujoliiv + + + + + + + + + + + + + + + + cH11tv + + + + tvill + till + + + + + + matlialh + matcrially + martial + + + + increase + + + + + ell + el + + + + + + + + + + + + + + + + LEiXJXGrON + LEl1XGTON + + + + Jy + + + + ot + + + + + + + iSales + Snles + Sales + Singles + + + + + + + + + + nnt + nt + + + + + + OIl + omi + + + + + time + + + + + + markct + + + + lionday + Lindsay + + + + O5 + + + + + + hut + 1111 + + + + + vtill + mU + till + + + + + + + + mud + + + + 011 + + + + + + + + + + + + + + + time + + + + Hurley + Hurly + + + + Society + + + + + 1000 + + + + looled + lolled + + + + C101 + + + + + + alo + Palo + + + + be + + + + rcsnm + resnm + resume + racism + resin + + + + + ed01 + econ + + + + + + + + + + + + + + + Whrehol15c + + + + nftcr + FTC + + + + h1ing + halving + + + + + + 8n5 + subs + + + + + + + + + fOI + foci + + + + seyeral + + + + + + + + + + + tecson + Tucson + + + + + + + + + + + + + + + + + looc + loc + + + + lenf + len + + + + + + + + flint + + + + + + + + + + + + + + + conlillg + coning + + + + + + + + + + + + + + + conferelHe + confer + + + + + + + + + ic1110on + tenon + + + + + + + + deeilci + deicing + + + + + + + + + wonld + + + + + + best + chest + + + + + + + + + + eidetic + + + + + + + Timesday + Immediacy + + + + + + olller + + + + flint + + + + + + + + + pncco + bilcco + bacon + Nucor + bicorn + + + + + + bc + he + + + + + + + + + + TIll + Tin + + + + + + + + + compnrtilv + comparative + + + + smi + snmal + semi + + + + Lt + hi + + + + + Thrsduy + + + + + + 7riday + + + + + + + + + + + + Qrs + IQs + + + + secm + Secom + + + + in + + + + he + + + + anxions + anions + + + + + + 1sposc + + + + or + + + + + fhe + he + + + + host + + + + + + + + ill + iii + + + + + + h1rn + + + + + + + + + + + + + + OFT + + + + + + + + + + + F018IRI + + + + + + + + + + + + + + Omens + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Stutc + Stale + Stout + + + + Fcdemtiou + Fe1erltiion + Facetious + + + + + + romcns + 1lonicmis + Omens + Romans + + + + + + + + + + + outlincs + ontliues + continues + + + + folic + + + + + + + + + Rtudy + Trudy + + + + ill + + + + + + + + l1l + mist + + + + + + + iii + + + + + + + + omens + + + + + + + t + + + + + + importanco + itnportance + ignorance + + + + + + thc + + + + studyof + sullenly + study + + + + + + + + + + + + + ill + + + + lite + t111 + elite + + + + + + + P1ioclamntio1 + 1noclamatioi + + + + + + Presideut + + + + Hoose + Itoose + Loose + Hose + Mitoses + + + + + + + + veldt + + + + while + + + + + + ol1ie + offce + notice + office + + + + + + time + + + + 8choo + sdtoo + school + stood + + + + + + + + + + + llnitc1 + + + + + + + + + + + wllieh + Willie + + + + menns + menus + + + + + + + T1ce + + + + + + + + + + obc1c + observe + + + + + + + + + + + + + out + + + + eniou + Cniom + envious + CIO + + + + + + + + iu + + + + time + + + + + + + schools + + + + + elmoolsTt + Schools + embolus + + + + + Tt + + + + + + + + lhl + hit + All + + + + + + sho1111 + + + + clehm + celehraft + celebrate + clam + + + + + + + ArLoI + Carlo + + + + + + thOllghtfully + ilmottghffnlly + + + + fu1 + + + + WWI + ivitli + civil + + + + + ih + in + ii + + + + you + + + + lifetiil1c + lifetiine + lifeline + + + + till + flue + + + + + + + + + or + + + + + + + + heeome + hereon + + + + CtioU5 + + + + + + + + + + + + + + + + + + + + + + + + + + + + nnd + nd + + + + respond + + + + + ibilHies + sibiltlies + 1sibiltlies + sibilates + ribaldries + sibylline + + + + whichm + + + + + + npom + nom + + + + + + ir + it + + + + + Vp + + + + + + + + lelrn + + + + tlw + law + + + + hings + hinges + + + + YOll + Yell + + + + wil + wail + + + + + + + iiecd + ivied + + + + + + + + whn + + + + + + 5011011 + + + + + + I + + + + + + + oel + Joel + + + + + + bill + + + + smiler + miler + + + + fhe + he + + + + cone + + + + + + + + + ijuei1CCS + finances + + + + + + my + + + + + + whieh + ivhielm + civil + + + + iu + iii + + + + H + + + + + + + + + Jouth + youth + Joust + + + + + + + + + + he + + + + dny + tiny + deny + + + + + + + 7xirilliotit + + + + solving + + + + + + COllunH + Collin + + + + + + + jlWl1t + oilfield + + + + Iillsballding + Dillydallying + + + + + + CXPct + + + + + + PC1 + + + + + attn + + + + + + tho + thou + + + + + + whoe + in1ose + whole + + + + InblJ + lIbo + labor + limbo + + + + + + + diflieulty + difficulty + + + + filHl + fool + fill + + + + hint + + + + + + bare + + + + + + + + + + + + + + + + + + + + tnie + tie + + + + + + + + + + + + a + + + stOIl + Aston + tOIl + + + + + litsc + flits + + + + + + + + wool + + + + + + + + + + + + + + + + + fiictoiy + fiction + + + + + + X001 + + + + anti + + + + + + lime + + + + + + timl + rims + + + + + n + + + + + + + + + + Wlcn + Wllel + Wiener + Lcm + Wale + + + + yon + + + + hII + helil + II + hell + + + + + + + + + OUl + Foul + + + + fOlests + foests + fogless + + + + 01 + + + + plint + plaint + + + + der + nevi + deer + + + + + cues + + + + you + + + + arc + + + + aeting + eating + + + + UI + + + + + + + + + + + + + rhe + rhea + + + + + + + + + + + + + + + there1olc + + + + + + lie + + + + + + + + thr + thru + + + + + sciool + ccilools + schools + school + Cyclops + + + + whidl + while + + + + + + + + mnlw + ml + + + + + + + itizcns + + + + + + you + + + + + + + + Arhor + + + + lay + + + + + xcrciscs + crises + + + + + + yon + + + + + + realie + renlizc + relic + + + + wh1 + whiff + + + + + + + elch + Welch + + + + onc + conc + + + + cotton + + + + recio + teeeiyes + recoil + + + + + froin + groin + + + + + + + + aud + Maud + + + + + + hy + + + + Reiji + + + + + siJtanee + sitailcC + distance + shitake + shirttail + + + + + + + + + + contin1H + enmmtinne + emptiness + + + + + + + + + + + hill + + + + scree + screed + + + + + + Aood + Wood + + + + ellil + elli + + + + + + + + + + + + + + + + + + + + + + + + + + + STEIUrnG + STEIILING + Steering + STENCILING + + + + + + Ort + + + + n + 31 + + + + + Iittle + + + + Ollie + Colic + Mollie + + + + Caytvood + Cawed + Cato + + + + Smouthsolll + 1Smontlmsold + + + + + qmghtel + might + + + + + + + + + + ll1s + Julies + + + + Onno + Guano + Ono + + + + Cis + CPIs + + + + + + + + + + + + + + + pJatyin + playin + partying + + + + Joni + + + + + + + + + whell + rhea + hell + + + + Hie + Hide + + + + + + + + tn + + + + + + + + + 1100J + hoar + + + + + + herai + Hebraic + + + + strstck + + + + + + nnll + amid + null + + + + + alit + + + + + + + + + + + + + + + + hcad + chad + + + + + + + + + + + eausint + Austin + + + + thc + time + + + + + + + + time + + + + + + + + + Missy + + + + + + + + lestroycd + + + + TJic + Tic + + + + + + + + + + + + + + skirted + + + + + + + + + + lnonght + lining + + + + + + + + + + + + + + + + + ykerc + Ayer + + + + + + examination + + + + + + + + + + + + + + + + + + + + + + + + + + NfW + NW + + + + + + + WARLHOUSE + Warehouse + + + + + + + + + + + + + + + + + + Stcckyards + Stcckyartls + + + + + + + + + + Nex + + + + + Week + + + + + Weeklime + Seethe + Weeklies + + + + + iht + lime + ht + + + + tobweo + towhee + + + + uarchnnse + urchins + + + + iehiefl + Ethel + + + + + + + + + IetNl + creeted + Viet + created + + + + + + + + Hroadway + Roadway + + + + + + + + yaids + yauls + aids + hauls + + + + + + IwaliuA + ueating + Italia + eating + + + + + + 1111 + + + + iifll + if + + + + + Iw + + + + + + 10 + + + + retcive + + + + tohaco + lobaceo + cloacae + + + + + + + + + Hlt + limst + list + Halt + + + + or + + + + + + wlek + + + + + + bouse + hOllC + blouse + holly + + + + + + he + + + + + ued + cued + + + + + + Stwnrt + + + + raylor + + + + + + are + + + + + huyillg + + + + tohaeo + Theo + + + + + + + + oullly + + + + + + + + + + + + + + + + + + MING + + + + + + + + trice + + + + + + + + Kas + Keas + + + + + + + + + + + + Repair + + + + + Jhe + lime + He + + + + + + + + + + hu1mo + + + + Ht + + + + + + + I0iwcr + + + + + + + + 11lllS + ttuts + tufts + + + + thu + thou + + + + strcet + + + + + ctr + ctrl + + + + + + hecn + hen + + + + + + amid + + + + lomitlay + gloomily + + + + + ftcl1ool1 + Iftcrnoon + + + + time + + + + ear + + + + + + tmted + tempted + + + + + + + + + + + + + hcen + chen + + + + + + oml11issil11 + conmission + + + + + + + ntarly + ueani + quean + + + + nice + + + + + + ntul + au1 + null + + + + tbis + tbs + + + + + + till + + + + + + + frollblc + rollback + + + + + + ompnny + + + + 111s + hts + hats + + + + cx + + + + + pericncecl + peieneed + prince + princely + preened + + + + + + + + Ahont + AlOUt + Hon + LOUt + + + + + + weekm + week + + + + + + + + + + + + + + + Boils + + + + hurned + btuncd + churned + obtund + + + + uut + ut + + + + + + + + + rcpleed + rippled + + + + + + oing + doing + + + + + + time + + + + + + + ondition + + + + + + + + uiiut + utility + + + + cOih + couch + + + + + + + + + + + + + supi + upi + + + + + + maehiuc + Mathieu + + + + + + + nil + + + + neiv + nevi + + + + + + whih + ivhiclt + vehicle + + + + tuck + + + + + + + + + + + davs + days + Davis + + + + + + + + + + + RfV + RV + + + + + + + + + + + + + WATIS + WAITS + + + + + COMPlIMfNUO + Compliment + + + + + Whitcsburg + Whites + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Itey + Pitney + + + + Bd + ld + Emil + + + + + + 11atts + + + + hns + hens + + + + + + seen + + + + + + + tho + thou + + + + rcthltist + ltethe1ist + recitalist + + + + Confcrenec + + + + + + + + + elmarge + enlarge + + + + + + thc + time + + + + ell11rcb + clutmeh + clutch + + + + + + llhitcs1urg + Whites + + + + + + + + + + + thtt + + + + + + puhlislmes + + + + thl + tin + hl + + + + + + + + + + + liini + iii + Hindi + + + + + + + + + + + nfto + nastnr + info + nastier + + + + 01 + + + + time + + + + + + + + + h1t1ch + chnrdt + chard + + + + Sonth + Sonilm + Soil + + + + + + 11riiesbnrg + Writs + + + + 11Iil + 5111 + + + + Cool + + + + + Sot + + + + + + + + ntthc + natiyC + nth + nattily + + + + + + Clnrk + Clank + + + + eonnty + entry + + + + + + + Arter + Carter + + + + + + feiv + five + + + + yur + yers + errs + your + Ayers + + + + Ipcnt + Pct + + + + + + + + distritt + + + + + + + 0f + + + + + + connty + comiy + comity + + + + h0 + + + + bcram + cram + + + + + asnsciatelt + asusciateei + associated + associate + + + + iiith + + + + + + 1i1111 + irnt + dint + + + + + + eontrnctor + + + + + Iud + 11111 + + + + + + + + Avhich + Avouch + + + + + + + + + + + + + nlmher + number + lamer + + + + lieitmg + alienist + + + + rllsiluUS + + + + + + gain + game + + + + + + King + + + + + + + + h0 + + + + agaiu + + + + entcrelJ + encore + + + + + sel1001 + + + + iind + rind + bind + + + + prept1el1 + + + + + + foi + fm + foci + + + + + tcnching + eaching + cinching + beaching + + + + + + + + he + + + + + + + + + fot + foot + + + + thrce + throe + thrice + + + + + + mudding + + + + + + fils + fills + + + + + + + ccrliticde + critic + + + + Xot + Oat + + + + + + satisfic + satisiiec + artistic + satisfied + + + + + + + + + tmininA + minima + + + + hc + be + + + + catered + + + + hentuckr + + + + + Weslcynn + Ileslevan + Leslie + + + + Collcg + + + + + + 1900 + + + + hlking + takimmg + hulking + + + + tit + + + + + ourse + e0t15C + bourse + + + + lending + + + + + + + + + + + + + + + + + iu + + + + + + + + + + 1910 + + + + + + + was + + + + + + + + + + thal + hal + + + + lie + + + + wns + ivps + awns + VPs + + + + conic + + + + + + + + + lie + + + + + + culled + + + + + + tlu + the + talus + + + + + + + lie + + + + + + licnsell + license + + + + + + prcach + + + + + SeptcmhN + Septic + + + + I + + + + 1001 + 1005 + Flock + + + + + + + + + + tall + + + + + snme + name + + + + + + ime + + + + + + 1lrecd + + + + + + + + + + + + + + + BonHyitlC + Beattyville + Calycle + Bony + Betty + + + + + + Campton + Camplol + Champion + Compton + Campo + + + + Junc + Junco + + + + + tiOI1 + ion + + + + tissio1 + + + + + + thc + + + + nnunal + anunal + + + + corer + + + + + cnce + fence + cancel + + + + + + + + It + + + + Ptris + Tries + + + + + + 1000 + 1OOtl + + + + hI + + + + + + + aail1 + aaimi + asian + aim + + + + assi11cd + + + + + + + + + + whcn + wher + where + + + + + + + sotYCIl + sooty + + + + + + scut + scout + + + + + + + + + + + 110 + + + + extcnd + + + + Talus + + + + + + coidial + conidia + + + + irel + weal + ire + + + + + + eons + + + + monA + aulong + along + + + + + + + SUPPLEIIWl + SUPPlEMfNT + Supple + Supplement + + + + + Supplemental + + + ION + + + + + + + + + + + + + Tuestray + Testacy + + + + + + + + + + + + + + + + tlay + lay + + + + + + + + + + + thc + + + + lirpt + flirt + + + + + + + + thr + flit + thru + + + + + supplemcnt1 + su11lemental + supplemental + + + + + + Tl1ol + + + + wlic + laic + + + + + we1C + + + + + + + + + + + + tile + time + + + + regulttl + rebuttal + + + + Greg + + + + + istmtiol1 + istatinti + distraction + instating + + + + dtty + ditty + + + + + + + + sie1 + + + + + + nnnhlt + + + + + + + + + + + fhe + he + + + + + + + + registe + + + + + tithe + + + + To1day + + + + fucsday + lncscfay + fuchsia + + + + dr + rr + + + + WNlnei + 11ed11e + Swedes + Winnie + + + + + tIny + + + + Ai + st + + + + nonn + non + + + + Monlay + + + + 32 + + + + Dl11l0erat + Iemoerat + Immoderate + + + + + ohd + Ihrd + hill + od + Hard + + + + + + + + Hpnhlinns + Rcpuhlicnus + Pangolins + + + + + + + + + + + + POLICE + + + + + + + + + + + 1181 + acre + + + + then + + + + r01111d + + + + + + time + thou + + + + + poJicc + polico + poi + polio + + + + gnttrt + gantry + + + + + + niglmt + Nigel + + + + anl + anal + + + + + + + + + + + + + + + + + + + + + + timefirs + tilefish + + + + + + + + + + + Alva + + + + + + + + lulins + Billings + Luis + + + + + + + + + + + life + + + + + + + + + + + + IliSlE + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Walked + + + + + + + Lancaster + + + + + Thl + Hl + + + + Sllciill + + + + + + or + + + + + + + + Ch + + + + + + cubit + + + + + + lens + + + + + + 3fonlny + + + + 11Wl11 + ntoin + Antoine + + + + + King + + + + llw + Time + allow + + + + tCl1U + + + + + + alled + allied + + + + fOl + foal + + + + 1110 + + + + + + + + + frying + + + + + + LiIt + Licit + + + + wilt + + + + Chl + Hl + + + + + + + ndtieli + Intel + + + + + + Esthel + Estlmer + Ethel + Settler + + + + C01whmnn + Cotdtman + Coachman + Ottoman + + + + + + + + + + + + + hreak + hreal + real + + + + time + + + + + + Qf + + + + Mr + + + + + Clnih0l11c + Clarion + + + + LiHlc + Lilac + + + + Lessrs + Messes + Lessors + + + + lhos + Ethos + los + + + + lisOt + list + + + + + + + + fnd1J1ts + femmdants + fondants + + + + Jucle + Julge + Joule + + + + + + ml + + + + + + + + tUiS + thIi + thefts + etuis + Tahiti + + + + + + + + + + + + + file + + + + ease + + + + + + Julge + Joule + + + + + 11arlker + + + + + + Luucastel + Lucas + + + + Imit3 + + + + cc + + + + + + + + + + + thc + + + + GocJlUr + + + + 1l + + + + wiJI + ivill + WWII + civilly + + + + take1 + + + + + Limo + + + + + + + + tul + amid + truly + + + + pIubaby + piquancy + + + + + + of + ROTC + + + + + ncxt + + + + + + + + heal + heu + hue + + + + Idle + Hide + + + + testimony3 + + + + + loJHlay + aloha + + + + nioruing + snoring + + + + + + cons11IDCd + + + + ih + iii + + + + + seJccting + ejecting + + + + + + + + lnd + 1111 + land + + + + inul1cdinleh + immncdiatelr + immediate + + + + + + + time + + + + + + tlJjoUl1lment + tdjoutatment + + + + theaat + ethicist + threat + + + + + journeys + + + + 101 + + + + + + sids + + + + + + tliti + tho + lit + thou + + + + + ease + + + + + PolloinA + Follotving + Polloi + + + + nre + re + + + + + + + + seietel + scented + semimetal + + + + + Elins + Ellis + + + + + + + + + + Horton + + + + QliIT + Licit + + + + + + + + + + 1erryman + Ferryman + + + + + + + + Gill + + + + + + + + + 8kn1I + Skinnier + + + + + + + + Heflin + Gillian + Hellion + + + + Chic + + + + lIndiov + Radler + Ladle + indigo + Adler + + + + + + + ii + + + + Ada111s + + + + n + It + + + + + + + + + FLANGE + + + + + + + + + Stcpp + Step + TCP + + + + + + Iias + Ixias + + + + heron + + + + + + + + + dipthelin + hptherla + diphtheria + dithering + patella + + + + b + + + + iniproing + indisposing + + + + + + + Margatret + + + + Bralley + + + + or + 01 + + + + 11iui + + + + + elmester + semester + + + + isited + iisite1 + + + + + + + + + + + + + + + + + H + + + + KinA + + + + + + Sunjlfty + Sunlit + + + + + + + + + + + + Joc + Jock + + + + + + atIll + tIll + + + + + + nnd + nd + + + + tWQ + ttiyp + IV + TRW + tip + + + + + + + hl + + + + + hen + den + + + + + + + + isitel + + + + foresails + + + + + herc + her + + + + + + + + + + Hie + idle + Hide + + + + Pl1st + + + + weep + + + + + II + + + + Shcrmtn + Shermun + Sherman + Shorten + + + + + + + + Diarists + + + + + + + + + + + hb + + + + sistel + sisterly + + + + + + + + + + + lung + + + + + + + + + + + Wntts + 1latls + Wants + + + + mind + + + + Mrs + + + + C + + + + + IcDonald + + + + + + rOoleiill + Mooesville + Doorsill + ocotillo + + + + spimi + simi + + + + + + + + + + + cdncsday + 11ednesday + cadences + + + + + + + + + + + + Dn4n + Unsung + + + + kb + lo + + + + + + + rahonc + Mahoue + Mahoney + aphonic + Madhouse + + + + 111 + + + + + + + + + tenderl + tenderly + + + + + + + + MI + + + + Sterliuj + Sterling + + + + ilaSl + villas + + + + + Momiday + + + + + llrs + alls + + + + + + OamJw + Gamboled + Foam + + + + aocl + aol + + + + littlo + + + + cleric + + + + + Eizaheth + + + + Insku + ltsko + Minsk + also + + + + arc + + + + + + + + + nt + + + + + + + Eddo + + + + Iiughcs + + + + tlCl + amid + tlC + + + + IJUlc + Ilttib + Kiltie + + + + sni + sri + snip + + + + Wood + + + + + + + + + Vinehcster + Vilichestee + Filches + + + + + + + + + + + + + + + friendti + friend + + + + + + + + + + + + + + + + MondayMess + Monday + + + + Bamc + Bmltltei + Banc + Mullein + + + + Daily + + + + + + + + HieJ + siik + Hide + Sikh + + + + + vith + itch + + + + fnm + nm + + + + + + + + + H + + + + + + + + + + + + Ler + Alex + Leer + + + + + + + + + reccntll + recent + + + + + + + + + + tho + thou + + + + smtl + smut + + + + + clen + clean + + + + denth + dent + + + + + + + + fnther + fatlmer + nether + falter + + + + + + + + + + + + + + + pcople + + + + + + hwf + haw + + + + solid + + + + + + + + + + + + + amid + + + + 12ti + + + + cetit + centrist + + + + + + + + + Iowell + Lowell + + + + hought + honght + thought + Hong + + + + a + + + + pnh + Penh + + + + + + trifles + + + + + + + riddletowll + Middletowlt + Middlebrow + riddle + + + + + + wcek + + + + 101 + + + + + + + WELKIN + + + + + + + GeOJgo + + + + + + + + Mrfi + Ht + Mari + + + + + + + FostCl + Foist + + + + Incl + and + Inc + + + + + + SOl1 + + + + Emme15Q + Hemmers + + + + + + + + + Lcxington + + + + returnccl + return + + + + hObte + hotmic + hoi + hotbed + rhombic + + + + Fiiday + + + + + aftcr + + + + n + + + + plnsant + pliant + + + + visit + + + + + + het + liar + heat + + + + sisal + + + + + + + + nnd + nd + + + + hlother + loather + + + + mfrs + irs + frs + + + + Alie + + + + Blye + Belo + Lye + + + + rand + + + + altar + + + + + + + IIardy + Hard + Iliad + + + + + Mir + + + + + + Mrs + + + + + + Embank + + + + attest + + + + + + + fl1nerul + + + + + + Inver + Invert + + + + sistCl + sisterly + + + + + + 1ait + Leah + + + + + + + + + + + Beldam + + + + + muss + + + + ESRic + Bessie + ERic + + + + + + + + frielhr + fricdtis + Africans + filcher + friction + + + + + + + 11inehcster + + + + + + sccrnl + scorn + + + + dnys + dens + + + + + + + Proof + + + + + + + + + + mchiiol + chili + + + + + of + + + + neconnt + nescient + + + + + + dipthcria + diptheria + dithering + diphtheria + + + + nnd + nd + + + + sacHet + + + + + + + + feverish + + + + + Iiss + Ibises + + + + + + HodJildn + lndglau + Hodgkin + Shoji + landlady + + + + + + 011 + ou + + + + tho + thou + + + + liik + eiek + Reek + like + geek + + + + + Iit + It + + + + + m1s + + + + AlieC + Alec + + + + ntye + Blc + Rally + nye + Bloc + + + + + + tho + time + thou + + + + nuts + + + + + + + + + + + + + + + + + + + + + + Embank + + + + + + + + + + + + time + + + + pleannt + leant + + + + + + + + dmiS + admits + + + + + + + llr + + + + + + Ilodgkin + Lodging + + + + + + + + + + + + + + Mts + Mats + + + + + + lllyc + Lye + + + + wns + awns + + + + rlsitingg + listing + + + + frigid + + + + + + + Winchcstel + wiuchestcr + + + + Flday + Flay + + + + + + Saturn + + + + + + + Urother + UlOthel + Further + Loathe + + + + Banderols + + + + 01 + + + + + + + + + nt + + + + + + Elkins + Eking + Welkins + + + + + + Stiii + Siam + + + + + d1y + + + + mOl1ling + + + + + + + + + + + + + + + + + + + + + + + + + + + + VOTES + + + + + + + + AH + + + + + + memhets + nletnheis + mementos + menthes + + + + + + time + + + + IndoH9nd + Indepgnd + Indeed + Independent + + + + + ent + Kent + + + + + + + + ClulHiro + Clumsier + Collier + + + + Lori + + + + + + + + + + + mat + + + + + + + + + + + + + It + + + + + + + + + + + + + Prcideht + Pride + + + + + WU + + + + COPIlEIl + COPIIEIt + COPIER + Coquille + Copyedit + + + + + + + + + + Alwayz + + + + + + + + + + + + + + + + + + + + + + + + + success + + + + + soar + + + + + + cen + cent + + + + + + + + + + Baltimore + + + + + + + + + + + + Iti + + + + CUSS + + + + + + + + + Cottngc + Clotting + + + + + + Browmi + BlOUU + Brownie + Lou + + + + + Louie + + + + + + + Beatty + Pertly + Batty + + + + Ewes + + + + officc + + + + + 10Utf + + + + + + + + + fiat + + + + + + + + + + + + + + + C111 + + + + Ifome + ionic + Biome + + + + + + 84BA + 898A + + + + + i0313t + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/qurator/dinglehopper/tests/data/test.alto2.xml b/qurator/dinglehopper/tests/data/test.alto2.xml new file mode 100644 index 0000000..67d3537 --- /dev/null +++ b/qurator/dinglehopper/tests/data/test.alto2.xml @@ -0,0 +1,64 @@ + + + +pixel +2017-03-27ABBYYABBYY FineReader Engine11 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/qurator/dinglehopper/tests/data/test.alto3.xml b/qurator/dinglehopper/tests/data/test.alto3.xml new file mode 100644 index 0000000..6986560 --- /dev/null +++ b/qurator/dinglehopper/tests/data/test.alto3.xml @@ -0,0 +1,37 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/qurator/dinglehopper/tests/data/test.page2018.xml b/qurator/dinglehopper/tests/data/test.page2018.xml new file mode 100644 index 0000000..c0dc183 --- /dev/null +++ b/qurator/dinglehopper/tests/data/test.page2018.xml @@ -0,0 +1,3394 @@ + + + + doculibtopagexml + 2019-01-08T10:25:36 + 2019-04-26T07:11:05 + + + + + + + + + + + + + + + + + + + + + + + b + + + + e + + + + r + + ber + + + + + + + d + + + + i + + + + e + + die + + + + + + + v + + + + i + + + + e + + + + l + + + + e + + + + n + + vielen + + + + + + + S + + + + o + + + + r + + + + g + + + + e + + + + n + + Sorgen + + + + + + + w + + + + e + + + + g + + + + e + + + + n + + wegen + + + + + + + d + + + + e + + + + + + + + e + + + + l + + + + b + + + + e + + + + n + + deelben + + + + + + + v + + + + e + + + + r + + + + g + + + + a + + + + ß + + vergaß + + + ber die vielen Sorgen wegen deelben vergaß + + + + + + + + i + + + + h + + + + r + + ihr + + + + + + + d + + + + o + + + + + + do + + + + + + + n + + + + o + + + + + + no + + + + + + + a + + + + n + + an + + + + + + + a + + + + + + + + e + + + + m + + + + . + + aem. + + + + + + + + + + + + ihr do no an aem. — + + + + + + + + H + + + + a + + + + r + + + + t + + + + k + + + + o + + + + p + + + + f + + + + , + + Hartkopf, + + + + + + + d + + + + e + + + + r + + der + + + + + + + F + + + + r + + + + a + + + + u + + Frau + + + + + + + A + + + + m + + + + t + + + + m + + + + + + + + n + + + + n + + + + i + + + + n + + Amtmnnin + + + + + + + d + + + + a + + + + s + + das + + + + + + + v + + + + e + + + + r + + + + + + ver⸗ + + + Hartkopf, der Frau Amtmnnin das ver⸗ + + + + + + + + ſ + + + + p + + + + r + + + + o + + + + + + + + e + + + + n + + + + e + + ſproene + + + + + + + z + + + + u + + zu + + + + + + + + + + + b + + + + e + + + + r + + + + l + + + + i + + + + e + + + + f + + + + e + + + + r + + + + n + + + + . + + berliefern. + + + + + + + + + + + + + + + + E + + + + i + + + + n + + Ein + + + + + + + E + + + + r + + + + p + + + + + + + + e + + + + r + + + + r + + + + e + + Erpreer + + + ſproene zu berliefern. — Ein Erpreer + + + + + + + + w + + + + d + + + + e + + + + u + + + + r + + wurde + + + + + + + a + + + + n + + an + + + + + + + i + + + + h + + + + n + + ihn + + + + + + + a + + + + b + + + + g + + + + e + + + + ſ + + + + + + + + i + + + + + + + + t + + + + , + + abgeſit, + + + + + + + u + + + + m + + um + + + + + + + i + + + + h + + + + n + + ihn + + + + + + + u + + + + m + + + + s + + ums + + + + + + + H + + + + i + + + + m + + + + + + Him⸗ + + + wurde an ihn abgeſit, um ihn ums Him⸗ + + + + + + + + m + + + + e + + + + l + + + + s + + + + w + + + + i + + + + + + + + e + + + + n + + melswien + + + + + + + z + + + + u + + zu + + + + + + + ſ + + + + a + + + + g + + + + e + + + + n + + + + , + + ſagen, + + + + + + + d + + + + a + + + + ß + + daß + + + + + + + e + + + + r + + er + + + + + + + d + + + + a + + + + s + + das + + + + + + + V + + + + e + + + + r + + + + ſ + + + + p + + + + r + + + + o + + + + + + + + e + + + + n + + + + e + + Verſproene + + + melswien zu ſagen, daß er das Verſproene + + + + + + + + g + + + + l + + + + e + + + + i + + + + + + glei + + + + + + + d + + + + e + + + + n + + den + + + + + + + A + + + + u + + + + g + + + + e + + + + n + + + + b + + + + l + + + + i + + + + + + Augenbli + + + + + + + + + + + b + + + + e + + + + r + + + + b + + + + r + + + + i + + + + n + + + + g + + + + e + + + + n + + berbringen + + + + + + + m + + + + + + + + + + + + t + + + + e + + + + , + + mte, + + + + + + + d + + + + i + + + + e + + die + + + glei den Augenbli berbringen mte, die + + + + + + + + F + + + + r + + + + a + + + + u + + Frau + + + + + + + A + + + + m + + + + t + + + + m + + + + + + + + n + + + + n + + + + i + + + + n + + Amtmnnin + + + + + + + h + + + + + + + + t + + + + t + + + + e + + htte + + + + + + + + + + + + +  + + + + + + + a + + + + u + + + + f + + auf + + + + + + + i + + + + h + + + + n + + ihn + + + + + + + v + + + + e + + + + r + + + + l + + + + a + + + + + + + + e + + + + n + + + + , + + verlaen, + + + Frau Amtmnnin htte  auf ihn verlaen, + + + + + + + + u + + + + n + + + + d + + und + + + + + + + n + + + + u + + + + n + + nun + + + + + + + w + + + + + + + + ß + + + + t + + + + e + + wßte + + + + + + + + + + + e + + e + + + + + + + n + + + + i + + + + + + + + t + + + + , + + nit, + + + + + + + w + + + + a + + + + s + + was + + + + + + + + + + + e + + e + + + + + + + a + + + + n + + + + f + + + + a + + + + n + + + + g + + + + e + + + + n + + anfangen + + + und nun wßte e nit, was e anfangen + + + + + + + + ſ + + + + o + + + + + + + + t + + + + e + + + + . + + ſote. + + + + + + + D + + + + e + + + + n + + Den + + + + + + + A + + + + u + + + + g + + + + e + + + + n + + + + b + + + + l + + + + i + + + + + + Augenbli + + + + + + + ſ + + + + o + + + + + + + + t + + + + e + + ſote + + + + + + + e + + + + r + + er + + + + + + + k + + + + o + + + + m + + + + m + + + + e + + + + n + + + + , + + kommen, + + + ſote. Den Augenbli ſote er kommen, + + + + + + + + + + + + e + + e + + + + + + + i + + + + n + + in + + + + + + + i + + + + h + + + + r + + + + e + + + + r + + ihrer + + + + + + + A + + + + n + + + + g + + + + + + + + . + + Ang. + + + + + + + + + + + + + + + + D + + + + i + + + + e + + Die + + + + + + + ſ + + + + o + + + + n + + + + + + ſon + + + + + + + v + + + + e + + + + r + + + + g + + + + i + + + + e + + + + n + + + + g + + vergieng + + + ſon vergieng e in ihrer Ang. — Die + + + + + + + + G + + + + + + + + + + + + e + + Ge + + + + + + + w + + + + + + + + r + + + + e + + + + n + + wren + + + + + + + ſ + + + + + + + + o + + + + n + + ſon + + + + + + + a + + + + n + + + + g + + + + e + + + + k + + + + o + + + + m + + + + m + + + + e + + + + n + + + + , + + angekommen, + + + + + + + u + + + + n + + + + d + + und + + + + + + + e + + + + s + + es + + + + + + + f + + + + e + + + + h + + + + l + + + + t + + + + e + + fehlte + + + Ge wren ſon angekommen, und es fehlte + + ber die vielen Sorgen wegen deelben vergaß +Hartkopf, der Frau Amtmnnin das ver⸗ +ſproene zu berliefern. — Ein Erpreer +wurde an ihn abgeſit, um ihn ums Him⸗ +melswien zu ſagen, daß er das Verſproene +glei den Augenbli berbringen mte, die +Frau Amtmnnin htte  auf ihn verlaen, +und nun wßte e nit, was e anfangen +ſote. Den Augenbli ſote er kommen, +ſon vergieng e in ihrer Ang. — Die +Ge wren ſon angekommen, und es fehlte +ihr do no an aem. — + + + + + + + + + + + H + + + + a + + + + r + + + + t + + + + k + + + + o + + + + f + + + + p + + Hartkopf + + + + + + + m + + + + u + + + + ß + + + + t + + + + e + + mußte + + + + + + + + + + + + +  + + + + + + + e + + + + r + + + + + + er + + + + + + + b + + + + e + + + + + + + + n + + + + n + + + + e + + + + n + + + + , + + bennen, + + + + + + + u + + + + n + + + + d + + und + + + Hartkopf mußte  er bennen, und + + + + + + + + m + + + + i + + + + t + + mit + + + + + + + u + + + + n + + + + d + + und + + + + + + + + + + + b + + + + e + + + + r + + + + b + + + + r + + + + a + + + + + + + + t + + + + e + + berbrate + + + + + + + e + + + + s + + + + . + + es. + + + + + + + + + + + + mit und berbrate es. — + + + + + + + + l + + + + a + + + + n + + + + g + + + + e + + + + m + + langem + + + + + + + N + + + + a + + + + + + + + d + + + + e + + + + n + + + + k + + + + e + + + + n + + Nadenken + + + + + + + + + + + e + + + + l + + fiel + + + + + + + e + + + + s + + es + + + + + + + i + + + + h + + + + m + + ihm + + + + + + + e + + + + r + + + + + + er + + + + + + + e + + + + n + + + + d + + + + l + + + + i + + + + + + endli + + + + + + + n + + + + a + + + + + + na + + + endli na langem Nadenken fiel es ihm er + + + + + + + + w + + + + i + + + + e + + + + d + + + + e + + + + r + + wieder + + + + + + + e + + + + i + + + + n + + + + . + + ein. + + + + + + + + + + + + + + + + E + + + + r + + Er + + + + + + + l + + + + a + + + + n + + + + g + + + + t + + + + e + + langte + + + + + + + d + + + + e + + + + n + + den + + + + + + + Z + + + + e + + + + t + + + + t + + + + e + + + + l + + Zettel + + + + + + + a + + + + u + + + + s + + aus + + + + + + + d + + + + e + + + + m + + dem + + + wieder ein. — Er langte den Zettel aus dem + + + + + + + + A + + + + c + + + + c + + + + i + + + + + + + + e + + + + s + + + + b + + + + u + + Accisbue + + + + + + + h + + + + e + + + + r + + + + a + + + + u + + + + s + + + + , + + heraus, + + + + + + + u + + + + n + + + + d + + und + + + + + + + ſ + + + + a + + + + g + + + + t + + + + e + + ſagte + + + + + + + ſ + + + + e + + + + i + + + + n + + + + e + + + + r + + ſeiner + + + + + + + F + + + + r + + + + a + + + + u + + + + , + + Frau, + + + + + + + d + + + + a + + + + ß + + daß + + + Accisbue heraus, und ſagte ſeiner Frau, daß + + + + + + + + + + + + e + + e + + + + + + + d + + + + a + + + + s + + + + , + + das, + + + + + + + w + + + + a + + + + s + + was + + + + + + + d + + + + a + + da + + + + + + + w + + + + + + + + r + + + + e + + + + , + + wre, + + + + + + + h + + + + e + + + + r + + + + b + + + + e + + + + y + + + + ſ + + + + + + + + a + + + + + + + + e + + + + n + + herbeyſaffen + + + + + + + m + + + + + + + + + + + + t + + + + e + + + + . + + mte. + + + e das, was da wre, herbeyſaffen mte. + + + + + + + + J + + + + n + + + + d + + + + e + + + + ß + + Jndeß + + + + + + + m + + + + a + + + + n + + + + g + + + + e + + + + l + + + + t + + + + e + + + + n + + mangelten + + + + + + + d + + + + i + + + + e + + die + + + + + + + d + + + + o + + + + + + do + + + + + + + e + + + + i + + + + n + + + + i + + + + g + + + + e + + einige + + + + + + + G + + + + e + + + + n + + + + e + + + + l + + + + i + + + + a + + + + , + + + + r + + + + a + + Generalia, + + + Jndeß mangelten do einige Generalia, die + + + + + + + + a + + + + l + + + + ſ + + + + o + + alſo + + + + + + + w + + + + e + + + + g + + + + + + + + e + + + + l + + + + e + + + + n + + + + . + + wegfielen. + + + + + + + + + + + + + + + + H + + + + a + + + + r + + + + t + + + + k + + + + o + + + + p + + + + f + + Hartkopf + + + + + + + g + + + + i + + + + e + + + + n + + + + g + + gieng + + + + + + + ſ + + + + e + + + + l + + + + b + + + + + + ſelb + + + alſo wegfielen. — Hartkopf gieng ſelb + + Hartkopf mußte  er bennen, und +endli na langem Nadenken fiel es ihm er +wieder ein. — Er langte den Zettel aus dem +Accisbue heraus, und ſagte ſeiner Frau, daß +e das, was da wre, herbeyſaffen mte. +Jndeß mangelten do einige Generalia, die +alſo wegfielen. — Hartkopf gieng ſelb +mit und berbrate es. — + + + + diff --git a/qurator/dinglehopper/tests/data/test.txt b/qurator/dinglehopper/tests/data/test.txt new file mode 100644 index 0000000..41bfe81 --- /dev/null +++ b/qurator/dinglehopper/tests/data/test.txt @@ -0,0 +1 @@ +Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. \ No newline at end of file diff --git a/qurator/dinglehopper/tests/test_align.py b/qurator/dinglehopper/tests/test_align.py new file mode 100644 index 0000000..1f29290 --- /dev/null +++ b/qurator/dinglehopper/tests/test_align.py @@ -0,0 +1,63 @@ +from .util import unzip +from .. import align + + +def test_left_empty(): + result = list(align('', 'foo')) + expected = [(None, 'f'), (None, 'o'), (None, 'o')] + assert result == expected + + +def test_right_empty(): + result = list(align('foo', '')) + expected = [('f', None), ('o', None), ('o', None)] + assert result == expected + + +def test_left_longer(): + result = list(align('food', 'foo')) + expected = [('f', 'f'), ('o', 'o'), ('o', 'o'), ('d', None)] + assert result == expected + + +def test_right_longer(): + result = list(align('foo', 'food')) + expected = [('f', 'f'), ('o', 'o'), ('o', 'o'), (None, 'd')] + assert result == expected + + +def test_some_diff(): + result = list(align('abcde', 'aaadef')) + left, right = unzip(result) + assert list(left) == ['a', 'b', 'c', 'd', 'e', None] + assert list(right) == ['a', 'a', 'a', 'd', 'e', 'f'] + + +def test_longer(): + s1 = 'Dies ist eine Tst!' + s2 = 'Dies ist ein Test.' + + result = list(align(s1, s2)) # ; diffprint(*unzip(result)) + expected = [('D', 'D'), ('i', 'i'), ('e', 'e'), ('s', 's'), (' ', ' '), + ('i', 'i'), ('s', 's'), ('t', 't'), (' ', ' '), + ('e', 'e'), ('i', 'i'), ('n', 'n'), ('e', None), (' ', ' '), + ('T', 'T'), (None, 'e'), ('s', 's'), ('t', 't'), ('!', '.')] + assert result == expected + + +def test_completely_different(): + assert len(list(align('abcde', 'fghij'))) == 5 + + +def test_with_some_fake_ocr_errors(): + result = list(align('Über die vielen Sorgen wegen desselben vergaß', + 'SomeJunk MoreJunk Übey die vielen Sorgen wegen AdditionalJunk deffelben vcrgab')) + left, right = unzip(result) + + # Beginning + assert list(left[:18]) == [None]*18 + assert list(right[:18]) == list('SomeJunk MoreJunk ') + + # End + assert list(left[-1:]) == ['ß'] + assert list(right[-1:]) == ['b'] diff --git a/qurator/dinglehopper/tests/test_character_error_rate.py b/qurator/dinglehopper/tests/test_character_error_rate.py new file mode 100644 index 0000000..b16d37c --- /dev/null +++ b/qurator/dinglehopper/tests/test_character_error_rate.py @@ -0,0 +1,37 @@ +from __future__ import division, print_function + +import math +import unicodedata + +from .. import character_error_rate + + +def test_character_error_rate(): + assert character_error_rate('a', 'a') == 0 + assert character_error_rate('a', 'b') == 1/1 + assert character_error_rate('Foo', 'Bar') == 3/3 + + assert character_error_rate('Foo', '') == 3/3 + + assert character_error_rate('', '') == 0 + assert math.isinf(character_error_rate('', 'Foo')) + + assert character_error_rate('Foo', 'Food') == 1/3 + assert character_error_rate('Fnord', 'Food') == 2/5 + assert character_error_rate('Müll', 'Mull') == 1/4 + assert character_error_rate('Abstand', 'Sand') == 4/7 + + +def test_character_error_rate_hard(): + s1 = unicodedata.normalize('NFC', 'Schlyñ lorem ipsum.') + s2 = unicodedata.normalize('NFD', 'Schlyñ lorem ipsum!') # Different, decomposed! + assert character_error_rate(s1, s2) == 1/19 + + s1 = 'Schlyñ' + assert len(s1) == 6 # This ends with LATIN SMALL LETTER N WITH TILDE, so 6 code points + s2 = 'Schlym̃' + assert len(s2) == 7 # This, OTOH, ends with LATIN SMALL LETTER M + COMBINING TILDE, 7 code points + + # Both strings have the same length in terms of grapheme clusters. So the CER should be symmetrical. + assert character_error_rate(s2, s1) == 1/6 + assert character_error_rate(s1, s2) == 1/6 diff --git a/qurator/dinglehopper/tests/test_edit_distance.py b/qurator/dinglehopper/tests/test_edit_distance.py new file mode 100644 index 0000000..fa901a8 --- /dev/null +++ b/qurator/dinglehopper/tests/test_edit_distance.py @@ -0,0 +1,40 @@ +from __future__ import division, print_function + +import unicodedata + +from .. import levenshtein, distance + + +def test_levenshtein(): + assert levenshtein('a', 'a') == 0 + assert levenshtein('a', 'b') == 1 + assert levenshtein('Foo', 'Bar') == 3 + + assert levenshtein('', '') == 0 + assert levenshtein('Foo', '') == 3 + assert levenshtein('', 'Foo') == 3 + + assert levenshtein('Foo', 'Food') == 1 + assert levenshtein('Fnord', 'Food') == 2 + assert levenshtein('Müll', 'Mull') == 1 + assert levenshtein('Abstand', 'Sand') == 4 + + +def test_levenshtein_other_sequences(): + assert levenshtein(['a', 'ab'], ['a', 'ab', 'c']) == 1 + assert levenshtein(['a', 'ab'], ['a', 'c']) == 1 + + +def test_distance(): + assert distance('Fnord', 'Food') == 2 + assert distance('Müll', 'Mull') == 1 + + word1 = unicodedata.normalize('NFC', 'Schlyñ') + word2 = unicodedata.normalize('NFD', 'Schlyñ') # Different, decomposed! + assert distance(word1, word2) == 0 + + word1 = 'Schlyñ' + assert len(word1) == 6 # This ends with LATIN SMALL LETTER N WITH TILDE, so 6 code points + word2 = 'Schlym̃' + assert len(word2) == 7 # This, OTOH, ends with LATIN SMALL LETTER M + COMBINING TILDE, 7 code points + assert distance(word1, word2) == 1 diff --git a/qurator/dinglehopper/tests/test_editops.py b/qurator/dinglehopper/tests/test_editops.py new file mode 100644 index 0000000..ce22377 --- /dev/null +++ b/qurator/dinglehopper/tests/test_editops.py @@ -0,0 +1,38 @@ +from .. import seq_editops, editops + + +def test_trivial(): + assert seq_editops('abc', 'abc') == [] + assert seq_editops('', '') == [] + + +def test_insert(): + assert seq_editops('bc', 'abc') == [('insert', 0, 0)] + assert seq_editops('ac', 'abc') == [('insert', 1, 1)] + assert seq_editops('ab', 'abc') == [('insert', 2, 2)] + assert seq_editops('', 'a') == [('insert', 0, 0)] + + +def test_multiple(): + assert seq_editops('bcd', 'abce') == [('insert', 0, 0), ('replace', 2, 3)] + + +def test_delete(): + assert seq_editops('abcdef', 'cdef') == [('delete', 0, 0), ('delete', 1, 0)] + assert seq_editops('Xabcdef', 'Xcdef') == [('delete', 1, 1), ('delete', 2, 1)] + assert seq_editops('abcdefg', 'acdefX') == [('delete', 1, 1), ('replace', 6, 5)] + assert seq_editops('abcde', 'aabcd') == [('insert', 1, 1), ('delete', 4, 5)] + assert seq_editops('Foo', '') == [('delete', 0, 0), ('delete', 1, 0), ('delete', 2, 0)] + assert seq_editops('Foolish', 'Foo') == [('delete', 3, 3), ('delete', 4, 3), ('delete', 5, 3), ('delete', 6, 3)] + + +def test_ambiguous(): + assert seq_editops('bcd', 'abcef') == [('insert', 0, 0), ('replace', 2, 3), ('insert', 3, 4)] + + +def test_editops(): + """Test editops() in cases where dealing with grapheme clusters matters""" + + # In these cases, one of the words has a composed form, the other one does not. + assert editops('Schlyñ', 'Schlym̃') == [('replace', 5, 5)] + assert editops('oͤde', 'öde') == [('replace', 0, 0)] diff --git a/qurator/dinglehopper/tests/test_integ_align.py b/qurator/dinglehopper/tests/test_integ_align.py new file mode 100644 index 0000000..df1e230 --- /dev/null +++ b/qurator/dinglehopper/tests/test_integ_align.py @@ -0,0 +1,23 @@ +from __future__ import division, print_function + +import os + +import pytest +from lxml import etree as ET + +from .. import align, page_text + +data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data') + + +@pytest.mark.integration +def test_align_page_files(): + # In the fake OCR file, we changed 2 characters and replaced a fi ligature with fi. + # → 4 elements in the alignment should be different. + # NOTE: In this example, it doesn't matter that we work with "characters", not grapheme clusters. + + gt = page_text(ET.parse(os.path.join(data_dir, 'test-gt.page2018.xml'))) + ocr = page_text(ET.parse(os.path.join(data_dir, 'test-fake-ocr.page2018.xml'))) + + result = list(align(gt, ocr)) + assert sum(left != right for left, right in result) == 4 diff --git a/qurator/dinglehopper/tests/test_integ_character_error_rate_ocr.py b/qurator/dinglehopper/tests/test_integ_character_error_rate_ocr.py new file mode 100644 index 0000000..c27cd31 --- /dev/null +++ b/qurator/dinglehopper/tests/test_integ_character_error_rate_ocr.py @@ -0,0 +1,35 @@ +from __future__ import division, print_function + +import os + +import pytest +from lxml import etree as ET + +from .. import character_error_rate, page_text, alto_text + +data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data') + + +@pytest.mark.integration +def test_character_error_rate_between_page_files(): + # In the fake OCR file, we changed 2 characters and replaced a fi ligature with fi. + gt = page_text(ET.parse(os.path.join(data_dir, 'test-gt.page2018.xml'))) + ocr = page_text(ET.parse(os.path.join(data_dir, 'test-fake-ocr.page2018.xml'))) + assert character_error_rate(gt, ocr) == 4/(470 + 1 + 311) # 2 TextRegions, 1 \n + + +@pytest.mark.integration +def test_character_error_rate_between_page_alto(): + gt = page_text(ET.parse(os.path.join(data_dir, 'lorem-ipsum', 'lorem-ipsum-scan.gt.page.xml'))) + ocr = alto_text(ET.parse(os.path.join(data_dir, 'lorem-ipsum', 'lorem-ipsum-scan.ocr.tesseract.alto.xml'))) + + assert gt == ocr + assert character_error_rate(gt, ocr) == 0 + + +@pytest.mark.integration +def test_character_error_rate_between_page_alto_2(): + gt = page_text(ET.parse(os.path.join(data_dir, 'lorem-ipsum', 'lorem-ipsum-scan-bad.gt.page.xml'))) + ocr = alto_text(ET.parse(os.path.join(data_dir, 'lorem-ipsum', 'lorem-ipsum-scan-bad.ocr.tesseract.alto.xml'))) + + assert character_error_rate(gt, ocr) == 8/591 # Manually verified diff --git a/qurator/dinglehopper/tests/test_integ_edit_distance_ocr.py b/qurator/dinglehopper/tests/test_integ_edit_distance_ocr.py new file mode 100644 index 0000000..2857d56 --- /dev/null +++ b/qurator/dinglehopper/tests/test_integ_edit_distance_ocr.py @@ -0,0 +1,35 @@ +from __future__ import division, print_function + +import os + +import pytest +from lxml import etree as ET + +from .. import distance, page_text, alto_text + +data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data') + + +@pytest.mark.integration +def test_distance_between_page_files(): + # In the fake OCR file, we changed 2 characters and replaced a fi ligature with fi. + gt = page_text(ET.parse(os.path.join(data_dir, 'test-gt.page2018.xml'))) + ocr = page_text(ET.parse(os.path.join(data_dir, 'test-fake-ocr.page2018.xml'))) + assert distance(gt, ocr) == 4 + + +@pytest.mark.integration +def test_distance_between_page_alto(): + gt = page_text(ET.parse(os.path.join(data_dir, 'lorem-ipsum', 'lorem-ipsum-scan.gt.page.xml'))) + ocr = alto_text(ET.parse(os.path.join(data_dir, 'lorem-ipsum', 'lorem-ipsum-scan.ocr.tesseract.alto.xml'))) + + assert gt == ocr + assert distance(gt, ocr) == 0 + + +@pytest.mark.integration +def test_distance_between_page_alto_2(): + gt = page_text(ET.parse(os.path.join(data_dir, 'lorem-ipsum', 'lorem-ipsum-scan-bad.gt.page.xml'))) + ocr = alto_text(ET.parse(os.path.join(data_dir, 'lorem-ipsum', 'lorem-ipsum-scan-bad.ocr.tesseract.alto.xml'))) + + assert distance(gt, ocr) == 8 # Manually verified diff --git a/qurator/dinglehopper/tests/test_integ_word_error_rate_ocr.py b/qurator/dinglehopper/tests/test_integ_word_error_rate_ocr.py new file mode 100644 index 0000000..1d2dead --- /dev/null +++ b/qurator/dinglehopper/tests/test_integ_word_error_rate_ocr.py @@ -0,0 +1,43 @@ +from __future__ import division, print_function + +import os + +import pytest +from lxml import etree as ET + +from .. import word_error_rate, words, page_text, alto_text + +data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data') + + +@pytest.mark.integration +def test_word_error_rate_between_page_files(): + # In the fake OCR file, we changed 2 characters and replaced a fi ligature with fi. → 3 changed words + gt = page_text(ET.parse(os.path.join(data_dir, 'test-gt.page2018.xml'))) + + gt_word_count = 7+6+5+8+7+6+7+8+6+7+7+5+6+8+8+7+7+6+5+4 # Manually verified word count per line + assert len(list(words(gt))) == gt_word_count + + ocr = page_text(ET.parse(os.path.join(data_dir, 'test-fake-ocr.page2018.xml'))) + assert word_error_rate(gt, ocr) == 3/gt_word_count + + +@pytest.mark.integration +def test_word_error_rate_between_page_alto(): + gt = page_text(ET.parse(os.path.join(data_dir, 'lorem-ipsum', 'lorem-ipsum-scan.gt.page.xml'))) + ocr = alto_text(ET.parse(os.path.join(data_dir, 'lorem-ipsum', 'lorem-ipsum-scan.ocr.tesseract.alto.xml'))) + + assert gt == ocr + assert word_error_rate(gt, ocr) == 0 + + +@pytest.mark.integration +def test_word_error_rate_between_page_alto_2(): + gt = page_text(ET.parse(os.path.join(data_dir, 'lorem-ipsum', 'lorem-ipsum-scan-bad.gt.page.xml'))) + + gt_word_count = 14+18+17+14+17+17+3 # Manually verified word count per line + assert len(list(words(gt))) == gt_word_count + + ocr = alto_text(ET.parse(os.path.join(data_dir, 'lorem-ipsum', 'lorem-ipsum-scan-bad.ocr.tesseract.alto.xml'))) + + assert word_error_rate(gt, ocr) == 7/gt_word_count # Manually verified, 6 words are wrong, 1 got split (=2 errors) diff --git a/qurator/dinglehopper/tests/test_ocr_files.py b/qurator/dinglehopper/tests/test_ocr_files.py new file mode 100644 index 0000000..694d548 --- /dev/null +++ b/qurator/dinglehopper/tests/test_ocr_files.py @@ -0,0 +1,99 @@ +import os +import re + +import lxml.etree as ET +import textwrap + +from .. import alto_namespace, alto_text, page_namespace, page_text, text + +data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data') + + +def test_alto_namespace(): + tree = ET.parse(os.path.join(data_dir, 'test.alto3.xml')) + assert alto_namespace(tree) == 'http://www.loc.gov/standards/alto/ns-v3#' + + +def test_alto_text(): + tree = ET.parse(os.path.join(data_dir, 'test.alto3.xml')) + result = alto_text(tree) + expected = textwrap.dedent("""\ + über die vielen Sorgen wegen deſſelben vergaß + Hartkopf, der Frau Amtmännin das ver- + ſprochene zu überliefern.""") + assert result == expected + + +def test_alto_text_ALTO1(): + tree = ET.parse(os.path.join(data_dir, 'test.alto1.xml')) + assert "being erected at the Broadway stock" in alto_text(tree) + + +def test_alto_text_ALTO2(): + tree = ET.parse(os.path.join(data_dir, 'test.alto2.xml')) + assert "Halbmonde, die genau durch einen Ouerstrich halbiert\nsind und an beiden Enden" in alto_text(tree) + + +def test_alto_text_ALTO3(): + tree = ET.parse(os.path.join(data_dir, 'test.alto3.xml')) + assert "über die vielen Sorgen wegen deſſelben vergaß" in alto_text(tree) + + +def test_page_namespace(): + tree = ET.parse(os.path.join(data_dir, 'test.page2018.xml')) + assert page_namespace(tree) == 'http://schema.primaresearch.org/PAGE/gts/pagecontent/2018-07-15' + + +def test_page_test(): + tree = ET.parse(os.path.join(data_dir, 'test.page2018.xml')) + result = page_text(tree) + expected = textwrap.dedent("""\ + ber die vielen Sorgen wegen deelben vergaß + Hartkopf, der Frau Amtmnnin das ver⸗ + ſproene zu berliefern. — Ein Erpreer + wurde an ihn abgeſit, um ihn ums Him⸗ + melswien zu ſagen, daß er das Verſproene + glei den Augenbli berbringen mte, die + Frau Amtmnnin htte  auf ihn verlaen, + und nun wßte e nit, was e anfangen + ſote. Den Augenbli ſote er kommen, + ſon vergieng e in ihrer Ang. — Die + Ge wren ſon angekommen, und es fehlte + ihr do no an aem. — + Hartkopf mußte  er bennen, und + endli na langem Nadenken fiel es ihm er + wieder ein. — Er langte den Zettel aus dem + Accisbue heraus, und ſagte ſeiner Frau, daß + e das, was da wre, herbeyſaffen mte. + Jndeß mangelten do einige Generalia, die + alſo wegfielen. — Hartkopf gieng ſelb + mit und berbrate es. —""") + assert result == expected + + +def test_page_with_empty_region(): + # This file contains an empty TextRegion: + # + # + # + # + # + # + # + tree = ET.parse(os.path.join(data_dir, 'brochrnx_73075507X/00000139.ocrd-tess.ocr.page.xml')) + result = page_text(tree) + assert result + + +def test_page_order(): + # This file contains TextRegions where file order is not the same as reading order. + tree = ET.parse(os.path.join(data_dir, 'order.page.xml')) + result = page_text(tree) + + assert re.search(r'Herr Konfrater.*75.*Etwas f.r Wittwen.*Ein gewi.er Lord.*76\. Die', result, re.DOTALL) + + +def test_text(): + assert "being erected at the Broadway stock" in text(os.path.join(data_dir, 'test.alto1.xml')) + assert "wieder ein. — Er langte den Zettel aus dem" in text(os.path.join(data_dir, 'test.page2018.xml')) + assert "Lorem ipsum" in text(os.path.join(data_dir, 'test.txt')) diff --git a/qurator/dinglehopper/tests/test_word_error_rate.py b/qurator/dinglehopper/tests/test_word_error_rate.py new file mode 100644 index 0000000..a707229 --- /dev/null +++ b/qurator/dinglehopper/tests/test_word_error_rate.py @@ -0,0 +1,45 @@ +from __future__ import division, print_function + +import math + +from .. import word_error_rate, words, unordered_word_error_rate + + +def test_words(): + result = list(words('Der schnelle [„braune“] Fuchs kann keine 3,14 Meter springen, oder?')) + expected = ['Der', 'schnelle', 'braune', 'Fuchs', 'kann', 'keine', '3,14', 'Meter', 'springen', 'oder'] + assert result == expected + + +def test_words_private_use_area(): + result = list(words( + 'ber die vielen Sorgen wegen deelben vergaß Hartkopf, der Frau Amtmnnin das ver⸗\n' + 'ſproene zu berliefern.')) + expected = [ + 'ber', 'die', 'vielen', 'Sorgen', 'wegen', 'deelben', 'vergaß', 'Hartkopf', + 'der', 'Frau', 'Amtmnnin', 'das', 'ver', + 'ſproene', 'zu', 'berliefern'] + assert result == expected + + +def test_word_error_rate(): + assert word_error_rate('Dies ist ein Beispielsatz!', 'Dies ist ein Beispielsatz!') == 0 + assert word_error_rate('Dies. ist ein Beispielsatz!', 'Dies ist ein Beispielsatz!') == 0 + assert word_error_rate('Dies. ist ein Beispielsatz!', 'Dies ist ein Beispielsatz.') == 0 + + assert word_error_rate('Dies ist ein Beispielsatz!', 'Dies ist ein Beispielsarz:') == 1/4 + assert word_error_rate('Dies ist ein Beispielsatz!', 'Dies ein ist Beispielsatz!') == 2/4 + + assert word_error_rate('Dies ist ein Beispielsatz!', '') == 4/4 + assert math.isinf(word_error_rate('', 'Dies ist ein Beispielsatz!')) + assert word_error_rate('', '') == 0 + + assert word_error_rate('Schlyñ lorem ipsum dolor sit amet,', 'Schlym̃ lorem ipsum dolor sit amet.') == 1/6 + + +def test_unordered_word_error_rate(): + assert unordered_word_error_rate('abc def ghi', 'ghi abc def') == 0 + assert unordered_word_error_rate('abc def ghi', 'ghi abcX def') == 1/3 + assert unordered_word_error_rate('abc def ghi jkl', 'abc ghi def jkl') == 0 + assert unordered_word_error_rate('abc def ghi jkl', 'abc ghi defX jkl') == 1/4 + # XXX There seem to be some cases where this does not work diff --git a/qurator/dinglehopper/tests/util.py b/qurator/dinglehopper/tests/util.py new file mode 100644 index 0000000..cb4dc13 --- /dev/null +++ b/qurator/dinglehopper/tests/util.py @@ -0,0 +1,24 @@ +from itertools import zip_longest +from typing import Iterable + +import colorama + + +def diffprint(x, y): + """Print elements or lists x and y, with differences in red""" + + def _diffprint(x, y): + if x != y: + print(colorama.Fore.RED, x, y, colorama.Fore.RESET) + else: + print(x, y) + + if isinstance(x, Iterable): + for xe, ye in zip_longest(x, y): + _diffprint(xe, ye) + else: + _diffprint(x, y) + + +def unzip(l): + return zip(*l) diff --git a/qurator/dinglehopper/word_error_rate.py b/qurator/dinglehopper/word_error_rate.py new file mode 100644 index 0000000..3ccfdfc --- /dev/null +++ b/qurator/dinglehopper/word_error_rate.py @@ -0,0 +1,69 @@ +from __future__ import division + +import unicodedata + +import uniseg.wordbreak + +from .edit_distance import levenshtein + + +def words(s): + # Patch uniseg.wordbreak.word_break to deal with our private use characters. See also + # https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/WordBreakProperty.txt + old_word_break = uniseg.wordbreak.word_break + + def new_word_break(c, index=0): + if 0xE000 <= ord(c) <= 0xF8FF: # Private Use Area + return 'ALetter' + else: + return old_word_break(c, index) + uniseg.wordbreak.word_break = new_word_break + + # Check if c is an unwanted character, i.e. whitespace, punctuation, or similar + def unwanted(c): + + # See https://www.fileformat.info/info/unicode/category/index.htm + # and https://unicodebook.readthedocs.io/unicode.html#categories + unwanted_categories = 'O', 'M', 'P', 'Z', 'S' + unwanted_subcategories = 'Cc', 'Cf' + + subcat = unicodedata.category(c) + cat = subcat[0] + return cat in unwanted_categories or subcat in unwanted_subcategories + + # We follow Unicode Standard Annex #29 on Unicode Text Segmentation here: Split on word boundaries using + # uniseg.wordbreak.words() and ignore all "words" that contain only whitespace, punctation "or similar characters." + for word in uniseg.wordbreak.words(s): + if all(unwanted(c) for c in word): + pass + else: + yield word + + +def words_normalized(s): + return words(unicodedata.normalize('NFC', s)) + + +def word_error_rate(reference, compared): + if isinstance(reference, str): + reference_seq = list(words_normalized(reference)) + compared_seq = list(words_normalized(compared)) + else: + reference_seq = list(reference) + compared_seq = list(compared) + + d = levenshtein(reference_seq, compared_seq) + if d == 0: + return 0 + + n = len(reference_seq) + if n == 0: + return float('inf') + + return d / n + + +def unordered_word_error_rate(reference, compared): + reference_seq = sorted(words_normalized(reference)) + compared_seq = sorted(words_normalized(compared)) + return word_error_rate(reference_seq, compared_seq) diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..074aede --- /dev/null +++ b/requirements.txt @@ -0,0 +1,6 @@ +click +jinja2 +lxml +uniseg +numpy +colorama diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..b613b4f --- /dev/null +++ b/setup.py @@ -0,0 +1,23 @@ +from io import open +from setuptools import find_packages, setup + +with open('requirements.txt') as fp: + install_requires = fp.read() + +setup( + name='dinglehopper', + author_email='qurator@sbb.spk-berlin.de', + description='The OCR evaluation tool', + long_description=open('README.md', 'r', encoding='utf-8').read(), + long_description_content_type='text/markdown', + keywords='qurator ocr', + license='Apache', + namespace_packages=['qurator'], + packages=find_packages(exclude=['*.tests', '*.tests.*', 'tests.*', 'tests']), + install_requires=install_requires, + entry_points={ + 'console_scripts': [ + 'dinglehopper=qurator.dinglehopper.cli:main', + ] + } +)