diff --git a/.gitignore b/.gitignore deleted file mode 100644 index 3fafd07..0000000 --- a/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -__pycache__ -*.egg-info diff --git a/.screenshots/dinglehopper.png b/.screenshots/dinglehopper.png new file mode 100644 index 0000000..794ebe1 Binary files /dev/null and b/.screenshots/dinglehopper.png differ diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 0000000..dc15c9a --- /dev/null +++ b/.travis.yml @@ -0,0 +1,14 @@ +dist: xenial # required for Python >= 3.7 +language: python +python: + - "3.5" + - "3.6" + - "3.7" + - "3.8" + + +install: + - pip install -r requirements.txt + +script: + - pytest diff --git a/Dockerfile b/Dockerfile deleted file mode 100644 index 20681e3..0000000 --- a/Dockerfile +++ /dev/null @@ -1,9 +0,0 @@ -FROM python:3 - -ADD requirements.txt / -RUN pip install --proxy=http-proxy.sbb.spk-berlin.de:3128 -r requirements.txt - -COPY . /usr/src/sbb_textline_detector -RUN pip install /usr/src/sbb_textline_detector - -ENTRYPOINT ["sbb_textline_detector"] diff --git a/LICENSE b/LICENSE index 261eeb9..9b7a833 100644 --- a/LICENSE +++ b/LICENSE @@ -178,7 +178,7 @@ APPENDIX: How to apply the Apache License to your work. To apply the Apache License to your work, attach the following - boilerplate notice, with the fields enclosed by brackets "[]" + boilerplate notice, with the fields enclosed by brackets "{}" replaced with your own identifying information. (Don't include the brackets!) The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a @@ -186,7 +186,7 @@ same "printed page" as the copyright notice for easier identification within third-party archives. - Copyright [yyyy] [name of copyright owner] + Copyright 2019 qurator Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/README.md b/README.md index 46a8295..c14a799 100644 --- a/README.md +++ b/README.md @@ -1,30 +1,49 @@ -# Textline Detection +dinglehopper +============ -## Introduction -This tool performs textline detection from document image data and returns the results as PAGE-XML. +dinglehopper is an OCR evaluation tool and reads [ALTO](https://github.com/altoxml), [PAGE](https://github.com/PRImA-Research-Lab/PAGE-XML) and text files. -## Installation +[![Build Status](https://travis-ci.org/qurator-spk/dinglehopper.svg?branch=master)](https://travis-ci.org/qurator-spk/dinglehopper) -`pip install .` +Goals +----- +* Useful + * As a UI tool + * For an automated evaluation + * As a library +* Unicode support -## Models -In order to run this tool you also need trained models. You can download our pre-trained models from here: -https://file.spk-berlin.de:8443/textline_detection/ - -## Usage +Installation +------------ +It's best to use pip, e.g.: +~~~ +sudo pip install . +~~~ -`sbb_textline_detector -i -o -m ` +Usage +----- +~~~ +dinglehopper some-document.gt.page.xml some-document.ocr.alto.xml +~~~ +This generates `report.html` and `report.json`. -## Usage with OCR-D +As a OCR-D processor: ~~~ -ocrd-example-binarize -I OCR-D-IMG -O OCR-D-IMG-BIN -ocrd-sbb-textline-detector -I OCR-D-IMG-BIN -O OCR-D-SEG-LINE-SBB \ - -p '{ "model": "/path/to/the/models/textline_detection" }' +ocrd-dinglehopper -m mets.xml -I OCR-D-GT-PAGE,OCR-D-OCR-TESS -O OCR-D-OCR-TESS-EVAL ~~~ +This generates HTML and JSON reports in the `OCR-D-OCR-TESS-EVAL` filegroup. -Segmentation works on raw RGB images, but respects and retains -`AlternativeImage`s from binarization steps, so it's a good idea to do -binarization first, then perform the textline detection. The used binarization -processor must produce an `AlternativeImage` for the binarized image, not -replace the original raw RGB image. + +![dinglehopper displaying metrics and character differences](.screenshots/dinglehopper.png?raw=true) + +Testing +------- +Use `pytest` to run the tests in [the tests directory](qurator/dinglehopper/tests): +~~~ +virtualenv -p /usr/bin/python3 venv +. venv/bin/activate +pip install -r requirements.txt +pip install pytest +pytest +~~~ diff --git a/ocrd-tool.json b/ocrd-tool.json index a1e5650..f45153c 120000 --- a/ocrd-tool.json +++ b/ocrd-tool.json @@ -1 +1 @@ -qurator/sbb_textline_detector/ocrd-tool.json \ No newline at end of file +qurator/dinglehopper/ocrd-tool.json \ No newline at end of file diff --git a/pytest.ini b/pytest.ini new file mode 100644 index 0000000..c56273f --- /dev/null +++ b/pytest.ini @@ -0,0 +1,4 @@ +[pytest] +markers = + integration: integration tests + serial diff --git a/qurator/__init__.py b/qurator/__init__.py index b0d6433..8d17c21 100644 --- a/qurator/__init__.py +++ b/qurator/__init__.py @@ -1 +1,2 @@ -__import__('pkg_resources').declare_namespace(__name__) \ No newline at end of file +__import__('pkg_resources').declare_namespace(__name__) + diff --git a/qurator/dinglehopper/.gitignore b/qurator/dinglehopper/.gitignore new file mode 100644 index 0000000..e70d1f9 --- /dev/null +++ b/qurator/dinglehopper/.gitignore @@ -0,0 +1,6 @@ +# User-specific stuff +.idea/**/workspace.xml +.idea/**/tasks.xml +.idea/**/usage.statistics.xml +.idea/**/dictionaries +.idea/**/shelf diff --git a/qurator/dinglehopper/.idea/dinglehopper.iml b/qurator/dinglehopper/.idea/dinglehopper.iml new file mode 100644 index 0000000..e273926 --- /dev/null +++ b/qurator/dinglehopper/.idea/dinglehopper.iml @@ -0,0 +1,12 @@ + + + + + + + + + + \ No newline at end of file diff --git a/qurator/dinglehopper/.idea/misc.xml b/qurator/dinglehopper/.idea/misc.xml new file mode 100644 index 0000000..ba209a1 --- /dev/null +++ b/qurator/dinglehopper/.idea/misc.xml @@ -0,0 +1,7 @@ + + + + + + \ No newline at end of file diff --git a/qurator/dinglehopper/.idea/modules.xml b/qurator/dinglehopper/.idea/modules.xml new file mode 100644 index 0000000..6035afb --- /dev/null +++ b/qurator/dinglehopper/.idea/modules.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/qurator/dinglehopper/__init__.py b/qurator/dinglehopper/__init__.py new file mode 100644 index 0000000..0e8ee38 --- /dev/null +++ b/qurator/dinglehopper/__init__.py @@ -0,0 +1,5 @@ +from .ocr_files import * +from .substitute_equivalences import * +from .character_error_rate import * +from .word_error_rate import * +from .align import * diff --git a/qurator/dinglehopper/align.py b/qurator/dinglehopper/align.py new file mode 100644 index 0000000..ab44760 --- /dev/null +++ b/qurator/dinglehopper/align.py @@ -0,0 +1,43 @@ +from .edit_distance import * + + +def align(t1, t2): + """Align text.""" + s1 = list(grapheme_clusters(unicodedata.normalize('NFC', t1))) + s2 = list(grapheme_clusters(unicodedata.normalize('NFC', t2))) + return seq_align(s1, s2) + + +def seq_align(s1, s2): + """Align general sequences.""" + s1 = list(s1) + s2 = list(s2) + ops = seq_editops(s1, s2) + i = 0 + j = 0 + + while i < len(s1) or j < len(s2): + o = None + try: + ot = ops[0] + if ot[1] == i and ot[2] == j: + ops = ops[1:] + o = ot + except IndexError: + pass + + if o: + if o[0] == 'insert': + yield (None, s2[j]) + j += 1 + elif o[0] == 'delete': + yield (s1[i], None) + i += 1 + elif o[0] == 'replace': + yield (s1[i], s2[j]) + i += 1 + j += 1 + else: + yield (s1[i], s2[j]) + i += 1 + j += 1 diff --git a/qurator/dinglehopper/character_error_rate.py b/qurator/dinglehopper/character_error_rate.py new file mode 100644 index 0000000..f63a15f --- /dev/null +++ b/qurator/dinglehopper/character_error_rate.py @@ -0,0 +1,21 @@ +from __future__ import division + +import unicodedata + +from uniseg.graphemecluster import grapheme_clusters + +from qurator.dinglehopper.edit_distance import distance + + +def character_error_rate(reference, compared): + d = distance(reference, compared) + if d == 0: + return 0 + + n = len(list(grapheme_clusters(unicodedata.normalize('NFC', reference)))) + if n == 0: + return float('inf') + + return d/n + + # XXX Should we really count newlines here? diff --git a/qurator/dinglehopper/cli.py b/qurator/dinglehopper/cli.py new file mode 100644 index 0000000..7f9ea8f --- /dev/null +++ b/qurator/dinglehopper/cli.py @@ -0,0 +1,106 @@ +import os + +import click +from jinja2 import Environment, FileSystemLoader +from markupsafe import escape + + +from qurator.dinglehopper import * + + +def gen_diff_report(gt_things, ocr_things, css_prefix, joiner, none, align): + gtx = '' + ocrx = '' + + def format_thing(t, css_classes=None): + if t is None: + html_t = none + css_classes += ' ellipsis' + elif t == '\n': + html_t = '
' + else: + html_t = escape(t) + + if css_classes: + return '{html_t}'.format(css_classes=css_classes, html_t=html_t) + else: + return '{html_t}'.format(html_t=html_t) + + for k, (g, o) in enumerate(align(gt_things, ocr_things)): + if g == o: + css_classes = None + else: + css_classes = '{css_prefix}diff{k} diff'.format(css_prefix=css_prefix, k=k) + + gtx += joiner + format_thing(g, css_classes) + ocrx += joiner + format_thing(o, css_classes) + + return \ + ''' +
+
{}
+
{}
+
+ '''.format(gtx, ocrx) + + +def process(gt, ocr, report_prefix): + """Check OCR result against GT. + + The @click decorators change the signature of the decorated functions, so we keep this undecorated version and use + Click on a wrapper. + """ + + gt_text = text(gt) + ocr_text = text(ocr) + + gt_text = substitute_equivalences(gt_text) + ocr_text = substitute_equivalences(ocr_text) + + cer = character_error_rate(gt_text, ocr_text) + wer = word_error_rate(gt_text, ocr_text) + + char_diff_report = gen_diff_report(gt_text, ocr_text, css_prefix='c', joiner='', none='·', align=align) + + gt_words = words_normalized(gt_text) + ocr_words = words_normalized(ocr_text) + word_diff_report = gen_diff_report(gt_words, ocr_words, css_prefix='w', joiner=' ', none='⋯', align=seq_align) + + def json_float(value): + """Convert a float value to an JSON float. + + This is here so that float('inf') yields "Infinity", not "inf". + """ + if value == float('inf'): + return 'Infinity' + elif value == float('-inf'): + return '-Infinity' + else: + return str(value) + + env = Environment(loader=FileSystemLoader(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'templates'))) + env.filters['json_float'] = json_float + + for report_suffix in ('.html', '.json'): + template_fn = 'report' + report_suffix + '.j2' + out_fn = report_prefix + report_suffix + + template = env.get_template(template_fn) + template.stream( + gt=gt, ocr=ocr, + cer=cer, wer=wer, + char_diff_report=char_diff_report, + word_diff_report=word_diff_report + ).dump(out_fn) + + +@click.command() +@click.argument('gt', type=click.Path(exists=True)) +@click.argument('ocr', type=click.Path(exists=True)) +@click.argument('report_prefix', type=click.Path(), default='report') +def main(gt, ocr, report_prefix): + process(gt, ocr, report_prefix) + + +if __name__ == '__main__': + main() diff --git a/qurator/dinglehopper/edit_distance.py b/qurator/dinglehopper/edit_distance.py new file mode 100644 index 0000000..8ca24d3 --- /dev/null +++ b/qurator/dinglehopper/edit_distance.py @@ -0,0 +1,122 @@ +from __future__ import division, print_function + +import unicodedata +from functools import partial, lru_cache +from typing import Sequence, Tuple + +import numpy as np +from uniseg.graphemecluster import grapheme_clusters + + +def levenshtein_matrix(seq1: Sequence, seq2: Sequence): + """Compute the matrix commonly computed to produce the Levenshtein distance. + This is also known as the Wagner-Fischer algorithm. The matrix element at the bottom right contains the desired + edit distance. + + This algorithm is implemented here because we need an implementation that can work with sequences other than + strings, e.g. lists of grapheme clusters or lists of word strings. + """ + + # Internally, we use a cached version. As the cache only works on hashable parameters, we convert the input + # sequences to tuples to make them hashable. + return _levenshtein_matrix(tuple(seq1), tuple(seq2)) + + +@lru_cache(maxsize=10) +def _levenshtein_matrix(seq1: Tuple, seq2: Tuple): + """Compute the matrix commonly computed to produce the Levenshtein distance. + + This is a LRU cached function not meant to be used directly. Use levenshtein_matrix() instead. + """ + m = len(seq1) + n = len(seq2) + + def from_to(start, stop): + return range(start, stop + 1, 1) + + D = np.zeros((m + 1, n + 1), np.int) + D[0, 0] = 0 + for i in from_to(1, m): + D[i, 0] = i + for j in from_to(1, n): + D[0, j] = j + for i in from_to(1, m): + for j in from_to(1, n): + D[i, j] = min( + D[i - 1, j - 1] + 1 * (seq1[i - 1] != seq2[j - 1]), # Same or Substitution + D[i, j - 1] + 1, # Insertion + D[i - 1, j] + 1 # Deletion + ) + + return D + + +def levenshtein(seq1, seq2): + """Compute the Levenshtein edit distance between two sequences""" + m = len(seq1) + n = len(seq2) + + D = levenshtein_matrix(seq1, seq2) + return D[m, n] + + +def levenshtein_matrix_cache_clear(): + """Clear internal Levenshtein matrix cache. + + You want to do this between different input file pairs to decrease memory + usage by not caching results from prior input files. + """ + _levenshtein_matrix.cache_clear() + + +def distance(s1, s2): + """Compute the Levenshtein edit distance between two Unicode strings + + Note that this is different from levenshtein() as this function knows about Unicode normalization and grapheme + clusters. This should be the correct way to compare two Unicode strings. + """ + s1 = list(grapheme_clusters(unicodedata.normalize('NFC', s1))) + s2 = list(grapheme_clusters(unicodedata.normalize('NFC', s2))) + return levenshtein(s1, s2) + + +def seq_editops(seq1, seq2): + """ + Return sequence of edit operations transforming one sequence to another. + + This aims to return the same/similar results as python-Levenshtein's editops(), just generalized to arbitrary + sequences. + """ + seq1 = list(seq1) + seq2 = list(seq2) + m = len(seq1) + n = len(seq2) + D = levenshtein_matrix(seq1, seq2) + + def _tail_backtrace(i, j, accumulator): + if i > 0 and D[i - 1, j] + 1 == D[i, j]: + return partial(_tail_backtrace, i - 1, j, [('delete', i-1, j)] + accumulator) + if j > 0 and D[i, j - 1] + 1 == D[i, j]: + return partial(_tail_backtrace, i, j - 1, [('insert', i, j-1)] + accumulator) + if i > 0 and j > 0 and D[i - 1, j - 1] + 1 == D[i, j]: + return partial(_tail_backtrace, i - 1, j - 1, [('replace', i-1, j-1)] + accumulator) + if i > 0 and j > 0 and D[i - 1, j - 1] == D[i, j]: + return partial(_tail_backtrace, i - 1, j - 1, accumulator) # NOP + return accumulator + + def backtrace(i, j): + result = partial(_tail_backtrace, i, j, []) + while isinstance(result, partial): + result = result() + + return result + + b = backtrace(m, n) + return b + + +def editops(word1, word2): + # XXX Note that this returns indices to the _grapheme clusters_, not characters! + word1 = list(grapheme_clusters(unicodedata.normalize('NFC', word1))) + word2 = list(grapheme_clusters(unicodedata.normalize('NFC', word2))) + return seq_editops(word1, word2) diff --git a/qurator/dinglehopper/notebooks/Levenshtein.ipynb b/qurator/dinglehopper/notebooks/Levenshtein.ipynb new file mode 100644 index 0000000..f56d0d7 --- /dev/null +++ b/qurator/dinglehopper/notebooks/Levenshtein.ipynb @@ -0,0 +1,1037 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import unicodedata\n", + "import inspect" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Levenshtein edit distance" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "def levenshtein_matrix(seq1, seq2):\n", + " \"\"\"Compute the matrix commonly computed to produce the Levenshtein distance.\n", + "\n", + " This is also known as the Wagner-Fischer algorithm. The matrix element at the bottom right contains the desired\n", + " edit distance.\n", + "\n", + " This algorithm is implemented here because we need an implementation that can work with sequences other than\n", + " strings, e.g. lists of grapheme clusters or lists of word strings.\n", + " \"\"\"\n", + " m = len(seq1)\n", + " n = len(seq2)\n", + "\n", + " def from_to(start, stop):\n", + " return range(start, stop + 1, 1)\n", + "\n", + " D = np.zeros((m + 1, n + 1), np.int)\n", + " D[0, 0] = 0\n", + " for i in from_to(1, m):\n", + " D[i, 0] = i\n", + " for j in from_to(1, n):\n", + " D[0, j] = j\n", + " for i in from_to(1, m):\n", + " for j in from_to(1, n):\n", + " D[i, j] = min(\n", + " D[i - 1, j - 1] + 1 * (seq1[i - 1] != seq2[j - 1]), # Same or Substitution\n", + " D[i, j - 1] + 1, # Insertion\n", + " D[i - 1, j] + 1 # Deletion\n", + " )\n", + "\n", + " return D\n", + "\n", + "def levenshtein(seq1, seq2):\n", + " \"\"\"Compute the Levenshtein edit distance between two sequences\"\"\"\n", + " m = len(seq1)\n", + " n = len(seq2)\n", + "\n", + " D = levenshtein_matrix(seq1, seq2)\n", + " return D[m, n]\n", + "\n" + ] + } + ], + "source": [ + "from edit_distance import levenshtein_matrix, levenshtein\n", + "\n", + "print(inspect.getsource(levenshtein_matrix))\n", + "print(inspect.getsource(levenshtein))" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "assert levenshtein('a', 'a') == 0\n", + "assert levenshtein('a', 'b') == 1\n", + "assert levenshtein('Foo', 'Bar') == 3\n", + "assert levenshtein('', '') == 0\n", + "assert levenshtein('Foo', '') == 3\n", + "assert levenshtein('', 'Foo') == 3\n", + "assert levenshtein('Fnord', 'Food') == 2\n", + "assert levenshtein('Müll', 'Mull') == 1\n", + "assert levenshtein('Abstand', 'Sand') == 4" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This fails for different representations of the \"same\" canonically equivalent string:" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "2" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "word1 = unicodedata.normalize('NFC', 'Schlyñ')\n", + "word2 = unicodedata.normalize('NFD', 'Schlyñ') # Different, decomposed!\n", + "levenshtein(word1, word2)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "1" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Same, but for grapheme clusters\n", + "from uniseg.graphemecluster import grapheme_clusters\n", + "\n", + "word1 = list(grapheme_clusters(unicodedata.normalize('NFC', 'Schlyñ')))\n", + "word2 = list(grapheme_clusters(unicodedata.normalize('NFD', 'Schlyñ')))\n", + "levenshtein(word1, word2)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Better." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's define a edit distance function that uses the basic Levenshtein algorithm, but knows about Unicode normalization and grapheme clusters!" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "def distance(s1, s2):\n", + " \"\"\"Compute the Levenshtein edit distance between two Unicode strings\n", + "\n", + " Note that this is different from levenshtein() as this function knows about Unicode normalization and grapheme\n", + " clusters. This should be the correct way to compare two Unicode strings.\n", + " \"\"\"\n", + " s1 = list(grapheme_clusters(unicodedata.normalize('NFC', s1)))\n", + " s2 = list(grapheme_clusters(unicodedata.normalize('NFC', s2)))\n", + " return levenshtein(s1, s2)\n", + "\n" + ] + } + ], + "source": [ + "from edit_distance import distance\n", + "print(inspect.getsource(distance))" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "word1 = unicodedata.normalize('NFC', 'Schlyñ')\n", + "word2 = unicodedata.normalize('NFD', 'Schlyñ') # Different, decomposed!\n", + "\n", + "distance(word1, word2)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This should give us the correct answer of 1 for 'Schlyñ' (with LATIN SMALL LETTER N WITH TILDE) vs 'Schlym̃' (with LATIN SMALL LETTER M + COMBINING TILDE):" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "1" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "word1 = 'Schlyñ'\n", + "word2 = 'Schlym̃'\n", + "#print('Lengths, as far as Python is concerned:', len(word1), len(word2)) # → gives 6 and 7!\n", + "distance(word1, word2)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Edit operations\n", + "\n", + "python-Levenshtein supports backtracing, i.e. giving a sequence of edit options that transforms a word to another word:\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[('insert', 5, 5), ('replace', 5, 6)]\n" + ] + } + ], + "source": [ + "import Levenshtein\n", + "word1 = 'Schlyñ' # with LATIN SMALL LETTER N WITH TILDE\n", + "word2 = 'Schlym̃' # with LATIN SMALL LETTER M + COMBINING TILDE\n", + "print(Levenshtein.editops(word1, word2))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Note that it does not work with grapheme clusters, but \"characters\", so it gives 2 operations." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Defining our own `editops()`. (This looks a bit wild due to our own tail recursion handling.)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "def seq_editops(seq1, seq2):\n", + " seq1 = list(seq1)\n", + " seq2 = list(seq2)\n", + " m = len(seq1)\n", + " n = len(seq2)\n", + " D = levenshtein_matrix(seq1, seq2)\n", + "\n", + " def _tail_backtrace(i, j, accumulator):\n", + " if i > 0 and D[i - 1, j] + 1 == D[i, j]:\n", + " return partial(_tail_backtrace, i - 1, j, [('delete', i-1, j)] + accumulator)\n", + " if j > 0 and D[i, j - 1] + 1 == D[i, j]:\n", + " return partial(_tail_backtrace, i, j - 1, [('insert', i, j-1)] + accumulator)\n", + " if i > 0 and j > 0 and D[i - 1, j - 1] + 1 == D[i, j]:\n", + " return partial(_tail_backtrace, i - 1, j - 1, [('replace', i-1, j-1)] + accumulator)\n", + " if i > 0 and j > 0 and D[i - 1, j - 1] == D[i, j]:\n", + " return partial(_tail_backtrace, i - 1, j - 1, accumulator) # NOP\n", + " return accumulator\n", + "\n", + " def backtrace(i, j):\n", + " result = partial(_tail_backtrace, i, j, [])\n", + " while isinstance(result, partial):\n", + " result = result()\n", + "\n", + " return result\n", + "\n", + " b = backtrace(m, n)\n", + " return b\n", + "\n", + "def editops(word1, word2):\n", + " # XXX Note that this returns indices to the _grapheme clusters_, not characters!\n", + " word1 = list(grapheme_clusters(unicodedata.normalize('NFC', word1)))\n", + " word2 = list(grapheme_clusters(unicodedata.normalize('NFC', word2)))\n", + " return seq_editops(word1, word2)\n", + "\n" + ] + } + ], + "source": [ + "from edit_distance import seq_editops, editops\n", + "print(inspect.getsource(seq_editops))\n", + "print(inspect.getsource(editops))" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[('replace', 2, 2)]" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "editops('Foo', 'Fon')" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[('insert', 4, 4)]\n", + "[('insert', 4, 4)]\n" + ] + } + ], + "source": [ + "print(editops('Käptn', 'Käpt\\'n'))\n", + "print(Levenshtein.editops('Käptn', 'Käpt\\'n'))" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[('delete', 6, 6)]\n", + "[('delete', 6, 6)]\n" + ] + } + ], + "source": [ + "print(editops('Delete something', 'Deletesomething'))\n", + "print(Levenshtein.editops('Delete something', 'Deletesomething'))" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[('delete', 1, 1), ('replace', 13, 12), ('insert', 17, 16), ('delete', 23, 23)]\n", + "[('delete', 1, 1), ('replace', 13, 12), ('insert', 16, 15), ('delete', 23, 23)]\n" + ] + } + ], + "source": [ + "print(editops('A more difficult example', 'Amore difficült exampl'))\n", + "print(Levenshtein.editops('A more difficult example', 'Amore difficült exampl'))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "XXX Note that our implementation returns different positions here for the 'insert'. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's try it with a difficult example that needs grapheme cluster handling:" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[('replace', 5, 5)]" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "word1 = 'Schlyñ' # with LATIN SMALL LETTER N WITH TILDE\n", + "word2 = 'Schlym̃' # with LATIN SMALL LETTER M + COMBINING TILDE\n", + "\n", + "editops(word1, word2)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "🎉" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Character error rate\n", + "\n", + "[digitisation.eu](https://sites.google.com/site/textdigitisation/qualitymeasures/computingerrorrates) defines the character error rate (CER) as:\n", + "\n", + "$$\n", + "\\text{CER} = \\frac{i + s + d}{n}\n", + "$$\n", + "\n", + "where $i$ is the number of inserts, $s$ the number of substitutions, $d$ the number of deletions and $n$ is the number of characters in the reference text. (The text is not super clear about $n$ being the number of characters in the reference text, but it seems appropiate as they *are* clear about this when computing the word error rate.)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Because our edit distance is equal to $i + s + d$, we can thus define:" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "def character_error_rate(reference, compared):\n", + " d = distance(reference, compared)\n", + " if d == 0:\n", + " return 0\n", + "\n", + " n = len(list(grapheme_clusters(unicodedata.normalize('NFC', reference))))\n", + " if n == 0:\n", + " return float('inf')\n", + "\n", + " return d/n\n", + "\n" + ] + } + ], + "source": [ + "from character_error_rate import character_error_rate\n", + "print(inspect.getsource(character_error_rate))" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "assert character_error_rate('Foo', 'Bär') == 3/3\n", + "assert character_error_rate('Fnord', 'Food') == 2/5\n", + "assert character_error_rate('Food', 'Fnord') == 2/4\n", + "assert character_error_rate('Schlyñ', 'Schlym̃') == 1/6" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "# From experiments/2019-07-ocrevalUAtion: These are already preprocessed by the equivalences in equivalences-tess-frk.csv.\n", + "gt = \"\"\"115 über die vielen Sorgen wegen deſſelben vergaß Hartkopf, der Frau Amtmännin das ver⸗ ſprochene zu überliefern. — Ein Erpreſſer wurde an ihn abgeſchickt, um ihn ums Him⸗ melswillen zu ſagen, daß er das Verſprochene gleich den Augenblick überbringen möchte, die Frau Amtmännin hätte ſich auf ihn verlaſſen, und nun wüßte ſie nicht, was ſie anfangen ſollte. Den Augenblick ſollte er kommen, ſonſt vergieng ſie in ihrer Angſt. — Die Gäſte wären ſchon angekommen, und es fehlte ihr doch noch an allem. — Hartkopf mußte ſich erſt beſinnen, und endlich nach langem Nachdenken fiel es ihm erſt wieder ein. — Er langte den Zettel aus dem Accisbuche heraus, und ſagte ſeiner Frau, daß ſie das, was da wäre, herbeyſchaffen möchte. Jndeß mangelten doch einige Generalia, die alſo wegfielen. — Hartkopf gieng ſelbſt mit und überbrachte es. — „Herr Jemine! er böſer Mann!“ — ſchrie ihm die Frau Amtmännin entgegen, und ſchlug ihn auf die Schulter und blickte den Korb, der voll gedrückt, gerüttelt und überflüſſig in ihren Schoos gegeben werden ſollte, mit Augen voller Freu⸗ H 2\"\"\"\n", + "tess = \"\"\"emm unmit; Lis Übey die vielen Sorgen wegen\" deſſelben vergaß Hartkopf, der Frau! Amimännin das- ver ſprochene zu überliefeen. ==\" Ein Epypreſſer- wurde an ihn abgeſchieet', um' ihn ums Hime melswillen zu ſagen, \"daß er das Verſyrochene leich den Augenblick \"überbringen möchte, die Frau Amtmännin hätte ſich auf ihn veriaſſen, und nun wüßte ſie- nicht, was ſie anfangen ſollte, =! 'Den Augenblick ſollte \"er kommen, ſonſt vergieng ſie in ihrer Angſt. == Die Säuaſie- wären. ſchon angekommen, und es fehlte ihr do < noch an alien, === Hartfopyf mußte ſich erſt TIM und endlich mach langem Rachdenken fiel es ihm erſt wieder ein, ==. Ex langte den Zettel aus dem- Accisbuche heraus, und ſagte ſeiner Frau, daß ſie das , was da wäre, herbeyſchaffen mschte. ZIudeß „mangelten doch einige Generalia, die alſo wegfielen. == ' Havrkopf gieng ſelbſt mit und überbrachte es == | „Herr Jemine! er böſer Mann 1-2 ſchrie ihm die Frau Amtmännin entgegen, und ſchlug ihn auf die Schulter und blickte den Korb, der - voll gedrückt, gerüttelt und überfirfſig in ihren Ss HEILE werden ſolite, mit Augen voller EE) Fron?\"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.1190\n" + ] + } + ], + "source": [ + "print('{:.4f}'.format(character_error_rate(gt, tess)))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "XXX This gives a smaller CER than ocrevalUAtion (which gives 0.1228). Why?" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.1190253045923149" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "levenshtein(gt, tess)/len(gt)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "That's ~ the same, so I think it's not about the character segmentation. Check that we're only dealing with single-codepoint grapheme clusters:" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [], + "source": [ + "for w in gt, tess:\n", + " for g in grapheme_clusters(w):\n", + " assert len(g) == 1" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Maybe ocrevalUAtion doesn't count whitespace?" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'115überdievielenSorgenwegendeſſelbenvergaßHartkopf,derFrauAmtmännindasver⸗ſprochenezuüberliefern.—EinErpreſſerwurdeanihnabgeſchickt,umihnumsHim⸗melswillenzuſagen,daßerdasVerſprochenegleichdenAugenblicküberbringenmöchte,dieFrauAmtmänninhätteſichaufihnverlaſſen,undnunwüßteſienicht,wasſieanfangenſollte.DenAugenblickſollteerkommen,ſonſtvergiengſieinihrerAngſt.—DieGäſtewärenſchonangekommen,undesfehlteihrdochnochanallem.—Hartkopfmußteſicherſtbeſinnen,undendlichnachlangemNachdenkenfielesihmerſtwiederein.—ErlangtedenZettelausdemAccisbucheheraus,undſagteſeinerFrau,daßſiedas,wasdawäre,herbeyſchaffenmöchte.JndeßmangeltendocheinigeGeneralia,diealſowegfielen.—Hartkopfgiengſelbſtmitundüberbrachtees.—„HerrJemine!erböſerMann!“—ſchrieihmdieFrauAmtmänninentgegen,undſchlugihnaufdieSchulterundblicktedenKorb,dervollgedrückt,gerütteltundüberflüſſiginihrenSchoosgegebenwerdenſollte,mitAugenvollerFreu⸗H2'" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "def remove_whitespace(s):\n", + " return s.replace(' ', '')\n", + "remove_whitespace(gt)" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.1324\n" + ] + } + ], + "source": [ + "print('{:.4f}'.format(character_error_rate(remove_whitespace(gt), remove_whitespace(tess))))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now it's larger than ocrevalUAtion 🤷‍♂️" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Word error rate" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Word segmentation" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Naively split on spaces.\n", + "\n", + "(Note: ocrevalUAtion does confusing things here, like the Token splitting in a hash function, with an empty pattern?!)" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [], + "source": [ + "def naive_word_split(s):\n", + " return s.split(' ')" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [], + "source": [ + "example_text = \"The quick (“brown”) fox can't jump 32.3 feet, right?\"" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['The',\n", + " 'quick',\n", + " '(“brown”)',\n", + " 'fox',\n", + " \"can't\",\n", + " 'jump',\n", + " '32.3',\n", + " 'feet,',\n", + " 'right?']" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "naive_word_split(example_text)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's do it the Unicode way (Appendix UAX #29 on Unicode Text Segmentation): Split on word boundaries using the uniseg libraries and ignore words that contain only whitespace, punctuation \"and similar characters\":" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "def words(s):\n", + " # Patch uniseg.wordbreak.word_break to deal with our private use characters. See also\n", + " # https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/WordBreakProperty.txt\n", + " old_word_break = uniseg.wordbreak.word_break\n", + "\n", + " def new_word_break(c, index=0):\n", + " if 0xE000 <= ord(c) <= 0xF8FF: # Private Use Area\n", + " return 'ALetter'\n", + " else:\n", + " return old_word_break(c, index)\n", + " uniseg.wordbreak.word_break = new_word_break\n", + "\n", + " # Check if c is an unwanted character, i.e. whitespace, punctuation, or similar\n", + " def unwanted(c):\n", + "\n", + " # See https://www.fileformat.info/info/unicode/category/index.htm\n", + " # and https://unicodebook.readthedocs.io/unicode.html#categories\n", + " unwanted_categories = 'O', 'M', 'P', 'Z', 'S'\n", + " unwanted_subcategories = 'Cc', 'Cf'\n", + "\n", + " subcat = unicodedata.category(c)\n", + " cat = subcat[0]\n", + " return cat in unwanted_categories or subcat in unwanted_subcategories\n", + "\n", + " # We follow Unicode Standard Annex #29 on Unicode Text Segmentation here: Split on word boundaries using\n", + " # uniseg.wordbreak.words() and ignore all \"words\" that contain only whitespace, punctation \"or similar characters.\"\n", + " for word in uniseg.wordbreak.words(s):\n", + " if all(unwanted(c) for c in word):\n", + " pass\n", + " else:\n", + " yield word\n", + "\n" + ] + }, + { + "data": { + "text/plain": [ + "['The', 'quick', 'brown', 'fox', \"can't\", 'jump', '32.3', 'feet', 'right']" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from word_error_rate import words\n", + "print(inspect.getsource(words))\n", + "\n", + "list(words(example_text))" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['Der',\n", + " 'schnelle',\n", + " 'braune',\n", + " 'Fuchs',\n", + " 'kann',\n", + " 'keine',\n", + " '3,14',\n", + " 'Meter',\n", + " 'springen',\n", + " 'oder']" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "list(words('Der schnelle [„braune“] Fuchs kann keine 3,14 Meter springen, oder?'))" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['Dies', 'ist', 'ein', 'Beispielsatz', 'Oh', 'ja']" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "list(words('Dies ist ein Beispielsatz. Oh, ja.'))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "It's probably not correct for Chinese and Japanese, but at least it doesn't rely on spaces." + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "['我', '很', '高', '興', '跟', '你', '見', '面']" + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "list(words('我很高興跟你見面')) # \"Pleased to meet you\" in Mandarin, Traditional writing" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['医', '者', 'を', '呼', 'ん', 'で', 'く', 'だ', 'さ', 'い']" + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "list(words('医者を呼んでください。'))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Word error rate" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "For the word error rate, normalize again and compare sequences of words." + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "def word_error_rate(reference, compared):\n", + " if isinstance(reference, str):\n", + " reference_seq = list(words_normalized(reference))\n", + " compared_seq = list(words_normalized(compared))\n", + " else:\n", + " reference_seq = list(reference)\n", + " compared_seq = list(compared)\n", + "\n", + " d = levenshtein(reference_seq, compared_seq)\n", + " if d == 0:\n", + " return 0\n", + "\n", + " n = len(reference_seq)\n", + " if n == 0:\n", + " return float('inf')\n", + "\n", + " return d / n\n", + "\n" + ] + } + ], + "source": [ + "from word_error_rate import word_error_rate\n", + "print(inspect.getsource(word_error_rate))" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.25" + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "word_error_rate('Dies ist ein Beispielsatz.', 'Dies isi ein Beispielsatz,')" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.75" + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "word_error_rate('Fnord ist verdampfter Kräutertee!', 'Fnòrd ist verdmpfter Krautertee.')" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.18823529411764706" + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "word_error_rate(gt, tess)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This is a little larger than the ocrevalUAtion result!" + ] + } + ], + "metadata": { + "hide_input": false, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.3" + }, + "toc": { + "base_numbering": 1, + "nav_menu": {}, + "number_sections": true, + "sideBar": true, + "skip_h1_title": false, + "title_cell": "Table of Contents", + "title_sidebar": "Contents", + "toc_cell": false, + "toc_position": {}, + "toc_section_display": true, + "toc_window_display": true + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/qurator/dinglehopper/notebooks/Unicode normalization and Character segmentation.ipynb b/qurator/dinglehopper/notebooks/Unicode normalization and Character segmentation.ipynb new file mode 100644 index 0000000..696fb4f --- /dev/null +++ b/qurator/dinglehopper/notebooks/Unicode normalization and Character segmentation.ipynb @@ -0,0 +1,558 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import unicodedata" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "def list_characters(s):\n", + " \"\"\"List characters of string s, as seen by Python\"\"\"\n", + " for c in s:\n", + " print(c, end=' ')\n", + " if unicodedata.combining(c):\n", + " print(end=' ')\n", + " print(unicodedata.name(c))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Comparing two Unicode strings" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "S LATIN CAPITAL LETTER S\n", + "c LATIN SMALL LETTER C\n", + "h LATIN SMALL LETTER H\n", + "l LATIN SMALL LETTER L\n", + "y LATIN SMALL LETTER Y\n", + "ñ LATIN SMALL LETTER N WITH TILDE\n", + "\n", + "S LATIN CAPITAL LETTER S\n", + "c LATIN SMALL LETTER C\n", + "h LATIN SMALL LETTER H\n", + "l LATIN SMALL LETTER L\n", + "y LATIN SMALL LETTER Y\n", + "n LATIN SMALL LETTER N\n", + "̃ COMBINING TILDE\n", + "\n" + ] + } + ], + "source": [ + "words = [unicodedata.normalize('NFC', 'Schlyñ'), unicodedata.normalize('NFD', 'Schlyñ')]\n", + "\n", + "for s in words:\n", + " list_characters(s)\n", + " print()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "These two strings are different:" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "False" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "words[0] == words[1]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "And yet they are the canonically equivalent:" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "unicodedata.normalize('NFC', words[0]) == unicodedata.normalize('NFC', words[1])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "→ Normalize to NFC (Normalization Form Composed) to compare. NFC is also composed, which is what we want. But it doesn't matter because we're not interested in the characters as Python sees them, but in grapheme clusters (see below.)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Grapheme clusters" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "For evaluation we're interesting in what is perceived as \"characters\". But is \"ñ\" 1 character (LATIN SMALL LETTER N WITH TILDE) or 2 (LATIN SMALL LETTER N + COMBINING TILDE)?\n", + "\n", + "What we're probably want are [grapheme clusters](https://uniseg-python.readthedocs.io/en/latest/graphemecluster.html):" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['S', 'c', 'h', 'l', 'y', 'ñ']\n", + "['S', 'c', 'h', 'l', 'y', 'ñ']\n" + ] + } + ], + "source": [ + "from uniseg.graphemecluster import grapheme_clusters\n", + "\n", + "for w in words:\n", + " print(list(grapheme_clusters(w)))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Just looking at the interesting character – the last one - from both words:" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "ñ LATIN SMALL LETTER N WITH TILDE\n", + "\n", + "n LATIN SMALL LETTER N\n", + "̃ COMBINING TILDE\n", + "\n" + ] + } + ], + "source": [ + "for w in words:\n", + " list_characters(list(grapheme_clusters(w))[-1])\n", + " print()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "→ Work with grapheme clusters, not \"characters as Python sees them\"." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "def unicode_name(c):\n", + " if 0xE000 <= ord(c) <= 0xF8FF:\n", + " return 'private use character 0x{:04X}'.format(ord(c))\n", + " else:\n", + " return unicodedata.name(c)\n", + " \n", + "\n", + "def list_grapheme_clusters(s):\n", + " \"\"\"List grapheme clusters of string s\"\"\"\n", + " for g in grapheme_clusters(s):\n", + " print(g, end=' ')\n", + " if len(g) > 1:\n", + " print('(multiple)', end=' ')\n", + " try:\n", + " print(', '.join(unicode_name(c) for c in g))\n", + " except ValueError:\n", + " print('ValueError')" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "S LATIN CAPITAL LETTER S\n", + "c LATIN SMALL LETTER C\n", + "h LATIN SMALL LETTER H\n", + "l LATIN SMALL LETTER L\n", + "y LATIN SMALL LETTER Y\n", + "ñ LATIN SMALL LETTER N WITH TILDE\n", + "\n", + "S LATIN CAPITAL LETTER S\n", + "c LATIN SMALL LETTER C\n", + "h LATIN SMALL LETTER H\n", + "l LATIN SMALL LETTER L\n", + "y LATIN SMALL LETTER Y\n", + "ñ (multiple) LATIN SMALL LETTER N, COMBINING TILDE\n", + "\n" + ] + } + ], + "source": [ + "for w in words:\n", + " list_grapheme_clusters(w)\n", + " print()" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "私 CJK UNIFIED IDEOGRAPH-79C1\n", + "は HIRAGANA LETTER HA\n", + "彼 CJK UNIFIED IDEOGRAPH-5F7C\n", + "女 CJK UNIFIED IDEOGRAPH-5973\n", + "が HIRAGANA LETTER GA\n", + "お HIRAGANA LETTER O\n", + "茶 CJK UNIFIED IDEOGRAPH-8336\n", + "を HIRAGANA LETTER WO\n", + "好 CJK UNIFIED IDEOGRAPH-597D\n", + "き HIRAGANA LETTER KI\n", + "な HIRAGANA LETTER NA\n", + "事 CJK UNIFIED IDEOGRAPH-4E8B\n", + "が HIRAGANA LETTER GA\n", + "分 CJK UNIFIED IDEOGRAPH-5206\n", + "か HIRAGANA LETTER KA\n", + "っ HIRAGANA LETTER SMALL TU\n", + "た HIRAGANA LETTER TA\n", + "。 IDEOGRAPHIC FULL STOP\n" + ] + } + ], + "source": [ + "list_grapheme_clusters('私は彼女がお茶を好きな事が分かった。')" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + ". FULL STOP\n", + " SPACE\n", + "ا ARABIC LETTER ALEF\n", + "م ARABIC LETTER MEEM\n", + "ا ARABIC LETTER ALEF\n", + " SPACE\n", + "چ ARABIC LETTER TCHEH\n", + "ن ARABIC LETTER NOON\n", + "د ARABIC LETTER DAL\n", + " SPACE\n", + "ت ARABIC LETTER TEH\n", + "ا ARABIC LETTER ALEF\n", + " SPACE\n", + "ح ARABIC LETTER HAH\n", + "ر ARABIC LETTER REH\n", + "ف ARABIC LETTER FEH\n", + " SPACE\n", + "ت ARABIC LETTER TEH\n", + "و ARABIC LETTER WAW\n", + " SPACE\n", + "ف ARABIC LETTER FEH\n", + "ا ARABIC LETTER ALEF\n", + "ر ARABIC LETTER REH\n", + "س ARABIC LETTER SEEN\n", + "ی ARABIC LETTER FARSI YEH\n", + " SPACE\n", + "ه ARABIC LETTER HEH\n", + "س ARABIC LETTER SEEN\n", + "ت ARABIC LETTER TEH\n", + " SPACE\n", + "ک ARABIC LETTER KEHEH\n", + "ه ARABIC LETTER HEH\n", + " SPACE\n", + "ت ARABIC LETTER TEH\n", + "و ARABIC LETTER WAW\n", + " SPACE\n", + "ع ARABIC LETTER AIN\n", + "ر ARABIC LETTER REH\n", + "ب ARABIC LETTER BEH\n", + "ی ARABIC LETTER FARSI YEH\n", + " SPACE\n", + "ن ARABIC LETTER NOON\n", + "ی ARABIC LETTER FARSI YEH\n", + "س ARABIC LETTER SEEN\n", + "ت ARABIC LETTER TEH\n" + ] + } + ], + "source": [ + "list_grapheme_clusters('. اما چند تا حرف تو فارسی هست که تو عربی نیست')" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + ". FULL STOP\n", + " SPACE\n", + "ل ARABIC LETTER LAM\n", + "ك ARABIC LETTER KAF\n", + "ن ARABIC LETTER NOON\n", + " SPACE\n", + "ك ARABIC LETTER KAF\n", + "م ARABIC LETTER MEEM\n", + " SPACE\n", + "ع ARABIC LETTER AIN\n", + "د ARABIC LETTER DAL\n", + "د ARABIC LETTER DAL\n", + " SPACE\n", + "ا ARABIC LETTER ALEF\n", + "ل ARABIC LETTER LAM\n", + "ك ARABIC LETTER KAF\n", + "ل ARABIC LETTER LAM\n", + "م ARABIC LETTER MEEM\n", + "ا ARABIC LETTER ALEF\n", + "ت ARABIC LETTER TEH\n", + " SPACE\n", + "ب ARABIC LETTER BEH\n", + "ا ARABIC LETTER ALEF\n", + "ل ARABIC LETTER LAM\n", + "ف ARABIC LETTER FEH\n", + "ا ARABIC LETTER ALEF\n", + "ر ARABIC LETTER REH\n", + "س ARABIC LETTER SEEN\n", + "ي ARABIC LETTER YEH\n", + "ة ARABIC LETTER TEH MARBUTA\n", + " SPACE\n", + "ه ARABIC LETTER HEH\n", + "ل ARABIC LETTER LAM\n", + " SPACE\n", + "أ ARABIC LETTER ALEF WITH HAMZA ABOVE\n", + "ن ARABIC LETTER NOON\n", + "ت ARABIC LETTER TEH\n", + " SPACE\n", + "ب ARABIC LETTER BEH\n", + "ا ARABIC LETTER ALEF\n", + "ل ARABIC LETTER LAM\n", + "ل ARABIC LETTER LAM\n", + "غ ARABIC LETTER GHAIN\n", + "ة ARABIC LETTER TEH MARBUTA\n", + " SPACE\n", + "ا ARABIC LETTER ALEF\n", + "ل ARABIC LETTER LAM\n", + "ع ARABIC LETTER AIN\n", + "ر ARABIC LETTER REH\n", + "ب ARABIC LETTER BEH\n", + "ي ARABIC LETTER YEH\n", + "ة ARABIC LETTER TEH MARBUTA\n", + "؟ ARABIC QUESTION MARK\n" + ] + } + ], + "source": [ + "list_grapheme_clusters('. لكن كم عدد الكلمات بالفارسية هل أنت باللغة العربية؟')" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "H LATIN CAPITAL LETTER H\n", + "e LATIN SMALL LETTER E\n", + "l LATIN SMALL LETTER L\n", + "l LATIN SMALL LETTER L\n", + "😀 GRINNING FACE\n", + " SPACE\n", + "W LATIN CAPITAL LETTER W\n", + "😀 GRINNING FACE\n", + "r LATIN SMALL LETTER R\n", + "l LATIN SMALL LETTER L\n", + "d LATIN SMALL LETTER D\n", + "! EXCLAMATION MARK\n" + ] + } + ], + "source": [ + "list_grapheme_clusters('Hell😀 W😀rld!')" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "u̶̜͓̬̞͚͙̪̰͓̯̲̝̬͔͎̳̼͇̓͊ͤ̋̃̀̄̓̿͊̀̚͟͜͟ͅ (multiple) LATIN SMALL LETTER U, COMBINING COMMA ABOVE, COMBINING NOT TILDE ABOVE, COMBINING LATIN SMALL LETTER E, COMBINING DOUBLE ACUTE ACCENT, COMBINING TILDE, COMBINING GRAVE ACCENT, COMBINING LEFT ANGLE ABOVE, COMBINING MACRON, COMBINING COMMA ABOVE, COMBINING DOUBLE OVERLINE, COMBINING NOT TILDE ABOVE, COMBINING DOUBLE MACRON BELOW, COMBINING GRAVE TONE MARK, COMBINING DOUBLE BREVE BELOW, COMBINING LONG STROKE OVERLAY, COMBINING DOUBLE MACRON BELOW, COMBINING LEFT HALF RING BELOW, COMBINING X BELOW, COMBINING CARON BELOW, COMBINING DOWN TACK BELOW, COMBINING DOUBLE RING BELOW, COMBINING ASTERISK BELOW, COMBINING BRIDGE BELOW, COMBINING TILDE BELOW, COMBINING X BELOW, COMBINING INVERTED BREVE BELOW, COMBINING LOW LINE, COMBINING UP TACK BELOW, COMBINING CARON BELOW, COMBINING LEFT ARROWHEAD BELOW, COMBINING UPWARDS ARROW BELOW, COMBINING DOUBLE LOW LINE, COMBINING SEAGULL BELOW, COMBINING EQUALS SIGN BELOW, COMBINING GREEK YPOGEGRAMMENI\n", + "ņ̷͔̤̜̗̘̠̦̦̖̟͉̹͕̬͎̙̲̲̎̅̈́ͮͣ̔̀̌͂̄͆͑̚ (multiple) LATIN SMALL LETTER N, COMBINING DOUBLE VERTICAL LINE ABOVE, COMBINING OVERLINE, COMBINING GREEK DIALYTIKA TONOS, COMBINING LEFT ANGLE ABOVE, COMBINING LATIN SMALL LETTER V, COMBINING LATIN SMALL LETTER A, COMBINING REVERSED COMMA ABOVE, COMBINING GRAVE ACCENT, COMBINING CARON, COMBINING GREEK PERISPOMENI, COMBINING MACRON, COMBINING BRIDGE ABOVE, COMBINING LEFT HALF RING ABOVE, COMBINING SHORT SOLIDUS OVERLAY, COMBINING CEDILLA, COMBINING LEFT ARROWHEAD BELOW, COMBINING DIAERESIS BELOW, COMBINING LEFT HALF RING BELOW, COMBINING ACUTE ACCENT BELOW, COMBINING LEFT TACK BELOW, COMBINING MINUS SIGN BELOW, COMBINING COMMA BELOW, COMBINING COMMA BELOW, COMBINING GRAVE ACCENT BELOW, COMBINING PLUS SIGN BELOW, COMBINING LEFT ANGLE BELOW, COMBINING RIGHT HALF RING BELOW, COMBINING RIGHT ARROWHEAD BELOW, COMBINING CARON BELOW, COMBINING UPWARDS ARROW BELOW, COMBINING RIGHT TACK BELOW, COMBINING LOW LINE, COMBINING LOW LINE\n", + "i̴̢͖̳̣̙͕̍ͯͧ̀ͥͭ̆ͣ̉͐͆̊͋͛̈́͒͟ (multiple) LATIN SMALL LETTER I, COMBINING VERTICAL LINE ABOVE, COMBINING LATIN SMALL LETTER X, COMBINING LATIN SMALL LETTER U, COMBINING GRAVE ACCENT, COMBINING LATIN SMALL LETTER I, COMBINING LATIN SMALL LETTER T, COMBINING BREVE, COMBINING LATIN SMALL LETTER A, COMBINING HOOK ABOVE, COMBINING RIGHT ARROWHEAD ABOVE, COMBINING BRIDGE ABOVE, COMBINING RING ABOVE, COMBINING HOMOTHETIC ABOVE, COMBINING ZIGZAG ABOVE, COMBINING GREEK DIALYTIKA TONOS, COMBINING FERMATA, COMBINING TILDE OVERLAY, COMBINING RETROFLEX HOOK BELOW, COMBINING DOUBLE MACRON BELOW, COMBINING RIGHT ARROWHEAD AND UP ARROWHEAD BELOW, COMBINING DOUBLE LOW LINE, COMBINING DOT BELOW, COMBINING RIGHT TACK BELOW, COMBINING RIGHT ARROWHEAD BELOW\n", + "c̰̟̫̲͇̺̹͖̼̦̾ͮ̍̐ͤͪ̓ͤ̐̈́̅ͯͤ̚̚͘ (multiple) LATIN SMALL LETTER C, COMBINING VERTICAL TILDE, COMBINING LATIN SMALL LETTER V, COMBINING VERTICAL LINE ABOVE, COMBINING CANDRABINDU, COMBINING LATIN SMALL LETTER E, COMBINING LEFT ANGLE ABOVE, COMBINING LATIN SMALL LETTER H, COMBINING COMMA ABOVE, COMBINING LATIN SMALL LETTER E, COMBINING LEFT ANGLE ABOVE, COMBINING CANDRABINDU, COMBINING GREEK DIALYTIKA TONOS, COMBINING OVERLINE, COMBINING LATIN SMALL LETTER X, COMBINING LATIN SMALL LETTER E, COMBINING DOT ABOVE RIGHT, COMBINING TILDE BELOW, COMBINING PLUS SIGN BELOW, COMBINING INVERTED DOUBLE ARCH BELOW, COMBINING LOW LINE, COMBINING EQUALS SIGN BELOW, COMBINING INVERTED BRIDGE BELOW, COMBINING RIGHT HALF RING BELOW, COMBINING RIGHT ARROWHEAD AND UP ARROWHEAD BELOW, COMBINING SEAGULL BELOW, COMBINING COMMA BELOW\n", + "o̴ͣ̑̐ͫ̈̄͊ͥ̓͟͏̫͔̠̤̜̤̥͘ (multiple) LATIN SMALL LETTER O, COMBINING LATIN SMALL LETTER A, COMBINING INVERTED BREVE, COMBINING CANDRABINDU, COMBINING LATIN SMALL LETTER M, COMBINING DIAERESIS, COMBINING MACRON, COMBINING NOT TILDE ABOVE, COMBINING LATIN SMALL LETTER I, COMBINING GREEK KORONIS, COMBINING DOUBLE MACRON BELOW, COMBINING TILDE OVERLAY, COMBINING GRAPHEME JOINER, COMBINING DOT ABOVE RIGHT, COMBINING INVERTED DOUBLE ARCH BELOW, COMBINING LEFT ARROWHEAD BELOW, COMBINING MINUS SIGN BELOW, COMBINING DIAERESIS BELOW, COMBINING LEFT HALF RING BELOW, COMBINING DIAERESIS BELOW, COMBINING RING BELOW\n", + "ḍ̛̥͖͓̪͈̹̯͖̱̘͙͖ͧ̿ͧ̓̓͊̈͑͘̕ (multiple) LATIN SMALL LETTER D, COMBINING LATIN SMALL LETTER U, COMBINING DOUBLE OVERLINE, COMBINING LATIN SMALL LETTER U, COMBINING COMMA ABOVE, COMBINING COMMA ABOVE, COMBINING NOT TILDE ABOVE, COMBINING DIAERESIS, COMBINING LEFT HALF RING ABOVE, COMBINING DOT ABOVE RIGHT, COMBINING COMMA ABOVE RIGHT, COMBINING HORN, COMBINING DOT BELOW, COMBINING RING BELOW, COMBINING RIGHT ARROWHEAD AND UP ARROWHEAD BELOW, COMBINING X BELOW, COMBINING BRIDGE BELOW, COMBINING DOUBLE VERTICAL LINE BELOW, COMBINING RIGHT HALF RING BELOW, COMBINING INVERTED BREVE BELOW, COMBINING RIGHT ARROWHEAD AND UP ARROWHEAD BELOW, COMBINING MACRON BELOW, COMBINING LEFT TACK BELOW, COMBINING ASTERISK BELOW, COMBINING RIGHT ARROWHEAD AND UP ARROWHEAD BELOW\n", + "e̛̺͈̜̰̜̖͎͚͈͋̒̆̈́̏͊ͬ̎̑̇̾̆̓ͬ̔̐̾ͭ́͞ (multiple) LATIN SMALL LETTER E, COMBINING HOMOTHETIC ABOVE, COMBINING TURNED COMMA ABOVE, COMBINING BREVE, COMBINING GREEK DIALYTIKA TONOS, COMBINING DOUBLE GRAVE ACCENT, COMBINING NOT TILDE ABOVE, COMBINING LATIN SMALL LETTER R, COMBINING DOUBLE VERTICAL LINE ABOVE, COMBINING INVERTED BREVE, COMBINING DOT ABOVE, COMBINING VERTICAL TILDE, COMBINING BREVE, COMBINING GREEK KORONIS, COMBINING LATIN SMALL LETTER R, COMBINING REVERSED COMMA ABOVE, COMBINING CANDRABINDU, COMBINING VERTICAL TILDE, COMBINING LATIN SMALL LETTER T, COMBINING ACUTE TONE MARK, COMBINING HORN, COMBINING DOUBLE MACRON, COMBINING INVERTED BRIDGE BELOW, COMBINING DOUBLE VERTICAL LINE BELOW, COMBINING LEFT HALF RING BELOW, COMBINING TILDE BELOW, COMBINING LEFT HALF RING BELOW, COMBINING GRAVE ACCENT BELOW, COMBINING UPWARDS ARROW BELOW, COMBINING DOUBLE RING BELOW, COMBINING DOUBLE VERTICAL LINE BELOW\n" + ] + } + ], + "source": [ + "list_grapheme_clusters('u̶̜͓̬̞͚͙̪̰͓̯̲̝̬͔͎̳̼͇̓͊ͤ̋̃̀̄̓̿͊̀̚͟͜͟ͅņ̷͔̤̜̗̘̠̦̦̖̟͉̹͕̬͎̙̲̲̎̅̈́ͮͣ̔̀̌͂̄͆͑̚i̴̢͖̳̣̙͕̍ͯͧ̀ͥͭ̆ͣ̉͐͆̊͋͛̈́͒͟c̰̟̫̲͇̺̹͖̼̦̾ͮ̍̐ͤͪ̓ͤ̐̈́̅ͯͤ̚̚͘o̴ͣ̑̐ͫ̈̄͊ͥ̓͟͏̫͔̠̤̜̤̥͘ḍ̛̥͖͓̪͈̹̯͖̱̘͙͖ͧ̿ͧ̓̓͊̈͑͘̕e̛̺͈̜̰̜̖͎͚͈͋̒̆̈́̏͊ͬ̎̑̇̾̆̓ͬ̔̐̾ͭ́͞')" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Z LATIN CAPITAL LETTER Z\n", + "e LATIN SMALL LETTER E\n", + "u LATIN SMALL LETTER U\n", + "g LATIN SMALL LETTER G\n", + "n LATIN SMALL LETTER N\n", + "uͤ (multiple) LATIN SMALL LETTER U, COMBINING LATIN SMALL LETTER E\n", + "ß LATIN SMALL LETTER SHARP S\n" + ] + } + ], + "source": [ + "list_grapheme_clusters('Zeugnuͤß')" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Z LATIN CAPITAL LETTER Z\n", + "e LATIN SMALL LETTER E\n", + "u LATIN SMALL LETTER U\n", + "g LATIN SMALL LETTER G\n", + "n LATIN SMALL LETTER N\n", + " private use character 0xE72B\n", + "ß LATIN SMALL LETTER SHARP S\n" + ] + } + ], + "source": [ + "list_grapheme_clusters('Zeugnß')" + ] + } + ], + "metadata": { + "hide_input": false, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.4" + }, + "toc": { + "base_numbering": 1, + "nav_menu": {}, + "number_sections": true, + "sideBar": true, + "skip_h1_title": false, + "title_cell": "Table of Contents", + "title_sidebar": "Contents", + "toc_cell": false, + "toc_position": {}, + "toc_section_display": true, + "toc_window_display": true + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/qurator/dinglehopper/ocr_files.py b/qurator/dinglehopper/ocr_files.py new file mode 100644 index 0000000..b57a047 --- /dev/null +++ b/qurator/dinglehopper/ocr_files.py @@ -0,0 +1,107 @@ +from __future__ import division, print_function + +from warnings import warn + +from lxml import etree as ET +import sys + +from lxml.etree import XMLSyntaxError + + +def alto_namespace(tree): + """Return the ALTO namespace used in the given ElementTree. + + This relies on the assumption that, in any given ALTO file, the root element has the local name "alto". We do not + check if the files uses any valid ALTO namespace. + """ + root_name = ET.QName(tree.getroot().tag) + if root_name.localname == 'alto': + return root_name.namespace + else: + raise ValueError('Not an ALTO tree') + + +def alto_text(tree): + """Extract text from the given ALTO ElementTree.""" + + nsmap = {'alto': alto_namespace(tree)} + + lines = ( + ' '.join(string.attrib.get('CONTENT') for string in line.iterfind('alto:String', namespaces=nsmap)) + for line in tree.iterfind('.//alto:TextLine', namespaces=nsmap)) + text_ = '\n'.join(lines) + + return text_ + + +def page_namespace(tree): + """Return the PAGE content namespace used in the given ElementTree. + + This relies on the assumption that, in any given PAGE content file, the root element has the local name "PcGts". We + do not check if the files uses any valid PAGE namespace. + """ + root_name = ET.QName(tree.getroot().tag) + if root_name.localname == 'PcGts': + return root_name.namespace + else: + raise ValueError('Not a PAGE tree') + + +def page_text(tree): + """Extract text from the given PAGE content ElementTree.""" + + nsmap = {'page': page_namespace(tree)} + + def region_text(region): + try: + return region.find('./page:TextEquiv/page:Unicode', namespaces=nsmap).text + except AttributeError: + return None + + region_texts = [] + reading_order = tree.find('.//page:ReadingOrder', namespaces=nsmap) + if reading_order is not None: + for group in reading_order.iterfind('./*', namespaces=nsmap): + if ET.QName(group.tag).localname == 'OrderedGroup': + region_ref_indexeds = group.findall('./page:RegionRefIndexed', namespaces=nsmap) + for region_ref_indexed in sorted(region_ref_indexeds, key=lambda r: int(r.attrib['index'])): + region_id = region_ref_indexed.attrib['regionRef'] + region = tree.find('.//page:TextRegion[@id="%s"]' % region_id, namespaces=nsmap) + if region is not None: + region_texts.append(region_text(region)) + else: + warn('Not a TextRegion: "%s"' % region_id) + else: + raise NotImplementedError + else: + for region in tree.iterfind('.//page:TextRegion', namespaces=nsmap): + region_texts.append(region_text(region)) + + # XXX Does a file have to have regions etc.? region vs lines etc. + # Filter empty region texts + region_texts = (t for t in region_texts if t) + + text_ = '\n'.join(region_texts) + + return text_ + + +def text(filename): + """Read the text from the given file. + + Supports PAGE, ALTO and falls back to plain text. + """ + + try: + tree = ET.parse(filename) + except XMLSyntaxError: + with open(filename, 'r') as f: + return f.read() + try: + return page_text(tree) + except ValueError: + return alto_text(tree) + + +if __name__ == '__main__': + print(text(sys.argv[1])) diff --git a/qurator/dinglehopper/ocrd-tool.json b/qurator/dinglehopper/ocrd-tool.json new file mode 100644 index 0000000..4710f35 --- /dev/null +++ b/qurator/dinglehopper/ocrd-tool.json @@ -0,0 +1,22 @@ +{ + "git_url": "https://github.com/qurator-spk/dinglehopper", + "tools": { + "ocrd-dinglehopper": { + "executable": "ocrd-dinglehopper", + "description": "Evaluate OCR text against ground truth with dinglehopper", + "input_file_grp": [ + "OCR-D-GT-PAGE", + "OCR-D-OCR" + ], + "output_file_grp": [ + "OCR-D-OCR-EVAL" + ], + "categories": [ + "Quality assurance" + ], + "steps": [ + "recognition/text-recognition" + ] + } + } +} diff --git a/qurator/dinglehopper/ocrd_cli.py b/qurator/dinglehopper/ocrd_cli.py new file mode 100644 index 0000000..8ab5cf2 --- /dev/null +++ b/qurator/dinglehopper/ocrd_cli.py @@ -0,0 +1,71 @@ +import json +import os + +import click +from ocrd import Processor +from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor +from ocrd_utils import concat_padded, getLogger +from pkg_resources import resource_string + +from qurator.dinglehopper.cli import process as cli_process +from qurator.dinglehopper.edit_distance import levenshtein_matrix_cache_clear + +log = getLogger('processor.OcrdDinglehopperEvaluate') + +OCRD_TOOL = json.loads(resource_string(__name__, 'ocrd-tool.json').decode('utf8')) + + +@click.command() +@ocrd_cli_options +def ocrd_dinglehopper(*args, **kwargs): + return ocrd_cli_wrap_processor(OcrdDinglehopperEvaluate, *args, **kwargs) + + +class OcrdDinglehopperEvaluate(Processor): + + def __init__(self, *args, **kwargs): + kwargs['ocrd_tool'] = OCRD_TOOL['tools']['ocrd-dinglehopper'] + super(OcrdDinglehopperEvaluate, self).__init__(*args, **kwargs) + + def _make_file_id(self, input_file, input_file_grp, n): + file_id = input_file.ID.replace(input_file_grp, self.output_file_grp) + if file_id == input_file.ID: + file_id = concat_padded(self.output_file_grp, n) + return file_id + + def process(self): + gt_grp, ocr_grp = self.input_file_grp.split(',') + for n, page_id in enumerate(self.workspace.mets.physical_pages): + gt_file = self.workspace.mets.find_files(fileGrp=gt_grp, pageId=page_id)[0] + ocr_file = self.workspace.mets.find_files(fileGrp=ocr_grp, pageId=page_id)[0] + log.info("INPUT FILES %i / %s↔ %s", n, gt_file, ocr_file) + + file_id = self._make_file_id(ocr_file, ocr_grp, n) + report_prefix = os.path.join(self.output_file_grp, file_id) + + # Process the files + try: + os.mkdir(self.output_file_grp) + except FileExistsError: + pass + cli_process(gt_file.local_filename, ocr_file.local_filename, report_prefix) + + # Add reports to the workspace + for report_suffix, mimetype in \ + [ + ['.html', 'text/html'], + ['.json', 'application/json'] + ]: + self.workspace.add_file( + ID=file_id + report_suffix, + file_grp=self.output_file_grp, + pageId=page_id, + mimetype=mimetype, + local_filename=report_prefix + report_suffix) + + # Clear cache between files + levenshtein_matrix_cache_clear() + + +if __name__ == '__main__': + ocrd_dinglehopper() diff --git a/qurator/dinglehopper/substitute_equivalences.py b/qurator/dinglehopper/substitute_equivalences.py new file mode 100644 index 0000000..1b7e0cf --- /dev/null +++ b/qurator/dinglehopper/substitute_equivalences.py @@ -0,0 +1,46 @@ +import unicodedata + + +def substitute_equivalences(s): + + # These are for OCR-D GT vs Tesseract frk vs Calamari GT4HistOCR + # It might make sense to use different rules for GT and for the different OCR + equivalences = { + '': 'ü', + '': 'ſſ', + "\ueba7": 'ſſi', # MUFI: LATIN SMALL LIGATURE LONG S LONG S I + '': 'ä', + '': 'ch', + '==': '–', # → en-dash + '—': '–', # em-dash → en-dash + '': 'ck', + '': 'll', + '': 'ö', + '': 'ſi', + '': 'ſt', + 'fi': 'fi', + 'ff': 'ff', + 'fl': 'fl', + 'ffi': 'ffi', + '': 'ct', + '’': '\'', + '⸗': '-', + '': 'tz', # MUFI: LATIN SMALL LIGATURE TZ + 'aͤ': 'ä', # LATIN SMALL LETTER A, COMBINING LATIN SMALL LETTER E + 'oͤ': 'ö', # LATIN SMALL LETTER O, COMBINING LATIN SMALL LETTER E + 'uͤ': 'ü', # LATIN SMALL LETTER U, COMBINING LATIN SMALL LETTER E + '\uf532': 'as', # eMOP: Latin small ligature as + '\uf533': 'is', # eMOP: Latin small ligature is + '\uf534': 'us', # eMOP: Latin small ligature us + '\uf535': 'Qu', # eMOP: Latin ligature capital Q small u + 'ij': 'ij', # U+0133 LATIN SMALL LIGATURE IJ + '\uE8BF': 'q&', # MUFI: LATIN SMALL LETTER Q LIGATED WITH FINAL ET XXX How to replace this correctly? + '\uEBA5': 'ſp', # MUFI: LATIN SMALL LIGATURE LONG S P + 'st': 'st', # U+FB06 LATIN SMALL LIGATURE ST + '\uF50E': 'q́' # U+F50E LATIN SMALL LETTER Q WITH ACUTE ACCENT + } + + s = unicodedata.normalize('NFC', s) + for fr, to in equivalences.items(): + s = s.replace(fr, to) + return s diff --git a/qurator/dinglehopper/templates/report.html.j2 b/qurator/dinglehopper/templates/report.html.j2 new file mode 100644 index 0000000..80ffae3 --- /dev/null +++ b/qurator/dinglehopper/templates/report.html.j2 @@ -0,0 +1,60 @@ + + + + + + + + + + + + + +
+ +{{ gt }}
+{{ ocr }} + + +

Metrics

+

CER: {{ cer|round(4) }}

+

WER: {{ wer|round(4) }}

+ +

Character differences

+{{ char_diff_report }} + +

Word differences

+{{ word_diff_report }} + + +
+ + + + + + + + + + + + diff --git a/qurator/dinglehopper/templates/report.html.js b/qurator/dinglehopper/templates/report.html.js new file mode 100644 index 0000000..ac43676 --- /dev/null +++ b/qurator/dinglehopper/templates/report.html.js @@ -0,0 +1,14 @@ +function find_diff_class(classes) { + return classes.split(/\s+/).find(x => x.match(/.diff\d.*/)); +} + +$(document).ready(function() { + $('.diff').mouseover(function() { + let c = find_diff_class($(this).attr('class')) + $('.' + c).addClass('diff-highlight') + }); + $('.diff').mouseout(function() { + let c = find_diff_class($(this).attr('class')) + $('.' + c).removeClass('diff-highlight') + }); +}); diff --git a/qurator/dinglehopper/templates/report.json.j2 b/qurator/dinglehopper/templates/report.json.j2 new file mode 100644 index 0000000..62d3f77 --- /dev/null +++ b/qurator/dinglehopper/templates/report.json.j2 @@ -0,0 +1,6 @@ +{ + "gt": "{{ gt }}", + "ocr": "{{ ocr }}", + "cer": {{ cer|json_float }}, + "wer": {{ wer|json_float }} +} diff --git a/.gitkeep b/qurator/dinglehopper/tests/__init__.py similarity index 100% rename from .gitkeep rename to qurator/dinglehopper/tests/__init__.py diff --git a/qurator/dinglehopper/tests/data/00000119.tif b/qurator/dinglehopper/tests/data/00000119.tif new file mode 100644 index 0000000..b831bd0 Binary files /dev/null and b/qurator/dinglehopper/tests/data/00000119.tif differ diff --git a/qurator/dinglehopper/tests/data/actevedef_718448162/OCR-D-GT-PAGE/00000024.page.xml b/qurator/dinglehopper/tests/data/actevedef_718448162/OCR-D-GT-PAGE/00000024.page.xml new file mode 100644 index 0000000..062a0d2 --- /dev/null +++ b/qurator/dinglehopper/tests/data/actevedef_718448162/OCR-D-GT-PAGE/00000024.page.xml @@ -0,0 +1,19115 @@ + + + + doculibtopagexml + 2018-12-05T03:30:16 + 2019-04-25T08:13:47 + + + + + + + + + + + + + + + + + + + + + + + + + + 2 + + + + 0 + + 20 + + 20 + + 20 + + + + + + + + + + + + D + + + + i + + + + e + + Die + + + Die + + Die + + + + + + + + + + + [ + + + + 2 + + + + 2 + + + + . + + + + ] + + [22.] + + [22.] + + [22.] + + + + + + + + + + [ + + + + 2 + + + + 2 + + + + . + + + + ] + + [22.] + + [22.] + + [22.] + + + + + + + + + + e + + + + i + + + + n + + ein + + + + + + + g + + + + l + + + + e + + + + i + + + + + + + + e + + + + s + + gleies + + + + + + + v + + + + o + + + + r + + + + g + + + + e + + + + g + + + + e + + + + b + + + + e + + + + n + + + + , + + vorgegeben, + + + + + + + u + + + + n + + + + d + + und + + + + + + + ſ + + + + o + + ſo + + + + + + + g + + + + a + + + + r + + gar + + + + + + + ſ + + + + e + + + + h + + + + r + + ſehr + + + + + + + v + + + + i + + + + e + + + + l + + + + e + + viele + + + + + + + m + + + + a + + + + h + + + + l + + + + e + + mahle + + + + + + + g + + + + e + + + + g + + + + e + + + + n + + gegen + + + + + + + a + + + + + + + + e + + ae + + + + + + + m + + + + i + + + + t + + mit + + + + + + + G + + + + e + + + + w + + + + a + + + + l + + + + t + + Gewalt + + + + + + + f + + + + o + + + + r + + + + - + + for- + + + + + + + m + + + + e + + + + n + + + + ſ + + + + + + + + l + + + + i + + + + + + + + e + + menſlie + + + + + + + M + + + + + + + + g + + + + l + + + + i + + + + + + + + k + + + + e + + + + i + + + + t + + Mglikeit + + + ein gleies vorgegeben, und ſo gar ſehr viele mahle gegen ae menſlie Mglikeit mit Gewalt for- + + + + + + + + c + + + + i + + + + r + + + + e + + + + t + + ciret + + + + + + + w + + + + o + + + + r + + + + d + + + + e + + + + n + + worden + + + + + + + z + + + + u + + zu + + + + + + + ſ + + + + e + + + + y + + + + n + + + + , + + ſeyn, + + + + + + + b + + + + e + + + + h + + + + a + + + + u + + + + p + + + + t + + + + e + + + + n + + behaupten + + + + + + + w + + + + i + + + + + + + + , + + wi, + + + + + + + m + + + + i + + + + t + + + + h + + + + i + + + + n + + mithin + + + + + + + n + + + + e + + + + b + + + + + + neb + + + + + + + d + + + + e + + + + m + + dem + + + + + + + B + + + + r + + + + e + + + + d + + + + e + + + + k + + + + a + + + + w + + + + , + + Bredekaw, + + + + + + + w + + + + e + + + + l + + + + + + + + e + + + + r + + weler + + + + + + + ( + + + + § + + + + . + + (§. + + + + + + + 2 + + + + 8 + + + + . + + 28. + + + + + + + 2 + + + + 9 + + + + . + + + + ) + + 29.) + + + + + + + + + + + + +  + + + + + + + i + + + + n + + in + + + + + + + a + + + + + + + + e + + + + n + + aen + + + + + + + ſ + + + + e + + + + e + + + + n + + + + i + + + + n + + ſeinen + + + ciret worden zu ſeyn, behaupten wi, mithin neb dem Bredekaw, weler (§. 28. 29.)  in aen ſeinen + + + + + + + + A + + + + u + + + + + + + + a + + + + g + + + + e + + + + n + + Auagen + + + + + + + w + + + + i + + + + e + + + + d + + + + e + + + + r + + + + ſ + + + + p + + + + r + + + + o + + + + + + + + e + + + + n + + + + , + + wiederſproen, + + + + + + + m + + + + i + + + + t + + mit + + + + + + + d + + + + e + + + + r + + der + + + + + + + P + + + + œ + + + + n + + + + a + + Pœna + + + + + + f + + + + a + + + + l + + + + + + fal + + + + + + g + + + + e + + + + w + + + + i + + + + + + + + e + + + + r + + gewier + + + + + + + z + + + + u + + zu + + + + + + + b + + + + e + + + + l + + + + e + + + + g + + + + e + + + + n + + belegen + + + + + + + i + + + + + + + + , + + i, + + + + + + + d + + + + a + + da + + + + + + + u + + + + m + + um + + + + + + + d + + + + o + + do + + + Auagen wiederſproen, mit der Pœna fal um do gewier zu belegen i, da + + + + + + + + ſ + + + + e + + + + c + + + + u + + + + n + + + + d + + + + . + + ſecund. + + + + + + + F + + + + a + + + + r + + + + i + + + + n + + + + . + + Farin. + + + + + + + T + + + + i + + + + t + + + + . + + Tit. + + + + + + + 9 + + + + . + + 9. + + + + + + + q + + + + u + + + + . + + qu. + + + + + + + 6 + + + + 6 + + + + . + + 66. + + + + + + + p + + + + . + + p. + + + + + + + m + + + + . + + m. + + + + + + + 3 + + + + 2 + + + + 0 + + + + . + + 320. + + + ſecund. Farin. Tit. 9. qu. 66. p. m. 320. + + + + + + + + d + + + + i + + + + e + + die + + + + + + + K + + + + l + + + + a + + + + g + + + + e + + Klage + + + + + + + a + + + + l + + + + s + + als + + + + + + + d + + + + a + + + + s + + das + + + + + + + Z + + + + e + + + + u + + + + g + + + + n + + + + + + + + ß + + Zeugnß + + + + + + + v + + + + o + + + + r + + vor + + + + + + + f + + + + a + + + + l + + + + ſ + + + + + + falſ + + + + + + + u + + + + n + + + + d + + und + + + + + + + e + + + + r + + + + d + + + + i + + + + + + + + t + + + + e + + + + t + + erditet + + + + + + + m + + + + + + + + ß + + + + e + + + + n + + mßen + + + + + + + g + + + + e + + + + h + + + + a + + + + l + + + + t + + + + e + + + + n + + gehalten + + + + + + + w + + + + e + + + + r + + + + d + + + + e + + + + n + + + + . + + werden. + + + + + + + ſ + + + + o + + ſo + + + + + + + w + + + + o + + + + h + + + + l + + wohl + + + die Klage ſo wohl als das Zeugnß vor falſ und erditet mßen gehalten werden. + + + + + + + + S + + + + o + + So + + + + + + + v + + + + o + + + + n + + von + + + + + + + d + + + + e + + + + r + + der + + + + + + + I + + + + n + + + + q + + + + u + + + + i + + + + + + + + t + + + + i + + + + n + + Inquitin + + + + + + v + + + + i + + + + e + + + + l + + viel + + + + + + + d + + + + i + + + + e + + die + + + + + + + 5 + + + + ) + + + + 3 + + 35) + + + + + + § + + + + . + + §. + + §. 35) So viel die von der Inquitin + + ein gleies vorgegeben, und ſo gar ſehr viele mahle gegen ae menſlie Mglikeit mit Gewalt for- +ciret worden zu ſeyn, behaupten wi, mithin neb dem Bredekaw, weler (§. 28. 29.)  in aen ſeinen +Auagen wiederſproen, mit der Pœna fal um do gewier zu belegen i, da +ſecund. Farin. Tit. 9. qu. 66. p. m. 320. +die Klage ſo wohl als das Zeugnß vor falſ und erditet mßen gehalten werden. +§. 35) So viel die von der Inquitin + + + + + + + + + + + r + + + + a + + + + t + + + + h + + rath + + + + + + + m + + + + i + + + + t + + mit + + + + + + + e + + + + i + + + + n + + + + e + + + + r + + einer + + + + + + + P + + + + œ + + + + n + + + + a + + Pœna + + + + + + + + + + + ſ + + + + c + + + + a + + + + l + + + + i + + fiſcali + + + + + + + a + + + + n + + + + g + + + + e + + + + ſ + + + + e + + + + h + + + + e + + + + n + + angeſehen + + + + + + + w + + + + o + + + + r + + + + d + + + + e + + + + n + + + + , + + worden, + + + + + + + u + + + + n + + + + d + + und + + + + + + + ſ + + + + o + + + + l + + + + c + + + + h + + + + e + + ſolche + + + + + + + d + + + + u + + + + r + + + + + + dur + + + + + + + G + + + + r + + + + a + + + + + + + + e + + + + n + + Graffen + + + + + + + v + + + + o + + + + n + + von + + + + + + + K + + + + + + + + n + + + + i + + + + g + + + + s + + + + f + + + + e + + + + l + + + + d + + Knigsfeld + + + + + + + V + + + + o + + + + r + + + + + + Vor⸗ + + + + + + + d + + + + e + + + + s + + des + + + + + + + H + + + + r + + + + n + + + + . + + Hrn. + + + rath mit einer Pœna fiſcali angeſehen worden, und ſolche dur des Hrn. Graffen von Knigsfeld Vor⸗ + + + + + + + + ſ + + + + p + + + + r + + + + u + + + + + + + + , + + ſpru, + + + + + + + n + + + + u + + + + r + + nur + + + + + + + a + + + + u + + + + s + + aus + + + + + + + G + + + + n + + + + a + + + + d + + + + e + + + + n + + Gnaden + + + + + + + n + + + + a + + + + + + + + g + + + + e + + + + l + + + + a + + + + + + + + e + + + + n + + nagelaen + + + + + + + e + + + + r + + + + h + + + + a + + + + l + + + + t + + + + e + + + + n + + + + . + + erhalten. + + + ſpru, nur aus Gnaden nagelaen erhalten. + + + + + + + + S + + + + o + + + + n + + + + d + + + + e + + + + r + + + + n + + Sondern + + + + + + + m + + + + a + + + + n + + man + + + + + + + d + + + + i + + + + e + + + + ſ + + + + e + + + + n + + dieſen + + + + + + + 4 + + + + . + + 4. + + + + + + + W + + + + o + + + + + + + + e + + + + n + + Woen + + + + + + + l + + + + a + + + + n + + + + g + + lang + + + + + + + a + + + + + + + + e + + ae + + + + + + + A + + + + b + + + + e + + + + n + + + + d + + Abend + + + + + + + b + + + + e + + + + y + + bey + + + + + + + d + + + + e + + + + r + + der + + + + + + + I + + + + n + + + + q + + + + u + + + + i + + + + + + + + t + + + + i + + + + n + + Inquitin + + + + + + + g + + + + a + + + + n + + + + + + gan + + + + + + + a + + + + + + + + e + + + + i + + + + n + + aein + + + + + + + g + + + + e + + + + l + + + + a + + + + + + + + e + + + + n + + + + . + + gelaen. + + + + + + + h + + + + a + + + + t + + hat + + + + + + + a + + + + u + + + + + + au + + + Sondern man hat au dieſen 4. Woen lang ae Abend bey der Inquitin gan aein gelaen. + + + + + + + + B + + + + i + + + + n + + + + n + + + + e + + + + n + + Binnen + + + + + + + d + + + + e + + + + r + + der + + + + + + + S + + + + + + + + r + + + + e + + + + i + + + + b + + + + e + + + + r + + Sreiber + + + + + + + B + + + + r + + + + e + + + + d + + + + e + + + + k + + + + a + + + + w + + Bredekaw + + + + + + + b + + + + e + + + + + + + + + + + + n + + + + d + + + + i + + + + g + + bendig + + + + + + + b + + + + e + + + + y + + bey + + + + + + + I + + + + h + + + + m + + + + e + + Ihme + + + + + + + g + + + + e + + + + w + + + + e + + + + ſ + + + + e + + + + n + + + + , + + geweſen, + + + + + + + u + + + + n + + + + d + + und + + + + + + + + + + + + +  + + + + + + + i + + + + n + + in + + + + + + + w + + + + e + + + + l + + + + + + + + e + + + + r + + weler + + + + + + + g + + + + a + + + + n + + + + + + + + e + + + + r + + ganer + + + + + + + Z + + + + e + + + + i + + + + t + + Zeit + + + Binnen weler ganer Zeit der Sreiber Bredekaw bendig bey Ihme geweſen, und  in + + + + + + + + d + + + + e + + + + r + + der + + + + + + + a + + + + m + + am + + + + + + + i + + + + n + + in + + + + + + + J + + + + u + + + + d + + + + i + + + + c + + + + i + + + + o + + Judicio + + + + + + + g + + + + e + + + + g + + + + e + + + + n + + gegen + + + + + + + ſ + + + + e + + + + i + + + + n + + + + e + + + + n + + ſeinen + + + + + + + i + + + + n + + + + t + + + + r + + + + o + + + + d + + + + u + + + + c + + + + i + + + + r + + + + t + + + + e + + + + r + + introducirter + + + + + + + A + + + + p + + + + p + + + + e + + + + + + + + a + + + + t + + + + i + + + + o + + + + n + + Appeation + + + + + + + d + + + + e + + + + + + + + e + + + + n + + deen + + + + + + + B + + + + e + + + + y + + + + + + Bey⸗ + + + + + + + 1 + + + + 3 + + 13 + + + + + + + t + + + + e + + + + n + + ten + + + + + + + O + + + + + + + + o + + + + b + + + + r + + + + . + + Oobr. + + + + + + + a + + + + . + + + + c + + + + . + + a.c. + + + + + + + g + + + + e + + + + w + + + + e + + + + ſ + + + + e + + + + n + + + + e + + + + n + + geweſenen + + + + + + + H + + + + r + + + + n + + + + . + + Hrn. + + + der am 13 ten Oobr. a.c. in Judicio gegen ſeinen geweſenen Hrn. introducirter Appeation deen Bey⸗ + + + + + + + + r + + + + a + + + + t + + + + h + + + + s + + raths + + + + + + + b + + + + e + + + + d + + + + i + + + + e + + + + n + + + + e + + + + t + + bedienet + + + + + + + h + + + + a + + + + t + + + + ; + + hat; + + + raths bedienet hat; + + rath mit einer Pœna fiſcali angeſehen worden, und ſolche dur des Hrn. Graffen von Knigsfeld Vor⸗ +ſpru, nur aus Gnaden nagelaen erhalten. +Sondern man hat au dieſen 4. Woen lang ae Abend bey der Inquitin gan aein gelaen. +Binnen weler ganer Zeit der Sreiber Bredekaw bendig bey Ihme geweſen, und  in +der am 13 ten Oobr. a.c. in Judicio gegen ſeinen geweſenen Hrn. introducirter Appeation deen Bey⸗ +raths bedienet hat; + + + + + + + + + + + D + + + + a + + + + b + + + + e + + + + n + + + + e + + + + b + + + + e + + + + n + + + + + + Dabeneben + + + + + + + i + + + + + + i + + + + + + + d + + + + e + + + + r + + der + + + + + + + d + + + + i + + + + e + + + + ſ + + + + e + + + + r + + dieſer + + + + + + + g + + + + a + + + + n + + + + + + + + e + + + + n + + ganen + + + + + + + f + + + + r + + + + e + + + + y + + + + e + + + + m + + freyem + + + + + + + F + + + + u + + + + ß + + Fuß + + + + + + + S + + + + + + + + r + + + + e + + + + i + + + + b + + + + e + + + + r + + Sreiber + + + + + + + b + + + + i + + + + n + + + + n + + + + e + + + + n + + binnen + + + + + + + g + + + + e + + + + b + + + + l + + + + i + + + + e + + + + b + + + + e + + + + n + + + + , + + geblieben, + + + + + + + u + + + + n + + + + d + + und + + + + + + + Z + + + + e + + + + i + + + + t + + Zeit + + + + + + + a + + + + u + + + + f + + auf + + + + + + + § + + + + . + + §. + + + + + + + 3 + + + + 3 + + + + ) + + 33) + + + §. 33) Dabeneben i der Sreiber binnen dieſer ganen Zeit auf freyem Fuß geblieben, und + + + + + + + + ſ + + + + e + + + + i + + + + n + + + + e + + + + n + + ſeinen + + + + + + + C + + + + o + + + + n + + + + ſ + + + + u + + + + l + + + + e + + + + n + + + + t + + + + e + + + + n + + + + , + + Conſulenten, + + + + + + ſ + + + + o + + + + n + + + + d + + + + e + + + + r + + + + n + + ſondern + + + + + + + a + + + + u + + + + + + + + , + + au, + + + + + + + w + + + + e + + + + i + + + + l + + + + e + + + + n + + weilen + + + + + + + d + + + + e + + + + r + + der + + + + + + + i + + + + n + + in + + + + + + + I + + + + h + + + + r + + + + e + + + + m + + Ihrem + + + + + + + G + + + + e + + + + f + + + + + + + + n + + + + g + + + + n + + + + + + + + ß + + Gefngnß + + + + + + + I + + + + n + + + + q + + + + u + + + + i + + + + + + + + t + + + + i + + + + n + + Inquitin + + + + + + ſ + + + + e + + + + l + + + + b + + + + + + + + e + + + + n + + ſelben + + + + + + + a + + + + t + + + + h + + hat + + + + + + + n + + + + i + + + + + + + + t + + nit + + + + + + + n + + + + u + + + + r + + nur + + + + + + + d + + + + u + + + + r + + + + + + dur + + + hat nit nur dur ſeinen Conſulenten, ſondern au, weilen der Inquitin ſelben in Ihrem Gefngnß + + + + + + + + o + + + + ſ + + ſo + + + + + + + v + + + + i + + + + e + + + + l + + + + e + + viele + + + + + + + F + + + + r + + + + e + + + + y + + + + h + + + + e + + + + i + + + + t + + Freyheit + + + + + + + g + + + + e + + + + l + + + + a + + + + + + + + e + + + + n + + gelaen + + + + + + + w + + + + o + + + + r + + + + d + + + + e + + + + n + + + + , + + worden, + + + + + + + d + + + + a + + + + ß + + daß + + + + + + + + + + + e + + e + + + + + + + f + + + + r + + + + e + + + + m + + + + b + + + + d + + + + e + + + + n + + frembden + + + + + + + B + + + + e + + + + ſ + + + + u + + + + + + Beſu + + + + + + + v + + + + o + + + + n + + von + + + + + + + I + + + + h + + + + r + + + + e + + + + n + + Ihren + + + + + + + A + + + + n + + + + v + + + + e + + + + r + + + + w + + + + a + + + + n + + + + d + + + + t + + + + e + + + + n + + Anverwandten + + + + + + + o + + + + h + + + + n + + + + g + + + + e + + + + h + + + + i + + + + n + + + + d + + + + e + + + + r + + + + t + + ohngehindert + + + + + + + e + + + + m + + + + + + em⸗ + + + ſo viele Freyheit gelaen worden, daß e frembden Beſu von Ihren Anverwandten ohngehindert em⸗ + + + + + + + + p + + + + f + + + + a + + + + n + + + + g + + + + e + + + + n + + pfangen + + + + + + + k + + + + + + + + n + + + + n + + + + e + + + + n + + + + , + + knnen, + + + + + + + d + + + + u + + + + r + + + + + + dur + + + + + + + a + + + + n + + + + d + + + + e + + + + r + + + + e + + andere + + + + + + + P + + + + e + + + + r + + + + ſ + + + + o + + + + n + + + + e + + + + n + + Perſonen + + + + + + + + + + + + +  + + + + + + + m + + + + i + + + + t + + mit + + + + + + + i + + + + h + + + + r + + ihr + + + + + + + + + + + b + + + + e + + + + r + + ber + + + + + + + a + + + + + + + + e + + + + s + + + + , + + aes, + + + + + + + w + + + + a + + + + s + + was + + + + + + + + + + + e + + e + + + + + + + d + + + + e + + + + r + + + + e + + + + i + + + + n + + + + + + + + e + + + + n + + dereinen + + + + + + + z + + + + u + + zu + + + + + + + ſ + + + + a + + + + g + + + + e + + + + n + + ſagen + + + + + + + h + + + + a + + + + t + + + + + + hat⸗ + + + + + + + E + + + + r + + Er + + + + + + + o + + + + d + + + + e + + + + r + + oder + + + pfangen knnen, dur andere Perſonen  mit ihr ber aes, was Er oder e dereinen zu ſagen hat⸗ + + + + + + + + t + + + + e + + + + n + + + + , + + ten, + + + + + + + i + + + + m + + + + m + + + + a + + + + + + + + e + + + + n + + immaen + + + + + + + S + + + + e + + + + n + + + + + + + + e + + + + n + + + + b + + + + e + + + + r + + + + g + + + + , + + Senenberg, + + + + + + + d + + + + a + + + + s + + das + + + + + + + O + + + + + + + + u + + + + m + + + + c + + + + i + + Officium + + + + + + + d + + + + i + + + + - + + + + J + + + + u + + Judi- + + + + + + + a + + + + l + + + + s + + als + + + + + + + d + + + + i + + + + e + + + + ſ + + + + e + + + + r + + dieſer + + + + + + + a + + + + m + + am + + + + + + + 1 + + + + . + + 1. + + + + + + + O + + + + + + + + o + + + + b + + + + . + + Oob. + + + + + + + d + + + + e + + + + r + + der + + + + + + + H + + + + o + + + + f + + + + t + + + + h + + + + r + + + + a + + Hofrath + + + + + + + v + + + + e + + + + r + + + + e + + + + i + + + + n + + + + i + + + + g + + + + e + + + + n + + vereinigen + + + + + + + k + + + + + + + + n + + + + n + + + + e + + + + n + + + + , + + knnen, + + + ten, vereinigen knnen, immaen der Hofrath Senenberg, als dieſer am 1. Oob. das Officium Judi- + + + + + + + + c + + + + i + + + + s + + cis + + + + + + + p + + + + u + + + + b + + + + l + + + + i + + + + c + + + + a + + publica + + + + + + + e + + + + x + + + + c + + + + i + + + + t + + + + i + + + + r + + + + e + + + + t + + + + e + + + + , + + excitirete, + + + + + + + v + + + + o + + + + r + + vor + + + + + + + d + + + + e + + + + m + + + + ſ + + + + e + + + + l + + + + b + + + + e + + + + n + + demſelben + + + + + + + p + + + + r + + + + e + + + + d + + + + . + + + + c + + + + æ + + præced. + + + + + + + r + + + + a + + + + t + + + + i + + + + o + + + + n + + + + e + + ratione + + + + + + + i + + + + n + + + + j + + + + u + + + + r + + + + i + + + + a + + + + r + + + + u + + + + m + + injuriarum + + + + + + + g + + + + e + + + + g + + + + e + + + + n + + gegen + + + + + + + i + + + + h + + + + n + + ihn + + + + + + + § + + § + + + + + + + ( + + + + e + + + + b + + + + e + + + + n + + (eben + + + + + + + + + + + + +  + + + + + + + a + + + + b + + + + e + + + + r + + aber + + + + + + + z + + + + u + + + + r + + zur + + + + + + + ſ + + + + a + + + + t + + + + i + + + + s + + + + f + + + + a + + + + + + + + i + + + + o + + + + n + + + + e + + ſatisfaione + + + cis gegen ihn zur ſatisfaione publica excitirete, vor  aber ratione injuriarum demſelben (eben § præced. + + + + + + + + g + + + + e + + + + ſ + + + + a + + + + g + + + + t + + + + e + + + + r + + geſagter + + + + + + + m + + + + a + + + + + + + + e + + + + n + + + + ) + + maen) + + + + + + + e + + + + i + + + + n + + + + e + + eine + + + + + + + L + + + + e + + + + i + + + + b + + + + e + + + + s + + + + + + + + S + + + + t + + + + r + + + + a + + + + + + + + e + + Leibes⸗Straffe + + + + + + + v + + + + o + + + + r + + vor + + + + + + + a + + + + + + + + e + + + + n + + aen + + + + + + + D + + + + i + + + + n + + + + g + + + + e + + + + n + + + + , + + Dingen, + + + + + + + o + + + + b + + ob + + + + + + + p + + + + e + + + + i + + + + n + + + + l + + + + i + + + + + + + + e + + + + r + + peinlier + + + + + + + g + + + + l + + + + e + + + + i + + + + + + glei + + + + + + + a + + + + l + + + + s + + als + + + + + + + E + + + + r + + Er + + + + + + + e + + + + i + + + + n + + ein + + + + + + + a + + + + u + + + + f + + + + z + + + + u + + + + l + + + + e + + + + g + + + + e + + + + n + + aufzulegen + + + + + + + b + + + + a + + + + t + + + + e + + + + , + + bate, + + + geſagter maen) eine Leibes⸗Straffe aufzulegen bate, vor aen Dingen, glei als ob Er ein peinlier + + + + + + + + u + + + + n + + + + d + + und + + + + + + + i + + + + n + + + + d + + + + i + + + + c + + + + i + + + + i + + + + s + + indiciis + + + + + + + A + + + + n + + + + k + + + + l + + + + + + + + g + + + + e + + + + r + + Anklger + + + + + + + w + + + + + + + + r + + + + e + + + + , + + wre, + + + + + + + d + + + + e + + + + n + + + + u + + + + n + + + + c + + + + i + + + + i + + + + r + + + + e + + + + t + + denunciiret + + + + + + + h + + + + + + + + t + + + + t + + + + e + + + + , + + htte, + + + + + + + e + + + + n + + + + o + + + + h + + ohne + + + Anklger wre, und ohne indiciis denunciiret htte, + + + + + + + + a + + + + u + + + + f + + auf + + + + + + + d + + + + i + + + + e + + + + ſ + + + + e + + + + m + + dieſem + + + + + + + F + + + + a + + + + + + Fa + + + + + + + a + + + + r + + + + t + + + + . + + art. + + + + + + + K + + + + l + + + + + + + + g + + + + e + + + + r + + Klger + + + + + + + e + + + + r + + + + f + + + + o + + + + r + + + + d + + + + e + + + + r + + + + t + + + + e + + erforderte + + + + + + + C + + + + r + + + + . + + Cr. + + + + + + + 1 + + + + 2 + + + + . + + 12. + + + + + + + p + + + + e + + + + i + + + + n + + + + l + + + + i + + + + + + + + e + + + + n + + peinlien + + + + + + + v + + + + o + + + + m + + vom + + + + + + + d + + + + i + + + + e + + die + + + + + + + i + + + + n + + in + + + + + + + o + + + + r + + + + d + + + + . + + ord. + + + die auf dieſem Fa in ord. Cr. art. 12. vom peinlien Klger erforderte + + + + + + + + C + + + + a + + + + u + + + + t + + + + i + + + + o + + + + n + + Caution + + + + + + + z + + + + u + + zu + + + + + + + l + + + + e + + + + i + + + + + + + + e + + + + n + + + + , + + leien, + + + + + + + a + + + + u + + + + f + + + + e + + + + r + + + + l + + + + e + + + + g + + + + e + + + + t + + auferleget + + + + + + + w + + + + o + + + + r + + + + d + + + + e + + + + n + + + + , + + worden, + + + + + + + d + + + + a + + da + + + + + + + d + + + + o + + + + + + do + + + + + + + e + + + + x + + ex + + + + + + + § + + + + . + + §. + + + + + + + 3 + + + + 1 + + + + . + + 31. + + + + + + + h + + + + + + + + t + + + + t + + + + e + + htte + + + + + + + e + + + + r + + + + ſ + + + + e + + + + h + + + + e + + + + n + + erſehen + + + + + + + d + + + + a + + + + ß + + daß + + + + + + + A + + + + + + + + i + + + + s + + Ais + + + + + + + ( + + + + v + + + + i + + + + d + + + + . + + (vid. + + + + + + + m + + + + a + + + + n + + man + + + + + + + + + + + + +  + + + + + + + k + + + + + + + + n + + + + n + + + + e + + + + n + + + + , + + knnen, + + + Caution zu leien, auferleget worden, da man  do ex Ais (vid. §. 31. htte erſehen knnen, daß + + + + + + + + h + + + + i + + + + e + + + + r + + hier + + + + + + + v + + + + o + + + + n + + von + + + + + + + e + + + + i + + + + n + + + + e + + + + r + + einer + + + + + + + o + + + + h + + + + n + + + + z + + + + w + + + + e + + + + i + + + + + + + + e + + + + n + + + + t + + + + l + + + + i + + + + + + + + e + + + + n + + ohnzweiffentlien + + + + + + + u + + + + n + + + + d + + und + + + + + + + o + + + + + + + + e + + + + n + + + + t + + + + l + + + + i + + + + + + + + e + + + + n + + offentlien + + + + + + + o + + + + b + + + + w + + + + a + + + + l + + + + t + + + + e + + + + , + + obwalte, + + + + + + + w + + + + o + + + + b + + + + e + + + + y + + wobey + + + + + + + d + + + + e + + + + m + + dem + + + + + + + R + + + + i + + + + + + + + t + + + + e + + + + r + + Riter + + + + + + + M + + + + i + + + + + + + + e + + + + t + + + + h + + + + a + + + + t + + Miethat + + + + + + + d + + + + i + + + + e + + die + + + + + + + F + + + + r + + + + a + + + + g + + + + e + + Frage + + + hier von einer ohnzweiffentlien und offentlien Miethat die Frage obwalte, wobey dem Riter + + + + + + + + i + + + + n + + in + + + + + + + O + + + + . + + O. + + + + + + + C + + + + r + + + + . + + Cr. + + + + + + + a + + + + r + + + + t + + + + . + + art. + + + + + + + 1 + + + + 6 + + + + . + + 16. + + + in O. Cr. art. 16. + + + + + + + + e + + + + i + + + + n + + ein + + + + + + + g + + + + a + + + + n + + + + + + gan + + + + + + + a + + + + n + + + + d + + + + e + + + + r + + + + e + + + + r + + anderer + + + + + + + e + + + + x + + ex + + + + + + + P + + + + r + + + + o + + + + c + + + + e + + + + ß + + Proceß + + + + + + + v + + + + o + + + + r + + + + g + + + + e + + + + s + + + + + + + + r + + + + i + + + + e + + + + b + + + + e + + + + n + + vorgesrieben + + + + + + + w + + + + i + + + + r + + + + d + + + + , + + wird, + + + + + + + u + + + + n + + + + d + + und + + + + + + + + + + + b + + + + e + + + + r + + ber + + + + + + + d + + + + i + + + + e + + die + + + + + + + O + + + + + + + + c + + + + i + + + + o + + Officio + + + + + + + a + + + + n + + + + z + + + + u + + + + + + + + e + + + + + + + + e + + + + n + + + + d + + + + e + + + + r + + anzueender + + + + + + + a + + + + + + + + e + + + + n + + + + f + + + + a + + + + + + + + s + + + + , + + aenfas, + + + + + + + w + + + + e + + + + n + + + + n + + wenn + + + ein gan anderer ex Officio anzueender Proceß vorgesrieben wird, und aenfas, wenn ber die + + + + + + + + e + + + + i + + + + n + + ein + + + + + + + Z + + + + w + + + + e + + + + i + + + + + + + + e + + + + l + + Zweiffel + + + + + + + o + + + + b + + + + g + + + + e + + + + w + + + + a + + + + l + + + + t + + + + e + + + + t + + obgewaltet + + + + + + + h + + + + + + + + t + + + + t + + + + e + + + + , + + htte, + + + + + + + i + + + + n + + + + ſ + + + + u + + + + + + + + c + + + + i + + + + e + + + + n + + + + t + + + + i + + + + a + + + + m + + inſufficientiam + + + + + + + I + + + + n + + + + d + + + + i + + + + c + + + + i + + + + o + + + + r + + + + u + + + + m + + Indiciorum + + + inſufficientiam Indiciorum ein Zweiffel obgewaltet htte, + + + + + + + + C + + + + r + + + + . + + Cr. + + + + + + + a + + + + r + + + + t + + + + . + + art. + + + + + + + 7 + + + + . + + 7. + + + + + + + ſ + + + + e + + + + c + + + + u + + + + n + + + + d + + + + . + + ſecund. + + + + + + + O + + O + + + ſecund. O Cr. art. 7. + + + + + + + + h + + + + + + + + t + + + + t + + + + e + + + + n + + htten + + + + + + + m + + + + + + + + + + + + e + + + + n + + men + + + + + + + a + + + + n + + + + ſ + + + + o + + + + n + + + + + + + + e + + + + n + + anſonen + + + + + + + a + + + + b + + + + e + + + + r + + aber + + + + + + + b + + + + e + + + + y + + bey + + + + + + + d + + + + e + + + + r + + der + + + + + + + a + + + + u + + + + s + + + + w + + + + + + + + r + + + + t + + + + i + + + + g + + + + e + + auswrtige + + + + + + + R + + + + e + + + + + + + + t + + + + s + + + + g + + + + e + + + + l + + + + + + + + h + + + + r + + + + t + + + + e + + Retsgelhrte + + + + + + + b + + + + e + + + + f + + + + r + + + + a + + + + g + + + + e + + + + t + + befraget + + + + + + + w + + + + e + + + + r + + + + d + + + + e + + + + n + + + + , + + werden, + + + + + + + b + + + + l + + + + o + + + + ß + + + + e + + + + n + + bloßen + + + + + + + a + + + + + + + + i + + + + o + + + + n + + + + e + + aione + + + + + + + I + + + + n + + + + j + + + + u + + + + r + + + + i + + + + a + + + + - + + Injuria- + + + auswrtige Retsgelhrte htten men befraget werden, anſonen aber bey der bloßen aione Injuria- + + + + + + + + r + + + + u + + + + m + + rum + + + + + + + d + + + + e + + + + m + + dem + + + + + + + d + + + + i + + + + e + + die + + + + + + + C + + + + a + + + + u + + + + t + + + + i + + + + o + + + + n + + + + s + + Cautions + + + + + + + L + + + + e + + + + i + + + + + + + + u + + + + n + + + + g + + Leiung + + + + + + + u + + + + m + + um + + + + + + + d + + + + o + + do + + + + + + + w + + + + e + + + + n + + + + i + + + + g + + + + e + + + + r + + weniger + + + + + + + k + + + + o + + + + n + + + + n + + + + t + + + + e + + konnte + + + + + + + a + + + + u + + + + f + + + + e + + + + r + + + + l + + + + e + + + + g + + + + e + + + + t + + auferleget + + + + + + + w + + + + e + + + + r + + + + d + + + + e + + + + n + + + + , + + werden, + + + + + + + d + + + + a + + da + + + + + + + ſ + + + + o + + + + l + + + + + + + + e + + ſole + + + + + + + H + + + + o + + + + f + + + + r + + + + a + + + + t + + + + h + + Hofrath + + + + + + + S + + + + e + + + + n + + + + + + + + e + + + + n + + + + b + + + + e + + + + r + + + + g + + Senenberg + + + rum dem Hofrath Senenberg die Cautions Leiung um do weniger konnte auferleget werden, da ſole + + + + + + + + a + + + + u + + + + + + au + + + + + + + d + + + + e + + + + r + + der + + + + + + + A + + + + g + + + + r + + + + i + + + + c + + + + o + + + + l + + + + a + + Agricola + + + + + + + v + + + + o + + + + n + + von + + + + + + + w + + + + a + + + + r + + + + e + + ware + + + + + + + e + + + + r + + + + f + + + + o + + + + r + + + + d + + + + e + + + + r + + + + t + + erfordert + + + + + + + w + + + + o + + + + r + + + + d + + + + e + + + + n + + + + . + + worden. + + + + + + + b + + + + e + + + + y + + bey + + + + + + + d + + + + e + + + + r + + der + + + + + + + I + + + + n + + + + h + + + + a + + + + + + + + t + + + + i + + + + r + + + + u + + + + n + + + + g + + Inhafftirung + + + + + + I + + + + h + + + + m + + Ihm + + + + + + + k + + + + e + + + + i + + + + n + + + + e + + + + s + + + + w + + + + e + + + + g + + + + e + + + + s + + keinesweges + + + au bey der Inhafftirung der Agricola von Ihm keinesweges ware erfordert worden. + + §. 33) Dabeneben i der Sreiber binnen dieſer ganen Zeit auf freyem Fuß geblieben, und +hat nit nur dur ſeinen Conſulenten, ſondern au, weilen der Inquitin ſelben in Ihrem Gefngnß +ſo viele Freyheit gelaen worden, daß e frembden Beſu von Ihren Anverwandten ohngehindert em⸗ +pfangen knnen, dur andere Perſonen  mit ihr ber aes, was Er oder e dereinen zu ſagen hat⸗ +ten, vereinigen knnen, immaen der Hofrath Senenberg, als dieſer am 1. Oob. das Officium Judi- +cis gegen ihn zur ſatisfaione publica excitirete, vor  aber ratione injuriarum demſelben (eben § præced. +geſagter maen) eine Leibes⸗Straffe aufzulegen bate, vor aen Dingen, glei als ob Er ein peinlier +Anklger wre, und ohne indiciis denunciiret htte, +die auf dieſem Fa in ord. Cr. art. 12. vom peinlien Klger erforderte +Caution zu leien, auferleget worden, da man  do ex Ais (vid. §. 31. htte erſehen knnen, daß +hier von einer ohnzweiffentlien und offentlien Miethat die Frage obwalte, wobey dem Riter +in O. Cr. art. 16. +ein gan anderer ex Officio anzueender Proceß vorgesrieben wird, und aenfas, wenn ber die +inſufficientiam Indiciorum ein Zweiffel obgewaltet htte, +ſecund. O Cr. art. 7. +auswrtige Retsgelhrte htten men befraget werden, anſonen aber bey der bloßen aione Injuria- +rum dem Hofrath Senenberg die Cautions Leiung um do weniger konnte auferleget werden, da ſole +au bey der Inhafftirung der Agricola von Ihm keinesweges ware erfordert worden. + + + + + + + + + + + § + + § + + + + + + + 3 + + + + 4 + + + + ) + + 34) + + + + + + + Z + + + + w + + + + i + + + + ſ + + + + + + + + e + + + + n + + Zwiſen + + + + + + + d + + + + e + + + + m + + dem + + + + + + + C + + + + r + + + + i + + + + m + + + + i + + + + n + + + + e + + Crimine + + + + + + + f + + + + a + + + + l + + + + + + fal + + + + + + + u + + + + n + + + + d + + und + + + + + + + c + + + + o + + + + n + + + + c + + + + u + + + + + + + + o + + + + n + + + + i + + + + s + + concuonis + + + + + + + i + + + + + + i + + + § 34) Zwiſen dem Crimine fal und concuonis i + + + + + + + + ſ + + + + e + + + + c + + + + . + + ſec. + + + + + + + L + + + + A + + + + U + + + + T + + + + E + + + + R + + + + B + + + + . + + LAUTERB. + + + + + + + P + + + + + + + + . + + + + r + + + + a + + Pra. + + + + + + + L + + + + i + + + + b + + + + . + + Lib. + + + + + + + 1 + + + + 0 + + + + . + + 10. + + + + + + + § + + + + . + + §. + + + + + + + 1 + + + + 6 + + + + . + + 16. + + + + + + + C + + + + o + + + + l + + + + l + + + + . + + Coll. + + + + + + + T + + + + h + + + + e + + + + o + + + + r + + + + . + + Theor. + + + + + + + 4 + + + + 8 + + + + . + + 48. + + + + + + + T + + + + i + + + + t + + + + . + + Tit. + + + ſec. LAUTERB. Coll. Theor. Pra. Lib. 48. Tit. 10. §. 16. + + + + + + + + e + + + + i + + + + n + + + + e + + eine + + + + + + + ſ + + + + o + + ſo + + + + + + + g + + + + r + + + + o + + + + ß + + + + e + + große + + + + + + + e + + + + i + + + + n + + + + i + + + + e + + + + n + + + + g + + einigen + + + + + + + d + + + + e + + + + r + + der + + + + + + + C + + + + o + + + + n + + + + ſ + + + + p + + + + i + + + + r + + + + a + + + + t + + + + i + + + + o + + + + n + + + + i + + Conſpirationi + + + + + + + & + + & + + + + + + + w + + + + i + + + + e + + wie + + + + + + + i + + + + n + + in + + + + + + + l + + + + e + + + + g + + + + i + + + + b + + + + u + + + + s + + legibus + + + + + + + e + + + + i + + + + n + + + + e + + + + m + + einem + + + + + + + V + + + + e + + + + r + + + + w + + + + a + + + + n + + + + d + + + + ſ + + + + + + + + a + + + + + + + + t + + + + , + + Verwandſafft, + + + + + + + d + + + + a + + + + ß + + daß + + + + + + + ſ + + + + o + + ſo + + + + + + + g + + + + a + + + + r + + gar + + + + + + + V + + + + r + + + + r + + + + + + + + n + + + + e + + + + b + + + + e + + + + e + + + + + + + + eine ſo große Verwandſafft, daß ſo gar in legibus einem einigen Verbreen⸗ wie der Conſpirationi & + + + + + + + + ſ + + + + u + + + + b + + + + o + + + + a + + + + t + + + + i + + + + o + + + + n + + + + i + + + + r + + + + n + + ſubornationi + + + + + + + T + + + + e + + + + + + + + i + + + + u + + + + m + + Teium + + + + + + + b + + + + a + + + + l + + + + d + + bald + + + + + + + b + + + + e + + + + y + + + + g + + + + e + + + + l + + + + e + + + + g + + + + e + + + + t + + beygeleget + + + + + + + w + + + + i + + + + r + + + + d + + + + . + + wird. + + + + + + + j + + + + e + + + + n + + + + e + + + + r + + jener + + + + + + + N + + + + a + + + + h + + + + m + + + + e + + Nahme + + + + + + + d + + + + i + + + + e + + + + ſ + + + + e + + + + r + + dieſer + + + + + + + b + + + + a + + + + l + + + + d + + bald + + + ſubornationi Teium bald dieſer bald jener Nahme beygeleget wird. + + + + + + + + f + + + + a + + + + l + + + + ſ + + + + . + + falſ. + + + + + + + d + + + + e + + de + + + + + + + c + + + + o + + + + n + + + + c + + + + u + + + + + + concu + + + + + + + l + + + + . + + l. + + + + + + + 1 + + + + . + + 1. + + + + + + + L + + + + . + + L. + + + + + + + 2 + + + + . + + 2. + + + + + + + d + + + + e + + de + + + + + + + . + + + + L + + L. + + + + + + + C + + + + o + + + + r + + + + n + + + + e + + + + l + + + + . + + Cornel. + + + + + + + d + + + + e + + de + + + L. 2. de concu l. 1. de L. Cornel. de falſ. + + + + + + + + D + + + + a + + Da + + + + + + + n + + + + u + + + + n + + nun + + + + + + + e + + + + r + + + + w + + + + i + + + + e + + + + ſ + + + + e + + + + n + + erwieſen + + + + + + + w + + + + o + + + + r + + + + d + + + + e + + + + n + + worden + + + + + + + ( + + + + § + + + + . + + (§. + + + + + + + 2 + + + + 2 + + + + . + + + + ) + + 22.) + + + + + + + u + + + + n + + + + d + + und + + + + + + + d + + + + i + + + + e + + + + ſ + + + + e + + + + l + + + + b + + + + e + + + + , + + dieſelbe, + + + + + + + w + + + + e + + + + n + + + + n + + wenn + + + + + + + + + + + e + + e + + + + + + + a + + + + u + + + + + + au + + + + + + + a + + + + + + + + ſ + + + + + + + + o + + + + n + + aſon + + + + + + + v + + + + + + + + + + + + i + + + + g + + vig + + + + + + + d + + + + e + + + + r + + der + + + + + + + I + + + + u + + + + i + + + + + + + + t + + + + n + + + + q + + + + i + + + + n + + Inquitin + + + + + + + d + + + + i + + + + e + + + + ſ + + + + e + + + + s + + dieſes + + + + + + + C + + + + r + + + + i + + + + m + + + + e + + + + n + + Crimen + + + Da nun der Inquitin dieſes Crimen aſon vig erwieſen worden (§. 22.) und dieſelbe, wenn e au + + + + + + + + z + + + + u + + zu + + + + + + + e + + + + i + + + + n + + + + e + + + + m + + einem + + + + + + + w + + + + a + + + + h + + + + r + + + + e + + + + n + + wahren + + + + + + + h + + + + + + + + t + + + + t + + + + e + + + + , + + htte, + + + + + + + o + + + + h + + + + n + + + + g + + + + e + + + + + + + + a + + + + n + + + + d + + + + e + + + + n + + + + e + + + + n + + ohngeandenen + + + + + + + f + + + + a + + + + + + + + s + + fas + + + + + + + Z + + + + e + + + + u + + + + g + + + + n + + + + + + + + ß + + Zeugnß + + + + + + + b + + + + o + + + + i + + + + r + + + + e + + + + t + + + + r + + + + n + + + + ſ + + + + u + + ſuborniret + + + ohngeandenen fas zu einem wahren Zeugnß ſuborniret htte, + + + + + + + + 8 + + + + . + + 8. + + + + + + + ſ + + + + e + + + + c + + + + . + + ſec. + + + + + + + L + + + + A + + + + U + + + + T + + + + E + + + + R + + + + B + + + + . + + LAUTERB. + + + + + + + C + + + + o + + + + l + + + + l + + + + . + + Coll. + + + + + + + T + + + + h + + + + e + + + + o + + + + r + + + + . + + Theor. + + + + + + + P + + + + r + + + + a + + + + + + + + . + + Pra. + + + + + + + L + + + + . + + L. + + + + + + + T + + + + . + + T. + + + + + + + 4 + + + + 8 + + + + . + + 48. + + + + + + + 1 + + + + 0 + + + + . + + 10. + + + + + + + § + + + + . + + §. + + + ſec. LAUTERB. Coll. Theor. Pra. L. 48. T. 10. §. 8. + + + + + + + + d + + + + e + + + + n + + + + n + + + + o + + + + + + denno + + + + + + + m + + + + i + + + + t + + mit + + + + + + + d + + + + e + + + + r + + der + + + + + + + p + + + + œ + + + + n + + + + a + + pœna + + + + + + + f + + + + a + + + + l + + + + + + + + , + + fal, + + + + + + + a + + + + l + + + + s + + als + + + + + + + f + + + + a + + + + l + + + + ſ + + + + u + + + + m + + falſum + + + + + + + + + + + e + + + + r + + + + i + + fieri + + + + + + + c + + + + u + + + + r + + + + a + + + + n + + + + s + + + + , + + curans, + + + denno mit der pœna fal, als falſum fieri curans, + + + + + + + + 3 + + + + . + + 3. + + + + + + + a + + + + d + + ad + + + + + + + f + + + + a + + + + l + + + + ſ + + + + . + + falſ. + + + + + + + ſ + + + + e + + + + c + + + + . + + ſec. + + + + + + + l + + + + . + + l. + + + + + + + § + + + + . + + §. + + + + + + + 9 + + + + . + + 9. + + + + + + + L + + + + . + + L. + + + + + + + d + + + + e + + de + + + + + + + C + + + + o + + + + r + + + + n + + + + . + + Corn. + + + ſec. l. 9. §. 3. ad L. Corn. de falſ. + + + + + + + + X + + + + . + + X. + + + + + + + . + + + + L + + L. + + + + + + + 4 + + + + . + + 4. + + + + + + + 8 + + + + . + + 8. + + + + + + + C + + + + . + + C. + + + + + + + 7 + + 7 + + + + + + + c + + + + . + + c. + + + + + + + d + + + + e + + de + + + + + + + f + + + + a + + + + l + + + + ſ + + + + . + + falſ. + + + L. 4. 8. C. c. 7 X. de falſ. + + + + + + + + m + + + + e + + + + h + + + + r + + mehr + + + + + + + w + + + + i + + + + r + + + + d + + wird + + + + + + + d + + + + e + + + + r + + + + ſ + + + + e + + + + l + + + + b + + + + e + + + + n + + derſelben + + + + + + + u + + + + n + + + + d + + und + + + + + + + I + + + + h + + + + r + + + + e + + + + m + + Ihrem + + + + + + + C + + + + o + + + + m + + + + p + + + + l + + + + i + + + + c + + + + i + + Complici + + + + + + + B + + + + r + + + + e + + + + d + + + + e + + + + k + + + + a + + + + w + + Bredekaw + + + + + + + d + + + + i + + + + e + + + + ſ + + + + e + + dieſe + + + + + + + S + + + + t + + + + r + + + + a + + + + + + + + e + + Straffe + + + + + + + a + + + + n + + + + g + + + + e + + + + d + + + + e + + + + y + + + + h + + + + e + + + + n + + angedeyhen + + + + + + + m + + + + + + + + + + + + e + + + + n + + + + , + + men, + + + + + + + d + + + + a + + da + + + + + + + d + + + + i + + + + e + + + + ſ + + + + e + + + + l + + + + b + + + + e + + dieſelbe + + + + + + + e + + + + x + + + + t + + + + r + + + + a + + extra + + + mehr wird derſelben und Ihrem Complici Bredekaw dieſe Straffe angedeyhen men, da dieſelbe extra + + + + + + + + J + + + + u + + + + d + + + + i + + + + c + + + + i + + + + u + + + + m + + Judicium + + + + + + + + + + + e + + e + + + + + + + S + + + + e + + + + n + + + + + + + + e + + + + n + + + + b + + + + e + + + + r + + + + g + + Senenberg + + + + + + + m + + + + i + + + + t + + mit + + + + + + + ſ + + + + o + + ſo + + + + + + + g + + + + a + + + + r + + gar + + + + + + + m + + + + i + + + + t + + mit + + + + + + + P + + + + i + + + + + + + + o + + + + l + + + + e + + + + n + + Piolen + + + + + + + z + + + + u + + zu + + + + + + + d + + + + a + + + + ß + + daß + + + + + + + G + + + + e + + + + w + + + + a + + + + l + + + + t + + + + + + Gewalt⸗ + + + + + + + u + + + + n + + + + d + + und + + + + + + + d + + + + e + + + + r + + der + + + + + + + H + + + + o + + + + f + + + + r + + + + a + + + + t + + + + h + + Hofrath + + + + + + + b + + + + e + + + + + + + + + + + + n + + + + d + + + + i + + + + g + + bendig + + + + + + + b + + + + e + + + + h + + + + a + + + + u + + + + p + + + + t + + + + e + + + + t + + + + , + + behauptet, + + + Judicium bendig behauptet, daß e der Hofrath Senenberg mit Gewalt⸗ und ſo gar mit Piolen zu + + + + + + + + ſ + + + + e + + + + i + + + + n + + + + e + + + + m + + ſeinem + + + + + + + W + + + + i + + + + + + + + e + + + + n + + Wien + + + + + + + g + + + + e + + + + z + + + + w + + + + u + + + + n + + + + g + + + + e + + + + n + + + + , + + gezwungen, + + + ſeinem Wien gezwungen, + + + + + + + + P + + + + r + + + + o + + + + t + + + + o + + + + c + + + + . + + Protoc. + + + + + + + I + + + + n + + + + q + + + + u + + + + i + + + + ſ + + + + . + + Inquiſ. + + + + + + + f + + + + o + + + + l + + + + . + + fol. + + + + + + + 7 + + + + 1 + + + + . + + 71. + + + + + + + b + + + + . + + b. + + + + + + + b + + + + . + + b. + + + + + + + 8 + + + + 2 + + + + . + + 82. + + + + + + + f + + + + o + + + + l + + + + . + + fol. + + + + + + + 2 + + + + 3 + + + + . + + 23. + + + + + + + a + + + + . + + a. + + + + + + + f + + + + o + + + + l + + + + . + + fol. + + + + + + 7 + + + + 3 + + + + . + + 73. + + + + + + b + + + + . + + b. + + + + + + a + + + + . + + a. + + Protoc. Inquiſ. fol. 71. b. fol. 73. b. 82. a. b. fol. 23. a. + + + + + + + + J + + + + u + + + + d + + + + i + + + + c + + + + i + + + + o + + + + , + + Judicio, + + + + + + + a + + + + u + + + + + + au + + + + + + + i + + + + n + + in + + + au in Judicio, + + + + + + + + m + + + + + + + + ß + + + + t + + + + e + + mßte + + + + + + + b + + + + e + + + + l + + + + e + + + + g + + + + e + + + + t + + beleget + + + + + + + w + + + + e + + + + r + + + + d + + + + e + + + + n + + + + , + + werden, + + + + + + + w + + + + e + + + + l + + + + + + + + e + + wele + + + + + + + d + + + + a + + + + n + + + + n + + dann + + + + + + + o + + + + b + + + + e + + + + n + + oben + + + + + + + . + + + + ( + + + + § + + (§. + + + + + + + m + + + + a + + + + + + + + e + + + + n + + maen + + + + + + + d + + + + i + + + + e + + die + + + + + + + S + + + + t + + + + r + + + + a + + + + + + + + e + + Straffe + + + + + + + d + + + + e + + + + r + + der + + + + + + + E + + + + n + + + + t + + + + h + + + + a + + + + u + + + + p + + + + t + + + + g + + + + u + + + + n + + Enthauptung + + + + + + + v + + + + i + + + + e + + + + l + + + + + + viel⸗ + + + + + + + i + + + + + + + + , + + i, + + + + + + + w + + + + i + + + + e + + wie + + + + + + + g + + + + e + + + + ſ + + + + a + + + + g + + + + t + + + + e + + + + r + + geſagter + + + + + + + 3 + + + + 1 + + + + ) + + 31) + + mßte beleget werden, wele dann oben (§. 31) geſagter maen die Straffe der Enthauptung i, wie viel⸗ + + § 34) Zwiſen dem Crimine fal und concuonis i +ſec. LAUTERB. Coll. Theor. Pra. Lib. 48. Tit. 10. §. 16. +eine ſo große Verwandſafft, daß ſo gar in legibus einem einigen Verbreen⸗ wie der Conſpirationi & +ſubornationi Teium bald dieſer bald jener Nahme beygeleget wird. +L. 2. de concu l. 1. de L. Cornel. de falſ. +Da nun der Inquitin dieſes Crimen aſon vig erwieſen worden (§. 22.) und dieſelbe, wenn e au +ohngeandenen fas zu einem wahren Zeugnß ſuborniret htte, +ſec. LAUTERB. Coll. Theor. Pra. L. 48. T. 10. §. 8. +denno mit der pœna fal, als falſum fieri curans, +ſec. l. 9. §. 3. ad L. Corn. de falſ. +L. 4. 8. C. c. 7 X. de falſ. +mßte beleget werden, wele dann oben (§. 31) geſagter maen die Straffe der Enthauptung i, wie viel⸗ +mehr wird derſelben und Ihrem Complici Bredekaw dieſe Straffe angedeyhen men, da dieſelbe extra +Judicium bendig behauptet, daß e der Hofrath Senenberg mit Gewalt⸗ und ſo gar mit Piolen zu +ſeinem Wien gezwungen, +Protoc. Inquiſ. fol. 71. b. fol. 73. b. 82. a. b. fol. 23. a. +au in Judicio, + + + + + + + + + + + a + + + + n + + + + g + + + + e + + + + g + + + + e + + + + b + + + + e + + + + n + + + + e + + angegebene + + + + + + + b + + + + e + + + + l + + + + a + + + + n + + + + g + + + + e + + + + t + + + + , + + belanget, + + + + + + + ſ + + + + o + + ſo + + + + + + + m + + + + u + + + + ß + + muß + + + + + + + z + + + + w + + + + a + + + + r + + + + , + + zwar, + + + + + + + ſ + + + + o + + ſo + + + + + + + v + + + + i + + + + e + + + + l + + viel + + + + + + + T + + + + e + + + + + + + + . + + Te. + + + + + + + 1 + + + + . + + 1. + + + + + + + n + + + + e + + + + m + + + + l + + + + . + + neml. + + + + + + + d + + + + e + + + + s + + des + + + + + + + B + + + + u + + + + r + + + + g + + + + e + + + + e + + + + i + + + + + + + + e + + + + r + + + + s + + + + r + + + + m + + Burgermeiers + + + + + + + h + + + + o + + + + + + + + + + ho⸗ + + + + + + + Z + + + + e + + + + u + + + + g + + + + i + + + + n + + Zeugin + + + + + + + + + + + e + + + + r + + + + l + + + + t + + + + n + + ltern + + + + + + + H + + + + r + + + + . + + Hr. + + + angegebene Zeugin belanget, ſo muß zwar, ſo viel Te. 1. neml. des ltern Hr. Burgermeiers ho⸗ + + + + + + + + d + + + + e + + + + r + + der + + + + + + + H + + + + o + + + + f + + + + r + + + + a + + + + t + + + + h + + Hofrath + + + + + + + S + + + + e + + + + n + + + + + + + + e + + + + n + + + + b + + + + e + + + + r + + + + g + + Senenberg + + + + + + + z + + + + u + + zu + + + + + + + ſ + + + + e + + + + i + + + + n + + + + e + + + + m + + ſeinem + + + + + + + g + + + + r + + + + + + + + ß + + + + t + + + + e + + + + n + + grßten + + + + + + + d + + + + w + + + + e + + + + e + + + + ſ + + + + e + + + + n + + + + L + + + + i + + + + e + + Leidweeſen + + + + + + + b + + + + e + + + + k + + + + e + + + + n + + + + n + + + + e + + + + n + + + + , + + bekennen, + + + + + + + d + + + + a + + + + ß + + daß + + + + + + + E + + + + r + + Er + + + + + + + d + + + + i + + + + e + + + + ſ + + + + e + + + + l + + + + b + + + + e + + + + , + + dieſelbe, + + + + + + + w + + + + e + + + + b + + + + l + + + + o + + + + h + + + + g + + + + l + + + + . + + wohlgebl. + + + + + + + a + + + + e + + + + t + + + + r + + + + i + + + + + + + + b + + + + n + + + + t + + + + , + + anbetrifft, + + + wohlgebl. anbetrifft, der Hofrath Senenberg zu ſeinem grßten Leidweeſen bekennen, daß Er dieſelbe, + + + + + + + + a + + + + + + + + ( + + + + n + + (na + + + + + + + 1 + + + + 6 + + + + . + + 16. + + + + + + + v + + + + o + + + + r + + vor + + + + + + + e + + + + i + + + + n + + + + e + + + + n + + einen + + + + + + + m + + + + + + + + ß + + + + e + + + + , + + mße, + + + + + + + w + + + + o + + + + f + + + + e + + + + r + + + + n + + + + e + + woferne + + + + + + + j + + + + e + + + + d + + + + o + + + + + + jedo + + + + + + + a + + + + n + + + + n + + + + o + + + + + + anno + + + + + + + e + + + + i + + + + n + + ein + + + + + + + P + + + + r + + + + o + + + + + + Pro⸗ + + + + + + + 1 + + + + 7 + + + + . + + 17. + + + + + + + 1 + + + + 8 + + + + . + + 18. + + + + + + + I + + + + n + + + + i + + + + m + + + + i + + + + c + + + + u + + + + m + + Inimicum + + + + + + a + + + + n + + + + g + + + + e + + + + b + + + + e + + + + n + + angeben + + + + + + + V + + + + e + + + + r + + + + a + + + + n + + + + l + + + + a + + + + + + + + u + + + + n + + + + g + + Veranlaung + + + + + + + § + + + + . + + §. + + + + + + + 1 + + + + 9 + + + + . + + + + ) + + 19.) + + (na Veranlaung §. 16. 17. 18. 19.) vor einen Inimicum angeben mße, woferne jedo anno ein Pro⸗ + + + + + + + + d + + + + e + + + + n + + den + + + + + + + H + + + + o + + + + f + + + + r + + + + a + + + + t + + + + h + + Hofrath + + + + + + + S + + + + e + + + + n + + + + + + + + e + + + + n + + + + b + + + + e + + + + r + + + + g + + Senenberg + + + + + + + + + + + a + + + + t + + + + t + + att + + + + + + + h + + + + a + + + + b + + + + e + + + + n + + haben + + + + + + + k + + + + + + + + n + + + + n + + + + t + + + + e + + + + , + + knnte, + + + + + + + u + + + + n + + + + d + + und + + + + + + + + + + + t + + + + n + + + + i + + nit + + + + + + + c + + + + e + + + + ß + + ceß + + + + + + + g + + + + e + + + + g + + + + e + + + + n + + gegen + + + ceß gegen den Hofrath Senenberg att haben knnte, und nit + + + + + + + + c + + + + o + + + + n + + + + t + + + + r + + + + a + + contra + + + + + + + O + + + + . + + O. + + + + + + + C + + + + r + + + + . + + Cr. + + + + + + + 1 + + + + 0 + + + + 0 + + + + . + + 100. + + + + + + + a + + + + r + + + + t + + + + . + + art. + + + contra O. Cr. art. 100. + + + + + + + + e + + + + w + + + + i + + wie + + + + + + + ſ + + + + o + + + + n + + + + + + + + e + + + + n + + ſonen + + + + + + + a + + + + r + + + + t + + + + i + + + + c + + + + u + + + + l + + + + i + + articuli + + + + + + + i + + + + m + + + + p + + + + e + + + + r + + + + t + + + + i + + + + n + + + + e + + + + n + + + + t + + + + e + + + + s + + impertinentes + + + + + + + o + + + + d + + + + e + + + + r + + oder + + + + + + + d + + + + e + + + + r + + + + g + + + + l + + + + e + + + + i + + + + + + + + e + + + + n + + dergleien + + + + + + + a + + + + u + + + + + + au + + + + + + + d + + + + i + + + + e + + die + + + + + + + v + + + + o + + + + n + + von + + + + + + + h + + + + i + + + + e + + + + r + + hier + + + + + + + g + + + + e + + + + w + + + + + + + + h + + + + n + + + + l + + + + i + + + + + + + + , + + gewhnli, + + + + + + + I + + + + n + + + + t + + + + e + + + + r + + + + r + + + + o + + + + g + + + + a + + + + r + + + + i + + + + a + + + + t + + + + o + + Interrogatoria + + + + + + + z + + + + u + + + + g + + + + e + + + + l + + + + a + + + + + + + + e + + + + n + + + + , + + zugelaen, + + + wie ſonen hier gewhnli, articuli impertinentes oder dergleien Interrogatoria zugelaen, au die von + + + + + + + + d + + + + e + + + + r + + der + + + + + + + E + + + + x + + + + c + + + + e + + + + p + + + + t + + + + i + + + + o + + + + n + + + + i + + + + s + + Exceptionis + + + + + + + a + + + + n + + + + g + + + + e + + + + d + + + + r + + + + o + + + + h + + + + e + + + + t + + + + e + + angedrohete + + + + + + + o + + + + b + + + + j + + + + e + + + + + + + + i + + + + o + + objeio + + + + + + + a + + + + l + + + + t + + + + e + + + + r + + + + i + + + + u + + + + s + + alterius + + + + + + + c + + + + m + + + + i + + + + s + + + + r + + + + i + + + + i + + + + n + + criminis + + + + + + + Ä + + + + g + + + + r + + + + i + + + + c + + + + o + + + + l + + + + a + + Ägricola + + + + + + + m + + + + o + + + + d + + + + u + + + + m + + modum + + + + + + + p + + + + e + + + + r + + per + + + der Ägricola per modum Exceptionis angedrohete objeio alterius criminis + + + + + + + + ſ + + + + e + + + + c + + + + . + + ſec. + + + + + + + c + + + + a + + + + p + + + + . + + cap. + + + + + + + a + + + + c + + + + c + + + + e + + + + d + + + + e + + + + n + + + + s + + accedens + + + + + + + 2 + + + + 3 + + + + . + + 23. + + + + + + + X + + + + . + + X. + + + + + + + d + + + + e + + de + + + + + + + a + + + + c + + + + c + + + + u + + + + s + + + + . + + accus. + + + ſec. cap. accedens 23. X. de accus. + + + + + + + + d + + + + + + + + r + + + + + + + + t + + + + e + + + + n + + drfften + + + + + + + d + + + + i + + + + e + + + + ſ + + + + e + + + + l + + + + b + + + + e + + dieſelbe + + + + + + + v + + + + i + + + + e + + + + + + + + e + + + + i + + + + + + + + t + + vieeit + + + + + + + u + + + + m + + um + + + + + + + v + + + + e + + + + r + + + + n + + + + o + + + + m + + + + m + + + + e + + + + n + + vernommen + + + + + + + w + + + + e + + + + r + + + + d + + + + e + + + + n + + + + , + + werden, + + + + + + + w + + + + e + + + + i + + + + l + + + + e + + + + n + + weilen + + + + + + + a + + + + + + + + e + + + + s + + aes + + + + + + + o + + + + h + + + + n + + + + e + + + + + + ohne⸗ + + + + + + + n + + + + i + + + + + + + + t + + nit + + + + + + + z + + + + u + + + + g + + + + e + + + + l + + + + a + + + + + + + + e + + + + n + + zugelaen + + + + + + + w + + + + i + + + + r + + + + d + + + + , + + wird, + + + + + + + d + + + + o + + do + + + + + + + e + + + + h + + + + e + + + + n + + + + d + + + + e + + + + r + + ehender + + + nit zugelaen wird, drfften dieſelbe vieeit um do ehender vernommen werden, weilen aes ohne⸗ + + + + + + + + h + + + + i + + + + n + + hin + + + + + + + e + + + + x + + ex + + + + + + + O + + + + r + + + + i + + + + g + + + + i + + + + n + + + + a + + + + l + + + + i + + + + b + + + + u + + + + s + + Originalibus + + + + + + + z + + + + u + + zu + + + + + + + e + + + + r + + + + w + + + + e + + + + i + + + + ſ + + + + e + + + + n + + erweiſen + + + + + + + + + + + e + + + + h + + + + e + + + + t + + + + . + + ehet. + + + hin ex Originalibus zu erweiſen ehet. + + angegebene Zeugin belanget, ſo muß zwar, ſo viel Te. 1. neml. des ltern Hr. Burgermeiers ho⸗ +wohlgebl. anbetrifft, der Hofrath Senenberg zu ſeinem grßten Leidweeſen bekennen, daß Er dieſelbe, +(na Veranlaung §. 16. 17. 18. 19.) vor einen Inimicum angeben mße, woferne jedo anno ein Pro⸗ +ceß gegen den Hofrath Senenberg att haben knnte, und nit +contra O. Cr. art. 100. +wie ſonen hier gewhnli, articuli impertinentes oder dergleien Interrogatoria zugelaen, au die von +der Ägricola per modum Exceptionis angedrohete objeio alterius criminis +ſec. cap. accedens 23. X. de accus. +nit zugelaen wird, drfften dieſelbe vieeit um do ehender vernommen werden, weilen aes ohne⸗ +hin ex Originalibus zu erweiſen ehet. + + + + + + + + + + + § + + + + . + + §. + + + + + + + 3 + + + + 6 + + + + ) + + 36) + + + + + + + W + + + + a + + + + s + + Was + + + + + + + v + + + + o + + + + n + + von + + + + + + + d + + + + e + + + + m + + dem + + + + + + + B + + + + r + + + + e + + + + d + + + + e + + + + k + + + + a + + + + w + + + + , + + Bredekaw, + + + + + + + d + + + + e + + + + r + + der + + + + + + + S + + + + e + + + + i + + + + + + + + i + + + + n + + Seiin + + + + + + + u + + + + n + + + + d + + und + + + + + + + d + + + + e + + + + r + + + + e + + + + n + + deren + + + + + + + S + + + + o + + + + h + + + + n + + Sohn + + + + + + + o + + + + b + + + + e + + + + n + + oben + + + + + + + ( + + + + § + + + + . + + (§. + + + + + + + 2 + + + + 5 + + + + . + + 25. + + + + + + + 2 + + + + 6 + + + + . + + 26. + + + + + + + 2 + + + + 7 + + + + . + + 27. + + + + + + + i + + + + + + i + + + + + + + h + + + + a + + + + l + + + + t + + + + e + + + + n + + + + , + + halten, + + + + + + + z + + + + u + + zu + + + §. 36) Was von dem Bredekaw, der Seiin und deren Sohn zu halten, i oben (§. 25. 26. 27. + + + + + + + + 2 + + + + 8 + + + + . + + + + ) + + 28.) + + + + + + + e + + + + r + + + + i + + + + n + + + + n + + + + e + + + + r + + + + t + + erinnert + + + + + + + w + + + + o + + + + r + + + + d + + + + e + + + + n + + + + . + + worden. + + + 28.) erinnert worden. + + + + + + + + M + + + + e + + + + i + + + + n + + Mein + + + + + + + L + + + + a + + + + q + + + + u + + + + a + + + + y + + + + s + + Laquays + + + + + + + f + + + + G + + + + æ + + + + r + + Græf + + + + + + + d + + + + a + + + + r + + + + + + + + , + + darff, + + + + + + + w + + + + a + + + + n + + + + n + + wann + + + + + + + g + + + + e + + + + g + + + + e + + + + n + + gegen + + + + + + + m + + + + i + + + + + + mi + + + + + + + a + + + + n + + + + n + + + + o + + + + + + anno + + + + + + + e + + + + i + + + + n + + ein + + + + + + + P + + + + r + + + + o + + + + c + + + + e + + + + ß + + Proceß + + + + + + + + + + + a + + + + t + + + + t + + att + + + + + + + h + + + + + + + + t + + + + t + + + + e + + + + , + + htte, + + + + + + + i + + + + m + + + + m + + + + e + + + + r + + + + h + + + + i + + + + n + + immerhin + + + + + + + v + + + + e + + + + r + + + + h + + + + + + + + h + + + + r + + + + e + + + + t + + verhhret + + + Mein Laquays Græf darff, wann gegen mi anno ein Proceß att htte, immerhin verhhret + + + + + + + + w + + + + e + + + + r + + + + d + + + + e + + + + n + + + + . + + werden. + + + werden. + + + + + + + + u + + + + n + + + + d + + und + + + + + + + d + + + + e + + + + r + + + + e + + + + n + + deren + + + + + + + M + + + + a + + + + n + + + + n + + Mann + + + + + + + h + + + + a + + + + b + + + + e + + + + n + + haben + + + + + + + a + + + + + + + + ſ + + + + + + + + o + + + + n + + aſon + + + + + + + g + + + + e + + + + g + + + + e + + + + n + + gegen + + + + + + + a + + + + u + + + + s + + + + g + + + + e + + + + ſ + + + + a + + + + g + + + + t + + + + . + + ausgeſagt. + + + + + + + D + + + + i + + + + e + + Die + + + + + + + W + + + + a + + + + g + + + + n + + + + e + + + + r + + + + i + + + + n + + Wagnerin + + + + + + + d + + + + i + + + + e + + die + + + + + + + I + + + + n + + + + q + + + + + + + + t + + + + i + + + + n + + + + u + + + + i + + Inquitin + + + Die Wagnerin und deren Mann haben aſon gegen die Inquitin ausgeſagt. + + + + + + + + s + + + + b + + + + e + + + + r + + + + e + + + + i + + + + t + + bereits + + + + + + + D + + + + e + + + + r + + Der + + + + + + + S + + + + + + + + n + + + + i + + + + t + + + + h + + + + e + + + + i + + + + ß + + Snitheiß + + + + + + + O + + + + b + + + + e + + + + r + + + + r + + + + o + + + + d + + + + , + + Oberrod, + + + + + + + z + + + + u + + zu + + + + + + + d + + + + e + + + + r + + der + + + + + + + i + + + + r + + + + t + + + + h + + + + W + + Wirth + + + + + + + K + + + + r + + + + e + + + + b + + + + s + + Krebs + + + + + + + u + + + + n + + + + d + + und + + + + + + + H + + + + r + + + + . + + Hr. + + + + + + + N + + + + o + + + + t + + + + a + + + + u + + + + s + + + + r + + + + i + + Notarius + + + + + + + T + + + + b + + + + e + + + + r + + + + t + + + + r + + + + i + + Tribert + + + + + + + + + + + n + + + + d + + nd + + + Der Snitheiß zu Oberrod, der Wirth Krebs und Hr. Notarius Tribert nd bereits + + + + + + + + a + + + + b + + + + g + + + + e + + + + h + + + + + + + + r + + + + e + + + + t + + + + . + + abgehret. + + + abgehret. + + §. 36) Was von dem Bredekaw, der Seiin und deren Sohn zu halten, i oben (§. 25. 26. 27. +28.) erinnert worden. +Mein Laquays Græf darff, wann gegen mi anno ein Proceß att htte, immerhin verhhret +werden. +Die Wagnerin und deren Mann haben aſon gegen die Inquitin ausgeſagt. +Der Snitheiß zu Oberrod, der Wirth Krebs und Hr. Notarius Tribert nd bereits +abgehret. + diff --git a/qurator/dinglehopper/tests/data/actevedef_718448162/OCR-D-OCR-CALAMARI/OCR-D-OCR-CALAMARI_0001.xml b/qurator/dinglehopper/tests/data/actevedef_718448162/OCR-D-OCR-CALAMARI/OCR-D-OCR-CALAMARI_0001.xml new file mode 100644 index 0000000..3e3489d --- /dev/null +++ b/qurator/dinglehopper/tests/data/actevedef_718448162/OCR-D-OCR-CALAMARI/OCR-D-OCR-CALAMARI_0001.xml @@ -0,0 +1,581 @@ + + + + Vahid + 2019-06-17T18:15:12 + 2019-06-17T18:15:12 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Die + + + + Die + + + + + + + + + + + + + + + + + + + + + + + + + + 122.1 + + + + + + ein gleiches vorgegeben, und ſo gar ſehr viele mahle gegen alle menſchliche Moͤglichkeit mit Gewalt lor- + + + + + + ciret worden zu ſeyn, behaupten will, mithin nebſt dem Bredekaw, welcher §. 28. 29.) ſich in allen ſeinen + + + + + + Auſſagen wiederſprochen, mit der lœna falſi um do gewiſſer zu belegen iſt, da + + + + + + ſecund. Fatin. Iit. 9. qu. 66. p. m. 320. + + + + + + die Klage ſo wohl als das Zeugnuͤß vos falſch und erdichtet muͤßen gehalten werden. + + + + + + .35) So viel die von der Inquiſitin + + + + + + 22. + + + + + + angegebene Zeugin belanget, ſo muß zwar, ſo viel Telt. 1. neml. des aͤltern Hr. Burgermeiſters hoch⸗ + + + + + + wohltebl. anbetrifft, der Hofraͤth Senckenberg zu ſeinem groͤßten Leidweeſen bekennen, daß Er dieſelbe, + + + + + + (nach Veranlaſſungs. 16. 17.18. 9.) vor einen lnimicum angeben muͤße, woferne jedoch annoch ein Pro⸗ + + + + + + ceß gegen den Hofrath Senckenberg ſtatt haben koͤnnte, und nicht + + + + + + contra O. Cr. art. 100. + + + + + + wie ſonſten hier gewoͤhnlich, arriculi impertinentes oder dergleichen Intertogatoria zugelaſſen, auch die von + + + + + + der Agricolu ptr modum Exceptionis antedrohete objectio alterius Criminis + + + + + + ſec. cap. accedens 23. X. de accus. + + + + + + nicht zugelaſſen wird, duͤrfften dieſelbe vielleicht um do ehender vernommen werden, weilen alles ohne⸗ + + + + + + hin ex Originaiibus zu erweiſen ſtehet. + + + + + + §. 36) Was von dem Bredekaw, der Seitzin und deren Sohn zu halten, iſt oben (. 25. 26. 27. + + + + + + 28.) erinnert. worden. + + + + + + Mein Laquays Græf darff, wann gegen mich annoch ein Proceß ſtatt haͤtte, immerhin verhoͤhret + + + + + + werden. + + + + + + Die Wagnerin und deren Mann haben allſchon gegen die Inquiſitin ausgeſagt + + + + + + Der Schnitheiß zu Oberrod, der Wirth Krebs und Hr. Notarius Tribert ſind bereits + + + + + + abgehoͤret. + + + + +122.1 +ein gleiches vorgegeben, und ſo gar ſehr viele mahle gegen alle menſchliche Moͤglichkeit mit Gewalt lor- +ciret worden zu ſeyn, behaupten will, mithin nebſt dem Bredekaw, welcher §. 28. 29.) ſich in allen ſeinen +Auſſagen wiederſprochen, mit der lœna falſi um do gewiſſer zu belegen iſt, da +ſecund. Fatin. Iit. 9. qu. 66. p. m. 320. +die Klage ſo wohl als das Zeugnuͤß vos falſch und erdichtet muͤßen gehalten werden. +.35) So viel die von der Inquiſitin +22. +angegebene Zeugin belanget, ſo muß zwar, ſo viel Telt. 1. neml. des aͤltern Hr. Burgermeiſters hoch⸗ +wohltebl. anbetrifft, der Hofraͤth Senckenberg zu ſeinem groͤßten Leidweeſen bekennen, daß Er dieſelbe, +(nach Veranlaſſungs. 16. 17.18. 9.) vor einen lnimicum angeben muͤße, woferne jedoch annoch ein Pro⸗ +ceß gegen den Hofrath Senckenberg ſtatt haben koͤnnte, und nicht +contra O. Cr. art. 100. +wie ſonſten hier gewoͤhnlich, arriculi impertinentes oder dergleichen Intertogatoria zugelaſſen, auch die von +der Agricolu ptr modum Exceptionis antedrohete objectio alterius Criminis +ſec. cap. accedens 23. X. de accus. +nicht zugelaſſen wird, duͤrfften dieſelbe vielleicht um do ehender vernommen werden, weilen alles ohne⸗ +hin ex Originaiibus zu erweiſen ſtehet. +§. 36) Was von dem Bredekaw, der Seitzin und deren Sohn zu halten, iſt oben (. 25. 26. 27. +28.) erinnert. worden. +Mein Laquays Græf darff, wann gegen mich annoch ein Proceß ſtatt haͤtte, immerhin verhoͤhret +werden. +Die Wagnerin und deren Mann haben allſchon gegen die Inquiſitin ausgeſagt +Der Schnitheiß zu Oberrod, der Wirth Krebs und Hr. Notarius Tribert ſind bereits +abgehoͤret. + + + + + + + + auch in Judicio, + + + + auch in Judicio, + + + + + + + + § 34) Zwiſchen dem Crimine falſi und concuſſionis iſt + + + + + + iec. LATERB. Coll. Theot. Pract. Lib. 48. Tit. 10. §. 16 + + + + + + eine ſo große Verwandſchafft, daß ſo gar in legibus einem einigen Verdrechen⸗wie der Conſpirationi & + + + + + + ſubornationi Teſtium bald dieſer bald jenet Nahme beygeleget wird. + + + + + + L. 2. de concuſl. I. ornel. de fall. + + + + + + Da nun der Inquiſitin dieſes Crimen allſchon voͤllig erwieſen worden (. 22.) und dieſelbe, wenn fie auch + + + + + + ohngeſtandenen falls zu einem wahren Zeugnuͤß luborniret haͤtte, + + + + + + ſee. LAETERs. Coli. Theor. Pract. L. 48. T. 10. §. 8. + + + + + + dennoch mit der pœna falſi, als fallum fieri curans, + + + + + + ſec. l. 0. 6. 3. ad L. Corn. de fali. + + + + + + L. 4. 8. C. e. 7 X. de fali. + + + + + + muͤßte beleget werden, welche dann oben (. 3i) geſagter maſſen die Straffe der Enthauptung iſt/ wie viel⸗ + + + + + + mehr wird derſelben und Jhrem Complici Bredeka dieſe Straffe angedeyhen muſſen, da dieſelbe extra + + + + + + Judicium beſtaͤndig behauptet, daß ſie der Hofrath Senckenberg mit Gewalt⸗und ſo gar mit Piſtolen zu + + + + + + ſeinem Willen gezwungen, + + + + + + Protoc. Inquiſ. fol. 71. b. fol.73. b. 82. a. b. fol. 23. a + + + + + + + + + + § 34) Zwiſchen dem Crimine falſi und concuſſionis iſt +iec. LATERB. Coll. Theot. Pract. Lib. 48. Tit. 10. §. 16 +eine ſo große Verwandſchafft, daß ſo gar in legibus einem einigen Verdrechen⸗wie der Conſpirationi & +ſubornationi Teſtium bald dieſer bald jenet Nahme beygeleget wird. +L. 2. de concuſl. I. ornel. de fall. +Da nun der Inquiſitin dieſes Crimen allſchon voͤllig erwieſen worden (. 22.) und dieſelbe, wenn fie auch +ohngeſtandenen falls zu einem wahren Zeugnuͤß luborniret haͤtte, +ſee. LAETERs. Coli. Theor. Pract. L. 48. T. 10. §. 8. +dennoch mit der pœna falſi, als fallum fieri curans, +ſec. l. 0. 6. 3. ad L. Corn. de fali. +L. 4. 8. C. e. 7 X. de fali. +muͤßte beleget werden, welche dann oben (. 3i) geſagter maſſen die Straffe der Enthauptung iſt/ wie viel⸗ +mehr wird derſelben und Jhrem Complici Bredeka dieſe Straffe angedeyhen muſſen, da dieſelbe extra +Judicium beſtaͤndig behauptet, daß ſie der Hofrath Senckenberg mit Gewalt⸗und ſo gar mit Piſtolen zu +ſeinem Willen gezwungen, +Protoc. Inquiſ. fol. 71. b. fol.73. b. 82. a. b. fol. 23. a + + + + + + + + + rath mit einer Pena fiſcali angeſehen worden, und ſolche durch des Hrn. Graffen von Koͤnigsfeld Vor⸗ + + + + + + ſpruch, nur aus Gnaden nachgelaſſen erhalten. + + + + + + Sondern man hat auch dieſen 4. Wochen lang alle Abend bey der Inquiſitin gantz allein gelaſſen + + + + + + Binnen welcher gantzer Zeit der Schreiber Bredeka beſtaͤndig bey Jhme geweſen, und ſich in + + + + + + der am 1ten ctohr. a. c. in fudicio gegen ſeinen geweſenen Hrn. intröducirter Appellation deſſen Bey⸗ + + + + + + raths bedienet hat; + + + + + + .33) Dabenebenſt iſt der Schreiber binnen dieſer gantzen Zeit auf freyem Fuß geblieben, und + + + + + + hat nicht nur durch ſeinen Conluletnten, ſondern auch, weilen der Inquilitih ſelbſten in Jhrem Gefaͤngnuͤß + + + + + + ſo viele Freyheit gelaſſen worden, daß ſie frembden Beſuch von Jhren Anverwandten ohngehindert em⸗ + + + + + + pfangen koͤnnen, durch andere Perſonen ſich mit ihr uͤber alles, was Er oder ſie dereinſten zu ſagen hat⸗ + + + + + + ten, vereinigen koͤnnen, immaſſen der Hofrath Senckenberg, als dieſer am 1. Octob. das Officiam Jèi. + + + + + + cis gegen ihn zur ſatisfactione publica excitirete, vor ſich aber ratione injuriarum demſelben (eben § præced. + + + + + + geſagter maſſen) eine Leibes⸗Straffe aufzulegen bate, vor allen Dingen, gleich als ob Er ein peinlicher + + + + + + Anklaͤger waͤre, und ohne indiciis denuneiiret haͤtte, + + + + + + ie dauf dieſem Fall inioid. Cr. art. 12. vom peinlichen Klaͤger erforderte + + + + + + Caution zu leiſten, auferleget worden, da man ſich doch ex Actis (vid. §. 31. haͤtte erſehen koͤnnen, daß + + + + + + hier von einer ohnzweiffentlichen und offentlichen Miſſethat die Frage obwalte, wobey den Richter + + + + + + in O. Cr. art. 16. + + + + + + in gantz anderer e Oficio anzuſtellender Proceß vorgeſchrieben wird und allenfalls, wenn uͤber die + + + + + + inſufficientia Iidiciorum ein Zweiffel obgewaltet haͤtte, + + + + + + ſeeund. O Cr. art. 7. + + + + + + auswaͤrtige Rechtsgelaͤhrte haͤtten muͤſſen befraget werden, anſonſten aber bey der bloßen actione Injuria- + + + + + + rum dem Hofrath Senckenberg die Cautions Leiſtung um do weniger konnte auferleget werden, da ſolche + + + + + + auch bey der Inhafftirung der Agricola von Jhm keinesweges ware erfordert worden. + + + + rath mit einer Pena fiſcali angeſehen worden, und ſolche durch des Hrn. Graffen von Koͤnigsfeld Vor⸗ +ſpruch, nur aus Gnaden nachgelaſſen erhalten. +Sondern man hat auch dieſen 4. Wochen lang alle Abend bey der Inquiſitin gantz allein gelaſſen +Binnen welcher gantzer Zeit der Schreiber Bredeka beſtaͤndig bey Jhme geweſen, und ſich in +der am 1ten ctohr. a. c. in fudicio gegen ſeinen geweſenen Hrn. intröducirter Appellation deſſen Bey⸗ +raths bedienet hat; +.33) Dabenebenſt iſt der Schreiber binnen dieſer gantzen Zeit auf freyem Fuß geblieben, und +hat nicht nur durch ſeinen Conluletnten, ſondern auch, weilen der Inquilitih ſelbſten in Jhrem Gefaͤngnuͤß +ſo viele Freyheit gelaſſen worden, daß ſie frembden Beſuch von Jhren Anverwandten ohngehindert em⸗ +pfangen koͤnnen, durch andere Perſonen ſich mit ihr uͤber alles, was Er oder ſie dereinſten zu ſagen hat⸗ +ten, vereinigen koͤnnen, immaſſen der Hofrath Senckenberg, als dieſer am 1. Octob. das Officiam Jèi. +cis gegen ihn zur ſatisfactione publica excitirete, vor ſich aber ratione injuriarum demſelben (eben § præced. +geſagter maſſen) eine Leibes⸗Straffe aufzulegen bate, vor allen Dingen, gleich als ob Er ein peinlicher +Anklaͤger waͤre, und ohne indiciis denuneiiret haͤtte, +ie dauf dieſem Fall inioid. Cr. art. 12. vom peinlichen Klaͤger erforderte +Caution zu leiſten, auferleget worden, da man ſich doch ex Actis (vid. §. 31. haͤtte erſehen koͤnnen, daß +hier von einer ohnzweiffentlichen und offentlichen Miſſethat die Frage obwalte, wobey den Richter +in O. Cr. art. 16. +in gantz anderer e Oficio anzuſtellender Proceß vorgeſchrieben wird und allenfalls, wenn uͤber die +inſufficientia Iidiciorum ein Zweiffel obgewaltet haͤtte, +ſeeund. O Cr. art. 7. +auswaͤrtige Rechtsgelaͤhrte haͤtten muͤſſen befraget werden, anſonſten aber bey der bloßen actione Injuria- +rum dem Hofrath Senckenberg die Cautions Leiſtung um do weniger konnte auferleget werden, da ſolche +auch bey der Inhafftirung der Agricola von Jhm keinesweges ware erfordert worden. + + + + + + + + 20 + + + + 20 + + + + + + + + ss ( 0)8 + + + + ss ( 0)8 + + + + + + + + + + diff --git a/qurator/dinglehopper/tests/data/actevedef_718448162/OCR-D-OCR-TESS/OCR-D-OCR-TESS_0001.xml b/qurator/dinglehopper/tests/data/actevedef_718448162/OCR-D-OCR-TESS/OCR-D-OCR-TESS_0001.xml new file mode 100644 index 0000000..ec51b6f --- /dev/null +++ b/qurator/dinglehopper/tests/data/actevedef_718448162/OCR-D-OCR-TESS/OCR-D-OCR-TESS_0001.xml @@ -0,0 +1,588 @@ + + + + Vahid + 2019-06-17T18:15:12 + 2019-06-17T18:15:12 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Die + + + + Die + + + + + + + + + + + + + + + + + + + + + + + + + + 122.) + + + + + + ein gleiches vorgegeben, und ſo gar ſehr viele mahſ Fgen alle menſchliche. Moͤglichkeit mit (Gewalt for + + + + + + ciret worden zu ſeyn, behaupten will, mithin nebſt dem Bredeka, welcher (8. 28. 294) ſich in allen ſeinen + + + + + + Auſſagen wiederſprochen, mit der bæna falli um do gewiſſer zu belegen iſt, da + + + + + + lecund. Patim. Lit. 9. qu. 66. p. m. 320. + + + + + + die Klage ſo wohl als das Zeugnuͤß vos faͤlſch uͤnd erdichtet muͤßen gehalten werden. + + + + + + 35) So viel die von der Inquiſitin, + + + + + + + + + + + + angegebene Zeugin belanget, ſo muß zwar, ſo viel Teſt. .. neml. des aͤltern Hr. Burgermeiſters boch⸗ + + + + + + wohlaebl. anbetrifft, der Hofrath Senckenberg zu ſeinem groͤßten Leidweeſen bekennen, daß Er dieſelbe, + + + + + + (nach Veranlaſſung 8. 16. 17. 18. 9.) vor einen laimicum angeben muͤße, woferne jedoch annoch ein Pro⸗ + + + + + + ceß getzen den Hofrath Senckenberg ſtatt haben koͤnnte, und nicht + + + + + + contta GO. Cr. art. 100. + + + + + + wie ſonſten hier gewoͤhnlich, articuli impertinentes oder dergleichen Intertogaroria zugelaſſen, auch die von + + + + + + der Agricola per modum Exceptionis angedrohete objectio alterius Criminis + + + + + + ſec. cap. accedens 23. X. de accus. + + + + + + nicht zugelaſſen wird, duͤrfften dieſelbe vielleicht um do ehender vernommen werden, weilen alles ohne⸗ + + + + + + Hin ex Originaiibus zu erweifen Rehet. + + + + + + F. 36) Was von dem hredekaw, der Seitzin und deren Sohn zu halten, iſt oben (8. 25. 26. 27 + + + + + + 28. ) erinnert worden. 4322 + + + + + + M̃ein baquays Græl darffwann gegen mich annoch ein Proceß ſtatt haͤtte, immerhin verhoͤhre + + + + + + vwerden. + + + + + + Die Wagnerin und deren Mann haben allſchon gegen die Inquiſitu ausgeſagt + + + + + + Der Schnitheiß zu Oberrod, der Wirth Krebs und Hr. Notarius Tribert ſind bereits + + + + + + abgehoͤret. — + + + + +122.) +ein gleiches vorgegeben, und ſo gar ſehr viele mahſ Fgen alle menſchliche. Moͤglichkeit mit (Gewalt for +ciret worden zu ſeyn, behaupten will, mithin nebſt dem Bredeka, welcher (8. 28. 294) ſich in allen ſeinen +Auſſagen wiederſprochen, mit der bæna falli um do gewiſſer zu belegen iſt, da +lecund. Patim. Lit. 9. qu. 66. p. m. 320. +die Klage ſo wohl als das Zeugnuͤß vos faͤlſch uͤnd erdichtet muͤßen gehalten werden. +35) So viel die von der Inquiſitin, + +angegebene Zeugin belanget, ſo muß zwar, ſo viel Teſt. .. neml. des aͤltern Hr. Burgermeiſters boch⸗ +wohlaebl. anbetrifft, der Hofrath Senckenberg zu ſeinem groͤßten Leidweeſen bekennen, daß Er dieſelbe, +(nach Veranlaſſung 8. 16. 17. 18. 9.) vor einen laimicum angeben muͤße, woferne jedoch annoch ein Pro⸗ +ceß getzen den Hofrath Senckenberg ſtatt haben koͤnnte, und nicht +contta GO. Cr. art. 100. +wie ſonſten hier gewoͤhnlich, articuli impertinentes oder dergleichen Intertogaroria zugelaſſen, auch die von +der Agricola per modum Exceptionis angedrohete objectio alterius Criminis +ſec. cap. accedens 23. X. de accus. +nicht zugelaſſen wird, duͤrfften dieſelbe vielleicht um do ehender vernommen werden, weilen alles ohne⸗ +Hin ex Originaiibus zu erweifen Rehet. +F. 36) Was von dem hredekaw, der Seitzin und deren Sohn zu halten, iſt oben (8. 25. 26. 27 +28. ) erinnert worden. 4322 +M̃ein baquays Græl darffwann gegen mich annoch ein Proceß ſtatt haͤtte, immerhin verhoͤhre +vwerden. +Die Wagnerin und deren Mann haben allſchon gegen die Inquiſitu ausgeſagt +Der Schnitheiß zu Oberrod, der Wirth Krebs und Hr. Notarius Tribert ſind bereits +abgehoͤret. — + + + + + + + + auch in Judicio, + + + + auch in Judicio, + + + + + + + + § 34) Zwiſchen dem Crimine falſi und concuſſionis iſt + + + + + + ſec. LAUTERB. Coll. Theor. Pract. Lib. 48. Tit. 10. . 16 + + + + + + eine ſo große Verwandſchafft, daß ſo gar in legibus einem einigen Verbrechen⸗wie der Conſpirationi & + + + + + + ſubornationi Teſtium bald dieferbald jener Nahme beygelegetwird. + + + + + + L. 2. de concuiſſ- i. r. del. Cornel. ie Eli. + + + + + + Da nun der Inquiſitin dieſes Crimen allſchon vollig erwieſen worden (S. 224 und dieſelbe, wenn fie auch + + + + + + ohngeſtandenen falls zu einem wahren Zeugnuͤß luborniret haͤtte, + + + + + + ſee. LATERR. Coll. Theor. Pract. L. 48. T. 10. S. 8 + + + + + + dennoch mit der pœna falſi, als falium fieri curans, + + + + + + ſec. l. a. G. 3. ad L. Corn. de fali. + + + + + + L. 4. S. C. C. 7 X. de fallilil + + + + + + muͤßte beleget werden, welche dann oben (8. 31) geſagter maſſen die Straffe der Enthauptung iſt/ wie viel⸗ + + + + + + mehr wird derſelben und Jhrem Complici Bredekae dieſe Straffe angedeyhen muͤſſen, da dieſelbe excra + + + + + + Judicium beſtaͤndig behauptet daß ſie der Hofrath Senckenberg mit Gewalt⸗ und fo gar mit Piſtolen zu + + + + + + ſeinem Willen gezwungen + + + + + + Protoc. Inquiſ. fol. 71. b. fo3. b. 82. à. B. fol. 23. à. + + + + + + —? — + + + + § 34) Zwiſchen dem Crimine falſi und concuſſionis iſt +ſec. LAUTERB. Coll. Theor. Pract. Lib. 48. Tit. 10. . 16 +eine ſo große Verwandſchafft, daß ſo gar in legibus einem einigen Verbrechen⸗wie der Conſpirationi & +ſubornationi Teſtium bald dieferbald jener Nahme beygelegetwird. +L. 2. de concuiſſ- i. r. del. Cornel. ie Eli. +Da nun der Inquiſitin dieſes Crimen allſchon vollig erwieſen worden (S. 224 und dieſelbe, wenn fie auch +ohngeſtandenen falls zu einem wahren Zeugnuͤß luborniret haͤtte, +ſee. LATERR. Coll. Theor. Pract. L. 48. T. 10. S. 8 +dennoch mit der pœna falſi, als falium fieri curans, +ſec. l. a. G. 3. ad L. Corn. de fali. +L. 4. S. C. C. 7 X. de fallilil +muͤßte beleget werden, welche dann oben (8. 31) geſagter maſſen die Straffe der Enthauptung iſt/ wie viel⸗ +mehr wird derſelben und Jhrem Complici Bredekae dieſe Straffe angedeyhen muͤſſen, da dieſelbe excra +Judicium beſtaͤndig behauptet daß ſie der Hofrath Senckenberg mit Gewalt⸗ und fo gar mit Piſtolen zu +ſeinem Willen gezwungen +Protoc. Inquiſ. fol. 71. b. fo3. b. 82. à. B. fol. 23. à. +—? — + + + + + + + + rath mit einer Pœna fiſcali angeſehen worden, und ſolche durch des Hun. Graffen von Koͤnigsfeld Vor⸗ + + + + + + ſpruch, nur aus Gnaden nachgelaſſen erhalten. + + + + + + Sondern man hat auch dieſen 4. Wochen lang alle Abend bey der Inquifitin gantz allein gelaſſen + + + + + + Binnen welcher gantzer Zeit der Schreiber Bredekaw beſtaͤndig bey Jhme geweſen, und ſich in + + + + + + der am 13ten Octobr. a.c. in ſudicio gegen ſeinen geweſenen Hrn. intröducirter Appellation deſſen Bey⸗ + + + + + + raths bedienel hat; + + + + + + 9.35) Dabenebenſt iſt der Schreiber binnen dieſer gantzen Zeit auf freyem Fuß geblieben, und + + + + + + hat nicht nur durch ſeinen Conulenten, ſondern auch, weilen der Inquilitih ſelbſten in Jhrem Gefaͤngnuͤß + + + + + + ſo viele Freyheit gelaſſen worden, daß ſie frembden Beſuch von Jhren Anverwandten ohngehindert en— + + + + + + pfangen koͤnnen, durch andere Perſonen ſich mit ihr uͤber alles, was Er oder ſie dereinſten zu ſagen hat— + + + + + + ten, vereinigen koͤnnen, immaſſen der Hofrath Senckenberg, als dieſer am 1. Octob. das Officium: Judi + + + + + + cis gegen ihn zur ſatisfactione publica excitirete, vor ſich aber ratione injuriarum demſelben (eben 8 præced. + + + + + + geſagter maſfen) eine Leibes⸗Straͤffe aufzulegen bate, vor allen Dingen, gleich als ob Errein peinlicher + + + + + + Anklaͤger waͤre, und ohne indiciis denuneiiret hatte, + + + + + + it Wauf dieſem Fall inord. Cr. art. 12. vom peinlichen Klaͤger erforderte + + + + + + Caution zu leiſten, auferleget worden, da man ſich doch ex Actis (id. S. 31. haͤtte erſehen koͤnnen, daß + + + + + + hier von einer ohnzweiffentlichen und offentlichen Miſſethat die Frage obwalte, wobey dem Richter + + + + + + in O. Cr. art. 16. + + + + + + ein gantz anderer ex Oflcio anzuſtellendet Proceß vorgeſchrieben wird, und allenfalls/ wenn uͤber die + + + + + + inſuſticientiam Iidiciorum ein Zweiffel obgewaltet haͤtte, + + + + + + ſeeund. O Cr. art. 7. + + + + + + auswaͤrtige Rechtsgelaͤhrte haͤtten muͤſſen beftaget werden, anſonſten aber bey der bloßen actione Jnjuria. + + + + + + rum dem Hofrath Senckenberg die Cautions Leiſtung um do weniger konnte auferleget werden, da ſolche + + + + + + auch bey der Inhafftirung der Agricola von Jhm keinesweges ware erfordert worden. + + + + rath mit einer Pœna fiſcali angeſehen worden, und ſolche durch des Hun. Graffen von Koͤnigsfeld Vor⸗ +ſpruch, nur aus Gnaden nachgelaſſen erhalten. +Sondern man hat auch dieſen 4. Wochen lang alle Abend bey der Inquifitin gantz allein gelaſſen +Binnen welcher gantzer Zeit der Schreiber Bredekaw beſtaͤndig bey Jhme geweſen, und ſich in +der am 13ten Octobr. a.c. in ſudicio gegen ſeinen geweſenen Hrn. intröducirter Appellation deſſen Bey⸗ +raths bedienel hat; +9.35) Dabenebenſt iſt der Schreiber binnen dieſer gantzen Zeit auf freyem Fuß geblieben, und +hat nicht nur durch ſeinen Conulenten, ſondern auch, weilen der Inquilitih ſelbſten in Jhrem Gefaͤngnuͤß +ſo viele Freyheit gelaſſen worden, daß ſie frembden Beſuch von Jhren Anverwandten ohngehindert en— +pfangen koͤnnen, durch andere Perſonen ſich mit ihr uͤber alles, was Er oder ſie dereinſten zu ſagen hat— +ten, vereinigen koͤnnen, immaſſen der Hofrath Senckenberg, als dieſer am 1. Octob. das Officium: Judi +cis gegen ihn zur ſatisfactione publica excitirete, vor ſich aber ratione injuriarum demſelben (eben 8 præced. +geſagter maſfen) eine Leibes⸗Straͤffe aufzulegen bate, vor allen Dingen, gleich als ob Errein peinlicher +Anklaͤger waͤre, und ohne indiciis denuneiiret hatte, +it Wauf dieſem Fall inord. Cr. art. 12. vom peinlichen Klaͤger erforderte +Caution zu leiſten, auferleget worden, da man ſich doch ex Actis (id. S. 31. haͤtte erſehen koͤnnen, daß +hier von einer ohnzweiffentlichen und offentlichen Miſſethat die Frage obwalte, wobey dem Richter +in O. Cr. art. 16. +ein gantz anderer ex Oflcio anzuſtellendet Proceß vorgeſchrieben wird, und allenfalls/ wenn uͤber die +inſuſticientiam Iidiciorum ein Zweiffel obgewaltet haͤtte, +ſeeund. O Cr. art. 7. +auswaͤrtige Rechtsgelaͤhrte haͤtten muͤſſen beftaget werden, anſonſten aber bey der bloßen actione Jnjuria. +rum dem Hofrath Senckenberg die Cautions Leiſtung um do weniger konnte auferleget werden, da ſolche +auch bey der Inhafftirung der Agricola von Jhm keinesweges ware erfordert worden. + + + + + + + + 20 + + + + 20 + + + + + + + + e CG 54 + + + + e CG 54 + + + + + + + + + + diff --git a/qurator/dinglehopper/tests/data/actevedef_718448162/mets.xml b/qurator/dinglehopper/tests/data/actevedef_718448162/mets.xml new file mode 100644 index 0000000..a6804ca --- /dev/null +++ b/qurator/dinglehopper/tests/data/actevedef_718448162/mets.xml @@ -0,0 +1,287 @@ + + + + + Goobi - UGH-1.11.1-v1.11.0-11-gbafb11b - 16−November−2015 + Goobi + + + + + + + + DE-1 + 4" Fy 11178 + + + + Hanau + + 1749 + + + + Berlin + + 2012 + Staatsbibliothek zu Berlin - Preußischer Kulturbesitz, Germany + [Electronic ed.] + + Historische Drucke + Rechtswissenschaft + VD18 digital + + PPN718448162 + + http://resolver.staatsbibliothek-berlin.de/SBB00008F1000000000 + 11750219 + PPN370506340 + + Acten-mäßiger Verlauff, Des Fameusen Processus sich verhaltende zwischen Herrn Hoff-Rath Eraßmus Senckenberg des Raths zu Franckfurt An einem und der Unschuldigen Catharina Agricola, am andern Theil puncto stupri violenti + Worinnen allen unpartheyischen Iustitiariis diese unverantwortliche Procedur und dabey gespielte listige Touren klärlich vor Augen gestellet werden + + P_Drucke_VD18 + VD18 11750219 + + ger + + + + VD18 digital + + + + + asn + + Senckenberg + Eraßmus + Senckenberg, Eraßmus + + + + asn + + Agricola + Catharina + Agricola, Catharina + + + + fnd + + Deutsche Forschungsgemeinschaft + + + reformatted digital + 44 S. + + + + + Aktenmäßiger Verlauf famosen Prozesses Hofrat Erasmus Rats Frankfurt Justitiariis + + + CC BY-NC-SA 4.0 International + text + + + + + + + + + + Ursachen so diesen Druck veranlasset + + + + + + + + + + + Endlich Abgetrungene Rechtliche Interims-Defensions-Schrifft ... + + + + + + + + + + + Staatsbibliothek zu Berlin - Preußischer Kulturbesitz + http://resolver.staatsbibliothek-berlin.de/SBB0000000100000000 + http://www.staatsbibliothek-berlin.de + mailto:info@sbb.spk-berlin.de + + + + + + + + + http://www.stabikat.de/DB=1/PPN?PPN=718448162 + http://digital.staatsbibliothek-berlin.de/dms/werkansicht/?PPN=PPN718448162 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/qurator/dinglehopper/tests/data/brochrnx_73075507X/00000139.gt.page.xml b/qurator/dinglehopper/tests/data/brochrnx_73075507X/00000139.gt.page.xml new file mode 100644 index 0000000..2e57619 --- /dev/null +++ b/qurator/dinglehopper/tests/data/brochrnx_73075507X/00000139.gt.page.xml @@ -0,0 +1,5610 @@ + + + + doculibtopagexml + 2019-01-08T01:56:06 + 2019-04-11T08:41:58 + + + + + + + + + + + + + + + + + + + + + + + D + + + + i + + + + e + + Die + + + + + + ſ + + + + + + + + e + + + + i + + + + n + + + + e + + + + n + + ſeinen + + + + + + u + + + + n + + + + s + + uns + + + + + + b + + + + a + + + + l + + + + d + + bald + + + + + + k + + + + l + + + + e + + + + i + + + + n + + + + e + + kleine + + + + + + H + + + + + + + + g + + + + e + + + + l + + + + , + + Hgel, + + + + + + b + + + + a + + + + l + + + + d + + bald + + + + + + H + + + + + + + + t + + + + t + + + + e + + + + n + + + + , + + Htten, + + + + + + Z + + + + e + + + + l + + + + t + + + + e + + + + n + + Zelten + + + + + + u + + + + n + + + + d + + und + + + + + + b + + + + a + + + + l + + + + d + + bald + + Die ſeinen uns bald kleine Hgel, bald Htten, Zelten und bald + + + + + + + + W + + + + e + + + + + + + + e + + + + n + + Ween + + Ween + + + + + + + + D + + + + e + + + + n + + Den + + + + + + B + + + + l + + + + i + + + + + + + + e + + + + n + + + + , + + Blien, + + + + + + w + + + + e + + + + l + + + + + + + + e + + wele + + + + + + + + + + e + + e + + + + + + d + + + + u + + + + r + + + + + + + + l + + + + a + + + + u + + + + f + + + + e + + + + n + + + + , + + durlaufen, + + + + + + v + + + + o + + + + n + + von + + + + + + w + + + + t + + + + e + + + + n + + + + e + + + + i + + weiten + + + + + + + + + + f + + + + t + + + + e + + + + r + + + + s + + fters + + + + + + v + + + + o + + + + r + + + + z + + + + u + + + + + + + + e + + + + + + + + e + + + + n + + + + . + + vorzueen. + + Den Blien, wele e durlaufen, von weiten fters vorzueen. + + + + + + + + S + + + + i + + + + e + + + + h + + + + t + + Sieht + + + + + + m + + + + a + + + + n + + man + + + + + + e + + + + i + + + + n + + ein + + + + + + ſ + + + + o + + + + l + + + + + + ſol + + + + + + g + + + + e + + + + m + + + + + + + + h + + + + t + + + + e + + + + s + + gemhtes + + + + + + F + + + + e + + + + l + + + + d + + + + , + + Feld, + + + + + + v + + + + o + + + + n + + von + + + + + + o + + + + b + + + + e + + + + n + + + + , + + oben, + + Sieht man ein ſol gemhtes Feld, von oben, + + + + + + + + S + + + + o + + So + + + + + + g + + + + l + + + + e + + + + i + + + + + + + + t + + gleit + + + + + + e + + + + s + + es + + + + + + e + + + + e + + + + m + + + + i + + + + n + + einem + + + + + + w + + + + e + + + + i + + + + t + + + + e + + + + n + + weiten + + + + + + M + + + + e + + + + e + + + + r + + + + , + + Meer, + + + + + + w + + + + o + + + + r + + + + a + + + + u + + + + f + + worauf + + + + + + e + + + + r + + + + h + + + + a + + + + b + + + + n + + + + e + + erhabne + + + + + + W + + + + e + + + + + + + + e + + + + n + + Ween + + + + + + t + + + + o + + + + b + + + + e + + + + n + + + + , + + toben, + + So gleit es einem weiten Meer, worauf erhabne Ween toben, + + + + + + + + J + + + + e + + + + d + + + + o + + + + + + Jedo + + + + + + m + + + + i + + + + t + + mit + + + + + + d + + + + i + + + + e + + + + ſ + + + + e + + + + m + + dieſem + + + + + + U + + + + n + + + + t + + + + e + + + + r + + + + ſ + + + + + + + + e + + + + i + + + + d + + + + , + + Unterſeid, + + + + + + d + + + + a + + + + ß + + + + , + + daß, + + + + + + d + + + + a + + da + + + + + + + + + + + +  + + + + + + d + + + + i + + + + e + + die + + + + + + b + + + + e + + + + + + + + + + + + n + + + + d + + + + i + + + + g + + bendig + + + + + + r + + + + + + + + h + + + + r + + + + e + + + + n + + + + : + + rhren: + + Jedo mit dieſem Unterſeid, daß, da  die bendig rhren: + + + + + + + + V + + + + o + + + + n + + Von + + + + + + e + + + + i + + + + n + + + + i + + + + g + + + + e + + + + r + + einiger + + + + + + B + + + + e + + + + w + + + + e + + + + g + + + + u + + + + n + + + + g + + Bewegung + + + + + + h + + + + i + + + + e + + + + r + + + + , + + hier, + + + + + + i + + + + n + + in + + + + + + d + + + + i + + + + e + + + + ſ + + + + e + + + + n + + dieſen + + + + + + W + + + + e + + + + + + + + e + + + + n + + + + , + + Ween, + + + + + + n + + + + i + + + + + + + + t + + + + s + + nits + + + + + + z + + + + u + + zu + + + + + + ſ + + + + p + + + + + + + + h + + + + r + + + + e + + + + n + + + + . + + ſphren. + + Von einiger Bewegung hier, in dieſen Ween, nits zu ſphren. + + Die ſeinen uns bald kleine Hgel, bald Htten, Zelten und bald +Ween +Den Blien, wele e durlaufen, von weiten fters vorzueen. +Sieht man ein ſol gemhtes Feld, von oben, +So gleit es einem weiten Meer, worauf erhabne Ween toben, +Jedo mit dieſem Unterſeid, daß, da  die bendig rhren: +Von einiger Bewegung hier, in dieſen Ween, nits zu ſphren. + + + + + + + + + + + D + + + + a + + Da + + + + + + C + + + + a + + + + p + + + + o + + + + . + + Capo. + + Da Capo. + + Da Capo. + + + + + + + + + + + G + + + + e + + + + h + + + + t + + Geht + + + + + + m + + + + a + + + + n + + man + + + + + + a + + + + u + + + + f + + auf + + + + + + e + + + + i + + + + n + + + + e + + + + n + + einen + + + + + + ſ + + + + o + + + + l + + + + + + + + e + + + + n + + ſolen + + + + + + F + + + + e + + + + l + + + + d + + + + e + + + + , + + Felde, + + + + + + ſ + + + + o + + ſo + + + + + + e + + + + b + + + + e + + + + n + + eben + + + + + + e + + + + r + + + + + + er + + + + + + g + + + + e + + + + m + + + + + + + + h + + + + t + + + + , + + gemht, + + + + + + ſ + + + + p + + + + a + + + + + + + + i + + + + e + + + + r + + + + e + + + + n + + + + , + + ſpaieren, + + Geht man auf einen ſolen Felde, ſo eben er gemht, ſpaieren, + + + + + + + + D + + + + a + + + + s + + Das + + + + + + m + + + + a + + + + n + + man + + + + + + g + + + + e + + + + w + + + + o + + + + h + + + + n + + + + t + + gewohnt + + + + + + v + + + + o + + + + + + vo + + + + + + K + + + + o + + + + r + + + + n + + Korn + + + + + + z + + + + u + + zu + + + + + + ſ + + + + e + + + + h + + + + n + + + + ; + + ſehn; + + + + + + ſ + + + + o + + ſo + + + + + + k + + + + o + + + + m + + + + m + + + + e + + + + n + + kommen + + + + + + w + + + + i + + + + r + + wir + + + + + + u + + + + n + + + + s + + uns + + + + + + g + + + + r + + + + + + + + + + + + e + + + + r + + grer + + + + + + f + + + + + + + + r + + + + , + + fr, + + Das man gewohnt vo Korn zu ſehn; ſo kommen wir uns grer fr, + + + + + + + + D + + + + a + + + + s + + Das + + + + + + F + + + + e + + + + l + + + + d + + Feld + + + + + + h + + + + i + + + + n + + + + g + + + + e + + + + g + + + + e + + + + n + + hingegen + + + + + + n + + + + i + + + + e + + + + d + + + + r + + + + i + + + + g + + + + e + + + + r + + + + . + + niedriger. + + + + + + A + + + + u + + + + + + Au + + + + + + n + + + + i + + + + m + + + + m + + + + t + + nimmt + + + + + + ſ + + + + o + + + + d + + + + e + + + + n + + + + n + + ſodenn + + + + + + e + + + + i + + + + n + + ein + + + + + + n + + + + e + + + + u + + + + e + + + + r + + neuer + + + + + + S + + + + + + + + e + + + + i + + + + n + + + + , + + Sein, + + Das Feld hingegen niedriger. Au nimmt ſodenn ein neuer Sein, + + + + + + + + U + + + + n + + + + d + + Und + + + + + + e + + + + i + + + + n + + + + e + + eine + + + + + + n + + + + e + + + + u + + + + e + + neue + + + + + + F + + + + a + + + + r + + + + b + + + + e + + + + n + + Farben + + + + + + Z + + + + i + + + + e + + + + r + + Zier + + Und eine neue Farben Zier + + + + + + + + D + + + + e + + + + n + + Den + + + + + + e + + + + r + + + + + + er + + + + + + g + + + + e + + + + m + + + + + + + + h + + + + t + + + + e + + + + n + + gemhten + + + + + + A + + + + + + + + e + + + + r + + Aer + + + + + + e + + + + i + + + + n + + + + . + + ein. + + Den er gemhten Aer ein. + + + + + + + + D + + + + e + + + + r + + Der + + + + + + G + + + + r + + + + u + + + + n + + + + d + + Grund + + + + + + i + + + + + + i + + + + + + g + + + + r + + + + + + + + n + + + + , + + grn, + + + + + + d + + + + i + + + + e + + die + + + + + + S + + + + t + + + + o + + + + p + + + + p + + + + e + + + + l + + + + n + + Stoppeln + + + + + + g + + + + e + + + + l + + + + b + + + + , + + gelb, + + + + + + u + + + + n + + + + d + + und + + + + + + w + + + + e + + + + n + + + + n + + wenn + + + + + + + + + + + +  + + + + + + u + + + + n + + + + ſ + + + + r + + + + e + + + + r + + unſrer + + + + + + S + + + + o + + + + n + + + + + + Son⸗ + + Der Grund i grn, die Stoppeln gelb, und wenn  unſrer Son⸗ + + + + + + + + n + + + + e + + + + n + + nen + + + + + + L + + + + i + + + + + + + + t + + Lit + + nen Lit + + + + + + + + A + + + + n + + An + + + + + + i + + + + h + + + + r + + + + e + + ihre + + + + + + r + + + + u + + + + n + + + + d + + + + e + + runde + + + + + + g + + + + l + + + + a + + + + t + + + + t + + + + e + + glatte + + + + + + R + + + + + + + + h + + + + r + + + + e + + + + n + + + + , + + Rhren, + + + + + + z + + + + u + + + + m + + + + a + + + + h + + + + l + + + + e + + + + n + + zumahlen + + + + + + f + + + + r + + + + + + + + h + + frh + + + + + + u + + + + n + + + + d + + und + + + + + + A + + + + b + + + + e + + + + n + + + + d + + + + s + + + + , + + Abends, + + + + + + b + + + + r + + + + i + + + + + + + + t + + + + : + + brit: + + An ihre runde glatte Rhren, zumahlen frh und Abends, brit: + + + + + + + + S + + + + o + + So + + + + + + k + + + + a + + + + n + + + + n + + kann + + + + + + e + + + + i + + + + n + + ein + + + + + + G + + + + o + + + + l + + + + d + + Gold + + + + + + k + + + + a + + + + u + + + + m + + kaum + + + + + + + + + + + + + + r + + + + + + + + e + + + + r + + rer + + + + + + g + + + + l + + + + + + + + n + + + + + + + + e + + + + n + + + + . + + glnen. + + + + + + D + + + + i + + + + e + + + + s + + Dies + + + + + + m + + + + a + + + + + + + + t + + mat + + + + + + e + + + + i + + + + n + + ein + + + + + + l + + + + i + + + + e + + + + b + + + + l + + + + i + + + + + + + + e + + + + s + + lieblies + + So kann ein Gold kaum rer glnen. Dies mat ein lieblies + + + + + + + + G + + + + e + + + + m + + + + i + + + + ſ + + + + + + + + e + + + + , + + Gemiſe, + + Gemiſe, + + + + + + + + Z + + + + u + + + + m + + + + a + + + + h + + + + l + + Zumahl + + + + + + w + + + + e + + + + n + + + + n + + + + , + + wenn, + + + + + + i + + + + n + + in + + + + + + d + + + + e + + + + r + + der + + + + + + N + + + + a + + + + + + + + b + + + + a + + + + r + + + + ſ + + + + + + + + a + + + + f + + + + t + + + + , + + Nabarſaft, + + + + + + e + + + + i + + + + n + + ein + + + + + + d + + + + u + + + + n + + + + + + + + e + + + + l + + + + + + + + g + + + + r + + + + + + + + n + + + + e + + + + n + + + + d + + + + e + + + + s + + dunel⸗grnendes + + + + + + G + + + + e + + + + b + + + + + + + + e + + + + , + + + + ſ + + + + + + Gebſe, + + Zumahl wenn, in der Nabarſaft, ein dunel⸗grnendes Gebſe, + + + + + + + + D + + + + e + + + + n + + Den + + + + + + g + + + + e + + + + l + + + + b + + + + e + + + + n + + gelben + + + + + + S + + + + + + + + i + + + + m + + + + m + + + + e + + + + r + + Simmer + + + + + + n + + + + o + + + + + + no + + + + + + e + + + + r + + + + h + + + + + + + + h + + + + t + + + + . + + erhht. + + + + + + W + + + + i + + + + e + + Wie + + + + + + i + + + + + + i + + + + + + n + + + + u + + + + n + + nun + + + + + + j + + + + + + + + n + + + + g + + + + + + + + , + + jng, + + + + + + z + + + + u + + + + r + + zur + + + + + + A + + + + b + + + + e + + + + n + + + + d + + Abend + + + + + + Z + + + + e + + + + i + + + + t + + + + , + + Zeit, + + Den gelben Simmer no erhht. Wie i nun jng, zur Abend Zeit, + + + + + + + + D + + + + u + + + + r + + + + + + Dur + + + + + + ſ + + + + o + + ſo + + + + + + v + + + + i + + + + e + + + + l + + viel + + + + + + ſ + + + + + + + + w + + + + e + + + + r + + + + e + + ſwere + + + + + + S + + + + e + + + + e + + + + g + + + + e + + + + n + + + + s + + + + + + + + B + + + + e + + + + r + + + + g + + + + e + + + + , + + Seegens⸗Berge, + + + + + + m + + + + i + + + + t + + mit + + + + + + ſ + + + + a + + + + n + + + + f + + + + t + + + + e + + + + n + + ſanften + + + + + + S + + + + + + + + r + + + + i + + + + t + + + + t + + + + e + + + + n + + + + , + + Sritten, + + + + + + h + + + + i + + + + n + + hin + + + + + + u + + + + n + + + + d + + und + + Dur ſo viel ſwere Seegens⸗Berge, mit ſanften Sritten, hin und + + + + + + + + w + + + + d + + + + , + + + + i + + + + e + + + + e + + + + r + + wieder, + + wieder, + + + + + + + + G + + + + e + + + + r + + + + + + + + h + + + + r + + + + e + + + + t + + Gerhret + + + + + + d + + + + u + + + + r + + + + + + dur + + + + + + d + + + + e + + + + s + + des + + + + + + F + + + + e + + + + l + + + + d + + + + e + + + + s + + Feldes + + + + + + S + + + + + + + + m + + + + u + + + + + + + + , + + Smu, + + + + + + g + + + + e + + + + r + + + + + + + + h + + + + r + + + + e + + + + t + + gerhret + + + + + + d + + + + u + + + + r + + + + + + dur + + + + + + d + + + + i + + + + e + + die + + + + + + F + + + + r + + + + u + + + + + + + + t + + + + b + + + + a + + + + r + + + + k + + + + e + + + + i + + + + t + + + + , + + Frutbarkeit, + + Gerhret dur des Feldes Smu, gerhret dur die Frutbarkeit, + + + + + + + + V + + + + e + + + + r + + + + g + + + + n + + + + + + + + g + + + + t + + Vergngt + + + + + + a + + + + u + + + + f + + auf + + + + + + m + + + + e + + + + i + + + + n + + + + e + + + + m + + meinem + + + + + + A + + + + + + + + e + + + + r + + Aer + + + + + + g + + + + i + + + + e + + + + n + + + + g + + + + , + + gieng, + + + + + + e + + + + r + + + + t + + + + + + + + n + + + + t + + + + e + + + + n + + ertnten + + + + + + d + + + + i + + + + e + + + + ſ + + + + e + + dieſe + + + + + + m + + + + e + + + + e + + + + i + + + + n + + meine + + + + + + L + + + + i + + + + e + + + + d + + + + e + + + + r + + + + : + + Lieder: + + Vergngt auf meinem Aer gieng, ertnten dieſe meine Lieder: + + Geht man auf einen ſolen Felde, ſo eben er gemht, ſpaieren, +Das man gewohnt vo Korn zu ſehn; ſo kommen wir uns grer fr, +Das Feld hingegen niedriger. Au nimmt ſodenn ein neuer Sein, +Und eine neue Farben Zier +Den er gemhten Aer ein. +Der Grund i grn, die Stoppeln gelb, und wenn  unſrer Son⸗ +nen Lit +An ihre runde glatte Rhren, zumahlen frh und Abends, brit: +So kann ein Gold kaum rer glnen. Dies mat ein lieblies +Gemiſe, +Zumahl wenn, in der Nabarſaft, ein dunel⸗grnendes Gebſe, +Den gelben Simmer no erhht. Wie i nun jng, zur Abend Zeit, +Dur ſo viel ſwere Seegens⸗Berge, mit ſanften Sritten, hin und +wieder, +Gerhret dur des Feldes Smu, gerhret dur die Frutbarkeit, +Vergngt auf meinem Aer gieng, ertnten dieſe meine Lieder: + + + + + + + + + + + + + 1 + + + + 1 + + + + 5 + + 115 + + 115 + + 115 + + + + + + + + + + H + + H + + + + + + 2 + + 2 + + H 2 + + H 2 + + + + + + + + + + + . + + + + A + + + + R + + + + A + + + + I + + ARIA. + + ARIA. + + ARIA. + + + + + + + + + + + A + + + + R + + + + I + + + + A + + + + . + + ARIA. + + ARIA. + + ARIA. + + + + + + + + + + + W + + + + a + + + + s + + Was + + + + + + e + + + + r + + + + h + + + + e + + + + b + + + + t + + erhebt + + + + + + d + + + + e + + + + s + + des + + + + + + S + + + + + + + + + + + + p + + + + f + + + + e + + + + r + + + + s + + Spfers + + + + + + G + + + + + + + + t + + + + e + + Gte + + Was erhebt des Spfers Gte + + + + + + + + M + + + + e + + + + h + + + + r + + + + , + + Mehr, + + + + + + a + + + + l + + + + s + + als + + + + + + d + + + + i + + + + e + + + + ſ + + + + e + + + + s + + dieſes + + + + + + S + + + + e + + + + e + + + + g + + + + e + + + + n + + + + s + + Seegens + + + + + + M + + + + e + + + + e + + + + r + + + + ? + + Meer? + + Mehr, als dieſes Seegens Meer? + + + + + + + + K + + + + o + + + + m + + + + m + + + + t + + Kommt + + + + + + d + + + + i + + + + e + + + + s + + dies + + + + + + w + + + + o + + + + h + + + + l + + wohl + + + + + + v + + + + o + + + + n + + von + + + + + + u + + + + n + + + + g + + + + e + + + + f + + + + e + + + + h + + + + r + + + + ? + + ungefehr? + + Kommt dies wohl von ungefehr? + + + + + + + + N + + + + e + + + + i + + + + n + + + + , + + Nein, + + + + + + r + + + + u + + + + f + + + + t + + ruft + + + + + + m + + + + e + + + + i + + + + n + + mein + + + + + + e + + + + r + + + + f + + + + r + + + + e + + + + u + + + + t + + erfreut + + + + + + G + + + + e + + + + m + + + + + + + + h + + + + t + + + + e + + + + : + + Gemhte: + + Nein, ruft mein erfreut Gemhte: + + + + + + + + N + + + + u + + + + r + + Nur + + + + + + v + + + + o + + + + n + + von + + + + + + G + + + + O + + + + T + + + + T + + GOTT + + + + + + k + + + + o + + + + m + + + + m + + + + t + + kommt + + + + + + a + + + + + + + + e + + + + s + + aes + + + + + + h + + + + e + + + + r + + + + ; + + her; + + Nur von GOTT kommt aes her; + + + + + + + + I + + + + h + + + + m + + Ihm + + + + + + ſ + + + + e + + + + y + + ſey + + + + + + e + + + + i + + + + ß + + + + P + + + + r + + Preiß + + + + + + u + + + + n + + + + d + + und + + + + + + D + + + + a + + + + n + + + + + + Dan + + + + + + u + + + + n + + + + d + + und + + + + + + E + + + + h + + + + r + + + + ! + + Ehr! + + Ihm ſey Preiß und Dan und Ehr! + + Was erhebt des Spfers Gte +Mehr, als dieſes Seegens Meer? +Kommt dies wohl von ungefehr? +Nein, ruft mein erfreut Gemhte: +Nur von GOTT kommt aes her; +Ihm ſey Preiß und Dan und Ehr! + diff --git a/qurator/dinglehopper/tests/data/brochrnx_73075507X/00000139.ocrd-tess.ocr.page.xml b/qurator/dinglehopper/tests/data/brochrnx_73075507X/00000139.ocrd-tess.ocr.page.xml new file mode 100644 index 0000000..b60d0f7 --- /dev/null +++ b/qurator/dinglehopper/tests/data/brochrnx_73075507X/00000139.ocrd-tess.ocr.page.xml @@ -0,0 +1,289 @@ + + + + OCR-D/core 1.0.0b11 + 2019-08-01T15:03:17.741679 + 2019-08-01T15:03:17.741679 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Die ſcheinen uns bald kleine Hügel - bald Hütten x Zelten und bald + + + + + + „Bellen + + + + + + Den Blicken , welche ſie durchlaufen , von weiten öfters vorzuſtellen, + + + + + + Sieht man ein ſolch gemähtes Feld - von oben, + + + + + + Sy gleicht es einem weiten Meer - worauf erhabne Wellen kobeny + + + + + + Jedoch mit dieſem Unterſcheid - daß, da ſich die beſtändig rühren: + + + + + + Von einiger Bewegung hier - in dieſen Wellen ; nichts zu ſpähren, + + + + Die ſcheinen uns bald kleine Hügel - bald Hütten x Zelten und bald +„Bellen +Den Blicken , welche ſie durchlaufen , von weiten öfters vorzuſtellen, +Sieht man ein ſolch gemähtes Feld - von oben, +Sy gleicht es einem weiten Meer - worauf erhabne Wellen kobeny +Jedoch mit dieſem Unterſcheid - daß, da ſich die beſtändig rühren: +Von einiger Bewegung hier - in dieſen Wellen ; nichts zu ſpähren, + + + + + + + + + + + + + + Was erhebt des Schöpfers Güte + + + + + + Mehr , als dieſes Seegens Meer? + + + + + + Kommt dies wohl von ungefehv? + + + + + + Nein , rüſt mein erfreut Gemühte + + + + + + Nur von GOTT komint alles hers + + + + + + Ihm ſey Preiß und Dan und Ehr! + + + + Was erhebt des Schöpfers Güte +Mehr , als dieſes Seegens Meer? +Kommt dies wohl von ungefehv? +Nein , rüſt mein erfreut Gemühte +Nur von GOTT komint alles hers +Ihm ſey Preiß und Dan und Ehr! + + + + + + + + Da Capo, + + + + Da Capo, + + + + + + + + Geht man auf einen ſolhen Felde, ſo eben erſi gemäht - ſpaßtiereny + + + + + + Das man gewohnt voll Korn zu ſehn; ſo kommen wir uns gröſſer für, + + + + + + Das Feld hingegen niedriger. Auch nimmt ſodean ein neuer Scheinz + + + + + + Und eine neue Farben Zier + + + + + + Den erſt gemähten Aker ein, + + + + + + Der Grund iſt grün - die Stoppeln gelb und wenn fich unjrer Son- + + + + + + nen B;Of + + + + + + Un ihre runde glatte Röhren , zumahlen früh und Abends bricht; + + + + + + So kann ein Gold kaum ſtärcker glänßen.- Dies macht ein liebliches + + + + + + Gemiſche, | + + + + + + Zutnahl wenn , in der Nachbarſchaft - ein dumfel-grünendes Gebüſche + + + + + + Den gelben Schimmer noch erhöht. Wir ich nun jüngſt, zur Abend Zeif, + + + + + + Durch ſo viel ſhwere Scegens-Berge, mit ſanften Schritten, hin und + + + + + + Wieder; + + + + + + Gepühret durch des Feldes Schmu, gerühret durc< die Fruchtbarkeitz + + + + + + Vergmigt auf meinem Acker gieng - ertönten dieſe meine Lieder: + + + + Geht man auf einen ſolhen Felde, ſo eben erſi gemäht - ſpaßtiereny +Das man gewohnt voll Korn zu ſehn; ſo kommen wir uns gröſſer für, +Das Feld hingegen niedriger. Auch nimmt ſodean ein neuer Scheinz +Und eine neue Farben Zier +Den erſt gemähten Aker ein, +Der Grund iſt grün - die Stoppeln gelb und wenn fich unjrer Son- +nen B;Of +Un ihre runde glatte Röhren , zumahlen früh und Abends bricht; +So kann ein Gold kaum ſtärcker glänßen.- Dies macht ein liebliches +Gemiſche, | +Zutnahl wenn , in der Nachbarſchaft - ein dumfel-grünendes Gebüſche +Den gelben Schimmer noch erhöht. Wir ich nun jüngſt, zur Abend Zeif, +Durch ſo viel ſhwere Scegens-Berge, mit ſanften Schritten, hin und +Wieder; +Gepühret durch des Feldes Schmu, gerühret durc< die Fruchtbarkeitz +Vergmigt auf meinem Acker gieng - ertönten dieſe meine Lieder: + + + + + + + + 5) 2 + + + + + + ARIA. + + + + 5) 2 +ARIA. + + + + diff --git a/qurator/dinglehopper/tests/data/lorem-ipsum/lorem-ipsum-scan-bad.gt.page.xml b/qurator/dinglehopper/tests/data/lorem-ipsum/lorem-ipsum-scan-bad.gt.page.xml new file mode 100644 index 0000000..c28161b --- /dev/null +++ b/qurator/dinglehopper/tests/data/lorem-ipsum/lorem-ipsum-scan-bad.gt.page.xml @@ -0,0 +1,47 @@ + + + + + 2019-07-26T13:59:00 + 2019-07-26T14:00:29 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt +ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo +dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit +amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor +invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et +justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum +dolor sit amet. diff --git a/qurator/dinglehopper/tests/data/lorem-ipsum/lorem-ipsum-scan-bad.ocr.tesseract.alto.xml b/qurator/dinglehopper/tests/data/lorem-ipsum/lorem-ipsum-scan-bad.ocr.tesseract.alto.xml new file mode 100644 index 0000000..1fd8377 --- /dev/null +++ b/qurator/dinglehopper/tests/data/lorem-ipsum/lorem-ipsum-scan-bad.ocr.tesseract.alto.xml @@ -0,0 +1,139 @@ + + + + pixel + + + + + + + tesseract 4.1.0-rc4 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/qurator/dinglehopper/tests/data/lorem-ipsum/lorem-ipsum-scan-bad.pdf b/qurator/dinglehopper/tests/data/lorem-ipsum/lorem-ipsum-scan-bad.pdf new file mode 100644 index 0000000..da97e0e Binary files /dev/null and b/qurator/dinglehopper/tests/data/lorem-ipsum/lorem-ipsum-scan-bad.pdf differ diff --git a/qurator/dinglehopper/tests/data/lorem-ipsum/lorem-ipsum-scan-bad.tif b/qurator/dinglehopper/tests/data/lorem-ipsum/lorem-ipsum-scan-bad.tif new file mode 100644 index 0000000..42b3d23 Binary files /dev/null and b/qurator/dinglehopper/tests/data/lorem-ipsum/lorem-ipsum-scan-bad.tif differ diff --git a/qurator/dinglehopper/tests/data/lorem-ipsum/lorem-ipsum-scan.gt.page.xml b/qurator/dinglehopper/tests/data/lorem-ipsum/lorem-ipsum-scan.gt.page.xml new file mode 100644 index 0000000..c28161b --- /dev/null +++ b/qurator/dinglehopper/tests/data/lorem-ipsum/lorem-ipsum-scan.gt.page.xml @@ -0,0 +1,47 @@ + + + + + 2019-07-26T13:59:00 + 2019-07-26T14:00:29 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt +ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo +dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit +amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor +invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et +justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum +dolor sit amet. diff --git a/qurator/dinglehopper/tests/data/lorem-ipsum/lorem-ipsum-scan.ocr.tesseract.alto.xml b/qurator/dinglehopper/tests/data/lorem-ipsum/lorem-ipsum-scan.ocr.tesseract.alto.xml new file mode 100644 index 0000000..d4a79a0 --- /dev/null +++ b/qurator/dinglehopper/tests/data/lorem-ipsum/lorem-ipsum-scan.ocr.tesseract.alto.xml @@ -0,0 +1,138 @@ + + + + pixel + + + + + + + tesseract 4.1.0-rc4 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/qurator/dinglehopper/tests/data/lorem-ipsum/lorem-ipsum-scan.pdf b/qurator/dinglehopper/tests/data/lorem-ipsum/lorem-ipsum-scan.pdf new file mode 100644 index 0000000..38564d7 Binary files /dev/null and b/qurator/dinglehopper/tests/data/lorem-ipsum/lorem-ipsum-scan.pdf differ diff --git a/qurator/dinglehopper/tests/data/lorem-ipsum/lorem-ipsum-scan.tif b/qurator/dinglehopper/tests/data/lorem-ipsum/lorem-ipsum-scan.tif new file mode 100644 index 0000000..39f11d6 Binary files /dev/null and b/qurator/dinglehopper/tests/data/lorem-ipsum/lorem-ipsum-scan.tif differ diff --git a/qurator/dinglehopper/tests/data/lorem-ipsum/lorem-ipsum.odt b/qurator/dinglehopper/tests/data/lorem-ipsum/lorem-ipsum.odt new file mode 100644 index 0000000..ce93bfd Binary files /dev/null and b/qurator/dinglehopper/tests/data/lorem-ipsum/lorem-ipsum.odt differ diff --git a/qurator/dinglehopper/tests/data/mixed-regions.page.xml b/qurator/dinglehopper/tests/data/mixed-regions.page.xml new file mode 100644 index 0000000..0e2a117 --- /dev/null +++ b/qurator/dinglehopper/tests/data/mixed-regions.page.xml @@ -0,0 +1,290 @@ + + + + OCR-D/core 1.0.0b19 + 2019-09-26T11:59:19.519140 + 2019-09-26T11:59:19.519140 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + phariſei hypocritæ, qui comeditis domos uiduarã ſub + + + + + + prætextu longarum precationum, propterea maiorẽ + + + + + + accipieris condemnationem. Ideo enim ꝙ non oratis + + + + + + ſecundum præſeriptum ſacræ ſcripturæ, nec ex ſpiritu + + + + + + & ueritate ſed iuxta ueſtram propriam conſtitutionẽ, + + + + + + orationes ueſtræ nõ ſiunt Deo acceptæ, neq; ab eo ex⸗ + + + + + + audiunt᷑ Eſa, Cum multiplicaueritis orationes ueſtras + + + + + + non exaudiam uos. Chriſtiani uero quia orant iuxta + + + + + + tenorem ſcripturæ, & ex ſpiritu & ueritate, ideo eo⸗ + + + + + + rum orationes a Deo exaudiuntur, ſuntq; illi grat iſsi⸗ + + + + + + mæ, dicunt enim Pater noſter qui es iu cœlis &c. Vos + + + + + + autem hoc tenore orandi contempto, obmur muratis + + + + + + ueſtras Horas canonicas, hoc eſt, diabolicas ab Anti⸗ + + + + + + chriſto inſtitutas. Paulus mauult quinq; uerba in Ec⸗ + + + + + + cle ſia loqui in ſenſu, qß decem milia uerborum in lin⸗ + + + + + + ua, Quibus uerbis adeo dãnat ueſtras prolixas ora⸗ + + + + + + tiones, ut ſi ſemiuncia ſanæ mentis uel mica ſidei eſfet + + + + + + in uobis, eas ſine dubio omitteretis. + + + + + + De inuocatione diuorum ne apiculus quidem ha + + + + + + betur in ſacris literis, quare ter ſtulti eſtis quod inuo⸗ + + + + + + catis ſanctos, cum ex præce pto Dei ne mo inuocandus + + + + + + ſit niſi ſolus Deus. Inuoca inquit me in die tribulatio⸗ + + + + + + nis. & eruam te, & honorificabis me. Et omnis qui⸗ + + + + + + cumq; inuocauerit nomen domini, ſaluus erit Sed + + + + + + quomodo inuocabitis, in quem non credidiſtis? Quo + + + + + + modo credetis ſine uerbo ? Inuocationẽ ergo in ſcrip⸗ + + + + + + turis non legitis cõmemorationem uero ſæpe, non ut + + + + + + intercedant pro uobis ſancti, ſed nt meminerit Deus + + + + + + Teſtamenti cum patribus ſanctis pacti, ut ſimiliter uo⸗ + + + + + + biſcum agat per miſericordiam, quemadmodum cum + + + + + + ilis egit. Atq; hoc non eſt inuocare ſanctos. ſed Deum + + + + + + ſuæ miſericordiæ & promiſsionis admonere Sic pſal + + + + + + mographus dicit, Qui paſcis Iſrael attende, qui de⸗ + + + + + + ducis uelut ouem Iacob Sic & Moſes orat, Memento + + + + + + B 3 domi⸗ + + + + phariſei hypocritæ, qui comeditis domos uiduarã ſub +prætextu longarum precationum, propterea maiorẽ +accipieris condemnationem. Ideo enim ꝙ non oratis +ſecundum præſeriptum ſacræ ſcripturæ, nec ex ſpiritu +& ueritate ſed iuxta ueſtram propriam conſtitutionẽ, +orationes ueſtræ nõ ſiunt Deo acceptæ, neq; ab eo ex⸗ +audiunt᷑ Eſa, Cum multiplicaueritis orationes ueſtras +non exaudiam uos. Chriſtiani uero quia orant iuxta +tenorem ſcripturæ, & ex ſpiritu & ueritate, ideo eo⸗ +rum orationes a Deo exaudiuntur, ſuntq; illi grat iſsi⸗ +mæ, dicunt enim Pater noſter qui es iu cœlis &c. Vos +autem hoc tenore orandi contempto, obmur muratis +ueſtras Horas canonicas, hoc eſt, diabolicas ab Anti⸗ +chriſto inſtitutas. Paulus mauult quinq; uerba in Ec⸗ +cle ſia loqui in ſenſu, qß decem milia uerborum in lin⸗ +ua, Quibus uerbis adeo dãnat ueſtras prolixas ora⸗ +tiones, ut ſi ſemiuncia ſanæ mentis uel mica ſidei eſfet +in uobis, eas ſine dubio omitteretis. +De inuocatione diuorum ne apiculus quidem ha +betur in ſacris literis, quare ter ſtulti eſtis quod inuo⸗ +catis ſanctos, cum ex præce pto Dei ne mo inuocandus +ſit niſi ſolus Deus. Inuoca inquit me in die tribulatio⸗ +nis. & eruam te, & honorificabis me. Et omnis qui⸗ +cumq; inuocauerit nomen domini, ſaluus erit Sed +quomodo inuocabitis, in quem non credidiſtis? Quo +modo credetis ſine uerbo ? Inuocationẽ ergo in ſcrip⸗ +turis non legitis cõmemorationem uero ſæpe, non ut +intercedant pro uobis ſancti, ſed nt meminerit Deus +Teſtamenti cum patribus ſanctis pacti, ut ſimiliter uo⸗ +biſcum agat per miſericordiam, quemadmodum cum +ilis egit. Atq; hoc non eſt inuocare ſanctos. ſed Deum +ſuæ miſericordiæ & promiſsionis admonere Sic pſal +mographus dicit, Qui paſcis Iſrael attende, qui de⸗ +ducis uelut ouem Iacob Sic & Moſes orat, Memento +B 3 domi⸗ + + + + + + + + + + + + + diff --git a/qurator/dinglehopper/tests/data/order.page.xml b/qurator/dinglehopper/tests/data/order.page.xml new file mode 100644 index 0000000..a1e058f --- /dev/null +++ b/qurator/dinglehopper/tests/data/order.page.xml @@ -0,0 +1,4204 @@ + + + + doculibtopagexml + 2018-11-20T05:00:14 + 2019-04-17T10:47:36 + + + + + + + + + + + + + + + + + + + + 7 + + + + 5 + + + + . + + 75. + + 75. + + + + + + + + E + + + + t + + + + w + + + + a + + + + s + + Etwas + + + + + + f + + + + r + + + + + + fr + + + + + + W + + + + i + + + + t + + + + t + + + + w + + + + e + + + + n + + + + . + + Wittwen. + + Etwas fr Wittwen. + + 75. +Etwas fr Wittwen. + + + + + + + + + + + 7 + + + + 6 + + + + . + + 76. + + + + + + D + + + + i + + + + e + + Die + + 76. Die + + 76. Die + + + + + + + + + + + m + + + + e + + + + n + + + + . + + men. + + + + + + D + + + + e + + + + n + + Den + + + + + + a + + + + n + + + + d + + + + e + + + + r + + + + n + + andern + + + + + + T + + + + a + + + + g + + Tag + + + + + + s + + + + + + + + i + + + + e + + + + n + + + + e + + + + r + + ersien + + + + + + d + + + + e + + + + r + + der + + + + + + e + + + + i + + + + n + + + + g + + + + e + + + + l + + + + a + + + + d + + + + e + + + + n + + + + e + + eingeladene + + men. Den andern Tag ersien der eingeladene + + + + + + + + b + + + + i + + + + s + + bis + + + + + + d + + + + r + + + + e + + + + y + + drey + + + + + + T + + + + a + + + + g + + + + e + + Tage + + + + + + h + + + + i + + + + n + + + + t + + + + e + + + + r + + + + e + + + + i + + + + n + + + + a + + + + n + + + + d + + + + e + + + + r + + hintereinander + + + + + + j + + + + e + + + + d + + + + e + + + + s + + + + m + + + + a + + + + l + + jedesmal + + + + + + z + + + + u + + + + m + + zum + + + + + + M + + + + i + + + + t + + + + + + Mit⸗ + + bis drey Tage hintereinander jedesmal zum Mit⸗ + + + + + + + + G + + + + a + + + + + + Ga + + + + + + m + + + + i + + + + t + + mit + + + + + + d + + + + e + + + + n + + den + + + + + + S + + + + e + + + + i + + + + n + + + + i + + + + g + + + + e + + + + n + + + + , + + Seinigen, + + + + + + u + + + + n + + + + d + + und + + + + + + k + + + + a + + + + m + + kam + + + + + + n + + + + a + + + + + + + + h + + + + e + + + + r + + naher + + + + + + z + + + + w + + + + e + + + + y + + zwey + + Ga mit den Seinigen, und kam naher zwey + + + + + + + + t + + + + a + + + + g + + + + s + + + + e + + + + + + + + e + + + + n + + + + . + + tagseen. + + + + + + D + + + + e + + + + r + + Der + + + + + + a + + + + n + + + + d + + + + e + + + + r + + + + e + + andere + + + + + + h + + + + i + + + + e + + + + r + + + + + + + + b + + + + e + + + + r + + + + , + + hierber, + + + + + + u + + + + n + + + + d + + und + + + + + + w + + + + u + + + + n + + + + d + + + + e + + + + r + + + + t + + + + e + + wunderte + + + + + + + + + + + +  + + tagseen. Der andere wunderte  hierber, und + + + + + + + + H + + + + e + + + + r + + + + r + + Herr + + + + + + K + + + + o + + + + n + + + + f + + + + r + + + + a + + + + t + + + + e + + + + r + + Konfrater + + + + + + w + + + + i + + + + r + + + + d + + + + , + + wird, + + + + + + n + + + + e + + + + b + + + + + + neb + + + + + + ſ + + + + e + + + + i + + + + n + + + + e + + + + r + + ſeiner + + + + + + l + + + + i + + + + e + + + + b + + + + e + + + + n + + lieben + + + + + + F + + + + a + + + + m + + + + i + + + + l + + + + i + + + + e + + + + , + + Familie, + + Herr Konfrater wird, neb ſeiner lieben Familie, + + + + + + + + H + + + + e + + + + r + + + + r + + + + n + + Herrn + + + + + + K + + + + o + + + + n + + + + f + + + + r + + + + a + + + + t + + + + e + + + + r + + + + , + + Konfrater, + + + + + + d + + + + a + + + + ß + + daß + + + + + + e + + + + s + + es + + + + + + i + + + + h + + + + m + + ihm + + + + + + z + + + + w + + + + a + + + + r + + zwar + + + + + + ſ + + + + a + + + + g + + + + e + + + + t + + + + e + + ſagete + + + + + + ſ + + + + e + + + + i + + + + n + + + + e + + + + m + + ſeinem + + ſagete ſeinem Herrn Konfrater, daß es ihm zwar + + + + + + + + v + + + + o + + + + m + + vom + + + + + + H + + + + e + + + + r + + + + z + + + + e + + + + n + + Herzen + + + + + + w + + + + e + + + + n + + + + n + + wenn + + + + + + e + + + + r + + er + + + + + + i + + + + h + + + + n + + ihn + + + + + + b + + + + e + + + + y + + bey + + + + + + + + + + + +  + + + + + + a + + + + n + + + + g + + + + e + + + + n + + + + e + + + + h + + + + m + + angenehm + + + + + + ſ + + + + y + + + + e + + + + , + + ſey, + + vom Herzen angenehm ſey, wenn er ihn bey  + + + + + + + + ſ + + + + o + + ſo + + + + + + o + + + + f + + + + t + + + + e + + ofte + + + + + + h + + + + a + + + + b + + + + e + + + + . + + habe. + + + + + + D + + + + e + + + + r + + Der + + + + + + G + + + + a + + + + + + Ga + + + + + + a + + + + n + + + + t + + + + w + + + + o + + + + r + + + + t + + + + e + + + + t + + + + e + + + + , + + antwortete, + + + + + + d + + + + a + + + + ß + + daß + + + + + + e + + + + r + + er + + ſo ofte habe. Der Ga antwortete, daß er + + + + + + + + e + + + + s + + es + + + + + + k + + + + o + + + + m + + + + m + + + + e + + + + , + + komme, + + + + + + d + + + + a + + + + ß + + daß + + + + + + e + + + + r + + er + + + + + + j + + + + e + + + + + + + + t + + jet + + + + + + d + + + + i + + + + e + + die + + + + + + E + + + + h + + + + r + + + + e + + Ehre + + + + + + ſ + + + + e + + + + i + + + + n + + + + e + + + + s + + ſeines + + + + + + Z + + + + u + + + + ſ + + + + p + + + + r + + + + u + + + + + + + + s + + Zuſprus + + es komme, daß er jet die Ehre ſeines Zuſprus + + + + + + + + b + + + + e + + + + i + + + + r + + + + t + + + + h + + + + e + + + + n + + + + w + + bewirthen + + + + + + e + + + + r + + er + + + + + + i + + + + n + + + + z + + + + w + + + + i + + + + ſ + + + + + + + + e + + + + n + + inzwiſen + + + + + + n + + + + i + + + + + + + + t + + + + , + + nit, + + + + + + o + + + + h + + + + e + + + + r + + + + w + + woher + + + + + + w + + + + i + + + + + + + + e + + wie + + + + + + k + + + + + + + + n + + + + n + + + + e + + + + ; + + knne; + + bewirthen knne; er wie inzwiſen nit, woher + + + + + + + + t + + + + + + + + g + + + + l + + + + i + + + + + + tgli + + + + + + a + + + + n + + an + + + + + + ſ + + + + e + + + + i + + + + n + + + + e + + + + r + + ſeiner + + + + + + T + + + + h + + + + + + + + r + + + + e + + Thre + + + + + + a + + + + n + + + + g + + + + e + + + + ſ + + + + + + + + r + + + + i + + + + e + + + + b + + + + e + + + + n + + angeſrieben + + + + + + + + + + n + + + + d + + + + e + + + + , + + finde, + + + + + + m + + + + o + + + + r + + + + + + mor⸗ + + tgli an ſeiner Thre angeſrieben finde, mor⸗ + + + + + + + + g + + + + e + + + + n + + gen + + + + + + z + + + + u + + zu + + + + + + k + + + + o + + + + m + + + + m + + + + e + + + + n + + + + . + + kommen. + + gen zu kommen. + + + + + + + + g + + + + e + + + + b + + + + e + + + + t + + + + e + + + + n + + + + , + + gebeten, + + + + + + m + + + + o + + + + r + + + + g + + + + e + + + + n + + morgen + + + + + + z + + + + u + + zu + + + + + + k + + + + o + + + + + + + + m + + kom⸗ + + + + + + z + + + + u + + zu + + + + + + m + + + + i + + + + r + + mir + + + + + + z + + + + u + + + + m + + zum + + + + + + M + + + + i + + + + a + + + + g + + + + s + + + + e + + + + + + + + e + + + + n + + + + t + + + + t + + Mittagseen + + gebeten, morgen zum Mittagseen zu mir zu kom⸗ + + Herr Konfrater wird, neb ſeiner lieben Familie, +gebeten, morgen zum Mittagseen zu mir zu kom⸗ +men. Den andern Tag ersien der eingeladene +Ga mit den Seinigen, und kam naher zwey +bis drey Tage hintereinander jedesmal zum Mit⸗ +tagseen. Der andere wunderte  hierber, und +ſagete ſeinem Herrn Konfrater, daß es ihm zwar +vom Herzen angenehm ſey, wenn er ihn bey  +bewirthen knne; er wie inzwiſen nit, woher +es komme, daß er jet die Ehre ſeines Zuſprus +ſo ofte habe. Der Ga antwortete, daß er +tgli an ſeiner Thre angeſrieben finde, mor⸗ +gen zu kommen. + + + + + + + + + + + + + 7 + + + + 9 + + 79 + + 79 + + 79 + + + + + + + + + + + H + + + + a + + + + n + + + + d + + + + , + + Hand, + + + + + + M + + + + y + + + + l + + + + o + + + + r + + + + d + + + + ? + + Mylord? + + + + + + f + + + + r + + + + a + + + + g + + + + t + + + + e + + fragte + + + + + + d + + + + e + + + + r + + der + + + + + + G + + + + r + + + + a + + + + f + + Graf + + + + + + v + + + + o + + + + n + + von + + + + + + R + + + + o + + + + + + + + e + + + + + + + + e + + + + r + + + + . + + Roeer. + + Hand, Mylord? fragte der Graf von Roeer. + + + + + + + + A + + + + l + + + + s + + Als + + + + + + e + + + + r + + er + + + + + + e + + + + i + + + + n + + + + m + + + + a + + + + l + + + + s + + + + s + + einsmals + + + + + + i + + + + n + + in + + + + + + d + + + + e + + + + m + + dem + + + + + + O + + + + b + + + + e + + + + r + + + + h + + + + a + + + + u + + + + ſ + + + + e + + Oberhauſe + + + + + + w + + + + e + + + + + + we⸗ + + + + + + e + + + + i + + + + n + + + + e + + eine + + + + + + B + + + + i + + + + + + Bi + + Als er einsmals in dem Oberhauſe eine Bi we⸗ + + + + + + + + b + + + + e + + + + + + + + + + + + n + + + + d + + + + i + + + + g + + bendig + + + + + + d + + + + i + + + + e + + die + + + + + + e + + + + i + + + + n + + + + e + + eine + + + + + + H + + + + a + + + + n + + + + d + + Hand + + + + + + i + + + + n + + in + + + + + + d + + + + e + + + + n + + den + + + + + + H + + + + o + + + + ſ + + + + e + + + + n + + Hoſen + + + + + + h + + + + a + + + + t + + + + t + + + + e + + + + . + + hatte. + + bendig die eine Hand in den Hoſen hatte. + + + + + + + + i + + + + n + + + + E + + Ein + + + + + + g + + + + e + + + + w + + + + i + + + + + + + + e + + + + r + + gewier + + + + + + L + + + + o + + + + r + + + + d + + Lord + + + + + + h + + + + a + + + + t + + + + t + + + + e + + hatte + + + + + + d + + + + i + + + + e + + die + + + + + + G + + + + e + + + + w + + + + o + + + + h + + + + n + + + + h + + + + e + + + + i + + + + t + + + + , + + Gewohnheit, + + + + + + d + + + + a + + + + ß + + daß + + + + + + e + + + + r + + er + + Ein gewier Lord hatte die Gewohnheit, daß er + + + + + + + + g + + + + e + + + + n + + gen + + + + + + V + + + + e + + + + r + + + + ſ + + + + o + + + + r + + + + g + + + + u + + + + n + + + + g + + Verſorgung + + + + + + a + + + + r + + + + m + + + + e + + + + r + + armer + + + + + + O + + + + + + + + i + + + + c + + + + i + + + + e + + + + r + + + + w + + + + i + + + + w + + + + e + + + + n + + + + t + + + + t + + Officierwittwen + + + + + + e + + + + i + + + + n + + + + b + + + + r + + + + i + + + + n + + + + g + + + + e + + + + n + + einbringen + + gen Verſorgung armer Officierwittwen einbringen + + + + + + + + w + + + + o + + + + + + + + t + + + + e + + + + , + + wote, + + + + + + ſ + + + + o + + ſo + + + + + + + + + + b + + + + e + + + + r + + + + r + + + + e + + + + i + + + + + + + + t + + + + e + + berreite + + + + + + e + + + + r + + er + + + + + + + + + + e + + + + , + + e, + + + + + + i + + + + n + + + + d + + + + e + + + + m + + indem + + + + + + e + + + + r + + er + + + + + + d + + + + i + + + + e + + die + + + + + + e + + + + i + + + + n + + + + e + + eine + + + + + + H + + + + a + + + + n + + + + d + + Hand + + wote, ſo berreite er e, indem er die eine Hand + + + + + + + + i + + + + n + + in + + + + + + d + + + + e + + + + n + + den + + + + + + H + + + + o + + + + ſ + + + + e + + + + n + + + + , + + Hoſen, + + + + + + u + + + + n + + + + d + + und + + + + + + i + + + + n + + in + + + + + + d + + + + e + + + + r + + der + + + + + + a + + + + n + + + + d + + + + e + + + + r + + + + n + + andern + + + + + + H + + + + a + + + + n + + + + d + + Hand + + + + + + d + + + + i + + + + e + + die + + + + + + B + + + + i + + + + + + Bi + + in den Hoſen, und in der andern Hand die Bi + + + + + + + + + + + + i + + i + + + + + + e + + + + t + + + + w + + + + a + + + + s + + etwas + + + + + + f + + + + + + + + r + + fr + + + + + + a + + + + r + + + + m + + + + e + + arme + + + + + + O + + + + + + + + i + + + + c + + + + i + + + + e + + + + r + + + + w + + + + i + + + + w + + + + e + + + + n + + + + t + + + + t + + + + . + + Officierwittwen. + + + + + + J + + + + n + + Jn + + + + + + w + + + + e + + + + l + + + + + + + + e + + + + r + + weler + + i etwas fr arme Officierwittwen. Jn weler + + + + + + + + h + + + + a + + + + t + + + + t + + + + e + + + + . + + hatte. + + + + + + e + + + + r + + + + : + + er: + + + + + + h + + + + a + + + + b + + + + e + + habe + + + + + + H + + + + i + + + + e + + + + r + + + + b + + + + e + + + + y + + Hierbey + + + + + + ſ + + + + a + + + + g + + + + e + + + + t + + + + e + + ſagete + + + + + + H + + + + i + + + + e + + + + r + + + + , + + Hier, + + + + + + M + + + + l + + + + o + + + + r + + + + d + + + + s + + + + , + + + + y + + Mylords, + + hatte. Hierbey ſagete er: Hier, Mylords, habe + + Ein gewier Lord hatte die Gewohnheit, daß er +bendig die eine Hand in den Hoſen hatte. +Als er einsmals in dem Oberhauſe eine Bi we⸗ +gen Verſorgung armer Officierwittwen einbringen +wote, ſo berreite er e, indem er die eine Hand +in den Hoſen, und in der andern Hand die Bi +hatte. Hierbey ſagete er: Hier, Mylords, habe +i etwas fr arme Officierwittwen. Jn weler +Hand, Mylord? fragte der Graf von Roeer. + diff --git a/qurator/dinglehopper/tests/data/test-fake-ocr.page2018.xml b/qurator/dinglehopper/tests/data/test-fake-ocr.page2018.xml new file mode 100644 index 0000000..0e62647 --- /dev/null +++ b/qurator/dinglehopper/tests/data/test-fake-ocr.page2018.xml @@ -0,0 +1,3394 @@ + + + + doculibtopagexml + 2019-01-08T10:25:36 + 2019-04-26T07:11:05 + + + + + + + + + + + + + + + + + + + + + + + b + + + + e + + + + r + + ber + + + + + + + d + + + + i + + + + e + + die + + + + + + + v + + + + i + + + + e + + + + l + + + + e + + + + n + + vielen + + + + + + + S + + + + o + + + + r + + + + g + + + + e + + + + n + + Sorgen + + + + + + + w + + + + e + + + + g + + + + e + + + + n + + wegen + + + + + + + d + + + + e + + + + + + + + e + + + + l + + + + b + + + + e + + + + n + + deelben + + + + + + + v + + + + e + + + + r + + + + g + + + + a + + + + ß + + vergaß + + + ber die vielen Sorgen wegen deelben vergaß + + + + + + + + i + + + + h + + + + r + + ihr + + + + + + + d + + + + o + + + + + + do + + + + + + + n + + + + o + + + + + + no + + + + + + + a + + + + n + + an + + + + + + + a + + + + + + + + e + + + + m + + + + . + + aem. + + + + + + + + + + + + ihr do no an aem. — + + + + + + + + H + + + + a + + + + r + + + + t + + + + k + + + + o + + + + p + + + + f + + + + , + + Hartkopf, + + + + + + + d + + + + e + + + + r + + der + + + + + + + F + + + + r + + + + a + + + + u + + Frau + + + + + + + A + + + + m + + + + t + + + + m + + + + + + + + n + + + + n + + + + i + + + + n + + Amtmnnin + + + + + + + d + + + + a + + + + s + + das + + + + + + + v + + + + e + + + + r + + + + + + ver⸗ + + + Hartkopf, der Frau Amtmnnin das ver⸗ + + + + + + + + ſ + + + + p + + + + r + + + + o + + + + + + + + e + + + + n + + + + e + + ſproene + + + + + + + z + + + + u + + zu + + + + + + + + + + + b + + + + e + + + + r + + + + l + + + + i + + + + e + + + + f + + + + e + + + + r + + + + n + + + + . + + berliefern. + + + + + + + + + + + + + + + + E + + + + i + + + + n + + Ein + + + + + + + E + + + + r + + + + p + + + + + + + + e + + + + r + + + + r + + + + e + + Erpreer + + + ſproene zu berliefern. — Ein Erpreer + + + + + + + + w + + + + d + + + + e + + + + u + + + + r + + wurde + + + + + + + a + + + + n + + an + + + + + + + i + + + + h + + + + n + + ihn + + + + + + + a + + + + b + + + + g + + + + e + + + + ſ + + + + + + + + i + + + + + + + + t + + + + , + + abgeſit, + + + + + + + u + + + + m + + um + + + + + + + i + + + + h + + + + n + + ihn + + + + + + + u + + + + m + + + + s + + ums + + + + + + + H + + + + i + + + + m + + + + + + Him⸗ + + + wurde an ihn abgeſit, um ihn ums Him⸗ + + + + + + + + m + + + + e + + + + l + + + + s + + + + w + + + + i + + + + + + + + e + + + + n + + melswien + + + + + + + z + + + + u + + zu + + + + + + + ſ + + + + a + + + + g + + + + e + + + + n + + + + , + + ſagen, + + + + + + + d + + + + a + + + + ß + + daß + + + + + + + e + + + + r + + er + + + + + + + d + + + + a + + + + s + + das + + + + + + + V + + + + e + + + + r + + + + ſ + + + + p + + + + r + + + + o + + + + + + + + e + + + + n + + + + e + + Verſproene + + + melswien zu ſagen, daß er das Verſproene + + + + + + + + g + + + + l + + + + e + + + + i + + + + + + glei + + + + + + + d + + + + e + + + + n + + den + + + + + + + A + + + + u + + + + g + + + + e + + + + n + + + + b + + + + l + + + + i + + + + + + Augenbli + + + + + + + + + + + b + + + + e + + + + r + + + + b + + + + r + + + + i + + + + n + + + + g + + + + e + + + + n + + berbringen + + + + + + + m + + + + + + + + + + + + t + + + + e + + + + , + + mte, + + + + + + + d + + + + i + + + + e + + die + + + glei den Augenbli berbringen mte, die + + + + + + + + F + + + + r + + + + a + + + + u + + Frau + + + + + + + A + + + + m + + + + t + + + + m + + + + + + + + n + + + + n + + + + i + + + + n + + Amtmnnin + + + + + + + h + + + + + + + + t + + + + t + + + + e + + htte + + + + + + + + + + + + +  + + + + + + + a + + + + u + + + + f + + auf + + + + + + + i + + + + h + + + + n + + ihn + + + + + + + v + + + + e + + + + r + + + + l + + + + a + + + + + + + + e + + + + n + + + + , + + verlaen, + + + Frau Amtmnnin htte  auf ihn verlaen, + + + + + + + + u + + + + n + + + + d + + und + + + + + + + n + + + + u + + + + n + + nun + + + + + + + w + + + + + + + + ß + + + + t + + + + e + + wßte + + + + + + + + + + + e + + e + + + + + + + n + + + + i + + + + + + + + t + + + + , + + nit, + + + + + + + w + + + + a + + + + s + + was + + + + + + + + + + + e + + e + + + + + + + a + + + + n + + + + f + + + + a + + + + n + + + + g + + + + e + + + + n + + anfangen + + + und nun wßte e nit, was e anfangen + + + + + + + + ſ + + + + o + + + + + + + + t + + + + e + + + + . + + ſote. + + + + + + + D + + + + e + + + + n + + Den + + + + + + + A + + + + u + + + + g + + + + e + + + + n + + + + b + + + + l + + + + i + + + + + + Augenbli + + + + + + + ſ + + + + o + + + + + + + + t + + + + e + + ſote + + + + + + + e + + + + r + + er + + + + + + + k + + + + o + + + + m + + + + m + + + + e + + + + n + + + + , + + kommen, + + + ſote. Den Augenbli ſote er kommen, + + + + + + + + + + + + e + + e + + + + + + + i + + + + n + + in + + + + + + + i + + + + h + + + + r + + + + e + + + + r + + ihrer + + + + + + + A + + + + n + + + + g + + + + + + + + . + + Ang. + + + + + + + + + + + + + + + + D + + + + i + + + + e + + Die + + + + + + + ſ + + + + o + + + + n + + + + + + ſon + + + + + + + v + + + + e + + + + r + + + + g + + + + i + + + + e + + + + n + + + + g + + vergieng + + + ſon vergieng e in ihrer Ang. — Die + + + + + + + + G + + + + + + + + + + + + e + + Ge + + + + + + + w + + + + + + + + r + + + + e + + + + n + + wren + + + + + + + ſ + + + + + + + + o + + + + n + + ſon + + + + + + + a + + + + n + + + + g + + + + e + + + + k + + + + o + + + + m + + + + m + + + + e + + + + n + + + + , + + angekommen, + + + + + + + u + + + + n + + + + d + + und + + + + + + + e + + + + s + + es + + + + + + + f + + + + e + + + + h + + + + l + + + + t + + + + e + + fehlte + + + Ge wren ſon angekommen, und es fehlte + + ber die vielen Sorgen wegen deelben vergaß +Hartkopf, der Frau Amtmnnin das ver⸗ +ſproene zu berliefern. — Ein Erpreer +wurde an ihn abgeſit, um ihn ums Him⸗ +melswien zu ſagen, daß er das Verfproene +glei den Augenbli berbringen mte, die +Frau Amtmnnin htte  auf ihn verlaen, +und nun wßte e nit, was e anfangen +ſote. Den Augembli ſote er kommen, +ſon vergieng e in ihrer Ang. — Die +Ge wren ſon angekommen, und es fehlte +ihr do no an aem. — + + + + + + + + + + + H + + + + a + + + + r + + + + t + + + + k + + + + o + + + + f + + + + p + + Hartkopf + + + + + + + m + + + + u + + + + ß + + + + t + + + + e + + mußte + + + + + + + + + + + + +  + + + + + + + e + + + + r + + + + + + er + + + + + + + b + + + + e + + + + + + + + n + + + + n + + + + e + + + + n + + + + , + + bennen, + + + + + + + u + + + + n + + + + d + + und + + + Hartkopf mußte  er bennen, und + + + + + + + + m + + + + i + + + + t + + mit + + + + + + + u + + + + n + + + + d + + und + + + + + + + + + + + b + + + + e + + + + r + + + + b + + + + r + + + + a + + + + + + + + t + + + + e + + berbrate + + + + + + + e + + + + s + + + + . + + es. + + + + + + + + + + + + mit und berbrate es. — + + + + + + + + l + + + + a + + + + n + + + + g + + + + e + + + + m + + langem + + + + + + + N + + + + a + + + + + + + + d + + + + e + + + + n + + + + k + + + + e + + + + n + + Nadenken + + + + + + + + + + + e + + + + l + + fiel + + + + + + + e + + + + s + + es + + + + + + + i + + + + h + + + + m + + ihm + + + + + + + e + + + + r + + + + + + er + + + + + + + e + + + + n + + + + d + + + + l + + + + i + + + + + + endli + + + + + + + n + + + + a + + + + + + na + + + endli na langem Nadenken fiel es ihm er + + + + + + + + w + + + + i + + + + e + + + + d + + + + e + + + + r + + wieder + + + + + + + e + + + + i + + + + n + + + + . + + ein. + + + + + + + + + + + + + + + + E + + + + r + + Er + + + + + + + l + + + + a + + + + n + + + + g + + + + t + + + + e + + langte + + + + + + + d + + + + e + + + + n + + den + + + + + + + Z + + + + e + + + + t + + + + t + + + + e + + + + l + + Zettel + + + + + + + a + + + + u + + + + s + + aus + + + + + + + d + + + + e + + + + m + + dem + + + wieder ein. — Er langte den Zettel aus dem + + + + + + + + A + + + + c + + + + c + + + + i + + + + + + + + e + + + + s + + + + b + + + + u + + Accisbue + + + + + + + h + + + + e + + + + r + + + + a + + + + u + + + + s + + + + , + + heraus, + + + + + + + u + + + + n + + + + d + + und + + + + + + + ſ + + + + a + + + + g + + + + t + + + + e + + ſagte + + + + + + + ſ + + + + e + + + + i + + + + n + + + + e + + + + r + + ſeiner + + + + + + + F + + + + r + + + + a + + + + u + + + + , + + Frau, + + + + + + + d + + + + a + + + + ß + + daß + + + Accisbue heraus, und ſagte ſeiner Frau, daß + + + + + + + + + + + + e + + e + + + + + + + d + + + + a + + + + s + + + + , + + das, + + + + + + + w + + + + a + + + + s + + was + + + + + + + d + + + + a + + da + + + + + + + w + + + + + + + + r + + + + e + + + + , + + wre, + + + + + + + h + + + + e + + + + r + + + + b + + + + e + + + + y + + + + ſ + + + + + + + + a + + + + + + + + e + + + + n + + herbeyſaffen + + + + + + + m + + + + + + + + + + + + t + + + + e + + + + . + + mte. + + + e das, was da wre, herbeyſaffen mte. + + + + + + + + J + + + + n + + + + d + + + + e + + + + ß + + Jndeß + + + + + + + m + + + + a + + + + n + + + + g + + + + e + + + + l + + + + t + + + + e + + + + n + + mangelten + + + + + + + d + + + + i + + + + e + + die + + + + + + + d + + + + o + + + + + + do + + + + + + + e + + + + i + + + + n + + + + i + + + + g + + + + e + + einige + + + + + + + G + + + + e + + + + n + + + + e + + + + l + + + + i + + + + a + + + + , + + + + r + + + + a + + Generalia, + + + Jndeß mangelten do einige Generalia, die + + + + + + + + a + + + + l + + + + ſ + + + + o + + alſo + + + + + + + w + + + + e + + + + g + + + + + + + + e + + + + l + + + + e + + + + n + + + + . + + wegfielen. + + + + + + + + + + + + + + + + H + + + + a + + + + r + + + + t + + + + k + + + + o + + + + p + + + + f + + Hartkopf + + + + + + + g + + + + i + + + + e + + + + n + + + + g + + gieng + + + + + + + ſ + + + + e + + + + l + + + + b + + + + + + ſelb + + + alſo wegfielen. — Hartkopf gieng ſelb + + Hartkopf mußte  er bennen, und +endli na langem Nadenken fiel es ihm er +wieder ein. — Er langte den Zettel aus dem +Accisbue heraus, und ſagte ſeiner Frau, daß +e das, was da wre, herbeyſaffen mte. +Jndeß mangelten do einige Generalia, die +alſo wegfielen. — Hartkopf gieng ſelb +mit und berbrate es. — + + + + diff --git a/qurator/dinglehopper/tests/data/test-gt.page2018.xml b/qurator/dinglehopper/tests/data/test-gt.page2018.xml new file mode 100644 index 0000000..c0dc183 --- /dev/null +++ b/qurator/dinglehopper/tests/data/test-gt.page2018.xml @@ -0,0 +1,3394 @@ + + + + doculibtopagexml + 2019-01-08T10:25:36 + 2019-04-26T07:11:05 + + + + + + + + + + + + + + + + + + + + + + + b + + + + e + + + + r + + ber + + + + + + + d + + + + i + + + + e + + die + + + + + + + v + + + + i + + + + e + + + + l + + + + e + + + + n + + vielen + + + + + + + S + + + + o + + + + r + + + + g + + + + e + + + + n + + Sorgen + + + + + + + w + + + + e + + + + g + + + + e + + + + n + + wegen + + + + + + + d + + + + e + + + + + + + + e + + + + l + + + + b + + + + e + + + + n + + deelben + + + + + + + v + + + + e + + + + r + + + + g + + + + a + + + + ß + + vergaß + + + ber die vielen Sorgen wegen deelben vergaß + + + + + + + + i + + + + h + + + + r + + ihr + + + + + + + d + + + + o + + + + + + do + + + + + + + n + + + + o + + + + + + no + + + + + + + a + + + + n + + an + + + + + + + a + + + + + + + + e + + + + m + + + + . + + aem. + + + + + + + + + + + + ihr do no an aem. — + + + + + + + + H + + + + a + + + + r + + + + t + + + + k + + + + o + + + + p + + + + f + + + + , + + Hartkopf, + + + + + + + d + + + + e + + + + r + + der + + + + + + + F + + + + r + + + + a + + + + u + + Frau + + + + + + + A + + + + m + + + + t + + + + m + + + + + + + + n + + + + n + + + + i + + + + n + + Amtmnnin + + + + + + + d + + + + a + + + + s + + das + + + + + + + v + + + + e + + + + r + + + + + + ver⸗ + + + Hartkopf, der Frau Amtmnnin das ver⸗ + + + + + + + + ſ + + + + p + + + + r + + + + o + + + + + + + + e + + + + n + + + + e + + ſproene + + + + + + + z + + + + u + + zu + + + + + + + + + + + b + + + + e + + + + r + + + + l + + + + i + + + + e + + + + f + + + + e + + + + r + + + + n + + + + . + + berliefern. + + + + + + + + + + + + + + + + E + + + + i + + + + n + + Ein + + + + + + + E + + + + r + + + + p + + + + + + + + e + + + + r + + + + r + + + + e + + Erpreer + + + ſproene zu berliefern. — Ein Erpreer + + + + + + + + w + + + + d + + + + e + + + + u + + + + r + + wurde + + + + + + + a + + + + n + + an + + + + + + + i + + + + h + + + + n + + ihn + + + + + + + a + + + + b + + + + g + + + + e + + + + ſ + + + + + + + + i + + + + + + + + t + + + + , + + abgeſit, + + + + + + + u + + + + m + + um + + + + + + + i + + + + h + + + + n + + ihn + + + + + + + u + + + + m + + + + s + + ums + + + + + + + H + + + + i + + + + m + + + + + + Him⸗ + + + wurde an ihn abgeſit, um ihn ums Him⸗ + + + + + + + + m + + + + e + + + + l + + + + s + + + + w + + + + i + + + + + + + + e + + + + n + + melswien + + + + + + + z + + + + u + + zu + + + + + + + ſ + + + + a + + + + g + + + + e + + + + n + + + + , + + ſagen, + + + + + + + d + + + + a + + + + ß + + daß + + + + + + + e + + + + r + + er + + + + + + + d + + + + a + + + + s + + das + + + + + + + V + + + + e + + + + r + + + + ſ + + + + p + + + + r + + + + o + + + + + + + + e + + + + n + + + + e + + Verſproene + + + melswien zu ſagen, daß er das Verſproene + + + + + + + + g + + + + l + + + + e + + + + i + + + + + + glei + + + + + + + d + + + + e + + + + n + + den + + + + + + + A + + + + u + + + + g + + + + e + + + + n + + + + b + + + + l + + + + i + + + + + + Augenbli + + + + + + + + + + + b + + + + e + + + + r + + + + b + + + + r + + + + i + + + + n + + + + g + + + + e + + + + n + + berbringen + + + + + + + m + + + + + + + + + + + + t + + + + e + + + + , + + mte, + + + + + + + d + + + + i + + + + e + + die + + + glei den Augenbli berbringen mte, die + + + + + + + + F + + + + r + + + + a + + + + u + + Frau + + + + + + + A + + + + m + + + + t + + + + m + + + + + + + + n + + + + n + + + + i + + + + n + + Amtmnnin + + + + + + + h + + + + + + + + t + + + + t + + + + e + + htte + + + + + + + + + + + + +  + + + + + + + a + + + + u + + + + f + + auf + + + + + + + i + + + + h + + + + n + + ihn + + + + + + + v + + + + e + + + + r + + + + l + + + + a + + + + + + + + e + + + + n + + + + , + + verlaen, + + + Frau Amtmnnin htte  auf ihn verlaen, + + + + + + + + u + + + + n + + + + d + + und + + + + + + + n + + + + u + + + + n + + nun + + + + + + + w + + + + + + + + ß + + + + t + + + + e + + wßte + + + + + + + + + + + e + + e + + + + + + + n + + + + i + + + + + + + + t + + + + , + + nit, + + + + + + + w + + + + a + + + + s + + was + + + + + + + + + + + e + + e + + + + + + + a + + + + n + + + + f + + + + a + + + + n + + + + g + + + + e + + + + n + + anfangen + + + und nun wßte e nit, was e anfangen + + + + + + + + ſ + + + + o + + + + + + + + t + + + + e + + + + . + + ſote. + + + + + + + D + + + + e + + + + n + + Den + + + + + + + A + + + + u + + + + g + + + + e + + + + n + + + + b + + + + l + + + + i + + + + + + Augenbli + + + + + + + ſ + + + + o + + + + + + + + t + + + + e + + ſote + + + + + + + e + + + + r + + er + + + + + + + k + + + + o + + + + m + + + + m + + + + e + + + + n + + + + , + + kommen, + + + ſote. Den Augenbli ſote er kommen, + + + + + + + + + + + + e + + e + + + + + + + i + + + + n + + in + + + + + + + i + + + + h + + + + r + + + + e + + + + r + + ihrer + + + + + + + A + + + + n + + + + g + + + + + + + + . + + Ang. + + + + + + + + + + + + + + + + D + + + + i + + + + e + + Die + + + + + + + ſ + + + + o + + + + n + + + + + + ſon + + + + + + + v + + + + e + + + + r + + + + g + + + + i + + + + e + + + + n + + + + g + + vergieng + + + ſon vergieng e in ihrer Ang. — Die + + + + + + + + G + + + + + + + + + + + + e + + Ge + + + + + + + w + + + + + + + + r + + + + e + + + + n + + wren + + + + + + + ſ + + + + + + + + o + + + + n + + ſon + + + + + + + a + + + + n + + + + g + + + + e + + + + k + + + + o + + + + m + + + + m + + + + e + + + + n + + + + , + + angekommen, + + + + + + + u + + + + n + + + + d + + und + + + + + + + e + + + + s + + es + + + + + + + f + + + + e + + + + h + + + + l + + + + t + + + + e + + fehlte + + + Ge wren ſon angekommen, und es fehlte + + ber die vielen Sorgen wegen deelben vergaß +Hartkopf, der Frau Amtmnnin das ver⸗ +ſproene zu berliefern. — Ein Erpreer +wurde an ihn abgeſit, um ihn ums Him⸗ +melswien zu ſagen, daß er das Verſproene +glei den Augenbli berbringen mte, die +Frau Amtmnnin htte  auf ihn verlaen, +und nun wßte e nit, was e anfangen +ſote. Den Augenbli ſote er kommen, +ſon vergieng e in ihrer Ang. — Die +Ge wren ſon angekommen, und es fehlte +ihr do no an aem. — + + + + + + + + + + + H + + + + a + + + + r + + + + t + + + + k + + + + o + + + + f + + + + p + + Hartkopf + + + + + + + m + + + + u + + + + ß + + + + t + + + + e + + mußte + + + + + + + + + + + + +  + + + + + + + e + + + + r + + + + + + er + + + + + + + b + + + + e + + + + + + + + n + + + + n + + + + e + + + + n + + + + , + + bennen, + + + + + + + u + + + + n + + + + d + + und + + + Hartkopf mußte  er bennen, und + + + + + + + + m + + + + i + + + + t + + mit + + + + + + + u + + + + n + + + + d + + und + + + + + + + + + + + b + + + + e + + + + r + + + + b + + + + r + + + + a + + + + + + + + t + + + + e + + berbrate + + + + + + + e + + + + s + + + + . + + es. + + + + + + + + + + + + mit und berbrate es. — + + + + + + + + l + + + + a + + + + n + + + + g + + + + e + + + + m + + langem + + + + + + + N + + + + a + + + + + + + + d + + + + e + + + + n + + + + k + + + + e + + + + n + + Nadenken + + + + + + + + + + + e + + + + l + + fiel + + + + + + + e + + + + s + + es + + + + + + + i + + + + h + + + + m + + ihm + + + + + + + e + + + + r + + + + + + er + + + + + + + e + + + + n + + + + d + + + + l + + + + i + + + + + + endli + + + + + + + n + + + + a + + + + + + na + + + endli na langem Nadenken fiel es ihm er + + + + + + + + w + + + + i + + + + e + + + + d + + + + e + + + + r + + wieder + + + + + + + e + + + + i + + + + n + + + + . + + ein. + + + + + + + + + + + + + + + + E + + + + r + + Er + + + + + + + l + + + + a + + + + n + + + + g + + + + t + + + + e + + langte + + + + + + + d + + + + e + + + + n + + den + + + + + + + Z + + + + e + + + + t + + + + t + + + + e + + + + l + + Zettel + + + + + + + a + + + + u + + + + s + + aus + + + + + + + d + + + + e + + + + m + + dem + + + wieder ein. — Er langte den Zettel aus dem + + + + + + + + A + + + + c + + + + c + + + + i + + + + + + + + e + + + + s + + + + b + + + + u + + Accisbue + + + + + + + h + + + + e + + + + r + + + + a + + + + u + + + + s + + + + , + + heraus, + + + + + + + u + + + + n + + + + d + + und + + + + + + + ſ + + + + a + + + + g + + + + t + + + + e + + ſagte + + + + + + + ſ + + + + e + + + + i + + + + n + + + + e + + + + r + + ſeiner + + + + + + + F + + + + r + + + + a + + + + u + + + + , + + Frau, + + + + + + + d + + + + a + + + + ß + + daß + + + Accisbue heraus, und ſagte ſeiner Frau, daß + + + + + + + + + + + + e + + e + + + + + + + d + + + + a + + + + s + + + + , + + das, + + + + + + + w + + + + a + + + + s + + was + + + + + + + d + + + + a + + da + + + + + + + w + + + + + + + + r + + + + e + + + + , + + wre, + + + + + + + h + + + + e + + + + r + + + + b + + + + e + + + + y + + + + ſ + + + + + + + + a + + + + + + + + e + + + + n + + herbeyſaffen + + + + + + + m + + + + + + + + + + + + t + + + + e + + + + . + + mte. + + + e das, was da wre, herbeyſaffen mte. + + + + + + + + J + + + + n + + + + d + + + + e + + + + ß + + Jndeß + + + + + + + m + + + + a + + + + n + + + + g + + + + e + + + + l + + + + t + + + + e + + + + n + + mangelten + + + + + + + d + + + + i + + + + e + + die + + + + + + + d + + + + o + + + + + + do + + + + + + + e + + + + i + + + + n + + + + i + + + + g + + + + e + + einige + + + + + + + G + + + + e + + + + n + + + + e + + + + l + + + + i + + + + a + + + + , + + + + r + + + + a + + Generalia, + + + Jndeß mangelten do einige Generalia, die + + + + + + + + a + + + + l + + + + ſ + + + + o + + alſo + + + + + + + w + + + + e + + + + g + + + + + + + + e + + + + l + + + + e + + + + n + + + + . + + wegfielen. + + + + + + + + + + + + + + + + H + + + + a + + + + r + + + + t + + + + k + + + + o + + + + p + + + + f + + Hartkopf + + + + + + + g + + + + i + + + + e + + + + n + + + + g + + gieng + + + + + + + ſ + + + + e + + + + l + + + + b + + + + + + ſelb + + + alſo wegfielen. — Hartkopf gieng ſelb + + Hartkopf mußte  er bennen, und +endli na langem Nadenken fiel es ihm er +wieder ein. — Er langte den Zettel aus dem +Accisbue heraus, und ſagte ſeiner Frau, daß +e das, was da wre, herbeyſaffen mte. +Jndeß mangelten do einige Generalia, die +alſo wegfielen. — Hartkopf gieng ſelb +mit und berbrate es. — + + + + diff --git a/qurator/dinglehopper/tests/data/test.alto1.xml b/qurator/dinglehopper/tests/data/test.alto1.xml new file mode 100644 index 0000000..ac2a50b --- /dev/null +++ b/qurator/dinglehopper/tests/data/test.alto1.xml @@ -0,0 +1,20186 @@ + + + + inch1200 + + + \\libdpsrv1\storage\root\projects\NDNP\win_19101017-19101231\win_19101017-19101231-scan-1-\ocr\0070.tif + + + + + useAbbyy4:0 + dictionaryFlag.D:American-English + ImgPrep Component Count High Estimate:31557 + useCaere:0 + Abbyy6OCREngine Character Error Ratio:0.1398 + splitwords:0 + ScansoftOCREngine Character Count:21156 + conjoinWords:1 + dictionaryOn:1 + IrisOCREngine Character Count:24661 + Abbyy6OCREngine Character Count:24501 + Abbyy6OCREngine STAT BLOCK:STATBLOCK_Abbyy6OCREngine;24501;3425;43.9106;93.2252 + language:en + ScansoftOCREngine Predicted Accuracy:65.12% + page-reoriented:UP + loadfromfile:true + suppressPunctuation:false + multipleEngineWeight:0 + suggestionCount:1 + Node Count:8562 + ScansoftOCREngine Character Error Ratio:0.4152 + spawned:1 + IrisOCREngine Character Error Ratio:0 + IrisOCREngine STAT BLOCK:STATBLOCK_IrisOCREngine;24661;0;0;0 + cachePath:/jobq/caches/newpah_legacy + monkeyTimeout:1800 + Predicted Word Accuracy:93.23% + lexicondirectory:\jobq\caches\lex\wintertree\ + text-orientation:UP + IrisOCREngine Predicted Accuracy:0% + verboseOutput:false + trimwords:1 + Abbyy6OCREngine Predicted Accuracy:93.23% + ImgPrep Component Range:31451,31557 + version:Newpah v2.07 Apocalypse in 7/4 + ScansoftOCREngine STAT BLOCK:STATBLOCK_ScansoftOCREngine;21156;8785;0.770562;65.1163 + noPunctuation:1 + ImgPrep Component Count Low Estimate:31451 + DictionaryFlagsUsed:D + configFile:/jobq/caches/newpah_legacy/config/newpah_acwi_x.xml + + + iArchives + Newpah + v2.07 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Jails + + + + + + TIIE + TIE + + + + + + + + + + + + + + + inlurie + inure + + + + W1S + ams + sis + aims + + + + mncle + manacle + + + + Ik + Ir + + + + Reynold + + + + + + + + + pliitlist + littlest + + + + aftei + + + + tlle + elite + tulle + + + + exalnina + examine + exanimate + + + + + + + + ti1n + ion + + + + al11lC1l11wcd + auuoluleed + + + + Hie + tile + Hide + + + + + + lCllI1aJwl1tly + pern1nuentl + + + + + + + + 11IlJII1cd + imijIlVCl + + + + jid + id + + + + + + + + + + goneh + + + + + + + hilt + bat + + + + + + lie + + + + + + lint + + + + + + + + + + + + + + + + + + + hilds + child + holds + + + + appearal1ec + appeuuuc + + + + + + + + + XIIIis + Morris + + + + entile + entitle + + + + hucl + huyer + hull + + + + tints + + + + + + + + + weekend + + + + + + m1ny + + + + + + + + + anwnA + auung + annA + faun + + + + + + l5n + 150 + + + + + + or + + + + JlIlI1eS + Jaines + Gaines + + + + + + + Bigstaf + Briggs + + + + avelage + + + + + + + + pOllnd + pollen + + + + + + + + + G + + + + + + 10 + 150 + lobo + + + + or + + + + + + 1 + + + + lIighltnel + Ilighhutd + lighten + + + + + + + iverlge + diverge + + + + weigh + + + + + + pou1d + pottn1 + + + + + + + + + + + ancl + 8111 + encl + + + + GII + + + + + + Hcl1IY + Ilenly + Silently + + + + + + lIall + hail + leally + + + + 1el + + + + + + agc + ac + + + + + + 14111 + + + + + + bounds + + + + + + 6 + G + + + + ccnts + accents + + + + + 1lwfl + + + + cantle + + + + Wlrc + Welch + + + + honght + Hong + + + + rm + + + + flu + + + + Iltl + Ill + + + + + It'll + + + + + timole + tlnldre + timor + insole + landed + + + + + + + + verc + veer + + + + + + tll + Cu + tall + + + + + + + 1olris + Mnrizs + Minis + + + + purchasecl + purchase + + + + + + OB + + + + + + + purebased + pUlcohased + purebred + + + + + + + + + + + + + + cattlc + + + + + ncmge + avemage + income + avenge + + + + 1100 + + + + pound + ponnds + ponds + + + + att + + + + lhe + he + + + + lrcail + lyrical + + + + + ing + King + + + + pries + + + + lllCrc + + + + arc + + + + + + dICHlt + snout + ditch + + + + IOOn + 1001 + IOn + + + + + + + 10 + + + + rtlw + Elbe + rattle + + + + txPwt + + + + nutrkct + Utrecht + + + + unolcl + Unocal + + + + + + + lltis + Allis + + + + eOllnty + felinity + + + + + countywide + + + + + + + + + + + + + + 01 + tkL + TLC + + + + 3lTlle + nTh + + + + + + + husincss + huskiness + + + + + + + + + + + + + Lexiugion + + + + + + ninny + + + + + + + + + + + omplettl + coIUIfeted + completely + cloudiest + + + + M1on1UV + + + + + + + + finn + lna + ulna + + + + + paicr + pair + + + + + + + + + + + + 01 + + + + time + + + + Centra + + + + + lientueky + alienate + + + + rohareo + threw + O'Hare + + + + + + Con + + + + + pamiy + panky + amity + + + + n1ll + + + + ue + + + + signel + signal + + + + arfolcling + aceo11II + refilling + + + + + + + + + welJnnthcntientlcl + ellauthenticated + + + + + + cullnt + enrrent + Cullen + errant + + + + Snt + S8t + Sent + + + + + nrday + hurdy + nerdy + + + + + + Trite + + + + derl + ler1 + del + + + + inYohci + ii1VOlcs + Kiyoshi + + + + + + + cxhange + exehange + change + + + + oi + + + + + + 9S000 + + + + + + + + + + + + + + l1Icl + 8111 + + + + + + bfcn + bf + + + + mulcr + mulct + + + + consielert + consider + consulter + + + + + + + + + + ion + + + + + + eCnl + eNcl + + + + + + past + + + + + fhc + hoc + + + + pl1IC1tnSCIB + pnre1Iase + + + + + + + + ullling + + + + + + + + + + + largcst + + + + + + walc + wail + + + + + + + + + + + + + + + + + Loniile + Lenities + + + + + + + WHrchons + Horehounds + + + + Compan + + + + of + + + + + + + whih + + + + OWI1S + onus + + + + cight + + + + + + tho + thou + + + + + + + + twe1c + + + + similnl + swimmingly + + + + mouses + mousers + + + + + + busincs + busniess + buskins + + + + + + + thlt + halt + + + + eity + deity + + + + nnd + nd + + + + whicht + + + + + + + + ns + + + + + + + + + ltle4 + + + + warehol1se + tvarehmise + starchier + + + + enmpuy + empty + + + + + + + + + + + t11aeo + + + + + + + + + + + + + + + + + 1ollals + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Steck + Stcck + Stack + + + + + + Gcod + Cod + + + + + + + + + + + + + Kenry + Kerry + + + + + + + + + YtItCl + 11tlter + Wailer + Title + + + + Benn + Beata + Benin + Bata + + + + + + pnrchlsed + purchased + + + + + + + + + Hcnry + 1Ienry + + + + Phillip + + + + + + + + 11lc1 + + + + + + + i7ine + + + + statuI + st8nl + statue + + + + non + + + + + + + + Phimp + 1hilipS + Pimp + + + + + + + Irng + Airing + + + + storl + stol + + + + + + + + + + + + + sion + scion + + + + + + morninA + + + + rte + + + + + + + + + + + tho + thou + + + + + + paper + + + + + + 11l1gazincs + niagazinis + + + + + + + Mimi + + + + Pltillilis + Policlinics + + + + hnl + hl + + + + condncted + emuluCte1 + + + + + + + + + + + 1nuy + + + + year + + + + + + tins + + + + + + + + + + + jood + Jody + + + + pattonagl + atonal + + + + + + + + Bcnl + BBC + + + + + + + + + + + + + kClP + kcal + + + + + + 10 + + + + inercase + interclass + + + + lime + + + + + + + + + + + itid + pitied + + + + + + priyaic + prismatic + + + + + + + NEW + NI + + + + SECRHARY + SCRIARY + SECTARY + CRAY + + + + + + + r + + + + + TIlE + + + + + + + + + + + + + + + + + + + + SIJhar + Spar + Sitar + + + + + + + + Madcl + Madly + + + + + + + anil + anile + + + + + + + + + + I + + + + + + + Churched + + + + + + + morninA + + + + ml + + + + + + H + + + + SplInt + Spar + + + + + + + vas + + + + elccted + + + + sCllctary + oscillatory + + + + + + tlC1FIUll + + + + + + + 0f + + + + thc + + + + Firsl + Firs + + + + Baptisl + + + + + + + + itl + + + + + + + + + + + + + + + + + + + Er1p + Escarp + + + + wll + win + wall + + + + 10 + + + + + + Rigncd + Rind + + + + + + + + + + + + + + IHE + HE + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ChuIch + Chichi + + + + + + + + + + + + ncw + near + nw + + + + lurlUtee + lulu + + + + + + + + instnllcd + + + + ill + iii + + + + + 11Ie + + + + + + + + churdl + churl + + + + + + + + 01U + emir + + + + + 0 + + + + the + + + + + + impIocd + implied + + + + + + + + + + + + + UlldJih1i1 + tliebuilding + libidinal + + + + bettel + betel + + + + limn + + + + tlu + tlir + talus + lair + + + + + + + 011e + + + + 1Je + Tme + Tome + + + + wrk + + + + + + imtstrlling + installing + mottling + + + + tit + tho + thou + + + + + IllatinA + Illation + + + + + + ivlil + will + livelily + + + + be + + + + ompletcl + completely + + + + he + + + + + + tore + + + + + + + + whilh + whieh + whirl + + + + + + + + + + + + cltlltl + + + + hy + + + + Ihy + Hay + + + + + + 11 + + + + + + + + Lox + Alex + + + + + iUgton + Kingston + rigatoni + + + + hegins + + + + + + + + + + + + + + + + + + + + + + HORSED + + + + + + + + + + + + + + + + mlc + mc + + + + + + fontana + lontalla + flotilla + + + + horse + + + + + + + + + + + + 31t + + + + + + + + Whartou + Whnrton + + + + + + + + uril1Y + hurdy + + + + + + tl1 + + + + Broadwav + + + + stockyard + + + + + + + slot + + + + + + hut + + + + tix + ix + + + + + + + + war + + + + + 50111 + + + + Thirtninc + Thirty + Titanic + + + + + + offercd + + + + fOl + foci + foal + + + + + + + nid + 1111 + Enid + + + + Ihosc + Hoc + + + + + + + + + + hrought + brough + + + + + price + + + + rangin + + + + + + + + + + + + + + + + + + + + Hon1d + + + + + + 3uctioncer + + + + + + + + + it + + + + + + + + ourthouser + + + + + + + + sley + sllY + sly + + + + amid + + + + Banns + + + + + + 01 + cld + clad + + + + Beckley + + + + + + + + + + + + + + + + + Becknerville + Becker + Belleville + + + + tom + + + + + It + + + + + + Sconce + + + + + + + + opera + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + IIousl + IOUs + + + + whre + twhere + whore + + + + + + spnt + + + + 8aitmlty + + + + + + + + + + + atHi + anti + Kathie + + + + Sandty + Sandy + + + + Mlorgm + Mellor + + + + mH1 + and + + + + + + + + + 1JIy + Cry + + + + whu + twho + hub + + + + Ire + lrc + rc + + + + huth + hutch + + + + ahont + Mahomet + + + + Clio + + + + tell + + + + unt + aunt + + + + + + + 8 + + + + + + + + nutttee + + + + + + + + + wns + awns + + + + cmployud + compound + + + + + + + + + + + whcn + + + + + + eune + equine + + + + + + time + + + + cttJilll + + + + timc + iinu + tic + in + + + + + + + Caiy + Cay + + + + letueel + lettuce + + + + + + Jay + pad + + + + 1lig1I + + + + walnut + + + + Ill + + + + + + + thollgh + though + + + + + + clue + + + + hind + + + + foran + oran + + + + + + + + + + steadstead + steadied + + + + + + 1eJeclin + procoedimtg + + + + + + + + nj + + + + + tn + td + + + + + + + + + + stirted + storied + + + + + + + + + whieh + + + + cast + + + + hint + + + + + + + + iii + + + + + + + ourt + court + ort + + + + + + + Goy + + + + anc1 + + + + Dottily + + + + Bales + + + + + + + Maher + + + + occludes + + + + 111 + + + + + + Oer + Boer + + + + nnwcl + unwell + + + + + IllUtHJ + mounters + Oilcloth + Mounties + + + + + + I1ftcl1lOin + ftflemvoon + + + + + + + + + Gny + Gunny + + + + thupst + alamps + thrust + Alamos + + + + + + bit + + + + + + Butcs + Buts + + + + + + + + + Balds + Bald + + + + wno + iis + wino + is + + + + Jren + Jean + + + + + + tlte + tilted + + + + icromarr + micrometer + + + + lion + + + + + + + eWI + WI + + + + + + heiclcs + chemicals + + + + + + + + carry + + + + + hael + heal + + + + iiI + + + + 110 + + + + twill + + + + h1e + + + + + + + + + + + + + + + + + Inllills + Mulleins + Instills + + + + Noose + + + + + + + + + + + + + + 30000 + + + + + Jft + 1a + + + + + + H + + + + CUl1lingham + unningham + + + + + + + solcl + soc + + + + + + + + + + lnJ + nJ + + + + + + + + + + Jmwtion + Jnuetiou + Inaction + Jamestown + Neutrino + + + + + + + Ihout + Shout + + + + + + + + + + Ml + + + + + Alhlt + therm + Allot + thermo + + + + Stofel + Solver + Steel + + + + or + + + + Ifontgomneiy + Fronton + + + + + + + + + + + + NFVf + + + + PUY + PLUMY + + + + DY + + + + + + + II + + + + COLLEGE + + + + + + + + + + + + + + + + + + + + Novemher + + + + Untie + Under + + + + + + + + + + + + + Dahl + + + + + + + + Eueouraged + + + + by + + + + + + Sl1eeess + + + + + + + + + + + cflorts + florets + + + + + + collcgc + + + + + + imard + hUd + Mardi + + + + ut + + + + + + + + + + + II + n + + + + + + 11111elll + atnalear + antler + + + + jilny + jitney + + + + lhi3 + 19la + + + + piny + piney + + + + + + u + + + + + + + + + + 101 + + + + time + + + + henefit + + + + + + tic + + + + atlaletai + Athletic + Valletta + + + + + + + H + + + + hwItulc + inclulcS + wistful + + + + + + bey + bees + bevy + + + + + + + + + tdirn + din + + + + uul + usual + + + + + + be + + + + w11 + iye11 + + + + + + 1 + < + + + + + + + + + + stmdara + Tamara + + + + + + Ims + Sims + + + + heen + hen + + + + + + + + tlm + + + + + + + otnr + toner + oftener + + + + + + Pllt + Pelt + + + + + + nuder + nude + + + + + + direc + direct + + + + + Lion + ion + + + + + LionProfessor + + + + + + + + + Dalgct + Daltretry + Aglet + Dialect + Maltreat + + + + hill + + + + ngJin + azgain + aging + angina + again + + + + htl + luty + have + html + lusty + + + + + elmrge + emerge + + + + + + + + woll + woolly + + + + uncI + nod + ncI + + + + ho1es + + + + + + dui + + + + + + + + + + + or + + + + + + exccl + excl + + + + + + + + + + + + + + + + mrit + + + + + + + + lde + + + + take + + + + IICII + Koh + bosh + IC + Koch + + + + + or + + + + + + + + + + ivitli + civil + + + + + + addel + Adele + + + + cx + + + + + + + peieucc + prince + penile + + + + + + ilOm + Dillon + + + + + + sllcccssfu + sneeessttt + success + + + + + sununcI + umnmer + Sunni + + + + enmgetncnt + penmen + + + + ivitlm + civil + + + + + + Vl11g11 + Vaugll + Vaughn + Vault + + + + + + + Glasse + Glassed + + + + Compnny + + + + + + + + 1 + + + + + time + thou + + + + nwv + meat + wv + + + + + + plnyhollse + + + + + + Hoch + Itoclm + Rob + Hooch + Ito + + + + + + + N + + + + + + + + + + + soleefod + sleeved + + + + + + + + ocr + + + + + + + + + cometly + comely + + + + + + fthe + + + + lime + + + + ut + + + + Ute + time + + + + Xapoleon + + + + + i0 + + + + podoll + poliod + pool + polio + + + + + + + + bright + + + + sparklin + + + + anis + + + + + + + + + Cltmint + Culminate + + + + ihmtions + inhumations + + + + + + + + + + + WM + + + + + + fo1 + + + + nnd + nd + + + + netell + retell + + + + hy + + + + OJ1 + omt + OH + omit + + + + + + + + + + + tho + thou + + + + forlmost + toremost + + + + shus + shuns + + + + + + HIe + Hide + + + + Ameriem + Ameruam + Aerie + American + + + + + + + Etrge + starch + Deterge + + + + + Etrgefhe + starched + Deterge + + + + + Tlw + fhe + Tel + he + + + + + + + + time + + + + picce + + + + + + PhocbeS + Ihoehe + Phoebes + Hoe + + + + + nomanc + Romaine + Norman + + + + ill + + + + + + Stloet + Sloe + + + + + Thc + Ihc + Ic + + + + + + + + + + papilla + + + + + + + + + 1111 + + + + eostnnted + costumed + resented + + + + It + + + + + + nppcnt + appeau + Nippon + + + + + Lime + + + + littler + + + + pert + + + + 01 + + + + Xoc1l1hcr + + + + + + + + + + + + + + + thl + hl + + + + stndnnl + standard + stunningly + + + + + + Imil + quil + equal + Simile + quail + + + + + city + + + + thc + + + + Anclitoiuni + Antitoxin + + + + theatto + threat + + + + wilt + + + + pn + + + + + + + + ienL + enL + + + + 011 + all + + + + mfotlday + moldy + + + + amid + + + + TueFclay + Tuscany + + + + + + + + + subwoofer + + + + mcre + nacre + + + + + + + + + + + + + tiro + tire + + + + + + Hts + Hats + + + + atul + amid + actual + + + + + + only + + + + Audis + + + + + nicutrcs + incurs + + + + + + appear + + + + + + + + + + + + + Cmson + Mason + + + + + + Comlany + + + + + + + + King + + + + Thc + + + + + + + + thc + + + + + + + + + + + + + trlCular + trmeular + tricolor + trammel + + + + nu1 + + + + tmt5formntion + + + + + + + + + + + + + 011 + can + + + + speciul + + + + secIlcry + secularly + + + + + 1150 + + + + g6rgeous + A6rjeons + + + + efTelt + Ethel + + + + anti + + + + magl1ifieenl + magnifemil + magnified + + + + + eostnmcs + esteems + + + + Anothcr + Anothe + + + + stir + + + + net + + + + 011 + + + + + + + + + + + + + + Brows + + + + + + theit + theist + + + + cnmcd + comic + conc + + + + + + + skil + skill + + + + XooIUCNOOlll + NoodleNoodle + Neocolonial + + + + + + act + + + + + + 1tl1 + omi + + + + + doubted + + + + 0110 + coo + + + + + + + + + + nct + aet + ct + abet + + + + + + + thc + + + + meliclu + mellifluent + + + + stagc + stag + + + + mid + + + + + + nl + + + + + + + + Canse1 + + + + + + riol + roil + + + + + + + + + + + + + + + + + + + + + + + thi + Lid + thin + + + + + + + + + + + dtc + flit + dc + + + + + + plice + p11CC + plaice + + + + + + mI + + + + + missiO + 111S510t1 + + + + + + conts + emits + counts + + + + iyill + idyll + + + + hic + chic + + + + smuetl1inv + s0methin + + + + + + + + + + regrets + + + + + + + + + + + + + + + + Stoke + + + + agemtts + + + + fl + far + + + + + + + + + lTadl + Hal + ladle + + + + + + + + + + + + + + + Olive + + + + + + + + r1J1t + + + + + + + + Yan + 1an + Yang + + + + + + + + + cdiltainiug + dilettanti + + + + + + acrcs + + + + f0i + foci + + + + + 35 + + + + + + + + + + + + + loc + + + + + + Asla + Al + Ala + + + + + GnllUlo + Allan + Gluon + + + + nt + + + + time + + + + opern + open + + + + + + Xoem + Noven + Novena + Oem + Oven + + + + + + + ber + 11 + beer + + + + + + + OATIIS + COATIS + + + + + + + + + + + + + + + + + + + + Wilrox + + + + + + 70 + + + + + + + nt + + + + het + lice + heat + + + + + + + + + + Satnr1ay + + + + of + 01 + + + + + geueral + + + + + + rhe + rhea + + + + + + + + + + + + + Aulioeh + Calliope + + + + hunh + hunch + + + + hy + + + + + + EltIcr + Celtic + + + + + + + + + Lnwry + Lowery + Landry + + + + Sunray + + + + morningorning + + + + Burinl + Burin + + + + + + + nt + + + + Thomts + Ihoma + Homage + + + + burinl + burin + + + + greunl + gruel + + + + + + + Strivers + + + + + + + + + + Hwn + Hewn + + + + S + + + + yeal + yea + + + + 01 + + + + + + + + + + + mid + utd + td + + + + + + + + Ernes + + + + + + + Sticgrs + Strivers + Tigers + + + + lied + + + + inexpressiveness + + + + iii + + + + Titck + Tick + + + + + + + + Sol1 + + + + + + + + + + onyphoid + o1Ytyplioid + boyhood + + + + + + + + + Thc + Time + + + + + + Terre + + + + Inouglit + Inoculate + + + + Jtere + Terre + + + + + + + + + + fm + + + + hint + + + + + + + + + + + tool + + + + buttal + bunnies + brutal + + + + + + + + + + + + + + + FAMOUS + + + + + + + AND + + + + UGTURER + FUTURE + + + + + jOie + + + + + + + Opine + + + + + + + + anrl + ami + Carl + amid + + + + Philosophe + + + + + + + + + + + + + + + Liam + + + + + + + Limit + + + + + Buckling + + + + + + + + + + + Thl + Hl + + + + folloing + + + + tCll + tell + + + + retons + Bretons + + + + are + + + + gim1 + give + + + + + Cll + + + + feU + feud + + + + + + + + or + + + + Opine + + + + Kead + Knead + + + + + + + 1 + + + + JJCl1lll + 1s3e1mi1 + + + + + + + + + tie + + + + + + + + + + + + + + + + caption + + + + tllC + tlC + + + + mtlwl + tall + + + + 11 + of1 + + + + morc + marc + + + + wideiti + widget + + + + + resid + lcl + reside + local + + + + books + + + + tlwn + lawn + + + + + + + + liyimm + Elysium + + + + + Anleli1Ii + + + + 11thol + anther + + + + + + + Jc + Ilc + + + + + + out + + + + or + + + + thc + + + + + + souhl + soul + + + + + + + 11111 + + + + hest + chest + + + + + + cuntiibuiors + + + + + + + + + + rcnt + recant + + + + 1Jeriodicnls + periodieals + + + + + + + Iris + + + + + + + + familiiu + familial + + + + + + mosi + mos + + + + + uf + 1t + + + + till + + + + htl1lcgoill + Ieoturegoing + lecture + + + + pnblic + + + + lIf + + + + + + + + + auel + amid + laurel + + + + dty + duty + + + + + + + IIe + + + + + + + + + + iuheltis + advertiso + unhealthiest + + + + + 0 + > + + + + III + Ito + + + + + + 4110 + + + + + + + + hst + hest + hast + chest + + + + elmw + elm + + + + + + King + + + + aUractions + abreactions + + + + 011 + do + + + + ll1UlY + mimumy + mommy + + + + + + + + 1tlI + lair + + + + + guest + + + + + + tbis + tJis + tbs + + + + pat + + + + seon + Seton + + + + + 0 + + + + He + + + + mint + + + + Hilly + + + + glls + galls + + + + + + crowL + cowL + + + + + lie + + + + plesses + lessees + + + + thiml + Ihem + thimble + Hem + + + + + i + + + + lIe + Ile + + + + docs + 1005 + + + + lot + + + + loll + + + + + + + + OJ + 0r + + + + + JIM + + + + + Icltah + Clash + + + + oilier + + + + mens + means + + + + 11015 + + + + + + + tic + + + + + + iviih + iii + + + + eflllal + + + + iaein9 + fteiua + fascia + ferial + + + + + + + + + + + elasses + lasses + + + + 0t + + + + jcople + couple + + + + + O + + + + Ill + 1Ic + + + + Imows + Mows + + + + whcJeof + wher0ot + + + + + + spenlt + spank + spelt + + + + + 0J + + + + + + htR + hats + hR + + + + + + aunty + + + + + + + + + + + + + + + expc1ncc + + + + + + + Ill + lie + + + + + + + + + + + + yoe + yoke + + + + + + + + + unO1 + 11101 + union + + + + + + + + + + + + nt + + + + + lie + + + + + + + + + lit + + + + Read + + + + + + + + appeu + ape + + + + + + + + lectl1lt + leetur + lecture + fleeter + + + + + at + + + + + + Lint + Liam + + + + luekliu + lucking + clerklier + + + + nt + + + + Kcntucki + Kentuckian + + + + + + + + + + + of + + + + + + ecuinz + cumin + + + + of + + + + + omlI + onember + Somali + condemner + + + + + + + + + VIILL + VILLA + + + + Of + + + + + + + + + fORCf + forceful + + + + + + + + + + + + + + Accord + + + + + + + + + + + + + + + + + + + + + + + + + + MonM + MoM + + + + + Mon + + + + + + + + + + + + or + + + + + + fCord + McCord + Record + Cord + Accord + + + + ivill + civilly + + + + + + + Otis + + + + ntire + + + + fore + forcC + + + + ont + tint + + + + Mloulay + Modula + + + + + + is + + + + + proh + Drool + pro + + + + + + lHopelty + properly + lonely + + + + 1111 + + + + + + + + the + + + + + + + + + + Hntloe11 + + + + ptics + optics + + + + arc + + + + nrl1crly + + + + + + + + + fated + tatted + + + + + + 11101nillg + + + + + + + + mini + + + + + tcnded + ended + + + + + + poi + + + + + + + + + + mime + + + + defncinpI + defining + + + + + + + + + nod + nd + + + + ottcr + otter + + + + roivcly + richly + + + + IInd + amid + Indo + + + + nu + + + + + + + llIct + alled + licit + allied + + + + tor + tore + + + + + + + + it + + + + hy + + + + incur + + + + + + + nnd + l1td + nd + + + + Dogs + + + + on + + + + + + uighi + eight + ugh + + + + tie + + + + + + ill + iu + + + + + + + trmted + tructed + trucked + termed + truncated + + + + + + + + + + arret + garret + + + + anti + + + + plact + plaec + pact + place + + + + + + + jrd + dial + rd + + + + + + olle + 000 + ole + + + + + + drfacing + + + + 0r + + + + de + + + + + tloing + straying + loping + + + + pIopelty + poetry + + + + + + + + amy + + + + + WOODMEr + Woodier + + + + + + + + + Tle + 110 + Tale + + + + lOlleln + Allen + + + + Woodmcn + 1Fondmen + + + + 01 + + + + AmCICf + Ametier + America + Pacify + Ammeter + + + + + + + + + + + it + + + + + + + + + + theii + Heidi + + + + + + + + + don + + + + + + thc + time + + + + + + b1i1diti + + + + I + + + + + 1omlay + + + + + + + + + + + + + urce + Vance + cure + + + + + + + + trnnHted + tansacfed + tormented + + + + mill + dud + + + + nlso + atlsn + nelson + atlas + + + + worb + orb + + + + + + + + + tel1m + teanm + + + + + tamper + + + + + + + + + + + + + + + + + + + + + + Hcrald + + + + + + pIny + piney + + + + + + + + + + + + + Winche5tcr + + + + + + + + + of + + + + + + + + opera + + + + + + The + Thc + + + + Housc + + + + or + + + + 1 + + + + rhOUSfind + rosined + + + + Caai + Cain + + + + + tllea + dales + tulle + + + + + + + + + + + + + + opcrn + OCR + + + + housr + houso + hours + + + + + + + mntitHe + amniotes + + + + aul + Gaul + + + + + + + + + + + + + + + goodsizcc1 + goodie + + + + + + + + + + mensch + densely + mesh + + + + + + + + + + + + + + pJay + Jay + + + + + + + + Tought + Thought + + + + + + + + + + + + entertainiup + entertain + + + + + + + + + iyithottt + + + + + + iu + + + + + + 5C10 + + + + II + m + + + + + tronA + tonal + + + + storv + Astor + + + + suffer + + + + + + + + + + it + + + + + time + + + + dramatizction + + + + an1 + + + + iii + + + + + + hmul + imand + html + viand + + + + + + + + + thorouglIIY + + + + competcnt + + + + + + + + + + + + + + + + + + + + Time + + + + + + + + + + + + book + + + + + + + + + + + + + thc + + + + + + + + + + + + + + tees + + + + + + + imptrtl1rbahle + + + + tacituru + + + + tesource + + + + + + rnl + foul + rn + + + + + + + + thc + + + + + + + + Glenallll + Gleam + + + + + + + + + + + + + + + Ixst + prst + pet + Kist + prost + + + + + + conccnl + conc + + + + nud + nude + + + + + + + + + + + + + + + + + + + + + + + + + + + + nerontc + necromantic + necrotic + + + + leo1 + revolt + + + + + + + llntcF + lilt + + + + nnd + nd + + + + + + actiitte + ratite + + + + + + + n + + + + + + + + conlail1 + containn + contain + + + + + + + mlny + Maloney + + + + + + tinill + infill + + + + + + + + Wag + vas + + + + eon + + + + + + + petclt1 + petcntly + patently + + + + trken + Turkmen + + + + + + + + + + Inn + + + + + + + irho + biro + + + + + + + + + + compaitsot + comparison + complaints + + + + + cmincnt + + + + + + + + + + It + + + + + + + dth + doth + + + + + + + + + + + HoiIanl + Haitian + + + + tiho + Jo + thou + + + + Inst + Inset + + + + + + + + + + Alex + + + + + + + Kingston + + + + nCClthelc5 + + + + pOlllaR + + + + + + nrl + + + + + + + milnhlY + ntirally + amiably + smilingly + naturally + + + + 111 + + + + + + + + eritie + verities + + + + + + 11lwe + nerve + + + + + hod + hood + + + + ne + + + + pleasnre + pleasuie + + + + offsetting + + + + 1tr + + + + Hol + Holy + + + + + + + lie + + + + world + + + + he + + + + moo + + + + + + acclpt + aceept + + + + + + + land + + + + + 11110 + + + + + + + Pmabclle + Respelled + Marble + + + + Lcslic + Cystic + + + + + + plnocl + playa + pinochle + + + + + lhe + he + + + + pHt + Ht + + + + + + L1on + lariot + lariat + + + + Devereraix + Coverer + Severer + + + + unfor + unfol + uniform + unfold + + + + + tunntcly + ttnately + truncate + tunnel + attenuate + + + + wns + awns + + + + + + + + + + + + + prai1ing + + + + nmlady + nullady + milady + + + + lhat + hat + + + + + + + + + + + + + + firt + + + + anhim11 + + + + rold + old + + + + wac + ware + ac + + + + + + + wns + awns + + + + nble + ahlC + ail + noble + Dahl + + + + + + seak + peak + + + + + + Iiiti + lines + Iii + + + + on1y + + + + + + + + + diliculty + + + + + + + + + + + + + 1oarsines + + + + + hoarseness + + + + liner + + + + + + iloue + loupe + + + + + + + + + + + + fircgoly + fiercely + + + + + + iyafh + Riyadh + + + + + + dolly + + + + Irish + + + + + fiord + + + + + + Donoan + Donorau + Donovan + Donor + + + + portrayed + + + + + + + + + + + + + + + + + + + + + + + + + + pot + + + + + + + + + + + + + H1ul + + + + + + oldtimc + olcltime + oldie + voltaic + Mollie + + + + Xorth + NOJ1h + + + + + + Irclnucl + Micronuclei + + + + + + + + + lime + + + + + + aet + abet + + + + whell + hell + + + + time + + + + + erouwcd + crOutned + crowed + rowdy + counted + + + + + + + + + + + + attuckc + attack + + + + + + + GlCU1lUl + Glenarni + Gleam + Learn + + + + + + + + tlw + lime + law + + + + + + + + + + + + + dispossesitlg + + + + + + Gll + Gleu + Gall + Glue + + + + + Iun1 + return + + + + + + + + + + milnsion + minion + + + + + Thc + fhe + he + + + + + + + + + + + + + nollgh + knoll + + + + tll + + + + + + hoth + hot + + + + + + gll1ery + + + + 111 + 8111 + + + + + Iuwel + luircr + Unwell + blurry + + + + lloor + 1Ioor + + + + in + + + + 1 + + + + + + + + excitemcnt + + + + + + + morn + + + + anisanci + Stances + Nissan + + + + + + oat + + + + ul + anal + + + + leay + leave + leafy + + + + + IIi + + + + + + deeidedl + deeded + + + + + + + + + + time + + + + mouth + + + + + + + + + inditor + indictor + + + + it + + + + thl + hl + + + + + + + AIRPLANE + + + + + + + + + + + + + + + + + + + + + + Far + + + + + + + + + IVASII1GI0 + + + + Olt + Bolt + + + + 31Iol + Ovoid + + + + + + + fitct + fit + + + + + + + + ompilers + + + + + + + + tall + + + + + + + trale + tale + + + + + + havy + heavy + + + + toned + + + + + + + lleeuQ + lee + + + + + + mnJxC + mnJw + minx + NJ + + + + + + pernlaueul + prelature + + + + + + + plfCC + pelf + + + + ill + + + + + + + + 101 + + + + acroplull + teroplaue + airplane + Carroll + aeropause + + + + + amI + amid + + + + ailshij + ails + + + + X1J01tS + + + + aecutdulti + acidulating + + + + tft + ie + + + + + ConsuJ1 + Consuular + + + + rcpOlts + repots + + + + + + + + Om + flat + + + + + log + King + + + + t110 + + + + + + + + Septemhcr + + + + shipmcnt + + + + + + + + + + + mllounted + anointed + mullioned + + + + + + + + + nnll + fund + null + + + + + + + + pnions + onions + + + + mottlm + mottle + + + + + + 33870 + + + + + + + + + + + + + + Iliac + + + + + + + + + + Oet + Moet + + + + + + + + + + + + + + rnion + iut0t + Orion + + + + + + IIaeke11ctek + Hackie + + + + + ndopte + dope + + + + resolutioll + resolution + + + + 8unday + Stuulny + Truly + + + + + + + mngrtding + engirding + + + + Wilhur + 1lilbur + + + + Hlul + Hull + + + + 01111e + Oriille + + + + 11riht + + + + + fo + + + + rfusillg + remusitmg + fusil + reusing + + + + + + aUow + nlloiv + avow + unloved + + + + + + employe + employees + employed + + + + + + + + + tly + tally + + + + + + neroplanc + airplanes + necromancy + + + + 011 + of + + + + Snuday + + + + + + + + + + + i1st1lcte + + + + + + + + + t + + + + eopy + + + + oi + + + + + + + + + + + + + Priggish + + + + + + + + + + + + W1IEEIJNG + + + + + + + + + + + Yhit + 11hile + Hit + + + + + + + + rhcI + Archie + + + + Snulay + Snugly + + + + + eyening + + + + + + lfotndsrille + Louisville + + + + + + + + skitT + skit + + + + + Jane + + + + DowllinA + Dolling + + + + + + 1rOnted + droimied + domed + + + + + + + + + + + + + + + hnc1 + 1181 + + + + + + ccnp + Cscap + escarp + cc + Escape + + + + + + + whcn + + + + lime + + + + + + captizcll + captized + capsized + chaotically + baptized + + + + Downinp + flownin2 + Owning + Downing + + + + + + + + + it + + + + r0sident + + + + + + + + Rellaie + Blair + Relative + + + + + + + + + + AMERICAN + + + + + + + + + + + + + + + + + + + + AboutKid + Kidnaping + About + Kidnapping + + + + + + + napping + + + + Qf + + + + + + + + + Inflexion + + + + + + + + + + + hown + + + + + + teleInm + telexing + + + + eonrelni + concern + reorienting + + + + JUte + time + + + + lad + + + + + naping + nnping + napping + inning + + + + + + + + Holph + ltolph + Kop + Hop + lop + + + + + + fender + Spender + + + + + lebraskn + Nebraskan + + + + + + Ambassadoi + Ambnssac101 + + + + + teny + teeny + + + + + + Wilsou + + + + saic1 + + + + Snnday + + + + + niglit + niggling + + + + lw + + + + IUH1 + lord + + + + + + informuaiiot + + + + + + tit + + + + + ubject + + + + Mis5 + hiss + + + + Hnlph + Rolpli + Ralph + Roll + + + + + + kidnapped + + + + + + + hy + + + + n + + + + Jlcxican + Lucian + + + + Ieon + pent + Eon + + + + + + + + + + + Hcgnnda + icrgunda + Spondaic + Hacienda + Nicaraguan + + + + + + + + + + + + ncnl + encl + + + + Cheeoy + Chicory + Cheney + + + + + + + whcre + + + + + + hall + + + + heen + heed + hen + + + + 5penclinp + + + + + + + + + sllmmcr + slimmer + + + + + + + + + + + + + + + + TIIOIASYILLE + + + + Gn + + + + OcL + Ocala + + + + + + + + + + + + + 511Ot + + + + + + Iiilled + Initialed + + + + h0rm + + + + + + + aftrnoon + aftenoou + + + + + + + + 111 + + + + + + wIll + tvlmc + TLC + + + + + + + + + + + + + hnd + hand + + + + + + this + + + + clos + cloSI + clods + celosia + + + + + + + BId + + + + SllICllClcc1 + surretdered + + + + pantsuit + + + + + + + + + tired + + + + + + sclfcfcnsi + + + + + facsimile + + + + + + + + + + + + + + CIIICAGO + + + + + + + + + + WHlianv + WtiliaID + + + + + 11 + u + + + + + + mnn + Mann + + + + + + + + + + killct + killei + kill + killed + + + + + tt + tit + + + + + + + + + + + + SundaT + + + + + aight + alight + + + + Iilliams + + + + + + + + htirglnr + htaglar + twirl + Hagar + + + + Cr + + + + + tering + tarring + tearing + + + + + + + + + + am + + + + windo + window + + + + + 0111 + + + + nttcmpted + + + + + + + + + + Thr + Thor + + + + + inltltde + Gillette + + + + + + + + + + + + + + + + + + + + + + + + Prcparlng + + + + + Tablcs + + + + + + + + + + + + + + + + + + + + + + + + af01y + + + + + + t11nt + + + + + + op + + + + + utsille + iutsile + utile + inutile + + + + + + + + oUlce + dolce + + + + bas + has + + + + + + + + + ontire + putlre + pustule + + + + + + + + trom + tromp + + + + + + + + + + + + + + + + + + + + + + Ilrinleel + Irvine + + + + almamc + + + + ia + + + + + + + + officee + + + + + exnmlnell + exantlned + exampled + + + + tlo + lo + + + + + + + + + + + + + + + tolal + toll + + + + + + + + OJures + Juries + + + + + + + + + + + + + + + + + + + numbcr + + + + 19 + + + + + it + + + + + + trilling + + + + + + + + + + numb + + + + + + + + + calc + + + + + rations + lotions + + + + + + + + + + + + prepare + + + + + + + + + + + + + + + + + + + + + + + + + tor + tore + + + + itn + in + + + + calcul + calculi + + + + + tion + ti0m + ion + + + + + + + + + + + + ami + amid + + + + + + hair + bait + + + + + or + + + + + + + + slmll1rly + + + + + + otber + October + + + + + + + + + + + + + + + ns + + + + + + sett + set + + + + + tthe + Lithe + the + + + + planets + + + + + + + + + + + + + + + feral + + + + pIDio1 + oplnioa + olio + + + + practicall + practical + + + + + + + + + in + + + + + + + + + + + + trom + tromp + + + + ypar + par + + + + + + + + + + + + + + + + + + + aft + aim + + + + + tho + thou + + + + + + + + + + hae + hade + + + + + + orlg + orig + org + rig + + + + + lnally + finally + lineally + + + + + + + + + + laborn + labor + + + + at + + + + + + + astronomlcat + + + + + + + + + + + + + Lithe + + + + + largr + + + + + + + + + + + + + + + lhe + he + + + + + + + + + + planets + planers + + + + marle + marble + + + + + + + + + clue + + + + Horal + Oral + + + + + + + + + + + + escapes + scopes + + + + + + + + + + appall + + + + + + + + + + anccs + nnces + panics + minces + + + + aro + are + aero + + + + consllcuousl + + + + + + + + + + + + + + + + + statt + stat + + + + + + IlUrely + Laurel + + + + mathematic + + + + + + + meal + legal + + + + anti + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Tbose + Boise + + + + + + Ulen + Len + + + + Risk + + + + + tlous + Cious + ious + locus + + + + + + + + + + politb + + + + atten + alten + attend + Alton + + + + + + + Uon + tiomi + ion + Upon + timid + + + + + + theft + + + + wles + wiles + + + + + + tho + thou + + + + rcat + feat + cat + + + + + + + I + + + + + + + + + + he + + + + ecpected + + + + + + + + + + + + + + + + + + + ft + + + + + + + + + + + + + + + ehance + enhance + + + + + Ionic + + + + drear + + + + xI + + + + + + + + + + + + + + ii + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + CENTS + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + and + + + + + + + + + + + + + + + + + + + + + toPcwell + locally + topically + + + + + + + + + Voters + + + + + kola + + + + + + + + + + + + Hou + Hour + + + + + + + floyd + + + + Lyrd + Yard + + + + lllwCIatie + ileuluctatic + + + + nomince + iiumilice + iniquities + + + + + jour + + + + + + frum + 1tnnl + forum + + + + tIe + + + + Tejilh + Tl1th + + + + COH + OH + + + + + + + Messina + + + + itrid + nitride + + + + tIHI + Thai + + + + RuhlJt + Ruth + + + + H + It + + + + + + + + f1il1d + ivied + + + + + + Jryine + Juried + + + + + + Sat1lliay + Sntmday + Stoma + + + + + + + dressed + + + + 111 + thC + + + + v1ters + + + + + + Ptavell + Lowell + Travel + + + + + + + + + ntStanton + tagalong + instant + + + + tlul + au1 + lull + + + + lay + + + + + + + + spCak + + + + + cr + + + + + + vcr + + + + gIcetcl + iciest + + + + + + + + + + + + + iinIes + biomes + Winnies + + + + tht + + + + + + Itcmlcd + tte1lccl + Sitcom + + + + + + + + + 8pelkmJr + + + + iii + + + + + + eunnty + aunty + + + + + + year + years + + + + + Ireie + Fibre + Retie + + + + + + eleuiire + elsewhere + eclair + + + + lllmbel + 11umhers + + + + or + + + + Hc + Re + + + + + + + + + cXTlrcssatl + cxrrresscl + + + + + + + + + + + Snpnorting + Sporting + + + + + + + + ill + + + + + Jlit + liar + Lit + + + + loch + + + + minions + + + + + + + + ctfeetlc + octet + + + + + + + their + + + + + + + + + rssiic + Rossini + + + + 0 + + + + + + + + camuaiun + causation + + + + mll + 5111 + mall + + + + + + + + + + + nreliele1 + + + + Oil + 0t + + + + all + + + + + + + + tlw + law + + + + + alolylutl + ally + + + + Dmnocratie + Democratize + + + + l1lujoliiv + + + + + + + + + + + + + + + + cH11tv + + + + tvill + till + + + + + + matlialh + matcrially + martial + + + + increase + + + + + ell + el + + + + + + + + + + + + + + + + LEiXJXGrON + LEl1XGTON + + + + Jy + + + + ot + + + + + + + iSales + Snles + Sales + Singles + + + + + + + + + + nnt + nt + + + + + + OIl + omi + + + + + time + + + + + + markct + + + + lionday + Lindsay + + + + O5 + + + + + + hut + 1111 + + + + + vtill + mU + till + + + + + + + + mud + + + + 011 + + + + + + + + + + + + + + + time + + + + Hurley + Hurly + + + + Society + + + + + 1000 + + + + looled + lolled + + + + C101 + + + + + + alo + Palo + + + + be + + + + rcsnm + resnm + resume + racism + resin + + + + + ed01 + econ + + + + + + + + + + + + + + + Whrehol15c + + + + nftcr + FTC + + + + h1ing + halving + + + + + + 8n5 + subs + + + + + + + + + fOI + foci + + + + seyeral + + + + + + + + + + + tecson + Tucson + + + + + + + + + + + + + + + + + looc + loc + + + + lenf + len + + + + + + + + flint + + + + + + + + + + + + + + + conlillg + coning + + + + + + + + + + + + + + + conferelHe + confer + + + + + + + + + ic1110on + tenon + + + + + + + + deeilci + deicing + + + + + + + + + wonld + + + + + + best + chest + + + + + + + + + + eidetic + + + + + + + Timesday + Immediacy + + + + + + olller + + + + flint + + + + + + + + + pncco + bilcco + bacon + Nucor + bicorn + + + + + + bc + he + + + + + + + + + + TIll + Tin + + + + + + + + + compnrtilv + comparative + + + + smi + snmal + semi + + + + Lt + hi + + + + + Thrsduy + + + + + + 7riday + + + + + + + + + + + + Qrs + IQs + + + + secm + Secom + + + + in + + + + he + + + + anxions + anions + + + + + + 1sposc + + + + or + + + + + fhe + he + + + + host + + + + + + + + ill + iii + + + + + + h1rn + + + + + + + + + + + + + + OFT + + + + + + + + + + + F018IRI + + + + + + + + + + + + + + Omens + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Stutc + Stale + Stout + + + + Fcdemtiou + Fe1erltiion + Facetious + + + + + + romcns + 1lonicmis + Omens + Romans + + + + + + + + + + + outlincs + ontliues + continues + + + + folic + + + + + + + + + Rtudy + Trudy + + + + ill + + + + + + + + l1l + mist + + + + + + + iii + + + + + + + + omens + + + + + + + t + + + + + + importanco + itnportance + ignorance + + + + + + thc + + + + studyof + sullenly + study + + + + + + + + + + + + + ill + + + + lite + t111 + elite + + + + + + + P1ioclamntio1 + 1noclamatioi + + + + + + Presideut + + + + Hoose + Itoose + Loose + Hose + Mitoses + + + + + + + + veldt + + + + while + + + + + + ol1ie + offce + notice + office + + + + + + time + + + + 8choo + sdtoo + school + stood + + + + + + + + + + + llnitc1 + + + + + + + + + + + wllieh + Willie + + + + menns + menus + + + + + + + T1ce + + + + + + + + + + obc1c + observe + + + + + + + + + + + + + out + + + + eniou + Cniom + envious + CIO + + + + + + + + iu + + + + time + + + + + + + schools + + + + + elmoolsTt + Schools + embolus + + + + + Tt + + + + + + + + lhl + hit + All + + + + + + sho1111 + + + + clehm + celehraft + celebrate + clam + + + + + + + ArLoI + Carlo + + + + + + thOllghtfully + ilmottghffnlly + + + + fu1 + + + + WWI + ivitli + civil + + + + + ih + in + ii + + + + you + + + + lifetiil1c + lifetiine + lifeline + + + + till + flue + + + + + + + + + or + + + + + + + + heeome + hereon + + + + CtioU5 + + + + + + + + + + + + + + + + + + + + + + + + + + + + nnd + nd + + + + respond + + + + + ibilHies + sibiltlies + 1sibiltlies + sibilates + ribaldries + sibylline + + + + whichm + + + + + + npom + nom + + + + + + ir + it + + + + + Vp + + + + + + + + lelrn + + + + tlw + law + + + + hings + hinges + + + + YOll + Yell + + + + wil + wail + + + + + + + iiecd + ivied + + + + + + + + whn + + + + + + 5011011 + + + + + + I + + + + + + + oel + Joel + + + + + + bill + + + + smiler + miler + + + + fhe + he + + + + cone + + + + + + + + + ijuei1CCS + finances + + + + + + my + + + + + + whieh + ivhielm + civil + + + + iu + iii + + + + H + + + + + + + + + Jouth + youth + Joust + + + + + + + + + + he + + + + dny + tiny + deny + + + + + + + 7xirilliotit + + + + solving + + + + + + COllunH + Collin + + + + + + + jlWl1t + oilfield + + + + Iillsballding + Dillydallying + + + + + + CXPct + + + + + + PC1 + + + + + attn + + + + + + tho + thou + + + + + + whoe + in1ose + whole + + + + InblJ + lIbo + labor + limbo + + + + + + + diflieulty + difficulty + + + + filHl + fool + fill + + + + hint + + + + + + bare + + + + + + + + + + + + + + + + + + + + tnie + tie + + + + + + + + + + + + a + + + stOIl + Aston + tOIl + + + + + litsc + flits + + + + + + + + wool + + + + + + + + + + + + + + + + + fiictoiy + fiction + + + + + + X001 + + + + anti + + + + + + lime + + + + + + timl + rims + + + + + n + + + + + + + + + + Wlcn + Wllel + Wiener + Lcm + Wale + + + + yon + + + + hII + helil + II + hell + + + + + + + + + OUl + Foul + + + + fOlests + foests + fogless + + + + 01 + + + + plint + plaint + + + + der + nevi + deer + + + + + cues + + + + you + + + + arc + + + + aeting + eating + + + + UI + + + + + + + + + + + + + rhe + rhea + + + + + + + + + + + + + + + there1olc + + + + + + lie + + + + + + + + thr + thru + + + + + sciool + ccilools + schools + school + Cyclops + + + + whidl + while + + + + + + + + mnlw + ml + + + + + + + itizcns + + + + + + you + + + + + + + + Arhor + + + + lay + + + + + xcrciscs + crises + + + + + + yon + + + + + + realie + renlizc + relic + + + + wh1 + whiff + + + + + + + elch + Welch + + + + onc + conc + + + + cotton + + + + recio + teeeiyes + recoil + + + + + froin + groin + + + + + + + + aud + Maud + + + + + + hy + + + + Reiji + + + + + siJtanee + sitailcC + distance + shitake + shirttail + + + + + + + + + + contin1H + enmmtinne + emptiness + + + + + + + + + + + hill + + + + scree + screed + + + + + + Aood + Wood + + + + ellil + elli + + + + + + + + + + + + + + + + + + + + + + + + + + + STEIUrnG + STEIILING + Steering + STENCILING + + + + + + Ort + + + + n + 31 + + + + + Iittle + + + + Ollie + Colic + Mollie + + + + Caytvood + Cawed + Cato + + + + Smouthsolll + 1Smontlmsold + + + + + qmghtel + might + + + + + + + + + + ll1s + Julies + + + + Onno + Guano + Ono + + + + Cis + CPIs + + + + + + + + + + + + + + + pJatyin + playin + partying + + + + Joni + + + + + + + + + whell + rhea + hell + + + + Hie + Hide + + + + + + + + tn + + + + + + + + + 1100J + hoar + + + + + + herai + Hebraic + + + + strstck + + + + + + nnll + amid + null + + + + + alit + + + + + + + + + + + + + + + + hcad + chad + + + + + + + + + + + eausint + Austin + + + + thc + time + + + + + + + + time + + + + + + + + + Missy + + + + + + + + lestroycd + + + + TJic + Tic + + + + + + + + + + + + + + skirted + + + + + + + + + + lnonght + lining + + + + + + + + + + + + + + + + + ykerc + Ayer + + + + + + examination + + + + + + + + + + + + + + + + + + + + + + + + + + NfW + NW + + + + + + + WARLHOUSE + Warehouse + + + + + + + + + + + + + + + + + + Stcckyards + Stcckyartls + + + + + + + + + + Nex + + + + + Week + + + + + Weeklime + Seethe + Weeklies + + + + + iht + lime + ht + + + + tobweo + towhee + + + + uarchnnse + urchins + + + + iehiefl + Ethel + + + + + + + + + IetNl + creeted + Viet + created + + + + + + + + Hroadway + Roadway + + + + + + + + yaids + yauls + aids + hauls + + + + + + IwaliuA + ueating + Italia + eating + + + + + + 1111 + + + + iifll + if + + + + + Iw + + + + + + 10 + + + + retcive + + + + tohaco + lobaceo + cloacae + + + + + + + + + Hlt + limst + list + Halt + + + + or + + + + + + wlek + + + + + + bouse + hOllC + blouse + holly + + + + + + he + + + + + ued + cued + + + + + + Stwnrt + + + + raylor + + + + + + are + + + + + huyillg + + + + tohaeo + Theo + + + + + + + + oullly + + + + + + + + + + + + + + + + + + MING + + + + + + + + trice + + + + + + + + Kas + Keas + + + + + + + + + + + + Repair + + + + + Jhe + lime + He + + + + + + + + + + hu1mo + + + + Ht + + + + + + + I0iwcr + + + + + + + + 11lllS + ttuts + tufts + + + + thu + thou + + + + strcet + + + + + ctr + ctrl + + + + + + hecn + hen + + + + + + amid + + + + lomitlay + gloomily + + + + + ftcl1ool1 + Iftcrnoon + + + + time + + + + ear + + + + + + tmted + tempted + + + + + + + + + + + + + hcen + chen + + + + + + oml11issil11 + conmission + + + + + + + ntarly + ueani + quean + + + + nice + + + + + + ntul + au1 + null + + + + tbis + tbs + + + + + + till + + + + + + + frollblc + rollback + + + + + + ompnny + + + + 111s + hts + hats + + + + cx + + + + + pericncecl + peieneed + prince + princely + preened + + + + + + + + Ahont + AlOUt + Hon + LOUt + + + + + + weekm + week + + + + + + + + + + + + + + + Boils + + + + hurned + btuncd + churned + obtund + + + + uut + ut + + + + + + + + + rcpleed + rippled + + + + + + oing + doing + + + + + + time + + + + + + + ondition + + + + + + + + uiiut + utility + + + + cOih + couch + + + + + + + + + + + + + supi + upi + + + + + + maehiuc + Mathieu + + + + + + + nil + + + + neiv + nevi + + + + + + whih + ivhiclt + vehicle + + + + tuck + + + + + + + + + + + davs + days + Davis + + + + + + + + + + + RfV + RV + + + + + + + + + + + + + WATIS + WAITS + + + + + COMPlIMfNUO + Compliment + + + + + Whitcsburg + Whites + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Itey + Pitney + + + + Bd + ld + Emil + + + + + + 11atts + + + + hns + hens + + + + + + seen + + + + + + + tho + thou + + + + rcthltist + ltethe1ist + recitalist + + + + Confcrenec + + + + + + + + + elmarge + enlarge + + + + + + thc + time + + + + ell11rcb + clutmeh + clutch + + + + + + llhitcs1urg + Whites + + + + + + + + + + + thtt + + + + + + puhlislmes + + + + thl + tin + hl + + + + + + + + + + + liini + iii + Hindi + + + + + + + + + + + nfto + nastnr + info + nastier + + + + 01 + + + + time + + + + + + + + + h1t1ch + chnrdt + chard + + + + Sonth + Sonilm + Soil + + + + + + 11riiesbnrg + Writs + + + + 11Iil + 5111 + + + + Cool + + + + + Sot + + + + + + + + ntthc + natiyC + nth + nattily + + + + + + Clnrk + Clank + + + + eonnty + entry + + + + + + + Arter + Carter + + + + + + feiv + five + + + + yur + yers + errs + your + Ayers + + + + Ipcnt + Pct + + + + + + + + distritt + + + + + + + 0f + + + + + + connty + comiy + comity + + + + h0 + + + + bcram + cram + + + + + asnsciatelt + asusciateei + associated + associate + + + + iiith + + + + + + 1i1111 + irnt + dint + + + + + + eontrnctor + + + + + Iud + 11111 + + + + + + + + Avhich + Avouch + + + + + + + + + + + + + nlmher + number + lamer + + + + lieitmg + alienist + + + + rllsiluUS + + + + + + gain + game + + + + + + King + + + + + + + + h0 + + + + agaiu + + + + entcrelJ + encore + + + + + sel1001 + + + + iind + rind + bind + + + + prept1el1 + + + + + + foi + fm + foci + + + + + tcnching + eaching + cinching + beaching + + + + + + + + he + + + + + + + + + fot + foot + + + + thrce + throe + thrice + + + + + + mudding + + + + + + fils + fills + + + + + + + ccrliticde + critic + + + + Xot + Oat + + + + + + satisfic + satisiiec + artistic + satisfied + + + + + + + + + tmininA + minima + + + + hc + be + + + + catered + + + + hentuckr + + + + + Weslcynn + Ileslevan + Leslie + + + + Collcg + + + + + + 1900 + + + + hlking + takimmg + hulking + + + + tit + + + + + ourse + e0t15C + bourse + + + + lending + + + + + + + + + + + + + + + + + iu + + + + + + + + + + 1910 + + + + + + + was + + + + + + + + + + thal + hal + + + + lie + + + + wns + ivps + awns + VPs + + + + conic + + + + + + + + + lie + + + + + + culled + + + + + + tlu + the + talus + + + + + + + lie + + + + + + licnsell + license + + + + + + prcach + + + + + SeptcmhN + Septic + + + + I + + + + 1001 + 1005 + Flock + + + + + + + + + + tall + + + + + snme + name + + + + + + ime + + + + + + 1lrecd + + + + + + + + + + + + + + + BonHyitlC + Beattyville + Calycle + Bony + Betty + + + + + + Campton + Camplol + Champion + Compton + Campo + + + + Junc + Junco + + + + + tiOI1 + ion + + + + tissio1 + + + + + + thc + + + + nnunal + anunal + + + + corer + + + + + cnce + fence + cancel + + + + + + + + It + + + + Ptris + Tries + + + + + + 1000 + 1OOtl + + + + hI + + + + + + + aail1 + aaimi + asian + aim + + + + assi11cd + + + + + + + + + + whcn + wher + where + + + + + + + sotYCIl + sooty + + + + + + scut + scout + + + + + + + + + + + 110 + + + + extcnd + + + + Talus + + + + + + coidial + conidia + + + + irel + weal + ire + + + + + + eons + + + + monA + aulong + along + + + + + + + SUPPLEIIWl + SUPPlEMfNT + Supple + Supplement + + + + + Supplemental + + + ION + + + + + + + + + + + + + Tuestray + Testacy + + + + + + + + + + + + + + + + tlay + lay + + + + + + + + + + + thc + + + + lirpt + flirt + + + + + + + + thr + flit + thru + + + + + supplemcnt1 + su11lemental + supplemental + + + + + + Tl1ol + + + + wlic + laic + + + + + we1C + + + + + + + + + + + + tile + time + + + + regulttl + rebuttal + + + + Greg + + + + + istmtiol1 + istatinti + distraction + instating + + + + dtty + ditty + + + + + + + + sie1 + + + + + + nnnhlt + + + + + + + + + + + fhe + he + + + + + + + + registe + + + + + tithe + + + + To1day + + + + fucsday + lncscfay + fuchsia + + + + dr + rr + + + + WNlnei + 11ed11e + Swedes + Winnie + + + + + tIny + + + + Ai + st + + + + nonn + non + + + + Monlay + + + + 32 + + + + Dl11l0erat + Iemoerat + Immoderate + + + + + ohd + Ihrd + hill + od + Hard + + + + + + + + Hpnhlinns + Rcpuhlicnus + Pangolins + + + + + + + + + + + + POLICE + + + + + + + + + + + 1181 + acre + + + + then + + + + r01111d + + + + + + time + thou + + + + + poJicc + polico + poi + polio + + + + gnttrt + gantry + + + + + + niglmt + Nigel + + + + anl + anal + + + + + + + + + + + + + + + + + + + + + + timefirs + tilefish + + + + + + + + + + + Alva + + + + + + + + lulins + Billings + Luis + + + + + + + + + + + life + + + + + + + + + + + + IliSlE + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Walked + + + + + + + Lancaster + + + + + Thl + Hl + + + + Sllciill + + + + + + or + + + + + + + + Ch + + + + + + cubit + + + + + + lens + + + + + + 3fonlny + + + + 11Wl11 + ntoin + Antoine + + + + + King + + + + llw + Time + allow + + + + tCl1U + + + + + + alled + allied + + + + fOl + foal + + + + 1110 + + + + + + + + + frying + + + + + + LiIt + Licit + + + + wilt + + + + Chl + Hl + + + + + + + ndtieli + Intel + + + + + + Esthel + Estlmer + Ethel + Settler + + + + C01whmnn + Cotdtman + Coachman + Ottoman + + + + + + + + + + + + + hreak + hreal + real + + + + time + + + + + + Qf + + + + Mr + + + + + Clnih0l11c + Clarion + + + + LiHlc + Lilac + + + + Lessrs + Messes + Lessors + + + + lhos + Ethos + los + + + + lisOt + list + + + + + + + + fnd1J1ts + femmdants + fondants + + + + Jucle + Julge + Joule + + + + + + ml + + + + + + + + tUiS + thIi + thefts + etuis + Tahiti + + + + + + + + + + + + + file + + + + ease + + + + + + Julge + Joule + + + + + 11arlker + + + + + + Luucastel + Lucas + + + + Imit3 + + + + cc + + + + + + + + + + + thc + + + + GocJlUr + + + + 1l + + + + wiJI + ivill + WWII + civilly + + + + take1 + + + + + Limo + + + + + + + + tul + amid + truly + + + + pIubaby + piquancy + + + + + + of + ROTC + + + + + ncxt + + + + + + + + heal + heu + hue + + + + Idle + Hide + + + + testimony3 + + + + + loJHlay + aloha + + + + nioruing + snoring + + + + + + cons11IDCd + + + + ih + iii + + + + + seJccting + ejecting + + + + + + + + lnd + 1111 + land + + + + inul1cdinleh + immncdiatelr + immediate + + + + + + + time + + + + + + tlJjoUl1lment + tdjoutatment + + + + theaat + ethicist + threat + + + + + journeys + + + + 101 + + + + + + sids + + + + + + tliti + tho + lit + thou + + + + + ease + + + + + PolloinA + Follotving + Polloi + + + + nre + re + + + + + + + + seietel + scented + semimetal + + + + + Elins + Ellis + + + + + + + + + + Horton + + + + QliIT + Licit + + + + + + + + + + 1erryman + Ferryman + + + + + + + + Gill + + + + + + + + + 8kn1I + Skinnier + + + + + + + + Heflin + Gillian + Hellion + + + + Chic + + + + lIndiov + Radler + Ladle + indigo + Adler + + + + + + + ii + + + + Ada111s + + + + n + It + + + + + + + + + FLANGE + + + + + + + + + Stcpp + Step + TCP + + + + + + Iias + Ixias + + + + heron + + + + + + + + + dipthelin + hptherla + diphtheria + dithering + patella + + + + b + + + + iniproing + indisposing + + + + + + + Margatret + + + + Bralley + + + + or + 01 + + + + 11iui + + + + + elmester + semester + + + + isited + iisite1 + + + + + + + + + + + + + + + + + H + + + + KinA + + + + + + Sunjlfty + Sunlit + + + + + + + + + + + + Joc + Jock + + + + + + atIll + tIll + + + + + + nnd + nd + + + + tWQ + ttiyp + IV + TRW + tip + + + + + + + hl + + + + + hen + den + + + + + + + + isitel + + + + foresails + + + + + herc + her + + + + + + + + + + Hie + idle + Hide + + + + Pl1st + + + + weep + + + + + II + + + + Shcrmtn + Shermun + Sherman + Shorten + + + + + + + + Diarists + + + + + + + + + + + hb + + + + sistel + sisterly + + + + + + + + + + + lung + + + + + + + + + + + Wntts + 1latls + Wants + + + + mind + + + + Mrs + + + + C + + + + + IcDonald + + + + + + rOoleiill + Mooesville + Doorsill + ocotillo + + + + spimi + simi + + + + + + + + + + + cdncsday + 11ednesday + cadences + + + + + + + + + + + + Dn4n + Unsung + + + + kb + lo + + + + + + + rahonc + Mahoue + Mahoney + aphonic + Madhouse + + + + 111 + + + + + + + + + tenderl + tenderly + + + + + + + + MI + + + + Sterliuj + Sterling + + + + ilaSl + villas + + + + + Momiday + + + + + llrs + alls + + + + + + OamJw + Gamboled + Foam + + + + aocl + aol + + + + littlo + + + + cleric + + + + + Eizaheth + + + + Insku + ltsko + Minsk + also + + + + arc + + + + + + + + + nt + + + + + + + Eddo + + + + Iiughcs + + + + tlCl + amid + tlC + + + + IJUlc + Ilttib + Kiltie + + + + sni + sri + snip + + + + Wood + + + + + + + + + Vinehcster + Vilichestee + Filches + + + + + + + + + + + + + + + friendti + friend + + + + + + + + + + + + + + + + MondayMess + Monday + + + + Bamc + Bmltltei + Banc + Mullein + + + + Daily + + + + + + + + HieJ + siik + Hide + Sikh + + + + + vith + itch + + + + fnm + nm + + + + + + + + + H + + + + + + + + + + + + Ler + Alex + Leer + + + + + + + + + reccntll + recent + + + + + + + + + + tho + thou + + + + smtl + smut + + + + + clen + clean + + + + denth + dent + + + + + + + + fnther + fatlmer + nether + falter + + + + + + + + + + + + + + + pcople + + + + + + hwf + haw + + + + solid + + + + + + + + + + + + + amid + + + + 12ti + + + + cetit + centrist + + + + + + + + + Iowell + Lowell + + + + hought + honght + thought + Hong + + + + a + + + + pnh + Penh + + + + + + trifles + + + + + + + riddletowll + Middletowlt + Middlebrow + riddle + + + + + + wcek + + + + 101 + + + + + + + WELKIN + + + + + + + GeOJgo + + + + + + + + Mrfi + Ht + Mari + + + + + + + FostCl + Foist + + + + Incl + and + Inc + + + + + + SOl1 + + + + Emme15Q + Hemmers + + + + + + + + + Lcxington + + + + returnccl + return + + + + hObte + hotmic + hoi + hotbed + rhombic + + + + Fiiday + + + + + aftcr + + + + n + + + + plnsant + pliant + + + + visit + + + + + + het + liar + heat + + + + sisal + + + + + + + + nnd + nd + + + + hlother + loather + + + + mfrs + irs + frs + + + + Alie + + + + Blye + Belo + Lye + + + + rand + + + + altar + + + + + + + IIardy + Hard + Iliad + + + + + Mir + + + + + + Mrs + + + + + + Embank + + + + attest + + + + + + + fl1nerul + + + + + + Inver + Invert + + + + sistCl + sisterly + + + + + + 1ait + Leah + + + + + + + + + + + Beldam + + + + + muss + + + + ESRic + Bessie + ERic + + + + + + + + frielhr + fricdtis + Africans + filcher + friction + + + + + + + 11inehcster + + + + + + sccrnl + scorn + + + + dnys + dens + + + + + + + Proof + + + + + + + + + + mchiiol + chili + + + + + of + + + + neconnt + nescient + + + + + + dipthcria + diptheria + dithering + diphtheria + + + + nnd + nd + + + + sacHet + + + + + + + + feverish + + + + + Iiss + Ibises + + + + + + HodJildn + lndglau + Hodgkin + Shoji + landlady + + + + + + 011 + ou + + + + tho + thou + + + + liik + eiek + Reek + like + geek + + + + + Iit + It + + + + + m1s + + + + AlieC + Alec + + + + ntye + Blc + Rally + nye + Bloc + + + + + + tho + time + thou + + + + nuts + + + + + + + + + + + + + + + + + + + + + + Embank + + + + + + + + + + + + time + + + + pleannt + leant + + + + + + + + dmiS + admits + + + + + + + llr + + + + + + Ilodgkin + Lodging + + + + + + + + + + + + + + Mts + Mats + + + + + + lllyc + Lye + + + + wns + awns + + + + rlsitingg + listing + + + + frigid + + + + + + + Winchcstel + wiuchestcr + + + + Flday + Flay + + + + + + Saturn + + + + + + + Urother + UlOthel + Further + Loathe + + + + Banderols + + + + 01 + + + + + + + + + nt + + + + + + Elkins + Eking + Welkins + + + + + + Stiii + Siam + + + + + d1y + + + + mOl1ling + + + + + + + + + + + + + + + + + + + + + + + + + + + + VOTES + + + + + + + + AH + + + + + + memhets + nletnheis + mementos + menthes + + + + + + time + + + + IndoH9nd + Indepgnd + Indeed + Independent + + + + + ent + Kent + + + + + + + + ClulHiro + Clumsier + Collier + + + + Lori + + + + + + + + + + + mat + + + + + + + + + + + + + It + + + + + + + + + + + + + Prcideht + Pride + + + + + WU + + + + COPIlEIl + COPIIEIt + COPIER + Coquille + Copyedit + + + + + + + + + + Alwayz + + + + + + + + + + + + + + + + + + + + + + + + + success + + + + + soar + + + + + + cen + cent + + + + + + + + + + Baltimore + + + + + + + + + + + + Iti + + + + CUSS + + + + + + + + + Cottngc + Clotting + + + + + + Browmi + BlOUU + Brownie + Lou + + + + + Louie + + + + + + + Beatty + Pertly + Batty + + + + Ewes + + + + officc + + + + + 10Utf + + + + + + + + + fiat + + + + + + + + + + + + + + + C111 + + + + Ifome + ionic + Biome + + + + + + 84BA + 898A + + + + + i0313t + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/qurator/dinglehopper/tests/data/test.alto2.xml b/qurator/dinglehopper/tests/data/test.alto2.xml new file mode 100644 index 0000000..67d3537 --- /dev/null +++ b/qurator/dinglehopper/tests/data/test.alto2.xml @@ -0,0 +1,64 @@ + + + +pixel +2017-03-27ABBYYABBYY FineReader Engine11 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/qurator/dinglehopper/tests/data/test.alto3.xml b/qurator/dinglehopper/tests/data/test.alto3.xml new file mode 100644 index 0000000..6986560 --- /dev/null +++ b/qurator/dinglehopper/tests/data/test.alto3.xml @@ -0,0 +1,37 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/qurator/dinglehopper/tests/data/test.page2018.xml b/qurator/dinglehopper/tests/data/test.page2018.xml new file mode 100644 index 0000000..c0dc183 --- /dev/null +++ b/qurator/dinglehopper/tests/data/test.page2018.xml @@ -0,0 +1,3394 @@ + + + + doculibtopagexml + 2019-01-08T10:25:36 + 2019-04-26T07:11:05 + + + + + + + + + + + + + + + + + + + + + + + b + + + + e + + + + r + + ber + + + + + + + d + + + + i + + + + e + + die + + + + + + + v + + + + i + + + + e + + + + l + + + + e + + + + n + + vielen + + + + + + + S + + + + o + + + + r + + + + g + + + + e + + + + n + + Sorgen + + + + + + + w + + + + e + + + + g + + + + e + + + + n + + wegen + + + + + + + d + + + + e + + + + + + + + e + + + + l + + + + b + + + + e + + + + n + + deelben + + + + + + + v + + + + e + + + + r + + + + g + + + + a + + + + ß + + vergaß + + + ber die vielen Sorgen wegen deelben vergaß + + + + + + + + i + + + + h + + + + r + + ihr + + + + + + + d + + + + o + + + + + + do + + + + + + + n + + + + o + + + + + + no + + + + + + + a + + + + n + + an + + + + + + + a + + + + + + + + e + + + + m + + + + . + + aem. + + + + + + + + + + + + ihr do no an aem. — + + + + + + + + H + + + + a + + + + r + + + + t + + + + k + + + + o + + + + p + + + + f + + + + , + + Hartkopf, + + + + + + + d + + + + e + + + + r + + der + + + + + + + F + + + + r + + + + a + + + + u + + Frau + + + + + + + A + + + + m + + + + t + + + + m + + + + + + + + n + + + + n + + + + i + + + + n + + Amtmnnin + + + + + + + d + + + + a + + + + s + + das + + + + + + + v + + + + e + + + + r + + + + + + ver⸗ + + + Hartkopf, der Frau Amtmnnin das ver⸗ + + + + + + + + ſ + + + + p + + + + r + + + + o + + + + + + + + e + + + + n + + + + e + + ſproene + + + + + + + z + + + + u + + zu + + + + + + + + + + + b + + + + e + + + + r + + + + l + + + + i + + + + e + + + + f + + + + e + + + + r + + + + n + + + + . + + berliefern. + + + + + + + + + + + + + + + + E + + + + i + + + + n + + Ein + + + + + + + E + + + + r + + + + p + + + + + + + + e + + + + r + + + + r + + + + e + + Erpreer + + + ſproene zu berliefern. — Ein Erpreer + + + + + + + + w + + + + d + + + + e + + + + u + + + + r + + wurde + + + + + + + a + + + + n + + an + + + + + + + i + + + + h + + + + n + + ihn + + + + + + + a + + + + b + + + + g + + + + e + + + + ſ + + + + + + + + i + + + + + + + + t + + + + , + + abgeſit, + + + + + + + u + + + + m + + um + + + + + + + i + + + + h + + + + n + + ihn + + + + + + + u + + + + m + + + + s + + ums + + + + + + + H + + + + i + + + + m + + + + + + Him⸗ + + + wurde an ihn abgeſit, um ihn ums Him⸗ + + + + + + + + m + + + + e + + + + l + + + + s + + + + w + + + + i + + + + + + + + e + + + + n + + melswien + + + + + + + z + + + + u + + zu + + + + + + + ſ + + + + a + + + + g + + + + e + + + + n + + + + , + + ſagen, + + + + + + + d + + + + a + + + + ß + + daß + + + + + + + e + + + + r + + er + + + + + + + d + + + + a + + + + s + + das + + + + + + + V + + + + e + + + + r + + + + ſ + + + + p + + + + r + + + + o + + + + + + + + e + + + + n + + + + e + + Verſproene + + + melswien zu ſagen, daß er das Verſproene + + + + + + + + g + + + + l + + + + e + + + + i + + + + + + glei + + + + + + + d + + + + e + + + + n + + den + + + + + + + A + + + + u + + + + g + + + + e + + + + n + + + + b + + + + l + + + + i + + + + + + Augenbli + + + + + + + + + + + b + + + + e + + + + r + + + + b + + + + r + + + + i + + + + n + + + + g + + + + e + + + + n + + berbringen + + + + + + + m + + + + + + + + + + + + t + + + + e + + + + , + + mte, + + + + + + + d + + + + i + + + + e + + die + + + glei den Augenbli berbringen mte, die + + + + + + + + F + + + + r + + + + a + + + + u + + Frau + + + + + + + A + + + + m + + + + t + + + + m + + + + + + + + n + + + + n + + + + i + + + + n + + Amtmnnin + + + + + + + h + + + + + + + + t + + + + t + + + + e + + htte + + + + + + + + + + + + +  + + + + + + + a + + + + u + + + + f + + auf + + + + + + + i + + + + h + + + + n + + ihn + + + + + + + v + + + + e + + + + r + + + + l + + + + a + + + + + + + + e + + + + n + + + + , + + verlaen, + + + Frau Amtmnnin htte  auf ihn verlaen, + + + + + + + + u + + + + n + + + + d + + und + + + + + + + n + + + + u + + + + n + + nun + + + + + + + w + + + + + + + + ß + + + + t + + + + e + + wßte + + + + + + + + + + + e + + e + + + + + + + n + + + + i + + + + + + + + t + + + + , + + nit, + + + + + + + w + + + + a + + + + s + + was + + + + + + + + + + + e + + e + + + + + + + a + + + + n + + + + f + + + + a + + + + n + + + + g + + + + e + + + + n + + anfangen + + + und nun wßte e nit, was e anfangen + + + + + + + + ſ + + + + o + + + + + + + + t + + + + e + + + + . + + ſote. + + + + + + + D + + + + e + + + + n + + Den + + + + + + + A + + + + u + + + + g + + + + e + + + + n + + + + b + + + + l + + + + i + + + + + + Augenbli + + + + + + + ſ + + + + o + + + + + + + + t + + + + e + + ſote + + + + + + + e + + + + r + + er + + + + + + + k + + + + o + + + + m + + + + m + + + + e + + + + n + + + + , + + kommen, + + + ſote. Den Augenbli ſote er kommen, + + + + + + + + + + + + e + + e + + + + + + + i + + + + n + + in + + + + + + + i + + + + h + + + + r + + + + e + + + + r + + ihrer + + + + + + + A + + + + n + + + + g + + + + + + + + . + + Ang. + + + + + + + + + + + + + + + + D + + + + i + + + + e + + Die + + + + + + + ſ + + + + o + + + + n + + + + + + ſon + + + + + + + v + + + + e + + + + r + + + + g + + + + i + + + + e + + + + n + + + + g + + vergieng + + + ſon vergieng e in ihrer Ang. — Die + + + + + + + + G + + + + + + + + + + + + e + + Ge + + + + + + + w + + + + + + + + r + + + + e + + + + n + + wren + + + + + + + ſ + + + + + + + + o + + + + n + + ſon + + + + + + + a + + + + n + + + + g + + + + e + + + + k + + + + o + + + + m + + + + m + + + + e + + + + n + + + + , + + angekommen, + + + + + + + u + + + + n + + + + d + + und + + + + + + + e + + + + s + + es + + + + + + + f + + + + e + + + + h + + + + l + + + + t + + + + e + + fehlte + + + Ge wren ſon angekommen, und es fehlte + + ber die vielen Sorgen wegen deelben vergaß +Hartkopf, der Frau Amtmnnin das ver⸗ +ſproene zu berliefern. — Ein Erpreer +wurde an ihn abgeſit, um ihn ums Him⸗ +melswien zu ſagen, daß er das Verſproene +glei den Augenbli berbringen mte, die +Frau Amtmnnin htte  auf ihn verlaen, +und nun wßte e nit, was e anfangen +ſote. Den Augenbli ſote er kommen, +ſon vergieng e in ihrer Ang. — Die +Ge wren ſon angekommen, und es fehlte +ihr do no an aem. — + + + + + + + + + + + H + + + + a + + + + r + + + + t + + + + k + + + + o + + + + f + + + + p + + Hartkopf + + + + + + + m + + + + u + + + + ß + + + + t + + + + e + + mußte + + + + + + + + + + + + +  + + + + + + + e + + + + r + + + + + + er + + + + + + + b + + + + e + + + + + + + + n + + + + n + + + + e + + + + n + + + + , + + bennen, + + + + + + + u + + + + n + + + + d + + und + + + Hartkopf mußte  er bennen, und + + + + + + + + m + + + + i + + + + t + + mit + + + + + + + u + + + + n + + + + d + + und + + + + + + + + + + + b + + + + e + + + + r + + + + b + + + + r + + + + a + + + + + + + + t + + + + e + + berbrate + + + + + + + e + + + + s + + + + . + + es. + + + + + + + + + + + + mit und berbrate es. — + + + + + + + + l + + + + a + + + + n + + + + g + + + + e + + + + m + + langem + + + + + + + N + + + + a + + + + + + + + d + + + + e + + + + n + + + + k + + + + e + + + + n + + Nadenken + + + + + + + + + + + e + + + + l + + fiel + + + + + + + e + + + + s + + es + + + + + + + i + + + + h + + + + m + + ihm + + + + + + + e + + + + r + + + + + + er + + + + + + + e + + + + n + + + + d + + + + l + + + + i + + + + + + endli + + + + + + + n + + + + a + + + + + + na + + + endli na langem Nadenken fiel es ihm er + + + + + + + + w + + + + i + + + + e + + + + d + + + + e + + + + r + + wieder + + + + + + + e + + + + i + + + + n + + + + . + + ein. + + + + + + + + + + + + + + + + E + + + + r + + Er + + + + + + + l + + + + a + + + + n + + + + g + + + + t + + + + e + + langte + + + + + + + d + + + + e + + + + n + + den + + + + + + + Z + + + + e + + + + t + + + + t + + + + e + + + + l + + Zettel + + + + + + + a + + + + u + + + + s + + aus + + + + + + + d + + + + e + + + + m + + dem + + + wieder ein. — Er langte den Zettel aus dem + + + + + + + + A + + + + c + + + + c + + + + i + + + + + + + + e + + + + s + + + + b + + + + u + + Accisbue + + + + + + + h + + + + e + + + + r + + + + a + + + + u + + + + s + + + + , + + heraus, + + + + + + + u + + + + n + + + + d + + und + + + + + + + ſ + + + + a + + + + g + + + + t + + + + e + + ſagte + + + + + + + ſ + + + + e + + + + i + + + + n + + + + e + + + + r + + ſeiner + + + + + + + F + + + + r + + + + a + + + + u + + + + , + + Frau, + + + + + + + d + + + + a + + + + ß + + daß + + + Accisbue heraus, und ſagte ſeiner Frau, daß + + + + + + + + + + + + e + + e + + + + + + + d + + + + a + + + + s + + + + , + + das, + + + + + + + w + + + + a + + + + s + + was + + + + + + + d + + + + a + + da + + + + + + + w + + + + + + + + r + + + + e + + + + , + + wre, + + + + + + + h + + + + e + + + + r + + + + b + + + + e + + + + y + + + + ſ + + + + + + + + a + + + + + + + + e + + + + n + + herbeyſaffen + + + + + + + m + + + + + + + + + + + + t + + + + e + + + + . + + mte. + + + e das, was da wre, herbeyſaffen mte. + + + + + + + + J + + + + n + + + + d + + + + e + + + + ß + + Jndeß + + + + + + + m + + + + a + + + + n + + + + g + + + + e + + + + l + + + + t + + + + e + + + + n + + mangelten + + + + + + + d + + + + i + + + + e + + die + + + + + + + d + + + + o + + + + + + do + + + + + + + e + + + + i + + + + n + + + + i + + + + g + + + + e + + einige + + + + + + + G + + + + e + + + + n + + + + e + + + + l + + + + i + + + + a + + + + , + + + + r + + + + a + + Generalia, + + + Jndeß mangelten do einige Generalia, die + + + + + + + + a + + + + l + + + + ſ + + + + o + + alſo + + + + + + + w + + + + e + + + + g + + + + + + + + e + + + + l + + + + e + + + + n + + + + . + + wegfielen. + + + + + + + + + + + + + + + + H + + + + a + + + + r + + + + t + + + + k + + + + o + + + + p + + + + f + + Hartkopf + + + + + + + g + + + + i + + + + e + + + + n + + + + g + + gieng + + + + + + + ſ + + + + e + + + + l + + + + b + + + + + + ſelb + + + alſo wegfielen. — Hartkopf gieng ſelb + + Hartkopf mußte  er bennen, und +endli na langem Nadenken fiel es ihm er +wieder ein. — Er langte den Zettel aus dem +Accisbue heraus, und ſagte ſeiner Frau, daß +e das, was da wre, herbeyſaffen mte. +Jndeß mangelten do einige Generalia, die +alſo wegfielen. — Hartkopf gieng ſelb +mit und berbrate es. — + + + + diff --git a/qurator/dinglehopper/tests/data/test.txt b/qurator/dinglehopper/tests/data/test.txt new file mode 100644 index 0000000..41bfe81 --- /dev/null +++ b/qurator/dinglehopper/tests/data/test.txt @@ -0,0 +1 @@ +Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. \ No newline at end of file diff --git a/qurator/dinglehopper/tests/test_align.py b/qurator/dinglehopper/tests/test_align.py new file mode 100644 index 0000000..cc5cb43 --- /dev/null +++ b/qurator/dinglehopper/tests/test_align.py @@ -0,0 +1,108 @@ +from .util import unzip +from .. import align, seq_align, distance + + +def test_left_empty(): + result = list(align('', 'foo')) + expected = [(None, 'f'), (None, 'o'), (None, 'o')] + assert result == expected + + +def test_right_empty(): + result = list(align('foo', '')) + expected = [('f', None), ('o', None), ('o', None)] + assert result == expected + + +def test_left_longer(): + result = list(align('food', 'foo')) + expected = [('f', 'f'), ('o', 'o'), ('o', 'o'), ('d', None)] + assert result == expected + + +def test_right_longer(): + result = list(align('foo', 'food')) + expected = [('f', 'f'), ('o', 'o'), ('o', 'o'), (None, 'd')] + assert result == expected + + +def test_some_diff(): + result = list(align('abcde', 'aaadef')) + left, right = unzip(result) + assert list(left) == ['a', 'b', 'c', 'd', 'e', None] + assert list(right) == ['a', 'a', 'a', 'd', 'e', 'f'] + + +def test_longer(): + s1 = 'Dies ist eine Tst!' + s2 = 'Dies ist ein Test.' + + result = list(align(s1, s2)) # ; diffprint(*unzip(result)) + expected = [('D', 'D'), ('i', 'i'), ('e', 'e'), ('s', 's'), (' ', ' '), + ('i', 'i'), ('s', 's'), ('t', 't'), (' ', ' '), + ('e', 'e'), ('i', 'i'), ('n', 'n'), ('e', None), (' ', ' '), + ('T', 'T'), (None, 'e'), ('s', 's'), ('t', 't'), ('!', '.')] + assert result == expected + + +def test_completely_different(): + assert len(list(align('abcde', 'fghij'))) == 5 + + +def test_with_some_fake_ocr_errors(): + result = list(align('Über die vielen Sorgen wegen desselben vergaß', + 'SomeJunk MoreJunk Übey die vielen Sorgen wegen AdditionalJunk deffelben vcrgab')) + left, right = unzip(result) + + # Beginning + assert list(left[:18]) == [None]*18 + assert list(right[:18]) == list('SomeJunk MoreJunk ') + + # End + assert list(left[-1:]) == ['ß'] + assert list(right[-1:]) == ['b'] + + +def test_lines(): + """Test comparing list of lines. + + This mainly serves as documentation for comparing lists of lines. + """ + result = list(seq_align( + ['This is a line.', 'This is another', 'And the last line'], + ['This is a line.', 'This is another', 'J u n k', 'And the last line'] + )) + left, right = unzip(result) + assert list(left) == ['This is a line.', 'This is another', None, 'And the last line'] + assert list(right) == ['This is a line.', 'This is another', 'J u n k', 'And the last line'] + + +def test_lines_similar(): + """Test comparing list of lines while using a "weaker equivalence". + + This mainly serves as documentation. + """ + + class SimilarString: + def __init__(self, string): + self._string = string + + def __eq__(self, other): + return distance(self._string, other._string) < 2 # XXX NOT the final version + + def __ne__(self, other): + return not self.__eq__(other) + + def __repr__(self): + return 'SimilarString(\'%s\')' % self._string + + def __hash__(self): + return hash(self._string) + + result = list(seq_align( + [SimilarString('This is a line.'), SimilarString('This is another'), SimilarString('And the last line')], + [SimilarString('This is a ljne.'), SimilarString('This is another'), SimilarString('J u n k'), SimilarString('And the last line')] + )) + left, right = unzip(result) + assert list(left) == [SimilarString('This is a line.'), SimilarString('This is another'), None, SimilarString('And the last line')] + assert list(right) == [SimilarString('This is a ljne.'), SimilarString('This is another'), SimilarString('J u n k'), SimilarString('And the last line')] diff --git a/qurator/dinglehopper/tests/test_character_error_rate.py b/qurator/dinglehopper/tests/test_character_error_rate.py new file mode 100644 index 0000000..b16d37c --- /dev/null +++ b/qurator/dinglehopper/tests/test_character_error_rate.py @@ -0,0 +1,37 @@ +from __future__ import division, print_function + +import math +import unicodedata + +from .. import character_error_rate + + +def test_character_error_rate(): + assert character_error_rate('a', 'a') == 0 + assert character_error_rate('a', 'b') == 1/1 + assert character_error_rate('Foo', 'Bar') == 3/3 + + assert character_error_rate('Foo', '') == 3/3 + + assert character_error_rate('', '') == 0 + assert math.isinf(character_error_rate('', 'Foo')) + + assert character_error_rate('Foo', 'Food') == 1/3 + assert character_error_rate('Fnord', 'Food') == 2/5 + assert character_error_rate('Müll', 'Mull') == 1/4 + assert character_error_rate('Abstand', 'Sand') == 4/7 + + +def test_character_error_rate_hard(): + s1 = unicodedata.normalize('NFC', 'Schlyñ lorem ipsum.') + s2 = unicodedata.normalize('NFD', 'Schlyñ lorem ipsum!') # Different, decomposed! + assert character_error_rate(s1, s2) == 1/19 + + s1 = 'Schlyñ' + assert len(s1) == 6 # This ends with LATIN SMALL LETTER N WITH TILDE, so 6 code points + s2 = 'Schlym̃' + assert len(s2) == 7 # This, OTOH, ends with LATIN SMALL LETTER M + COMBINING TILDE, 7 code points + + # Both strings have the same length in terms of grapheme clusters. So the CER should be symmetrical. + assert character_error_rate(s2, s1) == 1/6 + assert character_error_rate(s1, s2) == 1/6 diff --git a/qurator/dinglehopper/tests/test_edit_distance.py b/qurator/dinglehopper/tests/test_edit_distance.py new file mode 100644 index 0000000..fa901a8 --- /dev/null +++ b/qurator/dinglehopper/tests/test_edit_distance.py @@ -0,0 +1,40 @@ +from __future__ import division, print_function + +import unicodedata + +from .. import levenshtein, distance + + +def test_levenshtein(): + assert levenshtein('a', 'a') == 0 + assert levenshtein('a', 'b') == 1 + assert levenshtein('Foo', 'Bar') == 3 + + assert levenshtein('', '') == 0 + assert levenshtein('Foo', '') == 3 + assert levenshtein('', 'Foo') == 3 + + assert levenshtein('Foo', 'Food') == 1 + assert levenshtein('Fnord', 'Food') == 2 + assert levenshtein('Müll', 'Mull') == 1 + assert levenshtein('Abstand', 'Sand') == 4 + + +def test_levenshtein_other_sequences(): + assert levenshtein(['a', 'ab'], ['a', 'ab', 'c']) == 1 + assert levenshtein(['a', 'ab'], ['a', 'c']) == 1 + + +def test_distance(): + assert distance('Fnord', 'Food') == 2 + assert distance('Müll', 'Mull') == 1 + + word1 = unicodedata.normalize('NFC', 'Schlyñ') + word2 = unicodedata.normalize('NFD', 'Schlyñ') # Different, decomposed! + assert distance(word1, word2) == 0 + + word1 = 'Schlyñ' + assert len(word1) == 6 # This ends with LATIN SMALL LETTER N WITH TILDE, so 6 code points + word2 = 'Schlym̃' + assert len(word2) == 7 # This, OTOH, ends with LATIN SMALL LETTER M + COMBINING TILDE, 7 code points + assert distance(word1, word2) == 1 diff --git a/qurator/dinglehopper/tests/test_editops.py b/qurator/dinglehopper/tests/test_editops.py new file mode 100644 index 0000000..8fafe5d --- /dev/null +++ b/qurator/dinglehopper/tests/test_editops.py @@ -0,0 +1,48 @@ +import unicodedata + +from .. import seq_editops, editops + + +def test_trivial(): + assert seq_editops('abc', 'abc') == [] + assert seq_editops('', '') == [] + + +def test_insert(): + assert seq_editops('bc', 'abc') == [('insert', 0, 0)] + assert seq_editops('ac', 'abc') == [('insert', 1, 1)] + assert seq_editops('ab', 'abc') == [('insert', 2, 2)] + assert seq_editops('', 'a') == [('insert', 0, 0)] + + +def test_multiple(): + assert seq_editops('bcd', 'abce') == [('insert', 0, 0), ('replace', 2, 3)] + + +def test_delete(): + assert seq_editops('abcdef', 'cdef') == [('delete', 0, 0), ('delete', 1, 0)] + assert seq_editops('Xabcdef', 'Xcdef') == [('delete', 1, 1), ('delete', 2, 1)] + assert seq_editops('abcdefg', 'acdefX') == [('delete', 1, 1), ('replace', 6, 5)] + assert seq_editops('abcde', 'aabcd') == [('insert', 1, 1), ('delete', 4, 5)] + assert seq_editops('Foo', '') == [('delete', 0, 0), ('delete', 1, 0), ('delete', 2, 0)] + assert seq_editops('Foolish', 'Foo') == [('delete', 3, 3), ('delete', 4, 3), ('delete', 5, 3), ('delete', 6, 3)] + + +def test_ambiguous(): + assert seq_editops('bcd', 'abcef') == [('insert', 0, 0), ('replace', 2, 3), ('insert', 3, 4)] + + +def test_editops(): + """Test editops() in cases where dealing with grapheme clusters matters""" + + # In these cases, one of the words has a composed form, the other one does not. + assert editops('Schlyñ', 'Schlym̃') == [('replace', 5, 5)] + assert editops('oͤde', 'öde') == [('replace', 0, 0)] + + +def test_editops_canonically_equivalent(): + left = unicodedata.lookup('LATIN SMALL LETTER N') + unicodedata.lookup('COMBINING TILDE') + right = unicodedata.lookup('LATIN SMALL LETTER N WITH TILDE') + assert left != right + assert unicodedata.normalize('NFC', left) == unicodedata.normalize('NFC', right) + assert editops(left, right) == [] diff --git a/qurator/dinglehopper/tests/test_integ_align.py b/qurator/dinglehopper/tests/test_integ_align.py new file mode 100644 index 0000000..df1e230 --- /dev/null +++ b/qurator/dinglehopper/tests/test_integ_align.py @@ -0,0 +1,23 @@ +from __future__ import division, print_function + +import os + +import pytest +from lxml import etree as ET + +from .. import align, page_text + +data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data') + + +@pytest.mark.integration +def test_align_page_files(): + # In the fake OCR file, we changed 2 characters and replaced a fi ligature with fi. + # → 4 elements in the alignment should be different. + # NOTE: In this example, it doesn't matter that we work with "characters", not grapheme clusters. + + gt = page_text(ET.parse(os.path.join(data_dir, 'test-gt.page2018.xml'))) + ocr = page_text(ET.parse(os.path.join(data_dir, 'test-fake-ocr.page2018.xml'))) + + result = list(align(gt, ocr)) + assert sum(left != right for left, right in result) == 4 diff --git a/qurator/dinglehopper/tests/test_integ_character_error_rate_ocr.py b/qurator/dinglehopper/tests/test_integ_character_error_rate_ocr.py new file mode 100644 index 0000000..c27cd31 --- /dev/null +++ b/qurator/dinglehopper/tests/test_integ_character_error_rate_ocr.py @@ -0,0 +1,35 @@ +from __future__ import division, print_function + +import os + +import pytest +from lxml import etree as ET + +from .. import character_error_rate, page_text, alto_text + +data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data') + + +@pytest.mark.integration +def test_character_error_rate_between_page_files(): + # In the fake OCR file, we changed 2 characters and replaced a fi ligature with fi. + gt = page_text(ET.parse(os.path.join(data_dir, 'test-gt.page2018.xml'))) + ocr = page_text(ET.parse(os.path.join(data_dir, 'test-fake-ocr.page2018.xml'))) + assert character_error_rate(gt, ocr) == 4/(470 + 1 + 311) # 2 TextRegions, 1 \n + + +@pytest.mark.integration +def test_character_error_rate_between_page_alto(): + gt = page_text(ET.parse(os.path.join(data_dir, 'lorem-ipsum', 'lorem-ipsum-scan.gt.page.xml'))) + ocr = alto_text(ET.parse(os.path.join(data_dir, 'lorem-ipsum', 'lorem-ipsum-scan.ocr.tesseract.alto.xml'))) + + assert gt == ocr + assert character_error_rate(gt, ocr) == 0 + + +@pytest.mark.integration +def test_character_error_rate_between_page_alto_2(): + gt = page_text(ET.parse(os.path.join(data_dir, 'lorem-ipsum', 'lorem-ipsum-scan-bad.gt.page.xml'))) + ocr = alto_text(ET.parse(os.path.join(data_dir, 'lorem-ipsum', 'lorem-ipsum-scan-bad.ocr.tesseract.alto.xml'))) + + assert character_error_rate(gt, ocr) == 8/591 # Manually verified diff --git a/qurator/dinglehopper/tests/test_integ_cli_valid_json.py b/qurator/dinglehopper/tests/test_integ_cli_valid_json.py new file mode 100644 index 0000000..5699700 --- /dev/null +++ b/qurator/dinglehopper/tests/test_integ_cli_valid_json.py @@ -0,0 +1,39 @@ +import os +import json + +import pytest +from .util import working_directory + +from ..cli import process + + +def test_cli_json(tmp_path): + """Test that the cli/process() yields a loadable JSON report""" + + # XXX Path.__str__() is necessary for Python 3.5 + with working_directory(str(tmp_path)): + with open('gt.txt', 'w') as gtf: + gtf.write('AAAAA') + with open('ocr.txt', 'w') as ocrf: + ocrf.write('AAAAB') + + process('gt.txt', 'ocr.txt', 'report') + with open('report.json', 'r') as jsonf: + j = json.load(jsonf) + assert j['cer'] == pytest.approx(0.2) + + +def test_cli_json_cer_is_infinity(tmp_path): + """Test that the cli/process() yields a loadable JSON report when CER == inf""" + + # XXX Path.__str__() is necessary for Python 3.5 + with working_directory(str(tmp_path)): + with open('gt.txt', 'w') as gtf: + gtf.write('') # Empty to yield CER == inf + with open('ocr.txt', 'w') as ocrf: + ocrf.write('Not important') + + process('gt.txt', 'ocr.txt', 'report') + with open('report.json', 'r') as jsonf: + j = json.load(jsonf) + assert j['cer'] == pytest.approx(float('inf')) diff --git a/qurator/dinglehopper/tests/test_integ_edit_distance_ocr.py b/qurator/dinglehopper/tests/test_integ_edit_distance_ocr.py new file mode 100644 index 0000000..2857d56 --- /dev/null +++ b/qurator/dinglehopper/tests/test_integ_edit_distance_ocr.py @@ -0,0 +1,35 @@ +from __future__ import division, print_function + +import os + +import pytest +from lxml import etree as ET + +from .. import distance, page_text, alto_text + +data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data') + + +@pytest.mark.integration +def test_distance_between_page_files(): + # In the fake OCR file, we changed 2 characters and replaced a fi ligature with fi. + gt = page_text(ET.parse(os.path.join(data_dir, 'test-gt.page2018.xml'))) + ocr = page_text(ET.parse(os.path.join(data_dir, 'test-fake-ocr.page2018.xml'))) + assert distance(gt, ocr) == 4 + + +@pytest.mark.integration +def test_distance_between_page_alto(): + gt = page_text(ET.parse(os.path.join(data_dir, 'lorem-ipsum', 'lorem-ipsum-scan.gt.page.xml'))) + ocr = alto_text(ET.parse(os.path.join(data_dir, 'lorem-ipsum', 'lorem-ipsum-scan.ocr.tesseract.alto.xml'))) + + assert gt == ocr + assert distance(gt, ocr) == 0 + + +@pytest.mark.integration +def test_distance_between_page_alto_2(): + gt = page_text(ET.parse(os.path.join(data_dir, 'lorem-ipsum', 'lorem-ipsum-scan-bad.gt.page.xml'))) + ocr = alto_text(ET.parse(os.path.join(data_dir, 'lorem-ipsum', 'lorem-ipsum-scan-bad.ocr.tesseract.alto.xml'))) + + assert distance(gt, ocr) == 8 # Manually verified diff --git a/qurator/dinglehopper/tests/test_integ_ocrd_cli.py b/qurator/dinglehopper/tests/test_integ_ocrd_cli.py new file mode 100644 index 0000000..41da748 --- /dev/null +++ b/qurator/dinglehopper/tests/test_integ_ocrd_cli.py @@ -0,0 +1,37 @@ +import os +import re +import shutil +import json +from pathlib import Path + +from click.testing import CliRunner +import pytest +from .util import working_directory + + +from ..ocrd_cli import ocrd_dinglehopper + +data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data') + + +def test_ocrd_cli(tmp_path): + """Test OCR-D interface""" + + # XXX Path.str() is necessary for Python 3.5 + + # Copy test workspace + test_workspace_dir_source = Path(data_dir) / 'actevedef_718448162' + test_workspace_dir = tmp_path / 'test_ocrd_cli' + shutil.copytree(str(test_workspace_dir_source), str(test_workspace_dir)) + + # Run through the OCR-D interface + with working_directory(str(test_workspace_dir)): + runner = CliRunner() + result = runner.invoke(ocrd_dinglehopper, [ + '-m', 'mets.xml', + '-I', 'OCR-D-GT-PAGE,OCR-D-OCR-CALAMARI', + '-O', 'OCR-D-OCR-CALAMARI-EVAL' + ]) + assert result.exit_code == 0 + result_json = list((test_workspace_dir / 'OCR-D-OCR-CALAMARI-EVAL').glob('*.json')) + assert json.load(open(str(result_json[0])))['cer'] < 0.03 diff --git a/qurator/dinglehopper/tests/test_integ_word_error_rate_ocr.py b/qurator/dinglehopper/tests/test_integ_word_error_rate_ocr.py new file mode 100644 index 0000000..1d2dead --- /dev/null +++ b/qurator/dinglehopper/tests/test_integ_word_error_rate_ocr.py @@ -0,0 +1,43 @@ +from __future__ import division, print_function + +import os + +import pytest +from lxml import etree as ET + +from .. import word_error_rate, words, page_text, alto_text + +data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data') + + +@pytest.mark.integration +def test_word_error_rate_between_page_files(): + # In the fake OCR file, we changed 2 characters and replaced a fi ligature with fi. → 3 changed words + gt = page_text(ET.parse(os.path.join(data_dir, 'test-gt.page2018.xml'))) + + gt_word_count = 7+6+5+8+7+6+7+8+6+7+7+5+6+8+8+7+7+6+5+4 # Manually verified word count per line + assert len(list(words(gt))) == gt_word_count + + ocr = page_text(ET.parse(os.path.join(data_dir, 'test-fake-ocr.page2018.xml'))) + assert word_error_rate(gt, ocr) == 3/gt_word_count + + +@pytest.mark.integration +def test_word_error_rate_between_page_alto(): + gt = page_text(ET.parse(os.path.join(data_dir, 'lorem-ipsum', 'lorem-ipsum-scan.gt.page.xml'))) + ocr = alto_text(ET.parse(os.path.join(data_dir, 'lorem-ipsum', 'lorem-ipsum-scan.ocr.tesseract.alto.xml'))) + + assert gt == ocr + assert word_error_rate(gt, ocr) == 0 + + +@pytest.mark.integration +def test_word_error_rate_between_page_alto_2(): + gt = page_text(ET.parse(os.path.join(data_dir, 'lorem-ipsum', 'lorem-ipsum-scan-bad.gt.page.xml'))) + + gt_word_count = 14+18+17+14+17+17+3 # Manually verified word count per line + assert len(list(words(gt))) == gt_word_count + + ocr = alto_text(ET.parse(os.path.join(data_dir, 'lorem-ipsum', 'lorem-ipsum-scan-bad.ocr.tesseract.alto.xml'))) + + assert word_error_rate(gt, ocr) == 7/gt_word_count # Manually verified, 6 words are wrong, 1 got split (=2 errors) diff --git a/qurator/dinglehopper/tests/test_ocr_files.py b/qurator/dinglehopper/tests/test_ocr_files.py new file mode 100644 index 0000000..dd9377a --- /dev/null +++ b/qurator/dinglehopper/tests/test_ocr_files.py @@ -0,0 +1,110 @@ +import os +import re + +import lxml.etree as ET +import textwrap + +import pytest + +from .. import alto_namespace, alto_text, page_namespace, page_text, text + +data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data') + + +def test_alto_namespace(): + tree = ET.parse(os.path.join(data_dir, 'test.alto3.xml')) + assert alto_namespace(tree) == 'http://www.loc.gov/standards/alto/ns-v3#' + + +def test_alto_text(): + tree = ET.parse(os.path.join(data_dir, 'test.alto3.xml')) + result = alto_text(tree) + expected = textwrap.dedent("""\ + über die vielen Sorgen wegen deſſelben vergaß + Hartkopf, der Frau Amtmännin das ver- + ſprochene zu überliefern.""") + assert result == expected + + +def test_alto_text_ALTO1(): + tree = ET.parse(os.path.join(data_dir, 'test.alto1.xml')) + assert "being erected at the Broadway stock" in alto_text(tree) + + +def test_alto_text_ALTO2(): + tree = ET.parse(os.path.join(data_dir, 'test.alto2.xml')) + assert "Halbmonde, die genau durch einen Ouerstrich halbiert\nsind und an beiden Enden" in alto_text(tree) + + +def test_alto_text_ALTO3(): + tree = ET.parse(os.path.join(data_dir, 'test.alto3.xml')) + assert "über die vielen Sorgen wegen deſſelben vergaß" in alto_text(tree) + + +def test_page_namespace(): + tree = ET.parse(os.path.join(data_dir, 'test.page2018.xml')) + assert page_namespace(tree) == 'http://schema.primaresearch.org/PAGE/gts/pagecontent/2018-07-15' + + +def test_page_test(): + tree = ET.parse(os.path.join(data_dir, 'test.page2018.xml')) + result = page_text(tree) + expected = textwrap.dedent("""\ + ber die vielen Sorgen wegen deelben vergaß + Hartkopf, der Frau Amtmnnin das ver⸗ + ſproene zu berliefern. — Ein Erpreer + wurde an ihn abgeſit, um ihn ums Him⸗ + melswien zu ſagen, daß er das Verſproene + glei den Augenbli berbringen mte, die + Frau Amtmnnin htte  auf ihn verlaen, + und nun wßte e nit, was e anfangen + ſote. Den Augenbli ſote er kommen, + ſon vergieng e in ihrer Ang. — Die + Ge wren ſon angekommen, und es fehlte + ihr do no an aem. — + Hartkopf mußte  er bennen, und + endli na langem Nadenken fiel es ihm er + wieder ein. — Er langte den Zettel aus dem + Accisbue heraus, und ſagte ſeiner Frau, daß + e das, was da wre, herbeyſaffen mte. + Jndeß mangelten do einige Generalia, die + alſo wegfielen. — Hartkopf gieng ſelb + mit und berbrate es. —""") + assert result == expected + + +def test_page_with_empty_region(): + # This file contains an empty TextRegion: + # + # + # + # + # + # + # + tree = ET.parse(os.path.join(data_dir, 'brochrnx_73075507X/00000139.ocrd-tess.ocr.page.xml')) + result = page_text(tree) + assert result + + +def test_page_order(): + # This file contains TextRegions where file order is not the same as reading order. + tree = ET.parse(os.path.join(data_dir, 'order.page.xml')) + result = page_text(tree) + + assert re.search(r'Herr Konfrater.*75.*Etwas f.r Wittwen.*Ein gewi.er Lord.*76\. Die', result, re.DOTALL) + + +def test_page_mixed_regions(): + # This file contains ImageRegions and TextRegions in the ReadingOrder + tree = ET.parse(os.path.join(data_dir, 'mixed-regions.page.xml')) + with pytest.warns(UserWarning, match=r'Not a TextRegion'): + result = page_text(tree) + + assert 'non exaudiam uos. Chriſtiani uero quia orant iuxta' in result + + +def test_text(): + assert "being erected at the Broadway stock" in text(os.path.join(data_dir, 'test.alto1.xml')) + assert "wieder ein. — Er langte den Zettel aus dem" in text(os.path.join(data_dir, 'test.page2018.xml')) + assert "Lorem ipsum" in text(os.path.join(data_dir, 'test.txt')) diff --git a/qurator/dinglehopper/tests/test_word_error_rate.py b/qurator/dinglehopper/tests/test_word_error_rate.py new file mode 100644 index 0000000..ad19172 --- /dev/null +++ b/qurator/dinglehopper/tests/test_word_error_rate.py @@ -0,0 +1,37 @@ +from __future__ import division, print_function + +import math + +from .. import word_error_rate, words + + +def test_words(): + result = list(words('Der schnelle [„braune“] Fuchs kann keine 3,14 Meter springen, oder?')) + expected = ['Der', 'schnelle', 'braune', 'Fuchs', 'kann', 'keine', '3,14', 'Meter', 'springen', 'oder'] + assert result == expected + + +def test_words_private_use_area(): + result = list(words( + 'ber die vielen Sorgen wegen deelben vergaß Hartkopf, der Frau Amtmnnin das ver⸗\n' + 'ſproene zu berliefern.')) + expected = [ + 'ber', 'die', 'vielen', 'Sorgen', 'wegen', 'deelben', 'vergaß', 'Hartkopf', + 'der', 'Frau', 'Amtmnnin', 'das', 'ver', + 'ſproene', 'zu', 'berliefern'] + assert result == expected + + +def test_word_error_rate(): + assert word_error_rate('Dies ist ein Beispielsatz!', 'Dies ist ein Beispielsatz!') == 0 + assert word_error_rate('Dies. ist ein Beispielsatz!', 'Dies ist ein Beispielsatz!') == 0 + assert word_error_rate('Dies. ist ein Beispielsatz!', 'Dies ist ein Beispielsatz.') == 0 + + assert word_error_rate('Dies ist ein Beispielsatz!', 'Dies ist ein Beispielsarz:') == 1/4 + assert word_error_rate('Dies ist ein Beispielsatz!', 'Dies ein ist Beispielsatz!') == 2/4 + + assert word_error_rate('Dies ist ein Beispielsatz!', '') == 4/4 + assert math.isinf(word_error_rate('', 'Dies ist ein Beispielsatz!')) + assert word_error_rate('', '') == 0 + + assert word_error_rate('Schlyñ lorem ipsum dolor sit amet,', 'Schlym̃ lorem ipsum dolor sit amet.') == 1/6 diff --git a/qurator/dinglehopper/tests/util.py b/qurator/dinglehopper/tests/util.py new file mode 100644 index 0000000..52b7506 --- /dev/null +++ b/qurator/dinglehopper/tests/util.py @@ -0,0 +1,38 @@ +from itertools import zip_longest +from typing import Iterable + +import colorama +import os + + +def diffprint(x, y): + """Print elements or lists x and y, with differences in red""" + + def _diffprint(x, y): + if x != y: + print(colorama.Fore.RED, x, y, colorama.Fore.RESET) + else: + print(x, y) + + if isinstance(x, Iterable): + for xe, ye in zip_longest(x, y): + _diffprint(xe, ye) + else: + _diffprint(x, y) + + +def unzip(l): + return zip(*l) + + +class working_directory: + """Context manager to temporarily change the working directory""" + def __init__(self, wd): + self.wd = wd + + def __enter__(self): + self.old_wd = os.getcwd() + os.chdir(self.wd) + + def __exit__(self, etype, value, traceback): + os.chdir(self.old_wd) diff --git a/qurator/dinglehopper/word_error_rate.py b/qurator/dinglehopper/word_error_rate.py new file mode 100644 index 0000000..2425200 --- /dev/null +++ b/qurator/dinglehopper/word_error_rate.py @@ -0,0 +1,63 @@ +from __future__ import division + +import unicodedata + +import uniseg.wordbreak + +from .edit_distance import levenshtein + + +def words(s): + # Patch uniseg.wordbreak.word_break to deal with our private use characters. See also + # https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/WordBreakProperty.txt + old_word_break = uniseg.wordbreak.word_break + + def new_word_break(c, index=0): + if 0xE000 <= ord(c) <= 0xF8FF: # Private Use Area + return 'ALetter' + else: + return old_word_break(c, index) + uniseg.wordbreak.word_break = new_word_break + + # Check if c is an unwanted character, i.e. whitespace, punctuation, or similar + def unwanted(c): + + # See https://www.fileformat.info/info/unicode/category/index.htm + # and https://unicodebook.readthedocs.io/unicode.html#categories + unwanted_categories = 'O', 'M', 'P', 'Z', 'S' + unwanted_subcategories = 'Cc', 'Cf' + + subcat = unicodedata.category(c) + cat = subcat[0] + return cat in unwanted_categories or subcat in unwanted_subcategories + + # We follow Unicode Standard Annex #29 on Unicode Text Segmentation here: Split on word boundaries using + # uniseg.wordbreak.words() and ignore all "words" that contain only whitespace, punctation "or similar characters." + for word in uniseg.wordbreak.words(s): + if all(unwanted(c) for c in word): + pass + else: + yield word + + +def words_normalized(s): + return words(unicodedata.normalize('NFC', s)) + + +def word_error_rate(reference, compared): + if isinstance(reference, str): + reference_seq = list(words_normalized(reference)) + compared_seq = list(words_normalized(compared)) + else: + reference_seq = list(reference) + compared_seq = list(compared) + + d = levenshtein(reference_seq, compared_seq) + if d == 0: + return 0 + + n = len(reference_seq) + if n == 0: + return float('inf') + + return d / n diff --git a/qurator/sbb_textline_detector/__init__.py b/qurator/sbb_textline_detector/__init__.py deleted file mode 100644 index b7c0712..0000000 --- a/qurator/sbb_textline_detector/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -from .main import * -from .ocrd_cli import * diff --git a/qurator/sbb_textline_detector/main.py b/qurator/sbb_textline_detector/main.py deleted file mode 100644 index 7f1a425..0000000 --- a/qurator/sbb_textline_detector/main.py +++ /dev/null @@ -1,1484 +0,0 @@ -#! /usr/bin/env python3 - -__version__ = '1.0' - -import os -import sys -import cv2 -import numpy as np -import matplotlib.pyplot as plt -import seaborn as sns -from sys import getsizeof -import random -from tqdm import tqdm -from keras.models import model_from_json -from keras.models import load_model -import math -from shapely import geometry -from sklearn.cluster import KMeans -import gc -from keras import backend as K -import tensorflow as tf -from scipy.signal import find_peaks -from scipy.ndimage import gaussian_filter1d -import xml.etree.ElementTree as ET -import warnings -import click -import time -from multiprocessing import Process, Queue, cpu_count -import datetime - - -with warnings.catch_warnings(): - warnings.simplefilter("ignore") - -__doc__ = \ - """ - tool to extract text lines from document images - """ - - -class textline_detector: - def __init__(self, image_dir, dir_out, f_name, dir_models): - self.image_dir = image_dir # XXX This does not seem to be a directory as the name suggests, but a file - self.dir_out = dir_out - self.f_name = f_name - if self.f_name is None: - try: - self.f_name = image_dir.split('/')[len(image_dir.split('/')) - 1] - self.f_name = self.f_name.split('.')[0] - except: - self.f_name = self.f_name.split('.')[0] - self.dir_models = dir_models - self.kernel = np.ones((5, 5), np.uint8) - self.model_page_dir = dir_models + '/model_page_new.h5' - self.model_region_dir = dir_models + '/model_strukturerkennung.h5' - self.model_textline_dir = dir_models + '/model_textline.h5' - - def find_polygons_size_filter(self, contours, median_area, scaler_up=1.2, scaler_down=0.8): - found_polygons_early = list() - - for c in contours: - if len(c) < 3: # A polygon cannot have less than 3 points - continue - - polygon = geometry.Polygon([point[0] for point in c]) - area = polygon.area - # Check that polygon has area greater than minimal area - if area >= median_area * scaler_down and area <= median_area * scaler_up: - found_polygons_early.append( - np.array([point for point in polygon.exterior.coords], dtype=np.uint)) - return found_polygons_early - - def filter_contours_area_of_image(self, image, contours, hierarchy, max_area, min_area): - found_polygons_early = list() - - jv = 0 - for c in contours: - if len(c) < 3: # A polygon cannot have less than 3 points - continue - - polygon = geometry.Polygon([point[0] for point in c]) - area = polygon.area - if area >= min_area * np.prod(image.shape[:2]) and area <= max_area * np.prod( - image.shape[:2]) and hierarchy[0][jv][3] == -1 : # and hierarchy[0][jv][3]==-1 : - found_polygons_early.append( - np.array([ [point] for point in polygon.exterior.coords], dtype=np.uint)) - jv += 1 - return found_polygons_early - - def filter_contours_area_of_image_interiors(self, image, contours, hierarchy, max_area, min_area): - found_polygons_early = list() - - jv = 0 - for c in contours: - if len(c) < 3: # A polygon cannot have less than 3 points - continue - - polygon = geometry.Polygon([point[0] for point in c]) - area = polygon.area - if area >= min_area * np.prod(image.shape[:2]) and area <= max_area * np.prod(image.shape[:2]) and \ - hierarchy[0][jv][3] != -1: - # print(c[0][0][1]) - found_polygons_early.append( - np.array([point for point in polygon.exterior.coords], dtype=np.uint)) - jv += 1 - return found_polygons_early - - def resize_image(self, img_in, input_height, input_width): - return cv2.resize(img_in, (input_width, input_height), interpolation=cv2.INTER_NEAREST) - - def resize_ann(self, seg_in, input_height, input_width): - return cv2.resize(seg_in, (input_width, input_height), interpolation=cv2.INTER_NEAREST) - - def get_one_hot(self, seg, input_height, input_width, n_classes): - seg = seg[:, :, 0] - seg_f = np.zeros((input_height, input_width, n_classes)) - for j in range(n_classes): - seg_f[:, :, j] = (seg == j).astype(int) - return seg_f - - - def color_images(self, seg, n_classes): - ann_u = range(n_classes) - if len(np.shape(seg)) == 3: - seg = seg[:, :, 0] - - seg_img = np.zeros((np.shape(seg)[0], np.shape(seg)[1], 3)).astype(np.uint8) - colors = sns.color_palette("hls", n_classes) - - for c in ann_u: - c = int(c) - segl = (seg == c) - seg_img[:, :, 0] = segl * c - seg_img[:, :, 1] = segl * c - seg_img[:, :, 2] = segl * c - return seg_img - - def color_images_diva(self, seg, n_classes): - ann_u = range(n_classes) - if len(np.shape(seg)) == 3: - seg = seg[:, :, 0] - - seg_img = np.zeros((np.shape(seg)[0], np.shape(seg)[1], 3)).astype(float) - # colors=sns.color_palette("hls", n_classes) - colors = [[1, 0, 0], [8, 0, 0], [2, 0, 0], [4, 0, 0]] - - for c in ann_u: - c = int(c) - segl = (seg == c) - seg_img[:, :, 0][seg == c] = colors[c][0] # segl*(colors[c][0]) - seg_img[:, :, 1][seg == c] = colors[c][1] # seg_img[:,:,1]=segl*(colors[c][1]) - seg_img[:, :, 2][seg == c] = colors[c][2] # seg_img[:,:,2]=segl*(colors[c][2]) - return seg_img - - def rotate_image(self, img_patch, slope): - (h, w) = img_patch.shape[:2] - center = (w // 2, h // 2) - M = cv2.getRotationMatrix2D(center, slope, 1.0) - return cv2.warpAffine(img_patch, M, (w, h), flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE) - - def cleaning_probs(self, probs: np.ndarray, sigma: float) -> np.ndarray: - # Smooth - if sigma > 0.: - return cv2.GaussianBlur(probs, (int(3 * sigma) * 2 + 1, int(3 * sigma) * 2 + 1), sigma) - elif sigma == 0.: - return cv2.fastNlMeansDenoising((probs * 255).astype(np.uint8), h=20) / 255 - else: # Negative sigma, do not do anything - return probs - - def crop_image_inside_box(self, box, img_org_copy): - image_box = img_org_copy[box[1]:box[1] + box[3], box[0]:box[0] + box[2]] - return image_box, [box[1], box[1] + box[3], box[0], box[0] + box[2]] - - def otsu_copy(self, img): - img_r = np.zeros(img.shape) - img1 = img[:, :, 0] - img2 = img[:, :, 1] - img3 = img[:, :, 2] - # print(img.min()) - # print(img[:,:,0].min()) - # blur = cv2.GaussianBlur(img,(5,5)) - # ret3,th3 = cv2.threshold(blur,0,255,cv2.THRESH_BINARY+cv2.THRESH_OTSU) - retval1, threshold1 = cv2.threshold(img1, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU) - retval2, threshold2 = cv2.threshold(img2, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU) - retval3, threshold3 = cv2.threshold(img3, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU) - - img_r[:, :, 0] = threshold1 - img_r[:, :, 1] = threshold1 - img_r[:, :, 2] = threshold1 - return img_r - - def get_image_and_scales(self): - self.image = cv2.imread(self.image_dir) - self.height_org = self.image.shape[0] - self.width_org = self.image.shape[1] - - if self.image.shape[0] < 1000: - self.img_hight_int = 2800 - self.img_width_int = int(self.img_hight_int * self.image.shape[1] / float(self.image.shape[0])) - - elif self.image.shape[0] < 2000 and self.image.shape[0] >= 1000: - self.img_hight_int = int(self.image.shape[0]*1.1) - self.img_width_int = int(self.img_hight_int * self.image.shape[1] / float(self.image.shape[0])) - - elif self.image.shape[0] < 3300 and self.image.shape[0] >= 2000: - self.img_hight_int = int(self.image.shape[0]*1.1) - self.img_width_int = int(self.img_hight_int * self.image.shape[1] / float(self.image.shape[0])) - - elif self.image.shape[0] < 4000 and self.image.shape[0] >= 3300 and self.image.shape[1]<2400 : - self.img_hight_int = int(self.image.shape[0]*1.1)# 6500 - self.img_width_int = int(self.img_hight_int * self.image.shape[1] / float(self.image.shape[0])) - - elif self.image.shape[0] < 4000 and self.image.shape[0] >= 3300 and self.image.shape[1]>=2400 : - self.img_hight_int = 6500 - self.img_width_int = int(self.img_hight_int * self.image.shape[1] / float(self.image.shape[0])) - - elif self.image.shape[0] < 5400 and self.image.shape[0] > 4000 and self.image.shape[1]>3300 : - self.img_hight_int = int(self.image.shape[0]*1.6)# 6500 - self.img_width_int = int(self.img_hight_int * self.image.shape[1] / float(self.image.shape[0])) - elif self.image.shape[0] < 11000 and self.image.shape[0] >= 7000 : - self.img_hight_int = int(self.image.shape[0]*1.6)# 6500 - self.img_width_int = int(self.img_hight_int * self.image.shape[1] / float(self.image.shape[0])) - else: - self.img_hight_int = int(self.image.shape[0]*1.1)# 6500 - self.img_width_int = int(self.img_hight_int * self.image.shape[1] / float(self.image.shape[0])) - #self.img_hight_int = self.image.shape[0] - #self.img_width_int = self.image.shape[1] - - self.scale_y = self.img_hight_int / float(self.image.shape[0]) - self.scale_x = self.img_width_int / float(self.image.shape[1]) - - self.image = self.resize_image(self.image, self.img_hight_int, self.img_width_int) - - def start_new_session_and_model(self, model_dir): - config = tf.ConfigProto() - config.gpu_options.allow_growth = True - - session = tf.InteractiveSession() - model = load_model(model_dir, compile=False) - - return model, session - - def do_prediction(self,patches,img,model): - - img_height_model = model.layers[len(model.layers) - 1].output_shape[1] - img_width_model = model.layers[len(model.layers) - 1].output_shape[2] - n_classes = model.layers[len(model.layers) - 1].output_shape[3] - - if patches: - - margin = int(0.1 * img_width_model) - - width_mid = img_width_model - 2 * margin - height_mid = img_height_model - 2 * margin - - - img = img / float(255.0) - - img_h = img.shape[0] - img_w = img.shape[1] - - prediction_true = np.zeros((img_h, img_w, 3)) - mask_true = np.zeros((img_h, img_w)) - nxf = img_w / float(width_mid) - nyf = img_h / float(height_mid) - - if nxf > int(nxf): - nxf = int(nxf) + 1 - else: - nxf = int(nxf) - - if nyf > int(nyf): - nyf = int(nyf) + 1 - else: - nyf = int(nyf) - - for i in range(nxf): - for j in range(nyf): - - if i == 0: - index_x_d = i * width_mid - index_x_u = index_x_d + img_width_model - elif i > 0: - index_x_d = i * width_mid - index_x_u = index_x_d + img_width_model - - if j == 0: - index_y_d = j * height_mid - index_y_u = index_y_d + img_height_model - elif j > 0: - index_y_d = j * height_mid - index_y_u = index_y_d + img_height_model - - if index_x_u > img_w: - index_x_u = img_w - index_x_d = img_w - img_width_model - if index_y_u > img_h: - index_y_u = img_h - index_y_d = img_h - img_height_model - - - - img_patch = img[index_y_d:index_y_u, index_x_d:index_x_u, :] - - label_p_pred = model.predict( - img_patch.reshape(1, img_patch.shape[0], img_patch.shape[1], img_patch.shape[2])) - - seg = np.argmax(label_p_pred, axis=3)[0] - - seg_color = np.repeat(seg[:, :, np.newaxis], 3, axis=2) - - if i==0 and j==0: - seg_color = seg_color[0:seg_color.shape[0] - margin, 0:seg_color.shape[1] - margin, :] - seg = seg[0:seg.shape[0] - margin, 0:seg.shape[1] - margin] - - mask_true[index_y_d + 0:index_y_u - margin, index_x_d + 0:index_x_u - margin] = seg - prediction_true[index_y_d + 0:index_y_u - margin, index_x_d + 0:index_x_u - margin, - :] = seg_color - - elif i==nxf-1 and j==nyf-1: - seg_color = seg_color[margin:seg_color.shape[0] - 0, margin:seg_color.shape[1] - 0, :] - seg = seg[margin:seg.shape[0] - 0, margin:seg.shape[1] - 0] - - mask_true[index_y_d + margin:index_y_u - 0, index_x_d + margin:index_x_u - 0] = seg - prediction_true[index_y_d + margin:index_y_u - 0, index_x_d + margin:index_x_u - 0, - :] = seg_color - - elif i==0 and j==nyf-1: - seg_color = seg_color[margin:seg_color.shape[0] - 0, 0:seg_color.shape[1] - margin, :] - seg = seg[margin:seg.shape[0] - 0, 0:seg.shape[1] - margin] - - mask_true[index_y_d + margin:index_y_u - 0, index_x_d + 0:index_x_u - margin] = seg - prediction_true[index_y_d + margin:index_y_u - 0, index_x_d + 0:index_x_u - margin, - :] = seg_color - - elif i==nxf-1 and j==0: - seg_color = seg_color[0:seg_color.shape[0] - margin, margin:seg_color.shape[1] - 0, :] - seg = seg[0:seg.shape[0] - margin, margin:seg.shape[1] - 0] - - mask_true[index_y_d + 0:index_y_u - margin, index_x_d + margin:index_x_u - 0] = seg - prediction_true[index_y_d + 0:index_y_u - margin, index_x_d + margin:index_x_u - 0, - :] = seg_color - - elif i==0 and j!=0 and j!=nyf-1: - seg_color = seg_color[margin:seg_color.shape[0] - margin, 0:seg_color.shape[1] - margin, :] - seg = seg[margin:seg.shape[0] - margin, 0:seg.shape[1] - margin] - - mask_true[index_y_d + margin:index_y_u - margin, index_x_d + 0:index_x_u - margin] = seg - prediction_true[index_y_d + margin:index_y_u - margin, index_x_d + 0:index_x_u - margin, - :] = seg_color - - elif i==nxf-1 and j!=0 and j!=nyf-1: - seg_color = seg_color[margin:seg_color.shape[0] - margin, margin:seg_color.shape[1] - 0, :] - seg = seg[margin:seg.shape[0] - margin, margin:seg.shape[1] - 0] - - mask_true[index_y_d + margin:index_y_u - margin, index_x_d + margin:index_x_u - 0] = seg - prediction_true[index_y_d + margin:index_y_u - margin, index_x_d + margin:index_x_u - 0, - :] = seg_color - - elif i!=0 and i!=nxf-1 and j==0: - seg_color = seg_color[0:seg_color.shape[0] - margin, margin:seg_color.shape[1] - margin, :] - seg = seg[0:seg.shape[0] - margin, margin:seg.shape[1] - margin] - - mask_true[index_y_d + 0:index_y_u - margin, index_x_d + margin:index_x_u - margin] = seg - prediction_true[index_y_d + 0:index_y_u - margin, index_x_d + margin:index_x_u - margin, - :] = seg_color - - elif i!=0 and i!=nxf-1 and j==nyf-1: - seg_color = seg_color[margin:seg_color.shape[0] - 0, margin:seg_color.shape[1] - margin, :] - seg = seg[margin:seg.shape[0] - 0, margin:seg.shape[1] - margin] - - mask_true[index_y_d + margin:index_y_u - 0, index_x_d + margin:index_x_u - margin] = seg - prediction_true[index_y_d + margin:index_y_u - 0, index_x_d + margin:index_x_u - margin, - :] = seg_color - - else: - seg_color = seg_color[margin:seg_color.shape[0] - margin, margin:seg_color.shape[1] - margin, :] - seg = seg[margin:seg.shape[0] - margin, margin:seg.shape[1] - margin] - - mask_true[index_y_d + margin:index_y_u - margin, index_x_d + margin:index_x_u - margin] = seg - prediction_true[index_y_d + margin:index_y_u - margin, index_x_d + margin:index_x_u - margin, - :] = seg_color - - prediction_true = prediction_true.astype(np.uint8) - - if not patches: - - img = img /float( 255.0) - img = self.resize_image(img, img_height_model, img_width_model) - - label_p_pred = model.predict( - img.reshape(1, img.shape[0], img.shape[1], img.shape[2])) - - seg = np.argmax(label_p_pred, axis=3)[0] - seg_color =np.repeat(seg[:, :, np.newaxis], 3, axis=2) - prediction_true = self.resize_image(seg_color, self.image.shape[0], self.image.shape[1]) - prediction_true = prediction_true.astype(np.uint8) - return prediction_true - - - - def extract_page(self): - patches=False - model_page, session_page = self.start_new_session_and_model(self.model_page_dir) - img = self.otsu_copy(self.image) - #for ii in range(1): - # img = cv2.GaussianBlur(img, (15, 15), 0) - - - img_page_prediction=self.do_prediction(patches,img,model_page) - - imgray = cv2.cvtColor(img_page_prediction, cv2.COLOR_BGR2GRAY) - _, thresh = cv2.threshold(imgray, 0, 255, 0) - - thresh = cv2.dilate(thresh, self.kernel, iterations=6) - contours, _ = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) - - cnt_size = np.array([cv2.contourArea(contours[j]) for j in range(len(contours))]) - - cnt = contours[np.argmax(cnt_size)] - - x, y, w, h = cv2.boundingRect(cnt) - - try: - box = [x, y, w, h] - - croped_page, page_coord = self.crop_image_inside_box(box, self.image) - - - self.cont_page=[] - self.cont_page.append( np.array( [ [ page_coord[2] , page_coord[0] ] , - [ page_coord[3] , page_coord[0] ] , - [ page_coord[3] , page_coord[1] ] , - [ page_coord[2] , page_coord[1] ]] ) ) - except: - box = [0, 0, self.image.shape[1]-1, self.image.shape[0]-1] - croped_page, page_coord = self.crop_image_inside_box(box, self.image) - - - self.cont_page=[] - self.cont_page.append( np.array( [ [ page_coord[2] , page_coord[0] ] , - [ page_coord[3] , page_coord[0] ] , - [ page_coord[3] , page_coord[1] ] , - [ page_coord[2] , page_coord[1] ]] ) ) - - session_page.close() - del model_page - del session_page - del self.image - del contours - del thresh - del img - - gc.collect() - return croped_page, page_coord - - def extract_text_regions(self, img): - - patches=True - model_region, session_region = self.start_new_session_and_model(self.model_region_dir) - img = self.otsu_copy(img) - img = img.astype(np.uint8) - - - prediction_regions=self.do_prediction(patches,img,model_region) - - - session_region.close() - del model_region - del session_region - gc.collect() - return prediction_regions - - def get_text_region_contours_and_boxes(self, image): - rgb_class_of_texts = (1, 1, 1) - mask_texts = np.all(image == rgb_class_of_texts, axis=-1) - - image = np.repeat(mask_texts[:, :, np.newaxis], 3, axis=2) * 255 - image = image.astype(np.uint8) - - image = cv2.morphologyEx(image, cv2.MORPH_OPEN, self.kernel) - image = cv2.morphologyEx(image, cv2.MORPH_CLOSE, self.kernel) - - - imgray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) - - _, thresh = cv2.threshold(imgray, 0, 255, 0) - - contours, hierarchy = cv2.findContours(thresh.copy(), cv2.cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) - - main_contours = self.filter_contours_area_of_image(thresh, contours, hierarchy, max_area=1, min_area=0.00001) - self.boxes = [] - - for jj in range(len(main_contours)): - x, y, w, h = cv2.boundingRect(main_contours[jj]) - self.boxes.append([x, y, w, h]) - - - return main_contours - - def get_all_image_patches_coordination(self, image_page): - self.all_box_coord=[] - for jk in range(len(self.boxes)): - _,crop_coor=self.crop_image_inside_box(self.boxes[jk],image_page) - self.all_box_coord.append(crop_coor) - - - def textline_contours(self, img): - patches=True - model_textline, session_textline = self.start_new_session_and_model(self.model_textline_dir) - img = self.otsu_copy(img) - img = img.astype(np.uint8) - - prediction_textline=self.do_prediction(patches,img,model_textline) - - session_textline.close() - - del model_textline - del session_textline - gc.collect() - return prediction_textline[:,:,0] - - def get_textlines_for_each_textregions(self, textline_mask_tot, boxes): - textline_mask_tot = cv2.erode(textline_mask_tot, self.kernel, iterations=1) - self.area_of_cropped = [] - self.all_text_region_raw = [] - for jk in range(len(boxes)): - crop_img, crop_coor = self.crop_image_inside_box(boxes[jk], - np.repeat(textline_mask_tot[:, :, np.newaxis], 3, axis=2)) - crop_img=crop_img.astype(np.uint8) - self.all_text_region_raw.append(crop_img[:, :, 0]) - self.area_of_cropped.append(crop_img.shape[0] * crop_img.shape[1]) - - def seperate_lines(self, img_patch, contour_text_interest, thetha): - (h, w) = img_patch.shape[:2] - center = (w // 2, h // 2) - M = cv2.getRotationMatrix2D(center, -thetha, 1.0) - x_d = M[0, 2] - y_d = M[1, 2] - - thetha = thetha / 180. * np.pi - rotation_matrix = np.array([[np.cos(thetha), -np.sin(thetha)], [np.sin(thetha), np.cos(thetha)]]) - contour_text_interest_copy = contour_text_interest.copy() - - x_cont = contour_text_interest[:, 0, 0] - y_cont = contour_text_interest[:, 0, 1] - x_cont = x_cont - np.min(x_cont) - y_cont = y_cont - np.min(y_cont) - - x_min_cont = 0 - x_max_cont = img_patch.shape[1] - y_min_cont = 0 - y_max_cont = img_patch.shape[0] - - xv = np.linspace(x_min_cont, x_max_cont, 1000) - - textline_patch_sum_along_width = img_patch.sum(axis=1) - - first_nonzero = 0 # (next((i for i, x in enumerate(mada_n) if x), None)) - - y = textline_patch_sum_along_width[:] # [first_nonzero:last_nonzero] - y_padded = np.zeros(len(y) + 40) - y_padded[20:len(y) + 20] = y - x = np.array(range(len(y))) - - peaks_real, _ = find_peaks(gaussian_filter1d(y, 3), height=0) - if len(peaks_real)<=2 and len(peaks_real)>1: - sigma_gaus=10 - else: - sigma_gaus=8 - - - y_padded_smoothed= gaussian_filter1d(y_padded, sigma_gaus) - y_padded_up_to_down=-y_padded+np.max(y_padded) - y_padded_up_to_down_padded=np.zeros(len(y_padded_up_to_down)+40) - y_padded_up_to_down_padded[20:len(y_padded_up_to_down)+20]=y_padded_up_to_down - y_padded_up_to_down_padded= gaussian_filter1d(y_padded_up_to_down_padded, sigma_gaus) - - - peaks, _ = find_peaks(y_padded_smoothed, height=0) - peaks_neg, _ = find_peaks(y_padded_up_to_down_padded, height=0) - - mean_value_of_peaks=np.mean(y_padded_smoothed[peaks]) - std_value_of_peaks=np.std(y_padded_smoothed[peaks]) - peaks_values=y_padded_smoothed[peaks] - - - peaks_neg = peaks_neg - 20 - 20 - peaks = peaks - 20 - - for jj in range(len(peaks_neg)): - if peaks_neg[jj] > len(x) - 1: - peaks_neg[jj] = len(x) - 1 - - for jj in range(len(peaks)): - if peaks[jj] > len(x) - 1: - peaks[jj] = len(x) - 1 - - textline_boxes = [] - textline_boxes_rot = [] - - if len(peaks_neg) == len(peaks) + 1 and len(peaks) >= 3: - #print('11') - for jj in range(len(peaks)): - - if jj==(len(peaks)-1): - dis_to_next_up = abs(peaks[jj] - peaks_neg[jj]) - dis_to_next_down = abs(peaks[jj] - peaks_neg[jj + 1]) - - if peaks_values[jj]>mean_value_of_peaks-std_value_of_peaks/2.: - point_up = peaks[jj] + first_nonzero - int(1.3 * dis_to_next_up) ##+int(dis_to_next_up*1./4.0) - point_down =y_max_cont-1##peaks[jj] + first_nonzero + int(1.3 * dis_to_next_down) #point_up# np.max(y_cont)#peaks[jj] + first_nonzero + int(1.4 * dis_to_next_down) ###-int(dis_to_next_down*1./4.0) - else: - point_up = peaks[jj] + first_nonzero - int(1.4 * dis_to_next_up) ##+int(dis_to_next_up*1./4.0) - point_down =y_max_cont-1##peaks[jj] + first_nonzero + int(1.6 * dis_to_next_down) #point_up# np.max(y_cont)#peaks[jj] + first_nonzero + int(1.4 * dis_to_next_down) ###-int(dis_to_next_down*1./4.0) - - point_down_narrow = peaks[jj] + first_nonzero + int( - 1.4 * dis_to_next_down) ###-int(dis_to_next_down*1./2) - else: - dis_to_next_up = abs(peaks[jj] - peaks_neg[jj]) - dis_to_next_down = abs(peaks[jj] - peaks_neg[jj + 1]) - - if peaks_values[jj]>mean_value_of_peaks-std_value_of_peaks/2.: - point_up = peaks[jj] + first_nonzero - int(1.1 * dis_to_next_up) ##+int(dis_to_next_up*1./4.0) - point_down = peaks[jj] + first_nonzero + int(1.1 * dis_to_next_down) ###-int(dis_to_next_down*1./4.0) - else: - point_up = peaks[jj] + first_nonzero - int(1.23 * dis_to_next_up) ##+int(dis_to_next_up*1./4.0) - point_down = peaks[jj] + first_nonzero + int(1.33 * dis_to_next_down) ###-int(dis_to_next_down*1./4.0) - - point_down_narrow = peaks[jj] + first_nonzero + int( - 1.1 * dis_to_next_down) ###-int(dis_to_next_down*1./2) - - - - if point_down_narrow >= img_patch.shape[0]: - point_down_narrow = img_patch.shape[0] - 2 - - distances = [cv2.pointPolygonTest(contour_text_interest_copy, (xv[mj], peaks[jj] + first_nonzero), True) - for mj in range(len(xv))] - distances = np.array(distances) - - xvinside = xv[distances >= 0] - - if len(xvinside) == 0: - x_min = x_min_cont - x_max = x_max_cont - else: - x_min = np.min(xvinside) # max(x_min_interest,x_min_cont) - x_max = np.max(xvinside) # min(x_max_interest,x_max_cont) - - p1 = np.dot(rotation_matrix, [int(x_min), int(point_up)]) - p2 = np.dot(rotation_matrix, [int(x_max), int(point_up)]) - p3 = np.dot(rotation_matrix, [int(x_max), int(point_down)]) - p4 = np.dot(rotation_matrix, [int(x_min), int(point_down)]) - - x_min_rot1, point_up_rot1 = p1[0] + x_d, p1[1] + y_d - x_max_rot2, point_up_rot2 = p2[0] + x_d, p2[1] + y_d - x_max_rot3, point_down_rot3 = p3[0] + x_d, p3[1] + y_d - x_min_rot4, point_down_rot4 = p4[0] + x_d, p4[1] + y_d - - if x_min_rot1<0: - x_min_rot1=0 - if x_min_rot4<0: - x_min_rot4=0 - if point_up_rot1<0: - point_up_rot1=0 - if point_up_rot2<0: - point_up_rot2=0 - - textline_boxes_rot.append(np.array([[int(x_min_rot1), int(point_up_rot1)], - [int(x_max_rot2), int(point_up_rot2)], - [int(x_max_rot3), int(point_down_rot3)], - [int(x_min_rot4), int(point_down_rot4)]])) - - textline_boxes.append(np.array([[int(x_min), int(point_up)], - [int(x_max), int(point_up)], - [int(x_max), int(point_down)], - [int(x_min), int(point_down)]])) - - elif len(peaks) < 1: - pass - - elif len(peaks) == 1: - x_min = x_min_cont - x_max = x_max_cont - - y_min = y_min_cont - y_max = y_max_cont - - p1 = np.dot(rotation_matrix, [int(x_min), int(y_min)]) - p2 = np.dot(rotation_matrix, [int(x_max), int(y_min)]) - p3 = np.dot(rotation_matrix, [int(x_max), int(y_max)]) - p4 = np.dot(rotation_matrix, [int(x_min), int(y_max)]) - - x_min_rot1, point_up_rot1 = p1[0] + x_d, p1[1] + y_d - x_max_rot2, point_up_rot2 = p2[0] + x_d, p2[1] + y_d - x_max_rot3, point_down_rot3 = p3[0] + x_d, p3[1] + y_d - x_min_rot4, point_down_rot4 = p4[0] + x_d, p4[1] + y_d - - - if x_min_rot1<0: - x_min_rot1=0 - if x_min_rot4<0: - x_min_rot4=0 - if point_up_rot1<0: - point_up_rot1=0 - if point_up_rot2<0: - point_up_rot2=0 - - textline_boxes_rot.append(np.array([[int(x_min_rot1), int(point_up_rot1)], - [int(x_max_rot2), int(point_up_rot2)], - [int(x_max_rot3), int(point_down_rot3)], - [int(x_min_rot4), int(point_down_rot4)]])) - - textline_boxes.append(np.array([[int(x_min), int(y_min)], - [int(x_max), int(y_min)], - [int(x_max), int(y_max)], - [int(x_min), int(y_max)]])) - - - - elif len(peaks) == 2: - dis_to_next = np.abs(peaks[1] - peaks[0]) - for jj in range(len(peaks)): - if jj == 0: - point_up = 0#peaks[jj] + first_nonzero - int(1. / 1.7 * dis_to_next) - if point_up < 0: - point_up = 1 - point_down = peaks[jj] + first_nonzero + int(1. / 1.8 * dis_to_next) - elif jj == 1: - point_down = peaks[jj] + first_nonzero + int(1. / 1.8 * dis_to_next) - if point_down >= img_patch.shape[0]: - point_down = img_patch.shape[0] - 2 - point_up = peaks[jj] + first_nonzero - int(1. / 1.8 * dis_to_next) - - distances = [cv2.pointPolygonTest(contour_text_interest_copy, (xv[mj], peaks[jj] + first_nonzero), True) - for mj in range(len(xv))] - distances = np.array(distances) - - xvinside = xv[distances >= 0] - - if len(xvinside) == 0: - x_min = x_min_cont - x_max = x_max_cont - else: - x_min = np.min(xvinside) - x_max = np.max(xvinside) - - p1 = np.dot(rotation_matrix, [int(x_min), int(point_up)]) - p2 = np.dot(rotation_matrix, [int(x_max), int(point_up)]) - p3 = np.dot(rotation_matrix, [int(x_max), int(point_down)]) - p4 = np.dot(rotation_matrix, [int(x_min), int(point_down)]) - - x_min_rot1, point_up_rot1 = p1[0] + x_d, p1[1] + y_d - x_max_rot2, point_up_rot2 = p2[0] + x_d, p2[1] + y_d - x_max_rot3, point_down_rot3 = p3[0] + x_d, p3[1] + y_d - x_min_rot4, point_down_rot4 = p4[0] + x_d, p4[1] + y_d - - if x_min_rot1<0: - x_min_rot1=0 - if x_min_rot4<0: - x_min_rot4=0 - if point_up_rot1<0: - point_up_rot1=0 - if point_up_rot2<0: - point_up_rot2=0 - - textline_boxes_rot.append(np.array([[int(x_min_rot1), int(point_up_rot1)], - [int(x_max_rot2), int(point_up_rot2)], - [int(x_max_rot3), int(point_down_rot3)], - [int(x_min_rot4), int(point_down_rot4)]])) - - textline_boxes.append(np.array([[int(x_min), int(point_up)], - [int(x_max), int(point_up)], - [int(x_max), int(point_down)], - [int(x_min), int(point_down)]])) - else: - for jj in range(len(peaks)): - - if jj == 0: - dis_to_next = peaks[jj + 1] - peaks[jj] - # point_up=peaks[jj]+first_nonzero-int(1./3*dis_to_next) - point_up = peaks[jj] + first_nonzero - int(1. / 1.9 * dis_to_next) - if point_up < 0: - point_up = 1 - # point_down=peaks[jj]+first_nonzero+int(1./3*dis_to_next) - point_down = peaks[jj] + first_nonzero + int(1. / 1.9 * dis_to_next) - elif jj == len(peaks) - 1: - dis_to_next = peaks[jj] - peaks[jj - 1] - # point_down=peaks[jj]+first_nonzero+int(1./3*dis_to_next) - point_down = peaks[jj] + first_nonzero + int(1. / 1.7 * dis_to_next) - if point_down >= img_patch.shape[0]: - point_down = img_patch.shape[0] - 2 - # point_up=peaks[jj]+first_nonzero-int(1./3*dis_to_next) - point_up = peaks[jj] + first_nonzero - int(1. / 1.9 * dis_to_next) - else: - dis_to_next_down = peaks[jj + 1] - peaks[jj] - dis_to_next_up = peaks[jj] - peaks[jj - 1] - - point_up = peaks[jj] + first_nonzero - int(1. / 1.9 * dis_to_next_up) - point_down = peaks[jj] + first_nonzero + int(1. / 1.9 * dis_to_next_down) - - distances = [cv2.pointPolygonTest(contour_text_interest_copy, (xv[mj], peaks[jj] + first_nonzero), True) - for mj in range(len(xv))] - distances = np.array(distances) - - xvinside = xv[distances >= 0] - - if len(xvinside) == 0: - x_min = x_min_cont - x_max = x_max_cont - else: - x_min = np.min(xvinside) # max(x_min_interest,x_min_cont) - x_max = np.max(xvinside) # min(x_max_interest,x_max_cont) - - p1 = np.dot(rotation_matrix, [int(x_min), int(point_up)]) - p2 = np.dot(rotation_matrix, [int(x_max), int(point_up)]) - p3 = np.dot(rotation_matrix, [int(x_max), int(point_down)]) - p4 = np.dot(rotation_matrix, [int(x_min), int(point_down)]) - - x_min_rot1, point_up_rot1 = p1[0] + x_d, p1[1] + y_d - x_max_rot2, point_up_rot2 = p2[0] + x_d, p2[1] + y_d - x_max_rot3, point_down_rot3 = p3[0] + x_d, p3[1] + y_d - x_min_rot4, point_down_rot4 = p4[0] + x_d, p4[1] + y_d - - - if x_min_rot1<0: - x_min_rot1=0 - if x_min_rot4<0: - x_min_rot4=0 - if point_up_rot1<0: - point_up_rot1=0 - if point_up_rot2<0: - point_up_rot2=0 - - - - textline_boxes_rot.append(np.array([[int(x_min_rot1), int(point_up_rot1)], - [int(x_max_rot2), int(point_up_rot2)], - [int(x_max_rot3), int(point_down_rot3)], - [int(x_min_rot4), int(point_down_rot4)]])) - - textline_boxes.append(np.array([[int(x_min), int(point_up)], - [int(x_max), int(point_up)], - [int(x_max), int(point_down)], - [int(x_min), int(point_down)]])) - - - return peaks, textline_boxes_rot - - def return_rotated_contours(self,slope,img_patch): - dst = self.rotate_image(img_patch, slope) - dst = dst.astype(np.uint8) - dst = dst[:, :, 0] - dst[dst != 0] = 1 - - imgray = cv2.cvtColor(dst, cv2.COLOR_BGR2GRAY) - _, thresh = cv2.threshold(imgray, 0, 255, 0) - thresh = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, kernel) - thresh = cv2.morphologyEx(thresh, cv2.MORPH_CLOSE, kernel) - contours, _ = cv2.findContours(thresh.copy(), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) - return contours - - def textline_contours_postprocessing(self, textline_mask, slope, contour_text_interest, box_ind): - - - textline_mask = np.repeat(textline_mask[:, :, np.newaxis], 3, axis=2) * 255 - textline_mask = textline_mask.astype(np.uint8) - kernel = np.ones((5, 5), np.uint8) - textline_mask = cv2.morphologyEx(textline_mask, cv2.MORPH_OPEN, kernel) - textline_mask = cv2.morphologyEx(textline_mask, cv2.MORPH_CLOSE, kernel) - textline_mask = cv2.erode(textline_mask, kernel, iterations=2) - - try: - - dst = self.rotate_image(textline_mask, slope) - dst = dst[:, :, 0] - dst[dst != 0] = 1 - - contour_text_copy = contour_text_interest.copy() - - contour_text_copy[:, 0, 0] = contour_text_copy[:, 0, 0] - box_ind[ - 0] - contour_text_copy[:, 0, 1] = contour_text_copy[:, 0, 1] - box_ind[1] - - img_contour = np.zeros((box_ind[3], box_ind[2], 3)) - img_contour = cv2.fillPoly(img_contour, pts=[contour_text_copy], color=(255, 255, 255)) - - - - img_contour_rot = self.rotate_image(img_contour, slope) - - img_contour_rot = img_contour_rot.astype(np.uint8) - imgrayrot = cv2.cvtColor(img_contour_rot, cv2.COLOR_BGR2GRAY) - _, threshrot = cv2.threshold(imgrayrot, 0, 255, 0) - contours_text_rot, _ = cv2.findContours(threshrot.copy(), cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) - - len_con_text_rot = [len(contours_text_rot[ib]) for ib in range(len(contours_text_rot))] - ind_big_con = np.argmax(len_con_text_rot) - - - - _, contours_rotated_clean = self.seperate_lines(dst, contours_text_rot[ind_big_con], slope) - - - except: - - contours_rotated_clean = [] - - return contours_rotated_clean - - - def return_contours_of_image(self,image_box_tabels_1): - - image_box_tabels=np.repeat(image_box_tabels_1[:, :, np.newaxis], 3, axis=2) - image_box_tabels=image_box_tabels.astype(np.uint8) - imgray = cv2.cvtColor(image_box_tabels, cv2.COLOR_BGR2GRAY) - ret, thresh = cv2.threshold(imgray, 0, 255, 0) - contours,hierarchy=cv2.findContours(thresh,cv2.RETR_TREE,cv2.CHAIN_APPROX_SIMPLE) - return contours,hierarchy - - def find_contours_mean_y_diff(self,contours_main): - M_main=[cv2.moments(contours_main[j]) for j in range(len(contours_main))] - cy_main=[(M_main[j]['m01']/(M_main[j]['m00']+1e-32)) for j in range(len(M_main))] - return np.mean( np.diff( np.sort( np.array(cy_main) ) ) ) - - - def isNaN(self,num): - return num != num - - def get_standard_deviation_of_summed_textline_patch_along_width(self,img_patch,sigma_,multiplier=3.8 ): - img_patch_sum_along_width=img_patch[:,:].sum(axis=1) - - img_patch_sum_along_width_updown=img_patch_sum_along_width[len(img_patch_sum_along_width)::-1] - - first_nonzero=(next((i for i, x in enumerate(img_patch_sum_along_width) if x), 0)) - last_nonzero=(next((i for i, x in enumerate(img_patch_sum_along_width_updown) if x), 0)) - - last_nonzero=len(img_patch_sum_along_width)-last_nonzero - - - y=img_patch_sum_along_width#[first_nonzero:last_nonzero] - - y_help=np.zeros(len(y)+20) - - y_help[10:len(y)+10]=y - - x=np.array( range(len(y)) ) - - - - - zneg_rev=-y_help+np.max(y_help) - - zneg=np.zeros(len(zneg_rev)+20) - - zneg[10:len(zneg_rev)+10]=zneg_rev - - z=gaussian_filter1d(y, sigma_) - zneg= gaussian_filter1d(zneg, sigma_) - - - peaks_neg, _ = find_peaks(zneg, height=0) - peaks, _ = find_peaks(z, height=0) - - peaks_neg=peaks_neg-10-10 - - interest_pos=z[peaks] - - interest_pos=interest_pos[interest_pos>10] - - interest_neg=z[peaks_neg] - - min_peaks_pos=np.mean(interest_pos) - min_peaks_neg=0#np.min(interest_neg) - - dis_talaei=(min_peaks_pos-min_peaks_neg)/multiplier - #print(interest_pos) - grenze=min_peaks_pos-dis_talaei#np.mean(y[peaks_neg[0]:peaks_neg[len(peaks_neg)-1]])-np.std(y[peaks_neg[0]:peaks_neg[len(peaks_neg)-1]])/2.0 - - interest_neg_fin=interest_neg[(interest_neg12.5 and slope_corresponding_textregion!=999: - slope_corresponding_textregion=0 - elif slope_corresponding_textregion==999: - slope_corresponding_textregion=0 - slopes_per_each_subprocess.append(slope_corresponding_textregion) - - bounding_rectangle_of_textlines = self.textline_contours_postprocessing(crop_img - , slope_corresponding_textregion, - contours_per_process[mv], boxes_per_process[mv]) - - textlines_rectangles_per_each_subprocess.append(bounding_rectangle_of_textlines) - bounding_box_of_textregion_per_each_subprocess.append(boxes_per_process[mv] ) - - - - queue_of_slopes_per_textregion.put(slopes_per_each_subprocess) - queue_of_textlines_rectangle_per_textregion.put(textlines_rectangles_per_each_subprocess) - queue_of_textregion_box.put(bounding_box_of_textregion_per_each_subprocess ) - queue_of_quntours_of_textregion.put(contours_textregion_per_each_subprocess) - - def get_slopes_and_deskew(self, contours,textline_mask_tot): - num_cores = cpu_count() - - queue_of_slopes_per_textregion = Queue() - queue_of_textlines_rectangle_per_textregion=Queue() - queue_of_textregion_box=Queue() - queue_of_quntours_of_textregion=Queue() - - processes = [] - nh=np.linspace(0, len(self.boxes), num_cores+1) - - - for i in range(num_cores): - boxes_per_process=self.boxes[int(nh[i]):int(nh[i+1])] - contours_per_process=contours[int(nh[i]):int(nh[i+1])] - processes.append(Process(target=self.do_work_of_slopes, args=(queue_of_slopes_per_textregion,queue_of_textlines_rectangle_per_textregion, - queue_of_textregion_box, boxes_per_process, queue_of_quntours_of_textregion, textline_mask_tot, contours_per_process))) - - for i in range(num_cores): - processes[i].start() - - self.slopes = [] - self.all_found_texline_polygons=[] - all_found_text_regions=[] - self.boxes=[] - - for i in range(num_cores): - slopes_for_sub_process=queue_of_slopes_per_textregion.get(True) - boxes_for_sub_process=queue_of_textregion_box.get(True) - polys_for_sub_process=queue_of_textlines_rectangle_per_textregion.get(True) - contours_for_subprocess=queue_of_quntours_of_textregion.get(True) - - for j in range(len(slopes_for_sub_process)): - self.slopes.append(slopes_for_sub_process[j]) - self.all_found_texline_polygons.append(polys_for_sub_process[j]) - self.boxes.append(boxes_for_sub_process[j]) - all_found_text_regions.append(contours_for_subprocess[j]) - - for i in range(num_cores): - processes[i].join() - - return all_found_text_regions - - - def order_of_regions(self, textline_mask,contours_main): - textline_sum_along_width=textline_mask.sum(axis=1) - - y=textline_sum_along_width[:] - y_padded=np.zeros(len(y)+40) - y_padded[20:len(y)+20]=y - x=np.array( range(len(y)) ) - - - peaks_real, _ = find_peaks(gaussian_filter1d(y, 3), height=0) - - - sigma_gaus=8 - - z= gaussian_filter1d(y_padded, sigma_gaus) - zneg_rev=-y_padded+np.max(y_padded) - - zneg=np.zeros(len(zneg_rev)+40) - zneg[20:len(zneg_rev)+20]=zneg_rev - zneg= gaussian_filter1d(zneg, sigma_gaus) - - - peaks, _ = find_peaks(z, height=0) - peaks_neg, _ = find_peaks(zneg, height=0) - - peaks_neg=peaks_neg-20-20 - peaks=peaks-20 - - - - if contours_main!=None: - areas_main=np.array([cv2.contourArea(contours_main[j]) for j in range(len(contours_main))]) - M_main=[cv2.moments(contours_main[j]) for j in range(len(contours_main))] - cx_main=[(M_main[j]['m10']/(M_main[j]['m00']+1e-32)) for j in range(len(M_main))] - cy_main=[(M_main[j]['m01']/(M_main[j]['m00']+1e-32)) for j in range(len(M_main))] - x_min_main=np.array([np.min(contours_main[j][:,0,0]) for j in range(len(contours_main))]) - x_max_main=np.array([np.max(contours_main[j][:,0,0]) for j in range(len(contours_main))]) - - y_min_main=np.array([np.min(contours_main[j][:,0,1]) for j in range(len(contours_main))]) - y_max_main=np.array([np.max(contours_main[j][:,0,1]) for j in range(len(contours_main))]) - - - - - if contours_main!=None: - indexer_main=np.array(range(len(contours_main))) - - - if contours_main!=None: - len_main=len(contours_main) - else: - len_main=0 - - - matrix_of_orders=np.zeros((len_main,5)) - - matrix_of_orders[:,0]=np.array( range( len_main ) ) - - matrix_of_orders[:len_main,1]=1 - matrix_of_orders[len_main:,1]=2 - - matrix_of_orders[:len_main,2]=cx_main - matrix_of_orders[:len_main,3]=cy_main - - matrix_of_orders[:len_main,4]=np.array( range( len_main ) ) - - peaks_neg_new=[] - peaks_neg_new.append(0) - for iii in range(len(peaks_neg)): - peaks_neg_new.append(peaks_neg[iii]) - peaks_neg_new.append(textline_mask.shape[0]) - - final_indexers_sorted=[] - for i in range(len(peaks_neg_new)-1): - top=peaks_neg_new[i] - down=peaks_neg_new[i+1] - - indexes_in=matrix_of_orders[:,0][(matrix_of_orders[:,3]>=top) & ((matrix_of_orders[:,3]=top) & ((matrix_of_orders[:,3]0: - region_order=ET.SubElement(page, 'ReadingOrder') - region_order_sub = ET.SubElement(region_order, 'OrderedGroup') - - region_order_sub.set('id',"ro357564684568544579089") - - args_sort=np.argsort(order_of_texts) - for vj in args_sort: - name="coord_text_"+str(vj) - name = ET.SubElement(region_order_sub, 'RegionRefIndexed') - name.set('index',str(order_of_texts[vj]) ) - name.set('regionRef',id_of_texts[vj]) - - - id_indexer=0 - id_indexer_l=0 - - for mm in range(len(found_polygons_text_region)): - textregion=ET.SubElement(page, 'TextRegion') - - textregion.set('id','r'+str(id_indexer)) - id_indexer+=1 - - textregion.set('type','paragraph') - #if mm==0: - # textregion.set('type','heading') - #else: - # textregion.set('type','paragraph') - coord_text = ET.SubElement(textregion, 'Coords') - - points_co='' - for lmm in range(len(found_polygons_text_region[mm])): - if len(found_polygons_text_region[mm][lmm])==2: - points_co=points_co+str( int( (found_polygons_text_region[mm][lmm][0] +page_coord[2])/self.scale_x ) ) - points_co=points_co+',' - points_co=points_co+str( int( (found_polygons_text_region[mm][lmm][1] +page_coord[0])/self.scale_y ) ) - else: - points_co=points_co+str( int((found_polygons_text_region[mm][lmm][0][0] +page_coord[2])/self.scale_x) ) - points_co=points_co+',' - points_co=points_co+str( int((found_polygons_text_region[mm][lmm][0][1] +page_coord[0])/self.scale_y) ) - - if lmm<(len(found_polygons_text_region[mm])-1): - points_co=points_co+' ' - #print(points_co) - coord_text.set('points',points_co) - - - - for j in range(len(self.all_found_texline_polygons[mm])): - - textline=ET.SubElement(textregion, 'TextLine') - - textline.set('id','l'+str(id_indexer_l)) - - id_indexer_l+=1 - - - coord = ET.SubElement(textline, 'Coords') - #points = ET.SubElement(coord, 'Points') - - points_co='' - for l in range(len(self.all_found_texline_polygons[mm][j])): - #point = ET.SubElement(coord, 'Point') - - - - #point.set('x',str(found_polygons[j][l][0])) - #point.set('y',str(found_polygons[j][l][1])) - if len(self.all_found_texline_polygons[mm][j][l])==2: - points_co=points_co+str( int( (self.all_found_texline_polygons[mm][j][l][0] +page_coord[2] - +self.all_box_coord[mm][2])/self.scale_x) ) - points_co=points_co+',' - points_co=points_co+str( int( (self.all_found_texline_polygons[mm][j][l][1] +page_coord[0] - +self.all_box_coord[mm][0])/self.scale_y) ) - else: - points_co=points_co+str( int( ( self.all_found_texline_polygons[mm][j][l][0][0] +page_coord[2] - +self.all_box_coord[mm][2])/self.scale_x ) ) - points_co=points_co+',' - points_co=points_co+str( int( ( self.all_found_texline_polygons[mm][j][l][0][1] +page_coord[0] - +self.all_box_coord[mm][0])/self.scale_y) ) - - if l<(len(self.all_found_texline_polygons[mm][j])-1): - points_co=points_co+' ' - #print(points_co) - coord.set('points',points_co) - - - - tree = ET.ElementTree(data) - tree.write(os.path.join(self.dir_out, self.f_name) + ".xml") - - - def run(self): - - #get image and scales, then extract the page of scanned image - t1=time.time() - self.get_image_and_scales() - image_page,page_coord=self.extract_page() - - - ########## - K.clear_session() - gc.collect() - t2=time.time() - - - # extract text regions and corresponding contours and surrounding box - text_regions=self.extract_text_regions(image_page) - - text_regions = cv2.erode(text_regions, self.kernel, iterations=3) - text_regions = cv2.dilate(text_regions, self.kernel, iterations=4) - - #plt.imshow(text_regions[:,:,0]) - #plt.show() - - contours=self.get_text_region_contours_and_boxes(text_regions) - - - - ########## - K.clear_session() - gc.collect() - - t3=time.time() - - - if len(contours)>0: - - - - # extracting textlines using segmentation - textline_mask_tot=self.textline_contours(image_page) - ########## - K.clear_session() - gc.collect() - - t4=time.time() - - - # calculate the slope for deskewing for each box of text region. - contours=self.get_slopes_and_deskew(contours,textline_mask_tot) - - gc.collect() - t5=time.time() - - - # get orders of each textregion. This method by now only works for one column documents. - indexes_sorted, matrix_of_orders=self.order_of_regions(textline_mask_tot,contours) - order_of_texts, id_of_texts=self.order_and_id_of_texts(contours ,matrix_of_orders ,indexes_sorted ) - - - ########## - gc.collect() - t6=time.time() - - - self.get_all_image_patches_coordination(image_page) - - ########## - ########## - gc.collect() - - t7=time.time() - - else: - contours=[] - order_of_texts=None - id_of_texts=None - self.write_into_page_xml(contours,page_coord,self.dir_out , order_of_texts , id_of_texts) - - # Destroy the current Keras session/graph to free memory - K.clear_session() - - print( "time total = "+"{0:.2f}".format(time.time()-t1) ) - print( "time needed for page extraction = "+"{0:.2f}".format(t2-t1) ) - print( "time needed for text region extraction and get contours = "+"{0:.2f}".format(t3-t2) ) - if len(contours)>0: - print( "time needed for textlines = "+"{0:.2f}".format(t4-t3) ) - print( "time needed to get slopes of regions (deskewing) = "+"{0:.2f}".format(t5-t4) ) - print( "time needed to get order of regions = "+"{0:.2f}".format(t6-t5) ) - print( "time needed to implement deskewing = "+"{0:.2f}".format(t7-t6) ) - - - -@click.command() -@click.option('--image', '-i', help='image filename', type=click.Path(exists=True, dir_okay=False)) -@click.option('--out', '-o', help='directory to write output xml data', type=click.Path(exists=True, file_okay=False)) -@click.option('--model', '-m', help='directory of models', type=click.Path(exists=True, file_okay=False)) -def main(image, out, model): - possibles = globals() # XXX unused? - possibles.update(locals()) - x = textline_detector(image, out, None, model) - x.run() - - -if __name__ == "__main__": - main() - diff --git a/qurator/sbb_textline_detector/ocrd-tool.json b/qurator/sbb_textline_detector/ocrd-tool.json deleted file mode 100644 index 241f551..0000000 --- a/qurator/sbb_textline_detector/ocrd-tool.json +++ /dev/null @@ -1,19 +0,0 @@ -{ - "version": "0.0.1", - "tools": { - "ocrd-sbb-textline-detector": { - "executable": "ocrd-sbb-textline-detector", - "description": "Detect lines", - "steps": ["layout/segmentation/line"], - "input_file_grp": [ - "OCR-D-IMG" - ], - "output_file_grp": [ - "OCR-D-SBB-SEG-LINE" - ], - "parameters": { - "model": {"type": "string", "format": "file", "cacheable": true} - } - } - } -} diff --git a/qurator/sbb_textline_detector/ocrd_cli.py b/qurator/sbb_textline_detector/ocrd_cli.py deleted file mode 100644 index 272d671..0000000 --- a/qurator/sbb_textline_detector/ocrd_cli.py +++ /dev/null @@ -1,110 +0,0 @@ -import json -import os -import tempfile - -import click -import ocrd_models.ocrd_page -from ocrd import Processor -from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor -from ocrd_modelfactory import page_from_file -from ocrd_models import OcrdFile -from ocrd_models.ocrd_page_generateds import MetadataItemType, LabelsType, LabelType -from ocrd_utils import concat_padded, getLogger, MIMETYPE_PAGE -from pkg_resources import resource_string - -from qurator.sbb_textline_detector import textline_detector - -log = getLogger('processor.OcrdSbbTextlineDetectorRecognize') - -OCRD_TOOL = json.loads(resource_string(__name__, 'ocrd-tool.json').decode('utf8')) - - -@click.command() -@ocrd_cli_options -def ocrd_sbb_textline_detector(*args, **kwargs): - return ocrd_cli_wrap_processor(OcrdSbbTextlineDetectorRecognize, *args, **kwargs) - - -TOOL = 'ocrd_sbb_textline_detector' - - -class OcrdSbbTextlineDetectorRecognize(Processor): - - def __init__(self, *args, **kwargs): - kwargs['ocrd_tool'] = OCRD_TOOL['tools'][TOOL] - kwargs['version'] = OCRD_TOOL['version'] - super(OcrdSbbTextlineDetectorRecognize, self).__init__(*args, **kwargs) - - def _make_file_id(self, input_file, input_file_grp, n): - file_id = input_file.ID.replace(input_file_grp, self.output_file_grp) - if file_id == input_file.ID: - file_id = concat_padded(self.output_file_grp, n) - return file_id - - def _resolve_image_file(self, input_file: OcrdFile) -> str: - if input_file.mimetype == MIMETYPE_PAGE: - pcgts = page_from_file(self.workspace.download_file(input_file)) - page = pcgts.get_Page() - image_file = page.imageFilename - else: - image_file = input_file.local_filename - return image_file - - def process(self): - for n, page_id in enumerate(self.workspace.mets.physical_pages): - input_file = self.workspace.mets.find_files(fileGrp=self.input_file_grp, pageId=page_id)[0] - log.info("INPUT FILE %i / %s", n, input_file) - - file_id = self._make_file_id(input_file, self.input_file_grp, n) - - # Process the files - try: - os.mkdir(self.output_file_grp) - except FileExistsError: - pass - - with tempfile.TemporaryDirectory() as tmp_dirname: - # Segment the image - image_file = self._resolve_image_file(input_file) - model = self.parameter['model'] - x = textline_detector(image_file, tmp_dirname, file_id, model) - x.run() - - # Read segmentation results - tmp_filename = os.path.join(tmp_dirname, file_id) + '.xml' - tmp_pcgts = ocrd_models.ocrd_page.parse(tmp_filename) - tmp_page = tmp_pcgts.get_Page() - - # Create a new PAGE file from the input file - pcgts = page_from_file(self.workspace.download_file(input_file)) - page = pcgts.get_Page() - - # Merge results → PAGE file - page.set_PrintSpace(tmp_page.get_PrintSpace()) - page.set_ReadingOrder(tmp_page.get_ReadingOrder()) - page.set_TextRegion(tmp_page.get_TextRegion()) - - # Save metadata about this operation - metadata = pcgts.get_Metadata() - metadata.add_MetadataItem( - MetadataItemType(type_="processingStep", - name=self.ocrd_tool['steps'][0], - value=TOOL, - Labels=[LabelsType( - externalModel="ocrd-tool", - externalId="parameters", - Label=[LabelType(type_=name, value=self.parameter[name]) - for name in self.parameter.keys()])])) - - self.workspace.add_file( - ID=file_id, - file_grp=self.output_file_grp, - pageId=page_id, - mimetype='application/vnd.prima.page+xml', - local_filename=os.path.join(self.output_file_grp, file_id) + '.xml', - content=ocrd_models.ocrd_page.to_xml(pcgts) - ) - - -if __name__ == '__main__': - ocrd_sbb_textline_detector() diff --git a/requirements.txt b/requirements.txt index 9240226..063bac4 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,10 +1,7 @@ -opencv-python-headless -matplotlib -seaborn -tqdm -keras -shapely -scikit-learn -tensorflow-gpu < 2.0 -scipy -ocrd >= 2.0.0 +click +jinja2 +lxml +uniseg +numpy +colorama +ocrd >= 1.0.0b15 diff --git a/setup.py b/setup.py index 92c88cf..f7a6113 100644 --- a/setup.py +++ b/setup.py @@ -5,34 +5,24 @@ with open('requirements.txt') as fp: install_requires = fp.read() setup( - name="qurator-sbb-textline", - version="0.0.1", - author="The Qurator Team", - author_email="qurator@sbb.spk-berlin.de", - description="Qurator", - long_description=open("README.md", "r", encoding='utf-8').read(), - long_description_content_type="text/markdown", - keywords='qurator', + name='dinglehopper', + author='Mike Gerber, The QURATOR SPK Team', + author_email='mike.gerber@sbb.spk-berlin.de, qurator@sbb.spk-berlin.de', + description='The OCR evaluation tool', + long_description=open('README.md', 'r', encoding='utf-8').read(), + long_description_content_type='text/markdown', + keywords='qurator ocr', license='Apache', - url="https://qurator.ai", - packages=find_packages(exclude=["*.tests", "*.tests.*", - "tests.*", "tests"]), + namespace_packages=['qurator'], + packages=find_packages(exclude=['*.tests', '*.tests.*', 'tests.*', 'tests']), install_requires=install_requires, package_data={ - '': ['*.json'], + '': ['*.json', 'templates/*'], }, entry_points={ 'console_scripts': [ - "sbb_textline_detector=qurator.sbb_textline_detector:main", - "ocrd-sbb-textline-detector=qurator.sbb_textline_detector:ocrd_sbb_textline_detector", + 'dinglehopper=qurator.dinglehopper.cli:main', + 'ocrd-dinglehopper=qurator.dinglehopper.ocrd_cli:ocrd_dinglehopper', ] - }, - python_requires='>=3.6.0', - tests_require=['pytest'], - classifiers=[ - 'Intended Audience :: Science/Research', - 'License :: OSI Approved :: Apache Software License', - 'Programming Language :: Python :: 3', - 'Topic :: Scientific/Engineering :: Artificial Intelligence', - ], + } )