mirror of
https://github.com/qurator-spk/dinglehopper.git
synced 2025-06-09 03:40:12 +02:00
➡ Move dinglehopper into its own directory
This commit is contained in:
commit
89048bf55d
54 changed files with 43618 additions and 0 deletions
2
qurator/__init__.py
Normal file
2
qurator/__init__.py
Normal file
|
@ -0,0 +1,2 @@
|
|||
__import__('pkg_resources').declare_namespace(__name__)
|
||||
|
6
qurator/dinglehopper/.gitignore
vendored
Normal file
6
qurator/dinglehopper/.gitignore
vendored
Normal file
|
@ -0,0 +1,6 @@
|
|||
# User-specific stuff
|
||||
.idea/**/workspace.xml
|
||||
.idea/**/tasks.xml
|
||||
.idea/**/usage.statistics.xml
|
||||
.idea/**/dictionaries
|
||||
.idea/**/shelf
|
12
qurator/dinglehopper/.idea/dinglehopper.iml
generated
Normal file
12
qurator/dinglehopper/.idea/dinglehopper.iml
generated
Normal file
|
@ -0,0 +1,12 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<module type="PYTHON_MODULE" version="4">
|
||||
<component name="NewModuleRootManager">
|
||||
<content url="file://$MODULE_DIR$" />
|
||||
<orderEntry type="jdk" jdkName="Python 3.7 (dinglehopper)" jdkType="Python SDK" />
|
||||
<orderEntry type="sourceFolder" forTests="false" />
|
||||
</component>
|
||||
<component name="TestRunnerService">
|
||||
<option name="projectConfiguration" value="pytest" />
|
||||
<option name="PROJECT_TEST_RUNNER" value="pytest" />
|
||||
</component>
|
||||
</module>
|
7
qurator/dinglehopper/.idea/misc.xml
generated
Normal file
7
qurator/dinglehopper/.idea/misc.xml
generated
Normal file
|
@ -0,0 +1,7 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.7 (dinglehopper)" project-jdk-type="Python SDK" />
|
||||
<component name="PyCharmProfessionalAdvertiser">
|
||||
<option name="shown" value="true" />
|
||||
</component>
|
||||
</project>
|
8
qurator/dinglehopper/.idea/modules.xml
generated
Normal file
8
qurator/dinglehopper/.idea/modules.xml
generated
Normal file
|
@ -0,0 +1,8 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="ProjectModuleManager">
|
||||
<modules>
|
||||
<module fileurl="file://$PROJECT_DIR$/.idea/dinglehopper.iml" filepath="$PROJECT_DIR$/.idea/dinglehopper.iml" />
|
||||
</modules>
|
||||
</component>
|
||||
</project>
|
5
qurator/dinglehopper/__init__.py
Normal file
5
qurator/dinglehopper/__init__.py
Normal file
|
@ -0,0 +1,5 @@
|
|||
from .ocr_files import *
|
||||
from .substitute_equivalences import *
|
||||
from .character_error_rate import *
|
||||
from .word_error_rate import *
|
||||
from .align import *
|
34
qurator/dinglehopper/align.py
Normal file
34
qurator/dinglehopper/align.py
Normal file
|
@ -0,0 +1,34 @@
|
|||
from .edit_distance import *
|
||||
|
||||
def align(s1, s2):
|
||||
s1 = list(s1)
|
||||
s2 = list(s2)
|
||||
ops = seq_editops(s1, s2)
|
||||
i = 0
|
||||
j = 0
|
||||
|
||||
while i < len(s1) or j < len(s2):
|
||||
o = None
|
||||
try:
|
||||
ot = ops[0]
|
||||
if ot[1] == i and ot[2] == j:
|
||||
ops = ops[1:]
|
||||
o = ot
|
||||
except IndexError:
|
||||
pass
|
||||
|
||||
if o:
|
||||
if o[0] == 'insert':
|
||||
yield (None, s2[j])
|
||||
j += 1
|
||||
elif o[0] == 'delete':
|
||||
yield (s1[i], None)
|
||||
i += 1
|
||||
elif o[0] == 'replace':
|
||||
yield (s1[i], s2[j])
|
||||
i += 1
|
||||
j += 1
|
||||
else:
|
||||
yield (s1[i], s2[j])
|
||||
i += 1
|
||||
j += 1
|
21
qurator/dinglehopper/character_error_rate.py
Normal file
21
qurator/dinglehopper/character_error_rate.py
Normal file
|
@ -0,0 +1,21 @@
|
|||
from __future__ import division
|
||||
|
||||
import unicodedata
|
||||
|
||||
from uniseg.graphemecluster import grapheme_clusters
|
||||
|
||||
from qurator.dinglehopper.edit_distance import distance
|
||||
|
||||
|
||||
def character_error_rate(reference, compared):
|
||||
d = distance(reference, compared)
|
||||
if d == 0:
|
||||
return 0
|
||||
|
||||
n = len(list(grapheme_clusters(unicodedata.normalize('NFC', reference))))
|
||||
if n == 0:
|
||||
return float('inf')
|
||||
|
||||
return d/n
|
||||
|
||||
# XXX Should we really count newlines here?
|
83
qurator/dinglehopper/cli.py
Normal file
83
qurator/dinglehopper/cli.py
Normal file
|
@ -0,0 +1,83 @@
|
|||
import os
|
||||
|
||||
import click
|
||||
from jinja2 import Environment, FileSystemLoader
|
||||
|
||||
|
||||
from qurator.dinglehopper import *
|
||||
|
||||
|
||||
def gen_diff_report(gt_things, ocr_things, css_prefix, joiner, none):
|
||||
gtx = ''
|
||||
ocrx = ''
|
||||
|
||||
def format_thing(t, css_classes=None):
|
||||
if t is None:
|
||||
t = none
|
||||
css_classes += ' ellipsis'
|
||||
if t == '\n':
|
||||
t = '<br>'
|
||||
|
||||
if css_classes:
|
||||
return '<span class="{css_classes}">{t}</span>'.format(css_classes=css_classes, t=t)
|
||||
else:
|
||||
return '{t}'.format(t=t)
|
||||
|
||||
for k, (g, o) in enumerate(align(gt_things, ocr_things)):
|
||||
if g == o:
|
||||
css_classes = None
|
||||
else:
|
||||
css_classes = '{css_prefix}diff{k} diff'.format(css_prefix=css_prefix, k=k)
|
||||
|
||||
gtx += joiner + format_thing(g, css_classes)
|
||||
ocrx += joiner + format_thing(o, css_classes)
|
||||
|
||||
return \
|
||||
'''
|
||||
<div class="row">
|
||||
<div class="col-md-6 gt">{}</div>
|
||||
<div class="col-md-6 ocr">{}</div>
|
||||
</div>
|
||||
'''.format(gtx, ocrx)
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.argument('gt', type=click.Path(exists=True))
|
||||
@click.argument('ocr', type=click.Path(exists=True))
|
||||
def process(gt, ocr):
|
||||
"""Check OCR result against GT"""
|
||||
|
||||
gt_text = text(gt)
|
||||
ocr_text = text(ocr)
|
||||
|
||||
gt_text = substitute_equivalences(gt_text)
|
||||
ocr_text = substitute_equivalences(ocr_text)
|
||||
|
||||
cer = character_error_rate(gt_text, ocr_text)
|
||||
wer = word_error_rate(gt_text, ocr_text)
|
||||
uwer = unordered_word_error_rate(gt_text, ocr_text)
|
||||
|
||||
char_diff_report = gen_diff_report(gt_text, ocr_text, css_prefix='c', joiner='', none='·')
|
||||
|
||||
gt_words = words(gt_text)
|
||||
ocr_words = words(ocr_text)
|
||||
word_diff_report = gen_diff_report(gt_words, ocr_words, css_prefix='w', joiner=' ', none='⋯')
|
||||
|
||||
env = Environment(loader=FileSystemLoader(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'templates')))
|
||||
for out_fn in ('report.html', 'report.json'):
|
||||
template_fn = out_fn + '.j2'
|
||||
template = env.get_template(template_fn)
|
||||
template.stream(
|
||||
gt=gt, ocr=ocr,
|
||||
cer=cer, wer=wer, uwer=uwer,
|
||||
char_diff_report=char_diff_report,
|
||||
word_diff_report=word_diff_report
|
||||
).dump(out_fn)
|
||||
|
||||
|
||||
def main():
|
||||
process()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
95
qurator/dinglehopper/edit_distance.py
Normal file
95
qurator/dinglehopper/edit_distance.py
Normal file
|
@ -0,0 +1,95 @@
|
|||
from __future__ import division, print_function
|
||||
|
||||
import unicodedata
|
||||
from functools import partial
|
||||
|
||||
import numpy as np
|
||||
from uniseg.graphemecluster import grapheme_clusters
|
||||
|
||||
|
||||
def levenshtein_matrix(seq1, seq2):
|
||||
"""Compute the matrix commonly computed to produce the Levenshtein distance.
|
||||
|
||||
This is also known as the Wagner-Fischer algorithm. The matrix element at the bottom right contains the desired
|
||||
edit distance.
|
||||
|
||||
This algorithm is implemented here because we need an implementation that can work with sequences other than
|
||||
strings, e.g. lists of grapheme clusters or lists of word strings.
|
||||
"""
|
||||
m = len(seq1)
|
||||
n = len(seq2)
|
||||
|
||||
def from_to(start, stop):
|
||||
return range(start, stop + 1, 1)
|
||||
|
||||
D = np.zeros((m + 1, n + 1), np.int)
|
||||
D[0, 0] = 0
|
||||
for i in from_to(1, m):
|
||||
D[i, 0] = i
|
||||
for j in from_to(1, n):
|
||||
D[0, j] = j
|
||||
for i in from_to(1, m):
|
||||
for j in from_to(1, n):
|
||||
D[i, j] = min(
|
||||
D[i - 1, j - 1] + 1 * (seq1[i - 1] != seq2[j - 1]), # Same or Substitution
|
||||
D[i, j - 1] + 1, # Insertion
|
||||
D[i - 1, j] + 1 # Deletion
|
||||
)
|
||||
|
||||
return D
|
||||
|
||||
|
||||
def levenshtein(seq1, seq2):
|
||||
"""Compute the Levenshtein edit distance between two sequences"""
|
||||
m = len(seq1)
|
||||
n = len(seq2)
|
||||
|
||||
D = levenshtein_matrix(seq1, seq2)
|
||||
return D[m, n]
|
||||
|
||||
|
||||
def distance(s1, s2):
|
||||
"""Compute the Levenshtein edit distance between two Unicode strings
|
||||
|
||||
Note that this is different from levenshtein() as this function knows about Unicode normalization and grapheme
|
||||
clusters. This should be the correct way to compare two Unicode strings.
|
||||
"""
|
||||
s1 = list(grapheme_clusters(unicodedata.normalize('NFC', s1)))
|
||||
s2 = list(grapheme_clusters(unicodedata.normalize('NFC', s2)))
|
||||
return levenshtein(s1, s2)
|
||||
|
||||
|
||||
def seq_editops(seq1, seq2):
|
||||
seq1 = list(seq1)
|
||||
seq2 = list(seq2)
|
||||
m = len(seq1)
|
||||
n = len(seq2)
|
||||
D = levenshtein_matrix(seq1, seq2)
|
||||
|
||||
def _tail_backtrace(i, j, accumulator):
|
||||
if i > 0 and D[i - 1, j] + 1 == D[i, j]:
|
||||
return partial(_tail_backtrace, i - 1, j, [('delete', i-1, j)] + accumulator)
|
||||
if j > 0 and D[i, j - 1] + 1 == D[i, j]:
|
||||
return partial(_tail_backtrace, i, j - 1, [('insert', i, j-1)] + accumulator)
|
||||
if i > 0 and j > 0 and D[i - 1, j - 1] + 1 == D[i, j]:
|
||||
return partial(_tail_backtrace, i - 1, j - 1, [('replace', i-1, j-1)] + accumulator)
|
||||
if i > 0 and j > 0 and D[i - 1, j - 1] == D[i, j]:
|
||||
return partial(_tail_backtrace, i - 1, j - 1, accumulator) # NOP
|
||||
return accumulator
|
||||
|
||||
def backtrace(i, j):
|
||||
result = partial(_tail_backtrace, i, j, [])
|
||||
while isinstance(result, partial):
|
||||
result = result()
|
||||
|
||||
return result
|
||||
|
||||
b = backtrace(m, n)
|
||||
return b
|
||||
|
||||
|
||||
def editops(word1, word2):
|
||||
# XXX Note that this returns indices to the _grapheme clusters_, not characters!
|
||||
word1 = list(grapheme_clusters(unicodedata.normalize('NFC', word1)))
|
||||
word2 = list(grapheme_clusters(unicodedata.normalize('NFC', word2)))
|
||||
return seq_editops(word1, word2)
|
1037
qurator/dinglehopper/notebooks/Levenshtein.ipynb
Normal file
1037
qurator/dinglehopper/notebooks/Levenshtein.ipynb
Normal file
File diff suppressed because it is too large
Load diff
|
@ -0,0 +1,558 @@
|
|||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import unicodedata"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def list_characters(s):\n",
|
||||
" \"\"\"List characters of string s, as seen by Python\"\"\"\n",
|
||||
" for c in s:\n",
|
||||
" print(c, end=' ')\n",
|
||||
" if unicodedata.combining(c):\n",
|
||||
" print(end=' ')\n",
|
||||
" print(unicodedata.name(c))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Comparing two Unicode strings"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"S LATIN CAPITAL LETTER S\n",
|
||||
"c LATIN SMALL LETTER C\n",
|
||||
"h LATIN SMALL LETTER H\n",
|
||||
"l LATIN SMALL LETTER L\n",
|
||||
"y LATIN SMALL LETTER Y\n",
|
||||
"ñ LATIN SMALL LETTER N WITH TILDE\n",
|
||||
"\n",
|
||||
"S LATIN CAPITAL LETTER S\n",
|
||||
"c LATIN SMALL LETTER C\n",
|
||||
"h LATIN SMALL LETTER H\n",
|
||||
"l LATIN SMALL LETTER L\n",
|
||||
"y LATIN SMALL LETTER Y\n",
|
||||
"n LATIN SMALL LETTER N\n",
|
||||
"̃ COMBINING TILDE\n",
|
||||
"\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"words = [unicodedata.normalize('NFC', 'Schlyñ'), unicodedata.normalize('NFD', 'Schlyñ')]\n",
|
||||
"\n",
|
||||
"for s in words:\n",
|
||||
" list_characters(s)\n",
|
||||
" print()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"These two strings are different:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"False"
|
||||
]
|
||||
},
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"words[0] == words[1]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"And yet they are the canonically equivalent:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"True"
|
||||
]
|
||||
},
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"unicodedata.normalize('NFC', words[0]) == unicodedata.normalize('NFC', words[1])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"→ Normalize to NFC (Normalization Form Composed) to compare. NFC is also composed, which is what we want. But it doesn't matter because we're not interested in the characters as Python sees them, but in grapheme clusters (see below.)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Grapheme clusters"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"For evaluation we're interesting in what is perceived as \"characters\". But is \"ñ\" 1 character (LATIN SMALL LETTER N WITH TILDE) or 2 (LATIN SMALL LETTER N + COMBINING TILDE)?\n",
|
||||
"\n",
|
||||
"What we're probably want are [grapheme clusters](https://uniseg-python.readthedocs.io/en/latest/graphemecluster.html):"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"['S', 'c', 'h', 'l', 'y', 'ñ']\n",
|
||||
"['S', 'c', 'h', 'l', 'y', 'ñ']\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from uniseg.graphemecluster import grapheme_clusters\n",
|
||||
"\n",
|
||||
"for w in words:\n",
|
||||
" print(list(grapheme_clusters(w)))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Just looking at the interesting character – the last one - from both words:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"ñ LATIN SMALL LETTER N WITH TILDE\n",
|
||||
"\n",
|
||||
"n LATIN SMALL LETTER N\n",
|
||||
"̃ COMBINING TILDE\n",
|
||||
"\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"for w in words:\n",
|
||||
" list_characters(list(grapheme_clusters(w))[-1])\n",
|
||||
" print()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"→ Work with grapheme clusters, not \"characters as Python sees them\"."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def unicode_name(c):\n",
|
||||
" if 0xE000 <= ord(c) <= 0xF8FF:\n",
|
||||
" return 'private use character 0x{:04X}'.format(ord(c))\n",
|
||||
" else:\n",
|
||||
" return unicodedata.name(c)\n",
|
||||
" \n",
|
||||
"\n",
|
||||
"def list_grapheme_clusters(s):\n",
|
||||
" \"\"\"List grapheme clusters of string s\"\"\"\n",
|
||||
" for g in grapheme_clusters(s):\n",
|
||||
" print(g, end=' ')\n",
|
||||
" if len(g) > 1:\n",
|
||||
" print('(multiple)', end=' ')\n",
|
||||
" try:\n",
|
||||
" print(', '.join(unicode_name(c) for c in g))\n",
|
||||
" except ValueError:\n",
|
||||
" print('ValueError')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"S LATIN CAPITAL LETTER S\n",
|
||||
"c LATIN SMALL LETTER C\n",
|
||||
"h LATIN SMALL LETTER H\n",
|
||||
"l LATIN SMALL LETTER L\n",
|
||||
"y LATIN SMALL LETTER Y\n",
|
||||
"ñ LATIN SMALL LETTER N WITH TILDE\n",
|
||||
"\n",
|
||||
"S LATIN CAPITAL LETTER S\n",
|
||||
"c LATIN SMALL LETTER C\n",
|
||||
"h LATIN SMALL LETTER H\n",
|
||||
"l LATIN SMALL LETTER L\n",
|
||||
"y LATIN SMALL LETTER Y\n",
|
||||
"ñ (multiple) LATIN SMALL LETTER N, COMBINING TILDE\n",
|
||||
"\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"for w in words:\n",
|
||||
" list_grapheme_clusters(w)\n",
|
||||
" print()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"私 CJK UNIFIED IDEOGRAPH-79C1\n",
|
||||
"は HIRAGANA LETTER HA\n",
|
||||
"彼 CJK UNIFIED IDEOGRAPH-5F7C\n",
|
||||
"女 CJK UNIFIED IDEOGRAPH-5973\n",
|
||||
"が HIRAGANA LETTER GA\n",
|
||||
"お HIRAGANA LETTER O\n",
|
||||
"茶 CJK UNIFIED IDEOGRAPH-8336\n",
|
||||
"を HIRAGANA LETTER WO\n",
|
||||
"好 CJK UNIFIED IDEOGRAPH-597D\n",
|
||||
"き HIRAGANA LETTER KI\n",
|
||||
"な HIRAGANA LETTER NA\n",
|
||||
"事 CJK UNIFIED IDEOGRAPH-4E8B\n",
|
||||
"が HIRAGANA LETTER GA\n",
|
||||
"分 CJK UNIFIED IDEOGRAPH-5206\n",
|
||||
"か HIRAGANA LETTER KA\n",
|
||||
"っ HIRAGANA LETTER SMALL TU\n",
|
||||
"た HIRAGANA LETTER TA\n",
|
||||
"。 IDEOGRAPHIC FULL STOP\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"list_grapheme_clusters('私は彼女がお茶を好きな事が分かった。')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
". FULL STOP\n",
|
||||
" SPACE\n",
|
||||
"ا ARABIC LETTER ALEF\n",
|
||||
"م ARABIC LETTER MEEM\n",
|
||||
"ا ARABIC LETTER ALEF\n",
|
||||
" SPACE\n",
|
||||
"چ ARABIC LETTER TCHEH\n",
|
||||
"ن ARABIC LETTER NOON\n",
|
||||
"د ARABIC LETTER DAL\n",
|
||||
" SPACE\n",
|
||||
"ت ARABIC LETTER TEH\n",
|
||||
"ا ARABIC LETTER ALEF\n",
|
||||
" SPACE\n",
|
||||
"ح ARABIC LETTER HAH\n",
|
||||
"ر ARABIC LETTER REH\n",
|
||||
"ف ARABIC LETTER FEH\n",
|
||||
" SPACE\n",
|
||||
"ت ARABIC LETTER TEH\n",
|
||||
"و ARABIC LETTER WAW\n",
|
||||
" SPACE\n",
|
||||
"ف ARABIC LETTER FEH\n",
|
||||
"ا ARABIC LETTER ALEF\n",
|
||||
"ر ARABIC LETTER REH\n",
|
||||
"س ARABIC LETTER SEEN\n",
|
||||
"ی ARABIC LETTER FARSI YEH\n",
|
||||
" SPACE\n",
|
||||
"ه ARABIC LETTER HEH\n",
|
||||
"س ARABIC LETTER SEEN\n",
|
||||
"ت ARABIC LETTER TEH\n",
|
||||
" SPACE\n",
|
||||
"ک ARABIC LETTER KEHEH\n",
|
||||
"ه ARABIC LETTER HEH\n",
|
||||
" SPACE\n",
|
||||
"ت ARABIC LETTER TEH\n",
|
||||
"و ARABIC LETTER WAW\n",
|
||||
" SPACE\n",
|
||||
"ع ARABIC LETTER AIN\n",
|
||||
"ر ARABIC LETTER REH\n",
|
||||
"ب ARABIC LETTER BEH\n",
|
||||
"ی ARABIC LETTER FARSI YEH\n",
|
||||
" SPACE\n",
|
||||
"ن ARABIC LETTER NOON\n",
|
||||
"ی ARABIC LETTER FARSI YEH\n",
|
||||
"س ARABIC LETTER SEEN\n",
|
||||
"ت ARABIC LETTER TEH\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"list_grapheme_clusters('. اما چند تا حرف تو فارسی هست که تو عربی نیست')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 12,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
". FULL STOP\n",
|
||||
" SPACE\n",
|
||||
"ل ARABIC LETTER LAM\n",
|
||||
"ك ARABIC LETTER KAF\n",
|
||||
"ن ARABIC LETTER NOON\n",
|
||||
" SPACE\n",
|
||||
"ك ARABIC LETTER KAF\n",
|
||||
"م ARABIC LETTER MEEM\n",
|
||||
" SPACE\n",
|
||||
"ع ARABIC LETTER AIN\n",
|
||||
"د ARABIC LETTER DAL\n",
|
||||
"د ARABIC LETTER DAL\n",
|
||||
" SPACE\n",
|
||||
"ا ARABIC LETTER ALEF\n",
|
||||
"ل ARABIC LETTER LAM\n",
|
||||
"ك ARABIC LETTER KAF\n",
|
||||
"ل ARABIC LETTER LAM\n",
|
||||
"م ARABIC LETTER MEEM\n",
|
||||
"ا ARABIC LETTER ALEF\n",
|
||||
"ت ARABIC LETTER TEH\n",
|
||||
" SPACE\n",
|
||||
"ب ARABIC LETTER BEH\n",
|
||||
"ا ARABIC LETTER ALEF\n",
|
||||
"ل ARABIC LETTER LAM\n",
|
||||
"ف ARABIC LETTER FEH\n",
|
||||
"ا ARABIC LETTER ALEF\n",
|
||||
"ر ARABIC LETTER REH\n",
|
||||
"س ARABIC LETTER SEEN\n",
|
||||
"ي ARABIC LETTER YEH\n",
|
||||
"ة ARABIC LETTER TEH MARBUTA\n",
|
||||
" SPACE\n",
|
||||
"ه ARABIC LETTER HEH\n",
|
||||
"ل ARABIC LETTER LAM\n",
|
||||
" SPACE\n",
|
||||
"أ ARABIC LETTER ALEF WITH HAMZA ABOVE\n",
|
||||
"ن ARABIC LETTER NOON\n",
|
||||
"ت ARABIC LETTER TEH\n",
|
||||
" SPACE\n",
|
||||
"ب ARABIC LETTER BEH\n",
|
||||
"ا ARABIC LETTER ALEF\n",
|
||||
"ل ARABIC LETTER LAM\n",
|
||||
"ل ARABIC LETTER LAM\n",
|
||||
"غ ARABIC LETTER GHAIN\n",
|
||||
"ة ARABIC LETTER TEH MARBUTA\n",
|
||||
" SPACE\n",
|
||||
"ا ARABIC LETTER ALEF\n",
|
||||
"ل ARABIC LETTER LAM\n",
|
||||
"ع ARABIC LETTER AIN\n",
|
||||
"ر ARABIC LETTER REH\n",
|
||||
"ب ARABIC LETTER BEH\n",
|
||||
"ي ARABIC LETTER YEH\n",
|
||||
"ة ARABIC LETTER TEH MARBUTA\n",
|
||||
"؟ ARABIC QUESTION MARK\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"list_grapheme_clusters('. لكن كم عدد الكلمات بالفارسية هل أنت باللغة العربية؟')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 13,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"H LATIN CAPITAL LETTER H\n",
|
||||
"e LATIN SMALL LETTER E\n",
|
||||
"l LATIN SMALL LETTER L\n",
|
||||
"l LATIN SMALL LETTER L\n",
|
||||
"😀 GRINNING FACE\n",
|
||||
" SPACE\n",
|
||||
"W LATIN CAPITAL LETTER W\n",
|
||||
"😀 GRINNING FACE\n",
|
||||
"r LATIN SMALL LETTER R\n",
|
||||
"l LATIN SMALL LETTER L\n",
|
||||
"d LATIN SMALL LETTER D\n",
|
||||
"! EXCLAMATION MARK\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"list_grapheme_clusters('Hell😀 W😀rld!')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 14,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"u̶̜͓̬̞͚͙̪̰͓̯̲̝̬͔͎̳̼͇̓͊ͤ̋̃̀̄̓̿͊̀̚͟͜͟ͅ (multiple) LATIN SMALL LETTER U, COMBINING COMMA ABOVE, COMBINING NOT TILDE ABOVE, COMBINING LATIN SMALL LETTER E, COMBINING DOUBLE ACUTE ACCENT, COMBINING TILDE, COMBINING GRAVE ACCENT, COMBINING LEFT ANGLE ABOVE, COMBINING MACRON, COMBINING COMMA ABOVE, COMBINING DOUBLE OVERLINE, COMBINING NOT TILDE ABOVE, COMBINING DOUBLE MACRON BELOW, COMBINING GRAVE TONE MARK, COMBINING DOUBLE BREVE BELOW, COMBINING LONG STROKE OVERLAY, COMBINING DOUBLE MACRON BELOW, COMBINING LEFT HALF RING BELOW, COMBINING X BELOW, COMBINING CARON BELOW, COMBINING DOWN TACK BELOW, COMBINING DOUBLE RING BELOW, COMBINING ASTERISK BELOW, COMBINING BRIDGE BELOW, COMBINING TILDE BELOW, COMBINING X BELOW, COMBINING INVERTED BREVE BELOW, COMBINING LOW LINE, COMBINING UP TACK BELOW, COMBINING CARON BELOW, COMBINING LEFT ARROWHEAD BELOW, COMBINING UPWARDS ARROW BELOW, COMBINING DOUBLE LOW LINE, COMBINING SEAGULL BELOW, COMBINING EQUALS SIGN BELOW, COMBINING GREEK YPOGEGRAMMENI\n",
|
||||
"ņ̷͔̤̜̗̘̠̦̦̖̟͉̹͕̬͎̙̲̲̎̅̈́ͮͣ̔̀̌͂̄͆͑̚ (multiple) LATIN SMALL LETTER N, COMBINING DOUBLE VERTICAL LINE ABOVE, COMBINING OVERLINE, COMBINING GREEK DIALYTIKA TONOS, COMBINING LEFT ANGLE ABOVE, COMBINING LATIN SMALL LETTER V, COMBINING LATIN SMALL LETTER A, COMBINING REVERSED COMMA ABOVE, COMBINING GRAVE ACCENT, COMBINING CARON, COMBINING GREEK PERISPOMENI, COMBINING MACRON, COMBINING BRIDGE ABOVE, COMBINING LEFT HALF RING ABOVE, COMBINING SHORT SOLIDUS OVERLAY, COMBINING CEDILLA, COMBINING LEFT ARROWHEAD BELOW, COMBINING DIAERESIS BELOW, COMBINING LEFT HALF RING BELOW, COMBINING ACUTE ACCENT BELOW, COMBINING LEFT TACK BELOW, COMBINING MINUS SIGN BELOW, COMBINING COMMA BELOW, COMBINING COMMA BELOW, COMBINING GRAVE ACCENT BELOW, COMBINING PLUS SIGN BELOW, COMBINING LEFT ANGLE BELOW, COMBINING RIGHT HALF RING BELOW, COMBINING RIGHT ARROWHEAD BELOW, COMBINING CARON BELOW, COMBINING UPWARDS ARROW BELOW, COMBINING RIGHT TACK BELOW, COMBINING LOW LINE, COMBINING LOW LINE\n",
|
||||
"i̴̢͖̳̣̙͕̍ͯͧ̀ͥͭ̆ͣ̉͐͆̊͋͛̈́͒͟ (multiple) LATIN SMALL LETTER I, COMBINING VERTICAL LINE ABOVE, COMBINING LATIN SMALL LETTER X, COMBINING LATIN SMALL LETTER U, COMBINING GRAVE ACCENT, COMBINING LATIN SMALL LETTER I, COMBINING LATIN SMALL LETTER T, COMBINING BREVE, COMBINING LATIN SMALL LETTER A, COMBINING HOOK ABOVE, COMBINING RIGHT ARROWHEAD ABOVE, COMBINING BRIDGE ABOVE, COMBINING RING ABOVE, COMBINING HOMOTHETIC ABOVE, COMBINING ZIGZAG ABOVE, COMBINING GREEK DIALYTIKA TONOS, COMBINING FERMATA, COMBINING TILDE OVERLAY, COMBINING RETROFLEX HOOK BELOW, COMBINING DOUBLE MACRON BELOW, COMBINING RIGHT ARROWHEAD AND UP ARROWHEAD BELOW, COMBINING DOUBLE LOW LINE, COMBINING DOT BELOW, COMBINING RIGHT TACK BELOW, COMBINING RIGHT ARROWHEAD BELOW\n",
|
||||
"c̰̟̫̲͇̺̹͖̼̦̾ͮ̍̐ͤͪ̓ͤ̐̈́̅ͯͤ̚̚͘ (multiple) LATIN SMALL LETTER C, COMBINING VERTICAL TILDE, COMBINING LATIN SMALL LETTER V, COMBINING VERTICAL LINE ABOVE, COMBINING CANDRABINDU, COMBINING LATIN SMALL LETTER E, COMBINING LEFT ANGLE ABOVE, COMBINING LATIN SMALL LETTER H, COMBINING COMMA ABOVE, COMBINING LATIN SMALL LETTER E, COMBINING LEFT ANGLE ABOVE, COMBINING CANDRABINDU, COMBINING GREEK DIALYTIKA TONOS, COMBINING OVERLINE, COMBINING LATIN SMALL LETTER X, COMBINING LATIN SMALL LETTER E, COMBINING DOT ABOVE RIGHT, COMBINING TILDE BELOW, COMBINING PLUS SIGN BELOW, COMBINING INVERTED DOUBLE ARCH BELOW, COMBINING LOW LINE, COMBINING EQUALS SIGN BELOW, COMBINING INVERTED BRIDGE BELOW, COMBINING RIGHT HALF RING BELOW, COMBINING RIGHT ARROWHEAD AND UP ARROWHEAD BELOW, COMBINING SEAGULL BELOW, COMBINING COMMA BELOW\n",
|
||||
"o̴ͣ̑̐ͫ̈̄͊ͥ̓͟͏̫͔̠̤̜̤̥͘ (multiple) LATIN SMALL LETTER O, COMBINING LATIN SMALL LETTER A, COMBINING INVERTED BREVE, COMBINING CANDRABINDU, COMBINING LATIN SMALL LETTER M, COMBINING DIAERESIS, COMBINING MACRON, COMBINING NOT TILDE ABOVE, COMBINING LATIN SMALL LETTER I, COMBINING GREEK KORONIS, COMBINING DOUBLE MACRON BELOW, COMBINING TILDE OVERLAY, COMBINING GRAPHEME JOINER, COMBINING DOT ABOVE RIGHT, COMBINING INVERTED DOUBLE ARCH BELOW, COMBINING LEFT ARROWHEAD BELOW, COMBINING MINUS SIGN BELOW, COMBINING DIAERESIS BELOW, COMBINING LEFT HALF RING BELOW, COMBINING DIAERESIS BELOW, COMBINING RING BELOW\n",
|
||||
"ḍ̛̥͖͓̪͈̹̯͖̱̘͙͖ͧ̿ͧ̓̓͊̈͑͘̕ (multiple) LATIN SMALL LETTER D, COMBINING LATIN SMALL LETTER U, COMBINING DOUBLE OVERLINE, COMBINING LATIN SMALL LETTER U, COMBINING COMMA ABOVE, COMBINING COMMA ABOVE, COMBINING NOT TILDE ABOVE, COMBINING DIAERESIS, COMBINING LEFT HALF RING ABOVE, COMBINING DOT ABOVE RIGHT, COMBINING COMMA ABOVE RIGHT, COMBINING HORN, COMBINING DOT BELOW, COMBINING RING BELOW, COMBINING RIGHT ARROWHEAD AND UP ARROWHEAD BELOW, COMBINING X BELOW, COMBINING BRIDGE BELOW, COMBINING DOUBLE VERTICAL LINE BELOW, COMBINING RIGHT HALF RING BELOW, COMBINING INVERTED BREVE BELOW, COMBINING RIGHT ARROWHEAD AND UP ARROWHEAD BELOW, COMBINING MACRON BELOW, COMBINING LEFT TACK BELOW, COMBINING ASTERISK BELOW, COMBINING RIGHT ARROWHEAD AND UP ARROWHEAD BELOW\n",
|
||||
"e̛̺͈̜̰̜̖͎͚͈͋̒̆̈́̏͊ͬ̎̑̇̾̆̓ͬ̔̐̾ͭ́͞ (multiple) LATIN SMALL LETTER E, COMBINING HOMOTHETIC ABOVE, COMBINING TURNED COMMA ABOVE, COMBINING BREVE, COMBINING GREEK DIALYTIKA TONOS, COMBINING DOUBLE GRAVE ACCENT, COMBINING NOT TILDE ABOVE, COMBINING LATIN SMALL LETTER R, COMBINING DOUBLE VERTICAL LINE ABOVE, COMBINING INVERTED BREVE, COMBINING DOT ABOVE, COMBINING VERTICAL TILDE, COMBINING BREVE, COMBINING GREEK KORONIS, COMBINING LATIN SMALL LETTER R, COMBINING REVERSED COMMA ABOVE, COMBINING CANDRABINDU, COMBINING VERTICAL TILDE, COMBINING LATIN SMALL LETTER T, COMBINING ACUTE TONE MARK, COMBINING HORN, COMBINING DOUBLE MACRON, COMBINING INVERTED BRIDGE BELOW, COMBINING DOUBLE VERTICAL LINE BELOW, COMBINING LEFT HALF RING BELOW, COMBINING TILDE BELOW, COMBINING LEFT HALF RING BELOW, COMBINING GRAVE ACCENT BELOW, COMBINING UPWARDS ARROW BELOW, COMBINING DOUBLE RING BELOW, COMBINING DOUBLE VERTICAL LINE BELOW\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"list_grapheme_clusters('u̶̜͓̬̞͚͙̪̰͓̯̲̝̬͔͎̳̼͇̓͊ͤ̋̃̀̄̓̿͊̀̚͟͜͟ͅņ̷͔̤̜̗̘̠̦̦̖̟͉̹͕̬͎̙̲̲̎̅̈́ͮͣ̔̀̌͂̄͆͑̚i̴̢͖̳̣̙͕̍ͯͧ̀ͥͭ̆ͣ̉͐͆̊͋͛̈́͒͟c̰̟̫̲͇̺̹͖̼̦̾ͮ̍̐ͤͪ̓ͤ̐̈́̅ͯͤ̚̚͘o̴ͣ̑̐ͫ̈̄͊ͥ̓͟͏̫͔̠̤̜̤̥͘ḍ̛̥͖͓̪͈̹̯͖̱̘͙͖ͧ̿ͧ̓̓͊̈͑͘̕e̛̺͈̜̰̜̖͎͚͈͋̒̆̈́̏͊ͬ̎̑̇̾̆̓ͬ̔̐̾ͭ́͞')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 15,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Z LATIN CAPITAL LETTER Z\n",
|
||||
"e LATIN SMALL LETTER E\n",
|
||||
"u LATIN SMALL LETTER U\n",
|
||||
"g LATIN SMALL LETTER G\n",
|
||||
"n LATIN SMALL LETTER N\n",
|
||||
"uͤ (multiple) LATIN SMALL LETTER U, COMBINING LATIN SMALL LETTER E\n",
|
||||
"ß LATIN SMALL LETTER SHARP S\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"list_grapheme_clusters('Zeugnuͤß')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 16,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Z LATIN CAPITAL LETTER Z\n",
|
||||
"e LATIN SMALL LETTER E\n",
|
||||
"u LATIN SMALL LETTER U\n",
|
||||
"g LATIN SMALL LETTER G\n",
|
||||
"n LATIN SMALL LETTER N\n",
|
||||
" private use character 0xE72B\n",
|
||||
"ß LATIN SMALL LETTER SHARP S\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"list_grapheme_clusters('Zeugnß')"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"hide_input": false,
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.7.4"
|
||||
},
|
||||
"toc": {
|
||||
"base_numbering": 1,
|
||||
"nav_menu": {},
|
||||
"number_sections": true,
|
||||
"sideBar": true,
|
||||
"skip_h1_title": false,
|
||||
"title_cell": "Table of Contents",
|
||||
"title_sidebar": "Contents",
|
||||
"toc_cell": false,
|
||||
"toc_position": {},
|
||||
"toc_section_display": true,
|
||||
"toc_window_display": true
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
105
qurator/dinglehopper/ocr_files.py
Normal file
105
qurator/dinglehopper/ocr_files.py
Normal file
|
@ -0,0 +1,105 @@
|
|||
from __future__ import division, print_function
|
||||
|
||||
from lxml import etree as ET
|
||||
import sys
|
||||
|
||||
from lxml.etree import XMLSyntaxError
|
||||
|
||||
|
||||
def alto_namespace(tree):
|
||||
"""Return the ALTO namespace used in the given ElementTree.
|
||||
|
||||
This relies on the assumption that, in any given ALTO file, the root element has the local name "alto". We do not
|
||||
check if the files uses any valid ALTO namespace.
|
||||
"""
|
||||
root_name = ET.QName(tree.getroot().tag)
|
||||
if root_name.localname == 'alto':
|
||||
return root_name.namespace
|
||||
else:
|
||||
raise ValueError('Not an ALTO tree')
|
||||
|
||||
|
||||
def alto_text(tree):
|
||||
"""Extract text from the given ALTO ElementTree."""
|
||||
|
||||
nsmap = {'alto': alto_namespace(tree)}
|
||||
|
||||
lines = (
|
||||
' '.join(string.attrib.get('CONTENT') for string in line.iterfind('alto:String', namespaces=nsmap))
|
||||
for line in tree.iterfind('.//alto:TextLine', namespaces=nsmap))
|
||||
text_ = '\n'.join(lines)
|
||||
|
||||
return text_
|
||||
|
||||
|
||||
def page_namespace(tree):
|
||||
"""Return the PAGE content namespace used in the given ElementTree.
|
||||
|
||||
This relies on the assumption that, in any given PAGE content file, the root element has the local name "PcGts". We
|
||||
do not check if the files uses any valid PAGE namespace.
|
||||
"""
|
||||
root_name = ET.QName(tree.getroot().tag)
|
||||
if root_name.localname == 'PcGts':
|
||||
return root_name.namespace
|
||||
else:
|
||||
raise ValueError('Not a PAGE tree')
|
||||
|
||||
|
||||
def page_text(tree):
|
||||
"""Extract text from the given PAGE content ElementTree."""
|
||||
|
||||
nsmap = {'page': page_namespace(tree)}
|
||||
|
||||
def region_text(region):
|
||||
try:
|
||||
return region.find('./page:TextEquiv/page:Unicode', namespaces=nsmap).text
|
||||
except AttributeError:
|
||||
return None
|
||||
|
||||
region_texts = []
|
||||
reading_order = tree.find('.//page:ReadingOrder', namespaces=nsmap)
|
||||
if reading_order is not None:
|
||||
for group in reading_order.iterfind('./*', namespaces=nsmap):
|
||||
if ET.QName(group.tag).localname == 'OrderedGroup':
|
||||
region_ref_indexeds = group.findall('./page:RegionRefIndexed', namespaces=nsmap)
|
||||
for region_ref_indexed in sorted(region_ref_indexeds, key=lambda r: r.attrib['index']):
|
||||
region_id = region_ref_indexed.attrib['regionRef']
|
||||
region = tree.find('.//page:TextRegion[@id="%s"]' % region_id, namespaces=nsmap)
|
||||
if region is not None:
|
||||
region_texts.append(region_text(region))
|
||||
else:
|
||||
raise ValueError('Invalid region id "%s" in file' % region_id)
|
||||
else:
|
||||
raise NotImplementedError
|
||||
else:
|
||||
for region in tree.iterfind('.//page:TextRegion', namespaces=nsmap):
|
||||
region_texts.append(region_text(region))
|
||||
|
||||
# XXX Does a file have to have regions etc.? region vs lines etc.
|
||||
# Filter empty region texts
|
||||
region_texts = (t for t in region_texts if t)
|
||||
|
||||
text_ = '\n'.join(region_texts)
|
||||
|
||||
return text_
|
||||
|
||||
|
||||
def text(filename):
|
||||
"""Read the text from the given file.
|
||||
|
||||
Supports PAGE, ALTO and falls back to plain text.
|
||||
"""
|
||||
|
||||
try:
|
||||
tree = ET.parse(filename)
|
||||
except XMLSyntaxError:
|
||||
with open(filename, 'r') as f:
|
||||
return f.read()
|
||||
try:
|
||||
return page_text(tree)
|
||||
except ValueError:
|
||||
return alto_text(tree)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
print(text(sys.argv[1]))
|
4
qurator/dinglehopper/pytest.ini
Normal file
4
qurator/dinglehopper/pytest.ini
Normal file
|
@ -0,0 +1,4 @@
|
|||
[pytest]
|
||||
markers =
|
||||
integration: integration tests
|
||||
serial
|
32
qurator/dinglehopper/substitute_equivalences.py
Normal file
32
qurator/dinglehopper/substitute_equivalences.py
Normal file
|
@ -0,0 +1,32 @@
|
|||
def substitute_equivalences(s):
|
||||
|
||||
# These are for OCR-D GT vs Tesseract frk vs Calamari GT4HistOCR
|
||||
# It might make sense to use different rules for GT and for the different OCR
|
||||
equivalences = {
|
||||
'': 'ü',
|
||||
'': 'ſſ',
|
||||
"\ueba7": 'ſſi', # MUFI: LATIN SMALL LIGATURE LONG S LONG S I
|
||||
'': 'ä',
|
||||
'': 'ch',
|
||||
'==': '–', # → en-dash
|
||||
'—': '–', # em-dash → en-dash
|
||||
'': 'ck',
|
||||
'': 'll',
|
||||
'': 'ö',
|
||||
'': 'ſi',
|
||||
'': 'ſt',
|
||||
'fi': 'fi',
|
||||
'ff': 'ff',
|
||||
'fl': 'fl',
|
||||
'ffi': 'ffi',
|
||||
'': 'ct',
|
||||
'’': '\'',
|
||||
'⸗': '-',
|
||||
'': 'tz', # MUFI: LATIN SMALL LIGATURE TZ
|
||||
'aͤ': 'ä', # LATIN SMALL LETTER A, COMBINING LATIN SMALL LETTER E
|
||||
'oͤ': 'ö', # LATIN SMALL LETTER O, COMBINING LATIN SMALL LETTER E
|
||||
'uͤ': 'ü', # LATIN SMALL LETTER U, COMBINING LATIN SMALL LETTER E
|
||||
}
|
||||
for fr, to in equivalences.items():
|
||||
s = s.replace(fr, to)
|
||||
return s
|
61
qurator/dinglehopper/templates/report.html.j2
Normal file
61
qurator/dinglehopper/templates/report.html.j2
Normal file
|
@ -0,0 +1,61 @@
|
|||
<!doctype html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="utf-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
|
||||
|
||||
<link rel="stylesheet" href="https://stackpath.bootstrapcdn.com/bootstrap/4.3.1/css/bootstrap.min.css" integrity="sha384-ggOyR0iXCbMQv3Xipma34MD+dH/1fQ784/j6cY/iJTQUOhcWr7x9JvoRxT2MZw1T" crossorigin="anonymous">
|
||||
<style type="text/css">
|
||||
.gt .diff {
|
||||
color: green;
|
||||
}
|
||||
.ocr .diff {
|
||||
color: red;
|
||||
}
|
||||
.ellipsis {
|
||||
opacity: 0.5;
|
||||
font-style: italic;
|
||||
}
|
||||
.diff-highlight {
|
||||
border: 2px solid;
|
||||
border-radius: 5px;
|
||||
}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
|
||||
|
||||
|
||||
<div class="container">
|
||||
|
||||
{{ gt }}<br>
|
||||
{{ ocr }}
|
||||
|
||||
|
||||
<h2>Metrics</h2>
|
||||
<p>CER: {{ cer|round(4) }}</p>
|
||||
<p>WER: {{ wer|round(4) }}</p>
|
||||
<!-- FIXME <p>WER (unordered): {{ uwer|round(4) }}</p> -->
|
||||
|
||||
<h2>Character differences</h2>
|
||||
{{ char_diff_report }}
|
||||
|
||||
<h2>Word differences</h2>
|
||||
{{ word_diff_report }}
|
||||
|
||||
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
<script src="https://code.jquery.com/jquery-3.3.1.slim.min.js" integrity="sha384-q8i/X+965DzO0rT7abK41JStQIAqVgRVzpbzo5smXKp4YfRvH+8abtTE1Pi6jizo" crossorigin="anonymous"></script>
|
||||
<script src="https://cdnjs.cloudflare.com/ajax/libs/popper.js/1.14.7/umd/popper.min.js" integrity="sha384-UO2eT0CpHqdSJQ6hJty5KVphtPhzWj9WO1clHTMGa3JDZwrnQq4sF86dIHNDz0W1" crossorigin="anonymous"></script>
|
||||
<script src="https://stackpath.bootstrapcdn.com/bootstrap/4.3.1/js/bootstrap.min.js" integrity="sha384-JjSmVgyd0p3pXB1rRibZUAYoIIy6OrQ6VrjIEaFf/nJGzIxFDsf4x0xIM+B07jRM" crossorigin="anonymous"></script>
|
||||
|
||||
<script>
|
||||
{% include 'report.html.js' %}
|
||||
</script>
|
||||
|
||||
|
||||
</body>
|
||||
</html>
|
14
qurator/dinglehopper/templates/report.html.js
Normal file
14
qurator/dinglehopper/templates/report.html.js
Normal file
|
@ -0,0 +1,14 @@
|
|||
function find_diff_class(classes) {
|
||||
return classes.split(/\s+/).find(x => x.match(/.diff\d.*/));
|
||||
}
|
||||
|
||||
$(document).ready(function() {
|
||||
$('.diff').mouseover(function() {
|
||||
let c = find_diff_class($(this).attr('class'))
|
||||
$('.' + c).addClass('diff-highlight')
|
||||
});
|
||||
$('.diff').mouseout(function() {
|
||||
let c = find_diff_class($(this).attr('class'))
|
||||
$('.' + c).removeClass('diff-highlight')
|
||||
});
|
||||
});
|
6
qurator/dinglehopper/templates/report.json.j2
Normal file
6
qurator/dinglehopper/templates/report.json.j2
Normal file
|
@ -0,0 +1,6 @@
|
|||
{
|
||||
"gt": "{{ gt }}",
|
||||
"ocr": "{{ ocr }}",
|
||||
"cer": {{ cer|round(6) }},
|
||||
"wer": {{ wer|round(6) }}
|
||||
}
|
0
qurator/dinglehopper/tests/__init__.py
Normal file
0
qurator/dinglehopper/tests/__init__.py
Normal file
BIN
qurator/dinglehopper/tests/data/00000119.tif
Normal file
BIN
qurator/dinglehopper/tests/data/00000119.tif
Normal file
Binary file not shown.
File diff suppressed because it is too large
Load diff
|
@ -0,0 +1,289 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<PcGts xmlns="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15">
|
||||
<Metadata>
|
||||
<Creator>OCR-D/core 1.0.0b11</Creator>
|
||||
<Created>2019-08-01T15:03:17.741679</Created>
|
||||
<LastChange>2019-08-01T15:03:17.741679</LastChange>
|
||||
<MetadataItem type="processingStep" name="recognition/text-recognition" value="ocrd-tesserocr-recognize">
|
||||
<Labels>
|
||||
<Label value="frk" type="model"/>
|
||||
<Label value="line" type="textequiv_level"/>
|
||||
<Label value="False" type="overwrite_words"/>
|
||||
</Labels>
|
||||
</MetadataItem>
|
||||
</Metadata>
|
||||
<Page imageFilename="../OCR-D-IMG-BIN/OCR-D-IMG-BIN_0002" imageWidth="1386" imageHeight="2372">
|
||||
<ReadingOrder>
|
||||
<OrderedGroup id="reading-order">
|
||||
<RegionRefIndexed index="0" regionRef="region0000"/>
|
||||
<RegionRefIndexed index="1" regionRef="region0001"/>
|
||||
<RegionRefIndexed index="2" regionRef="region0002"/>
|
||||
<RegionRefIndexed index="3" regionRef="region0003"/>
|
||||
<RegionRefIndexed index="4" regionRef="region0004"/>
|
||||
<RegionRefIndexed index="5" regionRef="region0005"/>
|
||||
<RegionRefIndexed index="6" regionRef="region0006"/>
|
||||
</OrderedGroup>
|
||||
</ReadingOrder>
|
||||
<TextRegion id="region0000">
|
||||
<Coords points="488,133 1197,133 1197,193 488,193"/>
|
||||
<TextEquiv>
|
||||
<Unicode></Unicode>
|
||||
</TextEquiv>
|
||||
</TextRegion>
|
||||
<TextRegion id="region0001">
|
||||
<Coords points="40,221 1198,221 1198,626 40,626"/>
|
||||
<TextLine id="region0001_line0000">
|
||||
<Coords points="40,221 1198,221 1198,281 40,281"/>
|
||||
<TextEquiv conf="0.86">
|
||||
<Unicode>Die ſcheinen uns bald kleine Hügel - bald Hütten x Zelten und bald</Unicode>
|
||||
</TextEquiv>
|
||||
</TextLine>
|
||||
<TextLine id="region0001_line0001">
|
||||
<Coords points="768,290 879,290 879,325 768,325"/>
|
||||
<TextEquiv conf="0.62">
|
||||
<Unicode>„Bellen</Unicode>
|
||||
</TextEquiv>
|
||||
</TextLine>
|
||||
<TextLine id="region0001_line0002">
|
||||
<Coords points="86,337 1174,337 1174,396 86,396"/>
|
||||
<TextEquiv conf="0.8">
|
||||
<Unicode>Den Blicken , welche ſie durchlaufen , von weiten öfters vorzuſtellen,</Unicode>
|
||||
</TextEquiv>
|
||||
</TextLine>
|
||||
<TextLine id="region0001_line0003">
|
||||
<Coords points="88,397 841,397 841,455 88,455"/>
|
||||
<TextEquiv conf="0.84">
|
||||
<Unicode>Sieht man ein ſolch gemähtes Feld - von oben,</Unicode>
|
||||
</TextEquiv>
|
||||
</TextLine>
|
||||
<TextLine id="region0001_line0004">
|
||||
<Coords points="87,455 1142,455 1142,510 87,510"/>
|
||||
<TextEquiv conf="0.92">
|
||||
<Unicode>Sy gleicht es einem weiten Meer - worauf erhabne Wellen kobeny</Unicode>
|
||||
</TextEquiv>
|
||||
</TextLine>
|
||||
<TextLine id="region0001_line0005">
|
||||
<Coords points="87,510 1153,510 1153,570 87,570"/>
|
||||
<TextEquiv conf="0.85">
|
||||
<Unicode>Jedoch mit dieſem Unterſcheid - daß, da ſich die beſtändig rühren:</Unicode>
|
||||
</TextEquiv>
|
||||
</TextLine>
|
||||
<TextLine id="region0001_line0006">
|
||||
<Coords points="88,569 1161,569 1161,626 88,626"/>
|
||||
<TextEquiv conf="0.84">
|
||||
<Unicode>Von einiger Bewegung hier - in dieſen Wellen ; nichts zu ſpähren,</Unicode>
|
||||
</TextEquiv>
|
||||
</TextLine>
|
||||
<TextEquiv>
|
||||
<Unicode>Die ſcheinen uns bald kleine Hügel - bald Hütten x Zelten und bald
|
||||
„Bellen
|
||||
Den Blicken , welche ſie durchlaufen , von weiten öfters vorzuſtellen,
|
||||
Sieht man ein ſolch gemähtes Feld - von oben,
|
||||
Sy gleicht es einem weiten Meer - worauf erhabne Wellen kobeny
|
||||
Jedoch mit dieſem Unterſcheid - daß, da ſich die beſtändig rühren:
|
||||
Von einiger Bewegung hier - in dieſen Wellen ; nichts zu ſpähren,</Unicode>
|
||||
</TextEquiv>
|
||||
</TextRegion>
|
||||
<TextRegion id="region0002">
|
||||
<Coords points="517,670 745,670 745,716 517,716"/>
|
||||
<TextEquiv>
|
||||
<Unicode></Unicode>
|
||||
</TextEquiv>
|
||||
</TextRegion>
|
||||
<TextRegion id="region0003">
|
||||
<Coords points="243,739 1124,739 1124,1094 243,1094"/>
|
||||
<TextLine id="region0003_line0000">
|
||||
<Coords points="243,739 884,739 884,795 243,795"/>
|
||||
<TextEquiv conf="0.83">
|
||||
<Unicode>Was erhebt des Schöpfers Güte</Unicode>
|
||||
</TextEquiv>
|
||||
</TextLine>
|
||||
<TextLine id="region0003_line0001">
|
||||
<Coords points="244,792 972,792 972,859 244,859"/>
|
||||
<TextEquiv conf="0.8">
|
||||
<Unicode>Mehr , als dieſes Seegens Meer?</Unicode>
|
||||
</TextEquiv>
|
||||
</TextLine>
|
||||
<TextLine id="region0003_line0002">
|
||||
<Coords points="243,855 931,855 931,913 243,913"/>
|
||||
<TextEquiv conf="0.83">
|
||||
<Unicode>Kommt dies wohl von ungefehv?</Unicode>
|
||||
</TextEquiv>
|
||||
</TextLine>
|
||||
<TextLine id="region0003_line0003">
|
||||
<Coords points="244,914 918,914 918,971 244,971"/>
|
||||
<TextEquiv conf="0.84">
|
||||
<Unicode>Nein , rüſt mein erfreut Gemühte</Unicode>
|
||||
</TextEquiv>
|
||||
</TextLine>
|
||||
<TextLine id="region0003_line0004">
|
||||
<Coords points="245,972 1059,972 1059,1034 245,1034"/>
|
||||
<TextEquiv conf="0.86">
|
||||
<Unicode>Nur von GOTT komint alles hers</Unicode>
|
||||
</TextEquiv>
|
||||
</TextLine>
|
||||
<TextLine id="region0003_line0005">
|
||||
<Coords points="247,1029 1124,1029 1124,1094 247,1094"/>
|
||||
<TextEquiv conf="0.74">
|
||||
<Unicode>Ihm ſey Preiß und Dan und Ehr!</Unicode>
|
||||
</TextEquiv>
|
||||
</TextLine>
|
||||
<TextEquiv>
|
||||
<Unicode>Was erhebt des Schöpfers Güte
|
||||
Mehr , als dieſes Seegens Meer?
|
||||
Kommt dies wohl von ungefehv?
|
||||
Nein , rüſt mein erfreut Gemühte
|
||||
Nur von GOTT komint alles hers
|
||||
Ihm ſey Preiß und Dan und Ehr!</Unicode>
|
||||
</TextEquiv>
|
||||
</TextRegion>
|
||||
<TextRegion id="region0004">
|
||||
<Coords points="1043,1096 1204,1096 1204,1136 1043,1136"/>
|
||||
<TextLine id="region0004_line0000">
|
||||
<Coords points="1043,1096 1204,1096 1204,1136 1043,1136"/>
|
||||
<TextEquiv conf="0.8">
|
||||
<Unicode>Da Capo,</Unicode>
|
||||
</TextEquiv>
|
||||
</TextLine>
|
||||
<TextEquiv>
|
||||
<Unicode>Da Capo,</Unicode>
|
||||
</TextEquiv>
|
||||
</TextRegion>
|
||||
<TextRegion id="region0005">
|
||||
<Coords points="68,1183 1236,1183 1236,2056 68,2056"/>
|
||||
<TextLine id="region0005_line0000">
|
||||
<Coords points="91,1183 1170,1183 1170,1235 91,1235"/>
|
||||
<TextEquiv conf="0.65">
|
||||
<Unicode>Geht man auf einen ſolhen Felde, ſo eben erſi gemäht - ſpaßtiereny</Unicode>
|
||||
</TextEquiv>
|
||||
</TextLine>
|
||||
<TextLine id="region0005_line0001">
|
||||
<Coords points="89,1236 1182,1236 1182,1289 89,1289"/>
|
||||
<TextEquiv conf="0.73">
|
||||
<Unicode>Das man gewohnt voll Korn zu ſehn; ſo kommen wir uns gröſſer für,</Unicode>
|
||||
</TextEquiv>
|
||||
</TextLine>
|
||||
<TextLine id="region0005_line0002">
|
||||
<Coords points="89,1294 1208,1294 1208,1346 89,1346"/>
|
||||
<TextEquiv conf="0.85">
|
||||
<Unicode>Das Feld hingegen niedriger. Auch nimmt ſodean ein neuer Scheinz</Unicode>
|
||||
</TextEquiv>
|
||||
</TextLine>
|
||||
<TextLine id="region0005_line0003">
|
||||
<Coords points="90,1351 519,1351 519,1399 90,1399"/>
|
||||
<TextEquiv conf="0.92">
|
||||
<Unicode>Und eine neue Farben Zier</Unicode>
|
||||
</TextEquiv>
|
||||
</TextLine>
|
||||
<TextLine id="region0005_line0004">
|
||||
<Coords points="91,1405 561,1405 561,1457 91,1457"/>
|
||||
<TextEquiv conf="0.91">
|
||||
<Unicode>Den erſt gemähten Aker ein,</Unicode>
|
||||
</TextEquiv>
|
||||
</TextLine>
|
||||
<TextLine id="region0005_line0005">
|
||||
<Coords points="92,1459 1208,1459 1208,1510 92,1510"/>
|
||||
<TextEquiv conf="0.88">
|
||||
<Unicode>Der Grund iſt grün - die Stoppeln gelb und wenn fich unjrer Son-</Unicode>
|
||||
</TextEquiv>
|
||||
</TextLine>
|
||||
<TextLine id="region0005_line0006">
|
||||
<Coords points="782,1514 1007,1514 1007,1555 782,1555"/>
|
||||
<TextEquiv conf="0.46">
|
||||
<Unicode>nen B;Of</Unicode>
|
||||
</TextEquiv>
|
||||
</TextLine>
|
||||
<TextLine id="region0005_line0007">
|
||||
<Coords points="68,1562 1177,1562 1177,1617 68,1617"/>
|
||||
<TextEquiv conf="0.82">
|
||||
<Unicode>Un ihre runde glatte Röhren , zumahlen früh und Abends bricht;</Unicode>
|
||||
</TextEquiv>
|
||||
</TextLine>
|
||||
<TextLine id="region0005_line0008">
|
||||
<Coords points="90,1618 1236,1618 1236,1670 90,1670"/>
|
||||
<TextEquiv conf="0.79">
|
||||
<Unicode>So kann ein Gold kaum ſtärcker glänßen.- Dies macht ein liebliches</Unicode>
|
||||
</TextEquiv>
|
||||
</TextLine>
|
||||
<TextLine id="region0005_line0009">
|
||||
<Coords points="777,1671 1159,1671 1159,1716 777,1716"/>
|
||||
<TextEquiv conf="0.76">
|
||||
<Unicode>Gemiſche, |</Unicode>
|
||||
</TextEquiv>
|
||||
</TextLine>
|
||||
<TextLine id="region0005_line0010">
|
||||
<Coords points="92,1722 1211,1722 1211,1783 92,1783"/>
|
||||
<TextEquiv conf="0.7">
|
||||
<Unicode>Zutnahl wenn , in der Nachbarſchaft - ein dumfel-grünendes Gebüſche</Unicode>
|
||||
</TextEquiv>
|
||||
</TextLine>
|
||||
<TextLine id="region0005_line0011">
|
||||
<Coords points="91,1779 1210,1779 1210,1837 91,1837"/>
|
||||
<TextEquiv conf="0.84">
|
||||
<Unicode>Den gelben Schimmer noch erhöht. Wir ich nun jüngſt, zur Abend Zeif,</Unicode>
|
||||
</TextEquiv>
|
||||
</TextLine>
|
||||
<TextLine id="region0005_line0012">
|
||||
<Coords points="93,1837 1210,1837 1210,1895 93,1895"/>
|
||||
<TextEquiv conf="0.84">
|
||||
<Unicode>Durch ſo viel ſhwere Scegens-Berge, mit ſanften Schritten, hin und</Unicode>
|
||||
</TextEquiv>
|
||||
</TextLine>
|
||||
<TextLine id="region0005_line0013">
|
||||
<Coords points="800,1896 914,1896 914,1936 800,1936"/>
|
||||
<TextEquiv conf="0.52">
|
||||
<Unicode>Wieder;</Unicode>
|
||||
</TextEquiv>
|
||||
</TextLine>
|
||||
<TextLine id="region0005_line0014">
|
||||
<Coords points="92,1943 1212,1943 1212,2001 92,2001"/>
|
||||
<TextEquiv conf="0.74">
|
||||
<Unicode>Gepühret durch des Feldes Schmu, gerühret durc< die Fruchtbarkeitz</Unicode>
|
||||
</TextEquiv>
|
||||
</TextLine>
|
||||
<TextLine id="region0005_line0015">
|
||||
<Coords points="90,1998 1125,1998 1125,2056 90,2056"/>
|
||||
<TextEquiv conf="0.76">
|
||||
<Unicode>Vergmigt auf meinem Acker gieng - ertönten dieſe meine Lieder:</Unicode>
|
||||
</TextEquiv>
|
||||
</TextLine>
|
||||
<TextEquiv>
|
||||
<Unicode>Geht man auf einen ſolhen Felde, ſo eben erſi gemäht - ſpaßtiereny
|
||||
Das man gewohnt voll Korn zu ſehn; ſo kommen wir uns gröſſer für,
|
||||
Das Feld hingegen niedriger. Auch nimmt ſodean ein neuer Scheinz
|
||||
Und eine neue Farben Zier
|
||||
Den erſt gemähten Aker ein,
|
||||
Der Grund iſt grün - die Stoppeln gelb und wenn fich unjrer Son-
|
||||
nen B;Of
|
||||
Un ihre runde glatte Röhren , zumahlen früh und Abends bricht;
|
||||
So kann ein Gold kaum ſtärcker glänßen.- Dies macht ein liebliches
|
||||
Gemiſche, |
|
||||
Zutnahl wenn , in der Nachbarſchaft - ein dumfel-grünendes Gebüſche
|
||||
Den gelben Schimmer noch erhöht. Wir ich nun jüngſt, zur Abend Zeif,
|
||||
Durch ſo viel ſhwere Scegens-Berge, mit ſanften Schritten, hin und
|
||||
Wieder;
|
||||
Gepühret durch des Feldes Schmu, gerühret durc< die Fruchtbarkeitz
|
||||
Vergmigt auf meinem Acker gieng - ertönten dieſe meine Lieder:</Unicode>
|
||||
</TextEquiv>
|
||||
</TextRegion>
|
||||
<TextRegion id="region0006">
|
||||
<Coords points="688,2060 1216,2060 1216,2120 688,2120"/>
|
||||
<TextLine id="region0006_line0000">
|
||||
<Coords points="688,2069 787,2069 787,2120 688,2120"/>
|
||||
<TextEquiv conf="0.74">
|
||||
<Unicode>5) 2</Unicode>
|
||||
</TextEquiv>
|
||||
</TextLine>
|
||||
<TextLine id="region0006_line0001">
|
||||
<Coords points="1044,2060 1216,2060 1216,2105 1044,2105"/>
|
||||
<TextEquiv conf="0.89">
|
||||
<Unicode>ARIA.</Unicode>
|
||||
</TextEquiv>
|
||||
</TextLine>
|
||||
<TextEquiv>
|
||||
<Unicode>5) 2
|
||||
ARIA.</Unicode>
|
||||
</TextEquiv>
|
||||
</TextRegion>
|
||||
</Page>
|
||||
</PcGts>
|
|
@ -0,0 +1,47 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<PcGts xmlns="http://schema.primaresearch.org/PAGE/gts/pagecontent/2018-07-15" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://schema.primaresearch.org/PAGE/gts/pagecontent/2018-07-15 http://schema.primaresearch.org/PAGE/gts/pagecontent/2018-07-15/pagecontent.xsd">
|
||||
<Metadata>
|
||||
<Creator></Creator>
|
||||
<Created>2019-07-26T13:59:00</Created>
|
||||
<LastChange>2019-07-26T14:00:29</LastChange></Metadata>
|
||||
<Page imageFilename="lorem-ipsum-scan.tif" imageXResolution="300.00000" imageYResolution="300.00000" imageWidth="2481" imageHeight="3508">
|
||||
<TextRegion id="tempReg357564684568544579089">
|
||||
<Coords points="0,0 1,0 1,1 0,1"/>
|
||||
<TextLine id="l0">
|
||||
<Coords points="228,237 228,295 2216,295 2216,237"/>
|
||||
<TextEquiv>
|
||||
<Unicode></Unicode></TextEquiv></TextLine>
|
||||
<TextLine id="l1">
|
||||
<Coords points="228,298 228,348 2160,348 2160,298"/>
|
||||
<TextEquiv>
|
||||
<Unicode></Unicode></TextEquiv></TextLine>
|
||||
<TextLine id="l2">
|
||||
<Coords points="225,348 225,410 2178,410 2178,348"/>
|
||||
<TextEquiv>
|
||||
<Unicode></Unicode></TextEquiv></TextLine>
|
||||
<TextLine id="l3">
|
||||
<Coords points="218,413 218,463 2153,463 2153,413"/>
|
||||
<TextEquiv>
|
||||
<Unicode></Unicode></TextEquiv></TextLine>
|
||||
<TextLine id="l4">
|
||||
<Coords points="225,466 225,522 2153,522 2153,466"/>
|
||||
<TextEquiv>
|
||||
<Unicode></Unicode></TextEquiv></TextLine>
|
||||
<TextLine id="l5">
|
||||
<Coords points="216,524 216,581 2187,581 2187,524"/>
|
||||
<TextEquiv>
|
||||
<Unicode></Unicode></TextEquiv></TextLine>
|
||||
<TextLine id="l6">
|
||||
<Coords points="219,584 219,640 542,640 542,584"/>
|
||||
<TextEquiv>
|
||||
<Unicode></Unicode></TextEquiv></TextLine></TextRegion>
|
||||
<TextRegion id="r7" type="paragraph">
|
||||
<Coords points="204,212 204,651 2227,651 2227,212"/>
|
||||
<TextEquiv>
|
||||
<Unicode>Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt
|
||||
ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo
|
||||
dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit
|
||||
amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor
|
||||
invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et
|
||||
justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum
|
||||
dolor sit amet.</Unicode></TextEquiv></TextRegion></Page></PcGts>
|
|
@ -0,0 +1,139 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<alto xmlns="http://www.loc.gov/standards/alto/ns-v3#" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.loc.gov/standards/alto/ns-v3# http://www.loc.gov/alto/v3/alto-3-0.xsd">
|
||||
<Description>
|
||||
<MeasurementUnit>pixel</MeasurementUnit>
|
||||
<sourceImageInformation>
|
||||
<fileName> </fileName>
|
||||
</sourceImageInformation>
|
||||
<OCRProcessing ID="OCR_0">
|
||||
<ocrProcessingStep>
|
||||
<processingSoftware>
|
||||
<softwareName>tesseract 4.1.0-rc4</softwareName>
|
||||
</processingSoftware>
|
||||
</ocrProcessingStep>
|
||||
</OCRProcessing>
|
||||
</Description>
|
||||
<Layout>
|
||||
<Page WIDTH="2481" HEIGHT="3508" PHYSICAL_IMG_NR="0" ID="page_0">
|
||||
<PrintSpace HPOS="0" VPOS="0" WIDTH="2481" HEIGHT="3508">
|
||||
<TextBlock ID="block_0" HPOS="209" VPOS="258" WIDTH="1954" HEIGHT="437">
|
||||
<TextLine ID="line_0" HPOS="209" VPOS="258" WIDTH="1954" HEIGHT="103">
|
||||
<String ID="string_0" HPOS="209" VPOS="319" WIDTH="134" HEIGHT="34" WC="0.96" CONTENT="Lorem"/><SP WIDTH="13" VPOS="319" HPOS="343"/>
|
||||
<String ID="string_1" HPOS="356" VPOS="316" WIDTH="121" HEIGHT="45" WC="0.96" CONTENT="ipsum"/><SP WIDTH="14" VPOS="316" HPOS="477"/>
|
||||
<String ID="string_2" HPOS="491" VPOS="312" WIDTH="102" HEIGHT="36" WC="0.96" CONTENT="dolor"/><SP WIDTH="15" VPOS="312" HPOS="593"/>
|
||||
<String ID="string_3" HPOS="608" VPOS="309" WIDTH="46" HEIGHT="35" WC="0.96" CONTENT="sit"/><SP WIDTH="14" VPOS="309" HPOS="654"/>
|
||||
<String ID="string_4" HPOS="668" VPOS="311" WIDTH="106" HEIGHT="37" WC="0.96" CONTENT="amet,"/><SP WIDTH="16" VPOS="311" HPOS="774"/>
|
||||
<String ID="string_5" HPOS="790" VPOS="307" WIDTH="201" HEIGHT="32" WC="0.88" CONTENT="consetetur"/><SP WIDTH="14" VPOS="307" HPOS="991"/>
|
||||
<String ID="string_6" HPOS="1005" VPOS="297" WIDTH="205" HEIGHT="46" WC="0.96" CONTENT="sadipscing"/><SP WIDTH="15" VPOS="297" HPOS="1210"/>
|
||||
<String ID="string_7" HPOS="1225" VPOS="293" WIDTH="84" HEIGHT="42" WC="0.91" CONTENT="elitr,"/><SP WIDTH="16" VPOS="293" HPOS="1309"/>
|
||||
<String ID="string_8" HPOS="1325" VPOS="289" WIDTH="65" HEIGHT="38" WC="0.96" CONTENT="sed"/><SP WIDTH="14" VPOS="289" HPOS="1390"/>
|
||||
<String ID="string_9" HPOS="1404" VPOS="286" WIDTH="97" HEIGHT="36" WC="0.93" CONTENT="diam"/><SP WIDTH="14" VPOS="286" HPOS="1501"/>
|
||||
<String ID="string_10" HPOS="1515" VPOS="291" WIDTH="100" HEIGHT="24" WC="0.69" CONTENT="nonu"/><SP WIDTH="32" VPOS="291" HPOS="1615"/>
|
||||
<String ID="string_11" HPOS="1647" VPOS="285" WIDTH="30" HEIGHT="36" WC="0.37" CONTENT="yy"/><SP WIDTH="17" VPOS="285" HPOS="1677"/>
|
||||
<String ID="string_12" HPOS="1694" VPOS="268" WIDTH="140" HEIGHT="42" WC="0.93" CONTENT="eirmod"/><SP WIDTH="11" VPOS="268" HPOS="1834"/>
|
||||
<String ID="string_13" HPOS="1845" VPOS="273" WIDTH="139" HEIGHT="37" WC="0.96" CONTENT="tempor"/><SP WIDTH="15" VPOS="273" HPOS="1984"/>
|
||||
<String ID="string_14" HPOS="1999" VPOS="258" WIDTH="164" HEIGHT="38" WC="0.95" CONTENT="invidunt"/>
|
||||
</TextLine>
|
||||
<TextLine ID="line_1" HPOS="211" VPOS="315" WIDTH="1904" HEIGHT="102">
|
||||
<String ID="string_15" HPOS="211" VPOS="380" WIDTH="39" HEIGHT="31" WC="0.96" CONTENT="ut"/><SP WIDTH="13" VPOS="380" HPOS="250"/>
|
||||
<String ID="string_16" HPOS="263" VPOS="373" WIDTH="123" HEIGHT="44" WC="0.96" CONTENT="labore"/><SP WIDTH="16" VPOS="373" HPOS="386"/>
|
||||
<String ID="string_17" HPOS="402" VPOS="379" WIDTH="33" HEIGHT="27" WC="0.95" CONTENT="et"/><SP WIDTH="14" VPOS="379" HPOS="435"/>
|
||||
<String ID="string_18" HPOS="449" VPOS="370" WIDTH="123" HEIGHT="36" WC="0.95" CONTENT="dolore"/><SP WIDTH="15" VPOS="370" HPOS="572"/>
|
||||
<String ID="string_19" HPOS="587" VPOS="374" WIDTH="133" HEIGHT="37" WC="0.96" CONTENT="magna"/><SP WIDTH="14" VPOS="374" HPOS="720"/>
|
||||
<String ID="string_20" HPOS="734" VPOS="363" WIDTH="183" HEIGHT="43" WC="0.96" CONTENT="aliquyam"/><SP WIDTH="14" VPOS="363" HPOS="917"/>
|
||||
<String ID="string_21" HPOS="931" VPOS="360" WIDTH="82" HEIGHT="36" WC="0.95" CONTENT="erat,"/><SP WIDTH="17" VPOS="360" HPOS="1013"/>
|
||||
<String ID="string_22" HPOS="1030" VPOS="354" WIDTH="65" HEIGHT="35" WC="0.96" CONTENT="sed"/><SP WIDTH="13" VPOS="354" HPOS="1095"/>
|
||||
<String ID="string_23" HPOS="1108" VPOS="352" WIDTH="96" HEIGHT="36" WC="0.96" CONTENT="diam"/><SP WIDTH="13" VPOS="352" HPOS="1204"/>
|
||||
<String ID="string_24" HPOS="1217" VPOS="350" WIDTH="181" HEIGHT="44" WC="0.95" CONTENT="voluptua."/><SP WIDTH="13" VPOS="350" HPOS="1398"/>
|
||||
<String ID="string_25" HPOS="1411" VPOS="345" WIDTH="49" HEIGHT="34" WC="0.95" CONTENT="At"/><SP WIDTH="11" VPOS="345" HPOS="1460"/>
|
||||
<String ID="string_26" HPOS="1471" VPOS="348" WIDTH="88" HEIGHT="26" WC="0.93" CONTENT="Vero"/><SP WIDTH="16" VPOS="348" HPOS="1559"/>
|
||||
<String ID="string_27" HPOS="1575" VPOS="345" WIDTH="65" HEIGHT="26" WC="0.96" CONTENT="eos"/><SP WIDTH="15" VPOS="345" HPOS="1640"/>
|
||||
<String ID="string_28" HPOS="1655" VPOS="339" WIDTH="36" HEIGHT="29" WC="0.96" CONTENT="et"/><SP WIDTH="14" VPOS="339" HPOS="1691"/>
|
||||
<String ID="string_29" HPOS="1705" VPOS="336" WIDTH="168" HEIGHT="31" WC="0.87" CONTENT="accusam"/><SP WIDTH="15" VPOS="336" HPOS="1873"/>
|
||||
<String ID="string_30" HPOS="1888" VPOS="329" WIDTH="34" HEIGHT="28" WC="0.96" CONTENT="et"/><SP WIDTH="11" VPOS="329" HPOS="1922"/>
|
||||
<String ID="string_31" HPOS="1933" VPOS="322" WIDTH="96" HEIGHT="44" WC="0.96" CONTENT="justo"/><SP WIDTH="15" VPOS="322" HPOS="2029"/>
|
||||
<String ID="string_32" HPOS="2044" VPOS="315" WIDTH="71" HEIGHT="63" WC="0.96" CONTENT="duo"/>
|
||||
</TextLine>
|
||||
<TextLine ID="line_2" HPOS="214" VPOS="375" WIDTH="1919" HEIGHT="93">
|
||||
<String ID="string_33" HPOS="214" VPOS="431" WIDTH="144" HEIGHT="37" WC="0.96" CONTENT="dolores"/><SP WIDTH="16" VPOS="431" HPOS="358"/>
|
||||
<String ID="string_34" HPOS="374" VPOS="433" WIDTH="34" HEIGHT="31" WC="0.96" CONTENT="et"/><SP WIDTH="14" VPOS="433" HPOS="408"/>
|
||||
<String ID="string_35" HPOS="422" VPOS="437" WIDTH="42" HEIGHT="25" WC="0.96" CONTENT="ea"/><SP WIDTH="13" VPOS="437" HPOS="464"/>
|
||||
<String ID="string_36" HPOS="477" VPOS="426" WIDTH="136" HEIGHT="35" WC="0.96" CONTENT="rebum."/><SP WIDTH="18" VPOS="426" HPOS="613"/>
|
||||
<String ID="string_37" HPOS="631" VPOS="424" WIDTH="75" HEIGHT="34" WC="0.96" CONTENT="Stet"/><SP WIDTH="14" VPOS="424" HPOS="706"/>
|
||||
<String ID="string_38" HPOS="720" VPOS="419" WIDTH="85" HEIGHT="36" WC="0.96" CONTENT="clita"/><SP WIDTH="13" VPOS="419" HPOS="805"/>
|
||||
<String ID="string_39" HPOS="818" VPOS="415" WIDTH="90" HEIGHT="35" WC="0.97" CONTENT="kasd"/><SP WIDTH="14" VPOS="415" HPOS="908"/>
|
||||
<String ID="string_40" HPOS="922" VPOS="412" WIDTH="206" HEIGHT="48" WC="0.96" CONTENT="gubergren,"/><SP WIDTH="16" VPOS="412" HPOS="1128"/>
|
||||
<String ID="string_41" HPOS="1144" VPOS="417" WIDTH="47" HEIGHT="26" WC="0.97" CONTENT="no"/><SP WIDTH="16" VPOS="417" HPOS="1191"/>
|
||||
<String ID="string_42" HPOS="1207" VPOS="415" WIDTH="61" HEIGHT="25" WC="0.96" CONTENT="sea"/><SP WIDTH="13" VPOS="415" HPOS="1268"/>
|
||||
<String ID="string_43" HPOS="1281" VPOS="405" WIDTH="169" HEIGHT="36" WC="0.91" CONTENT="iakimata"/><SP WIDTH="14" VPOS="405" HPOS="1450"/>
|
||||
<String ID="string_44" HPOS="1464" VPOS="400" WIDTH="144" HEIGHT="33" WC="0.96" CONTENT="sanctus"/><SP WIDTH="16" VPOS="400" HPOS="1608"/>
|
||||
<String ID="string_45" HPOS="1624" VPOS="397" WIDTH="54" HEIGHT="29" WC="0.97" CONTENT="est"/><SP WIDTH="13" VPOS="397" HPOS="1678"/>
|
||||
<String ID="string_46" HPOS="1691" VPOS="390" WIDTH="132" HEIGHT="34" WC="0.96" CONTENT="Lorem"/><SP WIDTH="14" VPOS="390" HPOS="1823"/>
|
||||
<String ID="string_47" HPOS="1837" VPOS="383" WIDTH="120" HEIGHT="44" WC="0.96" CONTENT="ipsum"/><SP WIDTH="14" VPOS="383" HPOS="1957"/>
|
||||
<String ID="string_48" HPOS="1971" VPOS="375" WIDTH="102" HEIGHT="37" WC="0.96" CONTENT="dolor"/><SP WIDTH="15" VPOS="375" HPOS="2073"/>
|
||||
<String ID="string_49" HPOS="2088" VPOS="377" WIDTH="45" HEIGHT="31" WC="0.96" CONTENT="sit"/>
|
||||
</TextLine>
|
||||
<TextLine ID="line_3" HPOS="215" VPOS="435" WIDTH="1896" HEIGHT="93">
|
||||
<String ID="string_50" HPOS="215" VPOS="494" WIDTH="106" HEIGHT="32" WC="0.96" CONTENT="amet."/><SP WIDTH="16" VPOS="494" HPOS="321"/>
|
||||
<String ID="string_51" HPOS="337" VPOS="488" WIDTH="130" HEIGHT="33" WC="0.96" CONTENT="Lorem"/><SP WIDTH="14" VPOS="488" HPOS="467"/>
|
||||
<String ID="string_52" HPOS="481" VPOS="484" WIDTH="121" HEIGHT="44" WC="0.96" CONTENT="ipsum"/><SP WIDTH="14" VPOS="484" HPOS="602"/>
|
||||
<String ID="string_53" HPOS="616" VPOS="479" WIDTH="104" HEIGHT="37" WC="0.96" CONTENT="dolor"/><SP WIDTH="14" VPOS="479" HPOS="720"/>
|
||||
<String ID="string_54" HPOS="734" VPOS="476" WIDTH="46" HEIGHT="36" WC="0.93" CONTENT="sit"/><SP WIDTH="14" VPOS="476" HPOS="780"/>
|
||||
<String ID="string_55" HPOS="794" VPOS="477" WIDTH="104" HEIGHT="36" WC="0.75" CONTENT="armet,"/><SP WIDTH="17" VPOS="477" HPOS="898"/>
|
||||
<String ID="string_56" HPOS="915" VPOS="474" WIDTH="200" HEIGHT="30" WC="0.97" CONTENT="consetetur"/><SP WIDTH="14" VPOS="474" HPOS="1115"/>
|
||||
<String ID="string_57" HPOS="1129" VPOS="463" WIDTH="205" HEIGHT="45" WC="0.96" CONTENT="sadipscing"/><SP WIDTH="15" VPOS="463" HPOS="1334"/>
|
||||
<String ID="string_58" HPOS="1349" VPOS="457" WIDTH="86" HEIGHT="41" WC="0.96" CONTENT="elitr,"/><SP WIDTH="16" VPOS="457" HPOS="1435"/>
|
||||
<String ID="string_59" HPOS="1451" VPOS="452" WIDTH="65" HEIGHT="39" WC="0.96" CONTENT="sed"/><SP WIDTH="14" VPOS="452" HPOS="1516"/>
|
||||
<String ID="string_60" HPOS="1530" VPOS="449" WIDTH="99" HEIGHT="36" WC="0.93" CONTENT="diam"/><SP WIDTH="14" VPOS="449" HPOS="1629"/>
|
||||
<String ID="string_61" HPOS="1643" VPOS="451" WIDTH="162" HEIGHT="36" WC="0.59" CONTENT="nonurny"/><SP WIDTH="16" VPOS="451" HPOS="1805"/>
|
||||
<String ID="string_62" HPOS="1821" VPOS="435" WIDTH="138" HEIGHT="39" WC="0.96" CONTENT="eirmod"/><SP WIDTH="12" VPOS="435" HPOS="1959"/>
|
||||
<String ID="string_63" HPOS="1971" VPOS="440" WIDTH="140" HEIGHT="37" WC="0.96" CONTENT="tempor"/>
|
||||
</TextLine>
|
||||
<TextLine ID="line_4" HPOS="216" VPOS="483" WIDTH="1888" HEIGHT="97">
|
||||
<String ID="string_64" HPOS="216" VPOS="543" WIDTH="165" HEIGHT="37" WC="0.97" CONTENT="invidunt"/><SP WIDTH="13" VPOS="543" HPOS="381"/>
|
||||
<String ID="string_65" HPOS="394" VPOS="546" WIDTH="39" HEIGHT="30" WC="0.97" CONTENT="ut"/><SP WIDTH="12" VPOS="546" HPOS="433"/>
|
||||
<String ID="string_66" HPOS="445" VPOS="539" WIDTH="122" HEIGHT="36" WC="0.96" CONTENT="labore"/><SP WIDTH="16" VPOS="539" HPOS="567"/>
|
||||
<String ID="string_67" HPOS="583" VPOS="543" WIDTH="35" HEIGHT="29" WC="0.96" CONTENT="et"/><SP WIDTH="14" VPOS="543" HPOS="618"/>
|
||||
<String ID="string_68" HPOS="632" VPOS="536" WIDTH="125" HEIGHT="34" WC="0.96" CONTENT="dolore"/><SP WIDTH="14" VPOS="536" HPOS="757"/>
|
||||
<String ID="string_69" HPOS="771" VPOS="539" WIDTH="131" HEIGHT="37" WC="0.46" CONTENT="magna"/><SP WIDTH="14" VPOS="539" HPOS="902"/>
|
||||
<String ID="string_70" HPOS="916" VPOS="526" WIDTH="182" HEIGHT="45" WC="0.96" CONTENT="aliquyam"/><SP WIDTH="14" VPOS="526" HPOS="1098"/>
|
||||
<String ID="string_71" HPOS="1112" VPOS="527" WIDTH="82" HEIGHT="37" WC="0.96" CONTENT="erat,"/><SP WIDTH="17" VPOS="527" HPOS="1194"/>
|
||||
<String ID="string_72" HPOS="1211" VPOS="519" WIDTH="63" HEIGHT="36" WC="0.97" CONTENT="sed"/><SP WIDTH="14" VPOS="519" HPOS="1274"/>
|
||||
<String ID="string_73" HPOS="1288" VPOS="517" WIDTH="97" HEIGHT="37" WC="0.96" CONTENT="diam"/><SP WIDTH="11" VPOS="517" HPOS="1385"/>
|
||||
<String ID="string_74" HPOS="1396" VPOS="513" WIDTH="185" HEIGHT="44" WC="0.96" CONTENT="voluptua."/><SP WIDTH="14" VPOS="513" HPOS="1581"/>
|
||||
<String ID="string_75" HPOS="1595" VPOS="505" WIDTH="50" HEIGHT="35" WC="0.96" CONTENT="At"/><SP WIDTH="11" VPOS="505" HPOS="1645"/>
|
||||
<String ID="string_76" HPOS="1656" VPOS="511" WIDTH="89" HEIGHT="27" WC="0.96" CONTENT="vero"/><SP WIDTH="16" VPOS="511" HPOS="1745"/>
|
||||
<String ID="string_77" HPOS="1761" VPOS="508" WIDTH="63" HEIGHT="26" WC="0.96" CONTENT="eos"/><SP WIDTH="15" VPOS="508" HPOS="1824"/>
|
||||
<String ID="string_78" HPOS="1839" VPOS="501" WIDTH="35" HEIGHT="30" WC="0.97" CONTENT="et"/><SP WIDTH="13" VPOS="501" HPOS="1874"/>
|
||||
<String ID="string_79" HPOS="1887" VPOS="499" WIDTH="168" HEIGHT="53" WC="0.80" CONTENT="accusam"/><SP WIDTH="-3" VPOS="499" HPOS="2055"/>
|
||||
<String ID="string_80" HPOS="2052" VPOS="483" WIDTH="52" HEIGHT="55" WC="0.97" CONTENT="et"/>
|
||||
</TextLine>
|
||||
<TextLine ID="line_5" HPOS="215" VPOS="552" WIDTH="1941" HEIGHT="97">
|
||||
<String ID="string_81" HPOS="215" VPOS="604" WIDTH="97" HEIGHT="45" WC="0.97" CONTENT="justo"/><SP WIDTH="16" VPOS="604" HPOS="312"/>
|
||||
<String ID="string_82" HPOS="328" VPOS="600" WIDTH="71" HEIGHT="35" WC="0.97" CONTENT="duo"/><SP WIDTH="16" VPOS="600" HPOS="399"/>
|
||||
<String ID="string_83" HPOS="415" VPOS="597" WIDTH="143" HEIGHT="36" WC="0.93" CONTENT="dolores"/><SP WIDTH="16" VPOS="597" HPOS="558"/>
|
||||
<String ID="string_84" HPOS="574" VPOS="600" WIDTH="34" HEIGHT="29" WC="0.96" CONTENT="et"/><SP WIDTH="14" VPOS="600" HPOS="608"/>
|
||||
<String ID="string_85" HPOS="622" VPOS="602" WIDTH="43" HEIGHT="26" WC="0.96" CONTENT="ea"/><SP WIDTH="13" VPOS="602" HPOS="665"/>
|
||||
<String ID="string_86" HPOS="678" VPOS="590" WIDTH="136" HEIGHT="36" WC="0.96" CONTENT="rebum."/><SP WIDTH="19" VPOS="590" HPOS="814"/>
|
||||
<String ID="string_87" HPOS="833" VPOS="588" WIDTH="74" HEIGHT="34" WC="0.96" CONTENT="Stet"/><SP WIDTH="14" VPOS="588" HPOS="907"/>
|
||||
<String ID="string_88" HPOS="921" VPOS="584" WIDTH="83" HEIGHT="36" WC="0.96" CONTENT="clita"/><SP WIDTH="12" VPOS="584" HPOS="1004"/>
|
||||
<String ID="string_89" HPOS="1016" VPOS="580" WIDTH="90" HEIGHT="36" WC="0.97" CONTENT="kasd"/><SP WIDTH="15" VPOS="580" HPOS="1106"/>
|
||||
<String ID="string_90" HPOS="1121" VPOS="578" WIDTH="205" HEIGHT="47" WC="0.96" CONTENT="gubergren,"/><SP WIDTH="16" VPOS="578" HPOS="1326"/>
|
||||
<String ID="string_91" HPOS="1342" VPOS="582" WIDTH="47" HEIGHT="25" WC="0.96" CONTENT="no"/><SP WIDTH="16" VPOS="582" HPOS="1389"/>
|
||||
<String ID="string_92" HPOS="1405" VPOS="581" WIDTH="62" HEIGHT="26" WC="0.97" CONTENT="sea"/><SP WIDTH="13" VPOS="581" HPOS="1467"/>
|
||||
<String ID="string_93" HPOS="1480" VPOS="566" WIDTH="172" HEIGHT="38" WC="0.96" CONTENT="takimata"/><SP WIDTH="14" VPOS="566" HPOS="1652"/>
|
||||
<String ID="string_94" HPOS="1666" VPOS="563" WIDTH="145" HEIGHT="33" WC="0.97" CONTENT="sanctus"/><SP WIDTH="15" VPOS="563" HPOS="1811"/>
|
||||
<String ID="string_95" HPOS="1826" VPOS="558" WIDTH="54" HEIGHT="30" WC="0.97" CONTENT="est"/><SP WIDTH="12" VPOS="558" HPOS="1880"/>
|
||||
<String ID="string_96" HPOS="1892" VPOS="552" WIDTH="130" HEIGHT="34" WC="0.96" CONTENT="Lorem"/><SP WIDTH="15" VPOS="552" HPOS="2022"/>
|
||||
<String ID="string_97" HPOS="2037" VPOS="553" WIDTH="119" HEIGHT="37" WC="0.51" CONTENT="Ipsum"/>
|
||||
</TextLine>
|
||||
<TextLine ID="line_6" HPOS="219" VPOS="657" WIDTH="282" HEIGHT="38">
|
||||
<String ID="string_98" HPOS="219" VPOS="658" WIDTH="104" HEIGHT="37" WC="0.97" CONTENT="dolor"/><SP WIDTH="15" VPOS="658" HPOS="323"/>
|
||||
<String ID="string_99" HPOS="338" VPOS="657" WIDTH="45" HEIGHT="35" WC="0.97" CONTENT="sit"/><SP WIDTH="14" VPOS="657" HPOS="383"/>
|
||||
<String ID="string_100" HPOS="397" VPOS="660" WIDTH="104" HEIGHT="35" WC="0.94" CONTENT="amet."/>
|
||||
</TextLine>
|
||||
</TextBlock>
|
||||
</PrintSpace>
|
||||
</Page>
|
||||
</Layout>
|
||||
</alto>
|
Binary file not shown.
Binary file not shown.
|
@ -0,0 +1,47 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<PcGts xmlns="http://schema.primaresearch.org/PAGE/gts/pagecontent/2018-07-15" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://schema.primaresearch.org/PAGE/gts/pagecontent/2018-07-15 http://schema.primaresearch.org/PAGE/gts/pagecontent/2018-07-15/pagecontent.xsd">
|
||||
<Metadata>
|
||||
<Creator></Creator>
|
||||
<Created>2019-07-26T13:59:00</Created>
|
||||
<LastChange>2019-07-26T14:00:29</LastChange></Metadata>
|
||||
<Page imageFilename="lorem-ipsum-scan.tif" imageXResolution="300.00000" imageYResolution="300.00000" imageWidth="2481" imageHeight="3508">
|
||||
<TextRegion id="tempReg357564684568544579089">
|
||||
<Coords points="0,0 1,0 1,1 0,1"/>
|
||||
<TextLine id="l0">
|
||||
<Coords points="228,237 228,295 2216,295 2216,237"/>
|
||||
<TextEquiv>
|
||||
<Unicode></Unicode></TextEquiv></TextLine>
|
||||
<TextLine id="l1">
|
||||
<Coords points="228,298 228,348 2160,348 2160,298"/>
|
||||
<TextEquiv>
|
||||
<Unicode></Unicode></TextEquiv></TextLine>
|
||||
<TextLine id="l2">
|
||||
<Coords points="225,348 225,410 2178,410 2178,348"/>
|
||||
<TextEquiv>
|
||||
<Unicode></Unicode></TextEquiv></TextLine>
|
||||
<TextLine id="l3">
|
||||
<Coords points="218,413 218,463 2153,463 2153,413"/>
|
||||
<TextEquiv>
|
||||
<Unicode></Unicode></TextEquiv></TextLine>
|
||||
<TextLine id="l4">
|
||||
<Coords points="225,466 225,522 2153,522 2153,466"/>
|
||||
<TextEquiv>
|
||||
<Unicode></Unicode></TextEquiv></TextLine>
|
||||
<TextLine id="l5">
|
||||
<Coords points="216,524 216,581 2187,581 2187,524"/>
|
||||
<TextEquiv>
|
||||
<Unicode></Unicode></TextEquiv></TextLine>
|
||||
<TextLine id="l6">
|
||||
<Coords points="219,584 219,640 542,640 542,584"/>
|
||||
<TextEquiv>
|
||||
<Unicode></Unicode></TextEquiv></TextLine></TextRegion>
|
||||
<TextRegion id="r7" type="paragraph">
|
||||
<Coords points="204,212 204,651 2227,651 2227,212"/>
|
||||
<TextEquiv>
|
||||
<Unicode>Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt
|
||||
ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo
|
||||
dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit
|
||||
amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor
|
||||
invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et
|
||||
justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum
|
||||
dolor sit amet.</Unicode></TextEquiv></TextRegion></Page></PcGts>
|
|
@ -0,0 +1,138 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<alto xmlns="http://www.loc.gov/standards/alto/ns-v3#" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.loc.gov/standards/alto/ns-v3# http://www.loc.gov/alto/v3/alto-3-0.xsd">
|
||||
<Description>
|
||||
<MeasurementUnit>pixel</MeasurementUnit>
|
||||
<sourceImageInformation>
|
||||
<fileName> </fileName>
|
||||
</sourceImageInformation>
|
||||
<OCRProcessing ID="OCR_0">
|
||||
<ocrProcessingStep>
|
||||
<processingSoftware>
|
||||
<softwareName>tesseract 4.1.0-rc4</softwareName>
|
||||
</processingSoftware>
|
||||
</ocrProcessingStep>
|
||||
</OCRProcessing>
|
||||
</Description>
|
||||
<Layout>
|
||||
<Page WIDTH="2481" HEIGHT="3508" PHYSICAL_IMG_NR="0" ID="page_0">
|
||||
<PrintSpace HPOS="0" VPOS="0" WIDTH="2481" HEIGHT="3508">
|
||||
<TextBlock ID="block_0" HPOS="234" VPOS="244" WIDTH="1966" HEIGHT="387">
|
||||
<TextLine ID="line_0" HPOS="237" VPOS="244" WIDTH="1963" HEIGHT="48">
|
||||
<String ID="string_0" HPOS="237" VPOS="248" WIDTH="133" HEIGHT="34" WC="0.96" CONTENT="Lorem"/><SP WIDTH="14" VPOS="248" HPOS="370"/>
|
||||
<String ID="string_1" HPOS="384" VPOS="247" WIDTH="120" HEIGHT="45" WC="0.96" CONTENT="ipsum"/><SP WIDTH="15" VPOS="247" HPOS="504"/>
|
||||
<String ID="string_2" HPOS="519" VPOS="246" WIDTH="103" HEIGHT="36" WC="0.96" CONTENT="dolor"/><SP WIDTH="14" VPOS="246" HPOS="622"/>
|
||||
<String ID="string_3" HPOS="636" VPOS="247" WIDTH="46" HEIGHT="35" WC="0.96" CONTENT="sit"/><SP WIDTH="14" VPOS="247" HPOS="682"/>
|
||||
<String ID="string_4" HPOS="696" VPOS="252" WIDTH="105" HEIGHT="36" WC="0.97" CONTENT="amet,"/><SP WIDTH="17" VPOS="252" HPOS="801"/>
|
||||
<String ID="string_5" HPOS="818" VPOS="251" WIDTH="202" HEIGHT="30" WC="0.96" CONTENT="consetetur"/><SP WIDTH="14" VPOS="251" HPOS="1020"/>
|
||||
<String ID="string_6" HPOS="1034" VPOS="244" WIDTH="207" HEIGHT="46" WC="0.96" CONTENT="sadipscing"/><SP WIDTH="15" VPOS="244" HPOS="1241"/>
|
||||
<String ID="string_7" HPOS="1256" VPOS="244" WIDTH="86" HEIGHT="43" WC="0.96" CONTENT="elitr,"/><SP WIDTH="16" VPOS="244" HPOS="1342"/>
|
||||
<String ID="string_8" HPOS="1358" VPOS="244" WIDTH="65" HEIGHT="36" WC="0.96" CONTENT="sed"/><SP WIDTH="15" VPOS="244" HPOS="1423"/>
|
||||
<String ID="string_9" HPOS="1438" VPOS="244" WIDTH="99" HEIGHT="36" WC="0.96" CONTENT="diam"/><SP WIDTH="14" VPOS="244" HPOS="1537"/>
|
||||
<String ID="string_10" HPOS="1551" VPOS="255" WIDTH="164" HEIGHT="35" WC="0.97" CONTENT="nonumy"/><SP WIDTH="15" VPOS="255" HPOS="1715"/>
|
||||
<String ID="string_11" HPOS="1730" VPOS="244" WIDTH="139" HEIGHT="36" WC="0.96" CONTENT="eirmod"/><SP WIDTH="13" VPOS="244" HPOS="1869"/>
|
||||
<String ID="string_12" HPOS="1882" VPOS="250" WIDTH="140" HEIGHT="40" WC="0.96" CONTENT="tempor"/><SP WIDTH="13" VPOS="250" HPOS="2022"/>
|
||||
<String ID="string_13" HPOS="2035" VPOS="244" WIDTH="165" HEIGHT="35" WC="0.96" CONTENT="invidunt"/>
|
||||
</TextLine>
|
||||
<TextLine ID="line_1" HPOS="237" VPOS="301" WIDTH="1913" HEIGHT="49">
|
||||
<String ID="string_14" HPOS="237" VPOS="310" WIDTH="39" HEIGHT="29" WC="0.96" CONTENT="ut"/><SP WIDTH="13" VPOS="310" HPOS="276"/>
|
||||
<String ID="string_15" HPOS="289" VPOS="304" WIDTH="123" HEIGHT="44" WC="0.96" CONTENT="labore"/><SP WIDTH="16" VPOS="304" HPOS="412"/>
|
||||
<String ID="string_16" HPOS="428" VPOS="310" WIDTH="34" HEIGHT="29" WC="0.97" CONTENT="et"/><SP WIDTH="14" VPOS="310" HPOS="462"/>
|
||||
<String ID="string_17" HPOS="476" VPOS="304" WIDTH="123" HEIGHT="36" WC="0.96" CONTENT="dolore"/><SP WIDTH="15" VPOS="304" HPOS="599"/>
|
||||
<String ID="string_18" HPOS="614" VPOS="313" WIDTH="133" HEIGHT="37" WC="0.96" CONTENT="magna"/><SP WIDTH="14" VPOS="313" HPOS="747"/>
|
||||
<String ID="string_19" HPOS="761" VPOS="302" WIDTH="183" HEIGHT="46" WC="0.96" CONTENT="aliquyam"/><SP WIDTH="15" VPOS="302" HPOS="944"/>
|
||||
<String ID="string_20" HPOS="959" VPOS="308" WIDTH="81" HEIGHT="36" WC="0.96" CONTENT="erat,"/><SP WIDTH="17" VPOS="308" HPOS="1040"/>
|
||||
<String ID="string_21" HPOS="1057" VPOS="301" WIDTH="65" HEIGHT="36" WC="0.96" CONTENT="sed"/><SP WIDTH="14" VPOS="301" HPOS="1122"/>
|
||||
<String ID="string_22" HPOS="1136" VPOS="301" WIDTH="97" HEIGHT="36" WC="0.95" CONTENT="diam"/><SP WIDTH="13" VPOS="301" HPOS="1233"/>
|
||||
<String ID="string_23" HPOS="1246" VPOS="301" WIDTH="183" HEIGHT="46" WC="0.96" CONTENT="voluptua."/><SP WIDTH="13" VPOS="301" HPOS="1429"/>
|
||||
<String ID="string_24" HPOS="1442" VPOS="303" WIDTH="51" HEIGHT="34" WC="0.96" CONTENT="At"/><SP WIDTH="12" VPOS="303" HPOS="1493"/>
|
||||
<String ID="string_25" HPOS="1505" VPOS="312" WIDTH="88" HEIGHT="25" WC="0.96" CONTENT="vero"/><SP WIDTH="17" VPOS="312" HPOS="1593"/>
|
||||
<String ID="string_26" HPOS="1610" VPOS="312" WIDTH="64" HEIGHT="25" WC="0.96" CONTENT="eos"/><SP WIDTH="16" VPOS="312" HPOS="1674"/>
|
||||
<String ID="string_27" HPOS="1690" VPOS="308" WIDTH="35" HEIGHT="29" WC="0.96" CONTENT="et"/><SP WIDTH="14" VPOS="308" HPOS="1725"/>
|
||||
<String ID="string_28" HPOS="1739" VPOS="312" WIDTH="168" HEIGHT="25" WC="0.96" CONTENT="accusam"/><SP WIDTH="15" VPOS="312" HPOS="1907"/>
|
||||
<String ID="string_29" HPOS="1922" VPOS="308" WIDTH="34" HEIGHT="29" WC="0.97" CONTENT="et"/><SP WIDTH="11" VPOS="308" HPOS="1956"/>
|
||||
<String ID="string_30" HPOS="1967" VPOS="302" WIDTH="96" HEIGHT="45" WC="0.97" CONTENT="justo"/><SP WIDTH="16" VPOS="302" HPOS="2063"/>
|
||||
<String ID="string_31" HPOS="2079" VPOS="301" WIDTH="71" HEIGHT="36" WC="0.96" CONTENT="duo"/>
|
||||
</TextLine>
|
||||
<TextLine ID="line_2" HPOS="238" VPOS="359" WIDTH="1928" HEIGHT="46">
|
||||
<String ID="string_32" HPOS="238" VPOS="361" WIDTH="144" HEIGHT="36" WC="0.96" CONTENT="dolores"/><SP WIDTH="16" VPOS="361" HPOS="382"/>
|
||||
<String ID="string_33" HPOS="398" VPOS="368" WIDTH="34" HEIGHT="29" WC="0.96" CONTENT="et"/><SP WIDTH="15" VPOS="368" HPOS="432"/>
|
||||
<String ID="string_34" HPOS="447" VPOS="372" WIDTH="41" HEIGHT="25" WC="0.96" CONTENT="ea"/><SP WIDTH="14" VPOS="372" HPOS="488"/>
|
||||
<String ID="string_35" HPOS="502" VPOS="361" WIDTH="136" HEIGHT="36" WC="0.96" CONTENT="rebum."/><SP WIDTH="19" VPOS="361" HPOS="638"/>
|
||||
<String ID="string_36" HPOS="657" VPOS="363" WIDTH="75" HEIGHT="33" WC="0.97" CONTENT="Stet"/><SP WIDTH="14" VPOS="363" HPOS="732"/>
|
||||
<String ID="string_37" HPOS="746" VPOS="360" WIDTH="84" HEIGHT="36" WC="0.96" CONTENT="clita"/><SP WIDTH="13" VPOS="360" HPOS="830"/>
|
||||
<String ID="string_38" HPOS="843" VPOS="359" WIDTH="91" HEIGHT="36" WC="0.96" CONTENT="kasd"/><SP WIDTH="13" VPOS="359" HPOS="934"/>
|
||||
<String ID="string_39" HPOS="947" VPOS="359" WIDTH="208" HEIGHT="46" WC="0.96" CONTENT="gubergren,"/><SP WIDTH="16" VPOS="359" HPOS="1155"/>
|
||||
<String ID="string_40" HPOS="1171" VPOS="370" WIDTH="47" HEIGHT="24" WC="0.96" CONTENT="no"/><SP WIDTH="16" VPOS="370" HPOS="1218"/>
|
||||
<String ID="string_41" HPOS="1234" VPOS="370" WIDTH="61" HEIGHT="25" WC="0.96" CONTENT="sea"/><SP WIDTH="13" VPOS="370" HPOS="1295"/>
|
||||
<String ID="string_42" HPOS="1308" VPOS="359" WIDTH="172" HEIGHT="36" WC="0.96" CONTENT="takimata"/><SP WIDTH="15" VPOS="359" HPOS="1480"/>
|
||||
<String ID="string_43" HPOS="1495" VPOS="365" WIDTH="145" HEIGHT="30" WC="0.96" CONTENT="sanctus"/><SP WIDTH="16" VPOS="365" HPOS="1640"/>
|
||||
<String ID="string_44" HPOS="1656" VPOS="365" WIDTH="55" HEIGHT="29" WC="0.96" CONTENT="est"/><SP WIDTH="13" VPOS="365" HPOS="1711"/>
|
||||
<String ID="string_45" HPOS="1724" VPOS="361" WIDTH="131" HEIGHT="33" WC="0.96" CONTENT="Lorem"/><SP WIDTH="15" VPOS="361" HPOS="1855"/>
|
||||
<String ID="string_46" HPOS="1870" VPOS="360" WIDTH="119" HEIGHT="44" WC="0.96" CONTENT="ipsum"/><SP WIDTH="15" VPOS="360" HPOS="1989"/>
|
||||
<String ID="string_47" HPOS="2004" VPOS="359" WIDTH="103" HEIGHT="35" WC="0.96" CONTENT="dolor"/><SP WIDTH="14" VPOS="359" HPOS="2107"/>
|
||||
<String ID="string_48" HPOS="2121" VPOS="360" WIDTH="45" HEIGHT="34" WC="0.96" CONTENT="sit"/>
|
||||
</TextLine>
|
||||
<TextLine ID="line_3" HPOS="238" VPOS="416" WIDTH="1905" HEIGHT="48">
|
||||
<String ID="string_49" HPOS="238" VPOS="425" WIDTH="105" HEIGHT="29" WC="0.96" CONTENT="amet."/><SP WIDTH="16" VPOS="425" HPOS="343"/>
|
||||
<String ID="string_50" HPOS="359" VPOS="421" WIDTH="132" HEIGHT="33" WC="0.96" CONTENT="Lorem"/><SP WIDTH="13" VPOS="421" HPOS="491"/>
|
||||
<String ID="string_51" HPOS="504" VPOS="420" WIDTH="121" HEIGHT="44" WC="0.96" CONTENT="ipsum"/><SP WIDTH="15" VPOS="420" HPOS="625"/>
|
||||
<String ID="string_52" HPOS="640" VPOS="418" WIDTH="104" HEIGHT="36" WC="0.96" CONTENT="dolor"/><SP WIDTH="14" VPOS="418" HPOS="744"/>
|
||||
<String ID="string_53" HPOS="758" VPOS="419" WIDTH="45" HEIGHT="35" WC="0.97" CONTENT="sit"/><SP WIDTH="15" VPOS="419" HPOS="803"/>
|
||||
<String ID="string_54" HPOS="818" VPOS="424" WIDTH="104" HEIGHT="36" WC="0.96" CONTENT="amet,"/><SP WIDTH="17" VPOS="424" HPOS="922"/>
|
||||
<String ID="string_55" HPOS="939" VPOS="422" WIDTH="201" HEIGHT="30" WC="0.96" CONTENT="consetetur"/><SP WIDTH="15" VPOS="422" HPOS="1140"/>
|
||||
<String ID="string_56" HPOS="1155" VPOS="416" WIDTH="207" HEIGHT="46" WC="0.96" CONTENT="sadipscing"/><SP WIDTH="15" VPOS="416" HPOS="1362"/>
|
||||
<String ID="string_57" HPOS="1377" VPOS="417" WIDTH="86" HEIGHT="42" WC="0.96" CONTENT="elitr,"/><SP WIDTH="17" VPOS="417" HPOS="1463"/>
|
||||
<String ID="string_58" HPOS="1480" VPOS="416" WIDTH="66" HEIGHT="36" WC="0.96" CONTENT="sed"/><SP WIDTH="15" VPOS="416" HPOS="1546"/>
|
||||
<String ID="string_59" HPOS="1561" VPOS="416" WIDTH="98" HEIGHT="36" WC="0.96" CONTENT="diam"/><SP WIDTH="14" VPOS="416" HPOS="1659"/>
|
||||
<String ID="string_60" HPOS="1673" VPOS="427" WIDTH="163" HEIGHT="35" WC="0.96" CONTENT="nonumy"/><SP WIDTH="16" VPOS="427" HPOS="1836"/>
|
||||
<String ID="string_61" HPOS="1852" VPOS="416" WIDTH="138" HEIGHT="36" WC="0.96" CONTENT="eirmod"/><SP WIDTH="13" VPOS="416" HPOS="1990"/>
|
||||
<String ID="string_62" HPOS="2003" VPOS="422" WIDTH="140" HEIGHT="40" WC="0.96" CONTENT="tempor"/>
|
||||
</TextLine>
|
||||
<TextLine ID="line_4" HPOS="236" VPOS="474" WIDTH="1897" HEIGHT="47">
|
||||
<String ID="string_63" HPOS="236" VPOS="476" WIDTH="166" HEIGHT="35" WC="0.96" CONTENT="invidunt"/><SP WIDTH="14" VPOS="476" HPOS="402"/>
|
||||
<String ID="string_64" HPOS="416" VPOS="482" WIDTH="39" HEIGHT="29" WC="0.96" CONTENT="ut"/><SP WIDTH="12" VPOS="482" HPOS="455"/>
|
||||
<String ID="string_65" HPOS="467" VPOS="476" WIDTH="122" HEIGHT="35" WC="0.96" CONTENT="labore"/><SP WIDTH="16" VPOS="476" HPOS="589"/>
|
||||
<String ID="string_66" HPOS="605" VPOS="482" WIDTH="34" HEIGHT="29" WC="0.96" CONTENT="et"/><SP WIDTH="15" VPOS="482" HPOS="639"/>
|
||||
<String ID="string_67" HPOS="654" VPOS="475" WIDTH="125" HEIGHT="36" WC="0.96" CONTENT="dolore"/><SP WIDTH="14" VPOS="475" HPOS="779"/>
|
||||
<String ID="string_68" HPOS="793" VPOS="484" WIDTH="131" HEIGHT="37" WC="0.96" CONTENT="magna"/><SP WIDTH="15" VPOS="484" HPOS="924"/>
|
||||
<String ID="string_69" HPOS="939" VPOS="474" WIDTH="182" HEIGHT="45" WC="0.96" CONTENT="aliquyam"/><SP WIDTH="15" VPOS="474" HPOS="1121"/>
|
||||
<String ID="string_70" HPOS="1136" VPOS="480" WIDTH="81" HEIGHT="37" WC="0.96" CONTENT="erat,"/><SP WIDTH="18" VPOS="480" HPOS="1217"/>
|
||||
<String ID="string_71" HPOS="1235" VPOS="474" WIDTH="63" HEIGHT="35" WC="0.96" CONTENT="sed"/><SP WIDTH="15" VPOS="474" HPOS="1298"/>
|
||||
<String ID="string_72" HPOS="1313" VPOS="474" WIDTH="97" HEIGHT="35" WC="0.96" CONTENT="diam"/><SP WIDTH="13" VPOS="474" HPOS="1410"/>
|
||||
<String ID="string_73" HPOS="1423" VPOS="474" WIDTH="186" HEIGHT="46" WC="0.96" CONTENT="voluptua."/><SP WIDTH="14" VPOS="474" HPOS="1609"/>
|
||||
<String ID="string_74" HPOS="1623" VPOS="475" WIDTH="50" HEIGHT="34" WC="0.96" CONTENT="At"/><SP WIDTH="12" VPOS="475" HPOS="1673"/>
|
||||
<String ID="string_75" HPOS="1685" VPOS="485" WIDTH="89" HEIGHT="24" WC="0.96" CONTENT="vero"/><SP WIDTH="16" VPOS="485" HPOS="1774"/>
|
||||
<String ID="string_76" HPOS="1790" VPOS="484" WIDTH="63" HEIGHT="25" WC="0.96" CONTENT="eos"/><SP WIDTH="15" VPOS="484" HPOS="1853"/>
|
||||
<String ID="string_77" HPOS="1868" VPOS="480" WIDTH="34" HEIGHT="29" WC="0.96" CONTENT="et"/><SP WIDTH="14" VPOS="480" HPOS="1902"/>
|
||||
<String ID="string_78" HPOS="1916" VPOS="484" WIDTH="168" HEIGHT="25" WC="0.96" CONTENT="accusam"/><SP WIDTH="16" VPOS="484" HPOS="2084"/>
|
||||
<String ID="string_79" HPOS="2100" VPOS="480" WIDTH="33" HEIGHT="29" WC="0.96" CONTENT="et"/>
|
||||
</TextLine>
|
||||
<TextLine ID="line_5" HPOS="234" VPOS="531" WIDTH="1950" HEIGHT="47">
|
||||
<String ID="string_80" HPOS="234" VPOS="534" WIDTH="98" HEIGHT="44" WC="0.97" CONTENT="justo"/><SP WIDTH="16" VPOS="534" HPOS="332"/>
|
||||
<String ID="string_81" HPOS="348" VPOS="533" WIDTH="71" HEIGHT="35" WC="0.96" CONTENT="duo"/><SP WIDTH="16" VPOS="533" HPOS="419"/>
|
||||
<String ID="string_82" HPOS="435" VPOS="533" WIDTH="143" HEIGHT="35" WC="0.96" CONTENT="dolores"/><SP WIDTH="15" VPOS="533" HPOS="578"/>
|
||||
<String ID="string_83" HPOS="593" VPOS="539" WIDTH="35" HEIGHT="29" WC="0.96" CONTENT="et"/><SP WIDTH="14" VPOS="539" HPOS="628"/>
|
||||
<String ID="string_84" HPOS="642" VPOS="543" WIDTH="42" HEIGHT="25" WC="0.97" CONTENT="ea"/><SP WIDTH="14" VPOS="543" HPOS="684"/>
|
||||
<String ID="string_85" HPOS="698" VPOS="533" WIDTH="137" HEIGHT="35" WC="0.96" CONTENT="rebum."/><SP WIDTH="18" VPOS="533" HPOS="835"/>
|
||||
<String ID="string_86" HPOS="853" VPOS="534" WIDTH="74" HEIGHT="34" WC="0.96" CONTENT="Stet"/><SP WIDTH="14" VPOS="534" HPOS="927"/>
|
||||
<String ID="string_87" HPOS="941" VPOS="531" WIDTH="84" HEIGHT="36" WC="0.96" CONTENT="clita"/><SP WIDTH="13" VPOS="531" HPOS="1025"/>
|
||||
<String ID="string_88" HPOS="1038" VPOS="531" WIDTH="89" HEIGHT="35" WC="0.96" CONTENT="kasd"/><SP WIDTH="15" VPOS="531" HPOS="1127"/>
|
||||
<String ID="string_89" HPOS="1142" VPOS="531" WIDTH="208" HEIGHT="46" WC="0.96" CONTENT="gubergren,"/><SP WIDTH="16" VPOS="531" HPOS="1350"/>
|
||||
<String ID="string_90" HPOS="1366" VPOS="542" WIDTH="48" HEIGHT="25" WC="0.96" CONTENT="no"/><SP WIDTH="16" VPOS="542" HPOS="1414"/>
|
||||
<String ID="string_91" HPOS="1430" VPOS="542" WIDTH="62" HEIGHT="25" WC="0.96" CONTENT="sea"/><SP WIDTH="13" VPOS="542" HPOS="1492"/>
|
||||
<String ID="string_92" HPOS="1505" VPOS="531" WIDTH="173" HEIGHT="36" WC="0.96" CONTENT="takimata"/><SP WIDTH="15" VPOS="531" HPOS="1678"/>
|
||||
<String ID="string_93" HPOS="1693" VPOS="538" WIDTH="144" HEIGHT="29" WC="0.96" CONTENT="sanctus"/><SP WIDTH="16" VPOS="538" HPOS="1837"/>
|
||||
<String ID="string_94" HPOS="1853" VPOS="537" WIDTH="53" HEIGHT="29" WC="0.96" CONTENT="est"/><SP WIDTH="14" VPOS="537" HPOS="1906"/>
|
||||
<String ID="string_95" HPOS="1920" VPOS="533" WIDTH="130" HEIGHT="33" WC="0.96" CONTENT="Lorem"/><SP WIDTH="14" VPOS="533" HPOS="2050"/>
|
||||
<String ID="string_96" HPOS="2064" VPOS="532" WIDTH="120" HEIGHT="44" WC="0.95" CONTENT="ipsum"/>
|
||||
</TextLine>
|
||||
<TextLine ID="line_6" HPOS="237" VPOS="590" WIDTH="282" HEIGHT="41">
|
||||
<String ID="string_97" HPOS="237" VPOS="590" WIDTH="104" HEIGHT="35" WC="0.96" CONTENT="dolor"/><SP WIDTH="15" VPOS="590" HPOS="341"/>
|
||||
<String ID="string_98" HPOS="356" VPOS="591" WIDTH="45" HEIGHT="35" WC="0.96" CONTENT="sit"/><SP WIDTH="14" VPOS="591" HPOS="401"/>
|
||||
<String ID="string_99" HPOS="415" VPOS="597" WIDTH="104" HEIGHT="34" WC="0.96" CONTENT="amet."/>
|
||||
</TextLine>
|
||||
</TextBlock>
|
||||
</PrintSpace>
|
||||
</Page>
|
||||
</Layout>
|
||||
</alto>
|
BIN
qurator/dinglehopper/tests/data/lorem-ipsum/lorem-ipsum-scan.pdf
Normal file
BIN
qurator/dinglehopper/tests/data/lorem-ipsum/lorem-ipsum-scan.pdf
Normal file
Binary file not shown.
BIN
qurator/dinglehopper/tests/data/lorem-ipsum/lorem-ipsum-scan.tif
Normal file
BIN
qurator/dinglehopper/tests/data/lorem-ipsum/lorem-ipsum-scan.tif
Normal file
Binary file not shown.
BIN
qurator/dinglehopper/tests/data/lorem-ipsum/lorem-ipsum.odt
Normal file
BIN
qurator/dinglehopper/tests/data/lorem-ipsum/lorem-ipsum.odt
Normal file
Binary file not shown.
4204
qurator/dinglehopper/tests/data/order.page.xml
Normal file
4204
qurator/dinglehopper/tests/data/order.page.xml
Normal file
File diff suppressed because it is too large
Load diff
3394
qurator/dinglehopper/tests/data/test-fake-ocr.page2018.xml
Normal file
3394
qurator/dinglehopper/tests/data/test-fake-ocr.page2018.xml
Normal file
File diff suppressed because it is too large
Load diff
3394
qurator/dinglehopper/tests/data/test-gt.page2018.xml
Normal file
3394
qurator/dinglehopper/tests/data/test-gt.page2018.xml
Normal file
File diff suppressed because it is too large
Load diff
20186
qurator/dinglehopper/tests/data/test.alto1.xml
Normal file
20186
qurator/dinglehopper/tests/data/test.alto1.xml
Normal file
File diff suppressed because it is too large
Load diff
64
qurator/dinglehopper/tests/data/test.alto2.xml
Normal file
64
qurator/dinglehopper/tests/data/test.alto2.xml
Normal file
|
@ -0,0 +1,64 @@
|
|||
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
|
||||
<alto xmlns="http://www.loc.gov/standards/alto/ns-v2#" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.loc.gov/standards/alto/ns-v2# http://www.loc.gov/standards/alto/alto-v2.0.xsd">
|
||||
<Description>
|
||||
<MeasurementUnit>pixel</MeasurementUnit>
|
||||
<OCRProcessing ID="IdOcr"><ocrProcessingStep><processingDateTime>2017-03-27</processingDateTime><processingSoftware><softwareCreator>ABBYY</softwareCreator><softwareName>ABBYY FineReader Engine</softwareName><softwareVersion>11</softwareVersion></processingSoftware></ocrProcessingStep></OCRProcessing>
|
||||
</Description>
|
||||
<Styles><TextStyle ID="font0" FONTFAMILY="Times New Roman" FONTSIZE="7"/><TextStyle ID="font1" FONTFAMILY="Times New Roman" FONTSIZE="11"/>
|
||||
</Styles>
|
||||
<Layout>
|
||||
<Page ID="Page1" PHYSICAL_IMG_NR="1" HEIGHT="2500" WIDTH="1720">
|
||||
<TopMargin HEIGHT="172" WIDTH="1720" VPOS="0" HPOS="0">
|
||||
</TopMargin>
|
||||
<LeftMargin HEIGHT="2016" WIDTH="341" VPOS="172" HPOS="0">
|
||||
</LeftMargin>
|
||||
<RightMargin HEIGHT="2016" WIDTH="111" VPOS="172" HPOS="1609">
|
||||
</RightMargin>
|
||||
<BottomMargin HEIGHT="312" WIDTH="1720" VPOS="2188" HPOS="0">
|
||||
</BottomMargin>
|
||||
<PrintSpace HEIGHT="2016" WIDTH="1268" VPOS="172" HPOS="341">
|
||||
<TextBlock ID="Page1_Block1" HEIGHT="43" WIDTH="72" VPOS="174" HPOS="936" language="de" STYLEREFS="font1">
|
||||
<TextLine HEIGHT="31" WIDTH="60" VPOS="180" HPOS="942"><String STYLE="bold" WC="0.676666677" CONTENT="142" HEIGHT="31" WIDTH="60" VPOS="180" HPOS="942"/></TextLine>
|
||||
</TextBlock>
|
||||
<ComposedBlock ID="Page1_Block2" HEIGHT="1306" WIDTH="1266" VPOS="257" HPOS="341" TYPE="container"><Shape><Polygon POINTS="348,262 1610,262 1610,1564 348,1564 348,262"/></Shape>
|
||||
<TextBlock ID="Page1_Block3" HEIGHT="776" WIDTH="1261" VPOS="257" HPOS="343" language="de" STYLEREFS="font1"><Shape><Polygon POINTS="350,262 1610,262 1610,708 992,708 992,1034 350,1034 350,262"/></Shape>
|
||||
<TextLine HEIGHT="50" WIDTH="1223" VPOS="267" HPOS="363"><String WC="0.6899999976" CONTENT="die" HEIGHT="33" WIDTH="46" VPOS="271" HPOS="363"/><SP WIDTH="16" VPOS="272" HPOS="410"/><String WC="0.7875000238" CONTENT="Zugtiere" HEIGHT="44" WIDTH="142" VPOS="270" HPOS="427"/><SP WIDTH="20" VPOS="281" HPOS="570"/><String WC="0.9499999881" CONTENT="eines" HEIGHT="34" WIDTH="82" VPOS="271" HPOS="591"/><SP WIDTH="10" VPOS="272" HPOS="674"/><String WC="0.6349999905" CONTENT="Joches" HEIGHT="42" WIDTH="113" VPOS="272" HPOS="685"/><SP WIDTH="15" VPOS="271" HPOS="799"/><String WC="0.6009091139" CONTENT="(griechisch" HEIGHT="45" WIDTH="161" VPOS="270" HPOS="815"/><SP WIDTH="19" VPOS="271" HPOS="977"/><String WC="0.7699999809" CONTENT="zygos)," HEIGHT="44" WIDTH="126" VPOS="269" HPOS="997"/><SP WIDTH="21" VPOS="272" HPOS="1124"/><String WC="0.7099999785" CONTENT="so" HEIGHT="42" WIDTH="27" VPOS="271" HPOS="1146"/><SP WIDTH="19" VPOS="280" HPOS="1174"/><String WC="0.6679999828" CONTENT="nennt" HEIGHT="32" WIDTH="94" VPOS="272" HPOS="1194"/><SP WIDTH="19" VPOS="272" HPOS="1289"/><String WC="0.4133333266" CONTENT="man" HEIGHT="23" WIDTH="72" VPOS="281" HPOS="1309"/><SP WIDTH="21" VPOS="271" HPOS="1382"/><String WC="0.5099999905" CONTENT="die" HEIGHT="33" WIDTH="46" VPOS="271" HPOS="1404"/><SP WIDTH="15" VPOS="272" HPOS="1451"/><String WC="0.8700000048" CONTENT="Zporen" HEIGHT="43" WIDTH="119" VPOS="271" HPOS="1467"/></TextLine>
|
||||
<TextLine HEIGHT="51" WIDTH="1224" VPOS="321" HPOS="363"><String WC="0.8133333325" CONTENT="der" HEIGHT="34" WIDTH="50" VPOS="325" HPOS="363"/><SP WIDTH="24" VPOS="327" HPOS="414"/><String WC="0.8700000048" CONTENT="Tonjugaten" HEIGHT="43" WIDTH="197" VPOS="326" HPOS="439"/><SP WIDTH="32" VPOS="337" HPOS="637"/><String WC="0.6499999762" CONTENT="auch" HEIGHT="43" WIDTH="70" VPOS="326" HPOS="670"/><SP WIDTH="31" VPOS="326" HPOS="741"/><String WC="0.7120000124" CONTENT="Jochsporen" HEIGHT="43" WIDTH="185" VPOS="326" HPOS="773"/><SP WIDTH="37" VPOS="336" HPOS="959"/><String WC="0.9200000167" CONTENT="oder" HEIGHT="32" WIDTH="71" VPOS="327" HPOS="997"/><SP WIDTH="31" VPOS="326" HPOS="1069"/><String WC="0.7072727084" CONTENT="Zpgosporen." HEIGHT="44" WIDTH="203" VPOS="325" HPOS="1101"/><SP WIDTH="53" VPOS="326" HPOS="1305"/><String WC="0.5320000052" CONTENT="Daher" HEIGHT="43" WIDTH="107" VPOS="326" HPOS="1359"/><SP WIDTH="36" VPOS="325" HPOS="1467"/><String WC="0.5720000267" CONTENT="heißt" HEIGHT="43" WIDTH="83" VPOS="325" HPOS="1504"/></TextLine>
|
||||
<TextLine HEIGHT="46" WIDTH="655" VPOS="379" HPOS="363"><String WC="0.8650000095" CONTENT="auch" HEIGHT="43" WIDTH="70" VPOS="381" HPOS="363"/><SP WIDTH="29" VPOS="381" HPOS="434"/><String WC="0.6299999952" CONTENT="die" HEIGHT="33" WIDTH="46" VPOS="381" HPOS="464"/><SP WIDTH="24" VPOS="392" HPOS="511"/><String WC="0.7699999809" CONTENT="ganze" HEIGHT="33" WIDTH="94" VPOS="391" HPOS="536"/><SP WIDTH="24" VPOS="381" HPOS="631"/><String WC="0.7371428609" CONTENT="Ordnung" HEIGHT="43" WIDTH="154" VPOS="381" HPOS="656"/><SP WIDTH="24" VPOS="382" HPOS="811"/><String WC="0.800999999" CONTENT="Jochalgen." HEIGHT="43" WIDTH="182" VPOS="381" HPOS="836"/></TextLine>
|
||||
<TextLine HEIGHT="50" WIDTH="1182" VPOS="432" HPOS="406"><String WC="0.3966666758" CONTENT="Wir" HEIGHT="33" WIDTH="69" VPOS="436" HPOS="406"/><SP WIDTH="24" VPOS="446" HPOS="475"/><String WC="0.6949999928" CONTENT="wollen" HEIGHT="33" WIDTH="112" VPOS="436" HPOS="499"/><SP WIDTH="24" VPOS="445" HPOS="611"/><String WC="0.5166666508" CONTENT="nun" HEIGHT="23" WIDTH="65" VPOS="446" HPOS="635"/><SP WIDTH="24" VPOS="446" HPOS="700"/><String WC="0.7570000291" CONTENT="versuchen," HEIGHT="44" WIDTH="166" VPOS="435" HPOS="724"/><SP WIDTH="27" VPOS="446" HPOS="890"/><String WC="0.6733333468" CONTENT="uns" HEIGHT="23" WIDTH="59" VPOS="446" HPOS="917"/><SP WIDTH="25" VPOS="446" HPOS="976"/><String WC="0.6725000143" CONTENT="eine" HEIGHT="33" WIDTH="66" VPOS="436" HPOS="1001"/><SP WIDTH="25" VPOS="436" HPOS="1067"/><String WC="0.6690909266" CONTENT="Vorstellung" HEIGHT="44" WIDTH="192" VPOS="435" HPOS="1092"/><SP WIDTH="25" VPOS="446" HPOS="1284"/><String WC="0.8466666937" CONTENT="von" HEIGHT="23" WIDTH="62" VPOS="446" HPOS="1309"/><SP WIDTH="25" VPOS="436" HPOS="1371"/><String WC="0.5866666436" CONTENT="den" HEIGHT="32" WIDTH="56" VPOS="436" HPOS="1396"/><SP WIDTH="25" VPOS="436" HPOS="1452"/><String WC="0.7366666794" CONTENT="Zchon-" HEIGHT="44" WIDTH="111" VPOS="435" HPOS="1477"/></TextLine>
|
||||
<TextLine HEIGHT="50" WIDTH="1224" VPOS="486" HPOS="363"><String WC="0.7181817889" CONTENT="heitsformen" HEIGHT="45" WIDTH="199" VPOS="489" HPOS="363"/><SP WIDTH="32" VPOS="490" HPOS="563"/><String WC="0.8633333445" CONTENT="der" HEIGHT="33" WIDTH="50" VPOS="490" HPOS="596"/><SP WIDTH="31" VPOS="491" HPOS="647"/><String WC="0.7749999762" CONTENT="in" HEIGHT="33" WIDTH="30" VPOS="491" HPOS="679"/><SP WIDTH="31" VPOS="501" HPOS="710"/><String WC="0.5479999781" CONTENT="viele" HEIGHT="33" WIDTH="75" VPOS="491" HPOS="742"/><SP WIDTH="32" VPOS="502" HPOS="818"/><String WC="0.7345454693" CONTENT="artenreiche" HEIGHT="44" WIDTH="181" VPOS="490" HPOS="851"/><SP WIDTH="31" VPOS="491" HPOS="1033"/><String WC="0.7277777791" CONTENT="Gattungen" HEIGHT="43" WIDTH="181" VPOS="490" HPOS="1065"/><SP WIDTH="32" VPOS="501" HPOS="1247"/><String WC="0.7766666412" CONTENT="geteilten" HEIGHT="43" WIDTH="140" VPOS="490" HPOS="1280"/><SP WIDTH="32" VPOS="491" HPOS="1421"/><String WC="0.7514285445" CONTENT="Familie" HEIGHT="44" WIDTH="133" VPOS="489" HPOS="1454"/></TextLine>
|
||||
<TextLine HEIGHT="51" WIDTH="1225" VPOS="540" HPOS="362"><String WC="0.7633333206" CONTENT="der" HEIGHT="32" WIDTH="51" VPOS="546" HPOS="362"/><SP WIDTH="24" VPOS="544" HPOS="414"/><String WC="0.4366666675" CONTENT="OesmiäiLLeen" HEIGHT="35" WIDTH="254" VPOS="543" HPOS="439"/><SP WIDTH="29" VPOS="555" HPOS="694"/><String WC="0.8199999928" CONTENT="zu" HEIGHT="31" WIDTH="35" VPOS="556" HPOS="724"/><SP WIDTH="24" VPOS="556" HPOS="760"/><String WC="0.5699999928" CONTENT="machen." HEIGHT="44" WIDTH="131" VPOS="545" HPOS="785"/><SP WIDTH="47" VPOS="546" HPOS="917"/><String WC="0.7466666698" CONTENT="Vas" HEIGHT="33" WIDTH="68" VPOS="546" HPOS="965"/><SP WIDTH="25" VPOS="556" HPOS="1034"/><String WC="0.6685714126" CONTENT="gelingt" HEIGHT="43" WIDTH="116" VPOS="545" HPOS="1060"/><SP WIDTH="24" VPOS="545" HPOS="1177"/><String WC="0.5785714388" CONTENT="leicht," HEIGHT="43" WIDTH="95" VPOS="545" HPOS="1202"/><SP WIDTH="31" VPOS="556" HPOS="1298"/><String WC="0.6675000191" CONTENT="wenn" HEIGHT="23" WIDTH="90" VPOS="556" HPOS="1330"/><SP WIDTH="23" VPOS="556" HPOS="1421"/><String WC="0.5666666627" CONTENT="wir" HEIGHT="35" WIDTH="58" VPOS="544" HPOS="1445"/><SP WIDTH="23" VPOS="555" HPOS="1504"/><String WC="0.8000000119" CONTENT="uns" HEIGHT="23" WIDTH="59" VPOS="555" HPOS="1528"/></TextLine>
|
||||
<TextLine HEIGHT="50" WIDTH="1225" VPOS="596" HPOS="362"><String WC="0.6399999857" CONTENT="selbst" HEIGHT="42" WIDTH="84" VPOS="600" HPOS="362"/><SP WIDTH="23" VPOS="603" HPOS="447"/><String WC="0.80400002" CONTENT="etwas" HEIGHT="33" WIDTH="98" VPOS="601" HPOS="471"/><SP WIDTH="23" VPOS="601" HPOS="570"/><String WC="0.6587499976" CONTENT="Material" HEIGHT="34" WIDTH="156" VPOS="600" HPOS="594"/><SP WIDTH="24" VPOS="601" HPOS="751"/><String WC="0.7300000191" CONTENT="holen," HEIGHT="44" WIDTH="99" VPOS="600" HPOS="776"/><SP WIDTH="25" VPOS="600" HPOS="876"/><String WC="0.7516666651" CONTENT="höchst" HEIGHT="43" WIDTH="95" VPOS="600" HPOS="902"/><SP WIDTH="22" VPOS="603" HPOS="998"/><String WC="0.5454545617" CONTENT="mangelhaft," HEIGHT="44" WIDTH="206" VPOS="600" HPOS="1021"/><SP WIDTH="25" VPOS="610" HPOS="1228"/><String WC="0.7599999905" CONTENT="wenn" HEIGHT="23" WIDTH="90" VPOS="610" HPOS="1254"/><SP WIDTH="23" VPOS="610" HPOS="1345"/><String WC="0.6299999952" CONTENT="wir" HEIGHT="34" WIDTH="58" VPOS="600" HPOS="1369"/><SP WIDTH="23" VPOS="611" HPOS="1428"/><String WC="0.8100000024" CONTENT="uns" HEIGHT="24" WIDTH="59" VPOS="610" HPOS="1452"/><SP WIDTH="20" VPOS="610" HPOS="1512"/><String WC="0.5966666937" CONTENT="auf" HEIGHT="42" WIDTH="54" VPOS="600" HPOS="1533"/></TextLine>
|
||||
<TextLine HEIGHT="50" WIDTH="1224" VPOS="651" HPOS="362"><String WC="0.7933333516" CONTENT="die" HEIGHT="33" WIDTH="46" VPOS="655" HPOS="362"/><SP WIDTH="23" VPOS="655" HPOS="409"/><String WC="0.8428571224" CONTENT="Lektüre" HEIGHT="35" WIDTH="129" VPOS="654" HPOS="433"/><SP WIDTH="24" VPOS="655" HPOS="563"/><String WC="0.6150000095" CONTENT="dieses" HEIGHT="42" WIDTH="92" VPOS="655" HPOS="588"/><SP WIDTH="23" VPOS="656" HPOS="681"/><String WC="0.8766666651" CONTENT="Buches" HEIGHT="43" WIDTH="115" VPOS="655" HPOS="705"/><SP WIDTH="30" VPOS="655" HPOS="821"/><String WC="0.6575000286" CONTENT="beschränken." HEIGHT="45" WIDTH="211" VPOS="654" HPOS="852"/><SP WIDTH="46" VPOS="656" HPOS="1064"/><String WC="0.5699999928" CONTENT="Das" HEIGHT="34" WIDTH="68" VPOS="655" HPOS="1111"/><SP WIDTH="23" VPOS="656" HPOS="1180"/><String WC="0.7912499905" CONTENT="Material" HEIGHT="33" WIDTH="156" VPOS="655" HPOS="1204"/><SP WIDTH="24" VPOS="655" HPOS="1361"/><String WC="0.8199999928" CONTENT="ist" HEIGHT="42" WIDTH="33" VPOS="655" HPOS="1386"/><SP WIDTH="23" VPOS="655" HPOS="1420"/><String WC="0.6716666818" CONTENT="leicht" HEIGHT="44" WIDTH="83" VPOS="654" HPOS="1444"/><SP WIDTH="22" VPOS="657" HPOS="1528"/><String WC="0.6999999881" CONTENT="zu" HEIGHT="31" WIDTH="35" VPOS="665" HPOS="1551"/></TextLine>
|
||||
<TextLine HEIGHT="46" WIDTH="608" VPOS="707" HPOS="361"><String WC="0.6736363769" CONTENT="beschaffen." HEIGHT="43" WIDTH="175" VPOS="709" HPOS="361"/><SP WIDTH="30" VPOS="710" HPOS="537"/><String WC="0.6533333063" CONTENT="Man" HEIGHT="33" WIDTH="84" VPOS="710" HPOS="568"/><SP WIDTH="22" VPOS="710" HPOS="653"/><String WC="0.6228571534" CONTENT="sammelt" HEIGHT="42" WIDTH="137" VPOS="710" HPOS="676"/><SP WIDTH="20" VPOS="712" HPOS="814"/><String WC="0.7666666508" CONTENT="aus" HEIGHT="24" WIDTH="57" VPOS="720" HPOS="835"/><SP WIDTH="20" VPOS="710" HPOS="893"/><String WC="0.5966666937" CONTENT="den" HEIGHT="33" WIDTH="55" VPOS="710" HPOS="914"/></TextLine>
|
||||
<TextLine HEIGHT="47" WIDTH="607" VPOS="762" HPOS="364"><String WC="0.7990909219" CONTENT="Torflöchern" HEIGHT="44" WIDTH="195" VPOS="763" HPOS="364"/><SP WIDTH="16" VPOS="764" HPOS="559"/><String WC="0.9300000072" CONTENT="der" HEIGHT="33" WIDTH="52" VPOS="764" HPOS="575"/><SP WIDTH="8" VPOS="764" HPOS="627"/><String WC="0.7636363506" CONTENT="Niedermoore" HEIGHT="34" WIDTH="217" VPOS="764" HPOS="635"/><SP WIDTH="11" VPOS="765" HPOS="852"/><String WC="0.7620000243" CONTENT="Moose" HEIGHT="42" WIDTH="108" VPOS="765" HPOS="863"/></TextLine>
|
||||
<TextLine HEIGHT="48" WIDTH="608" VPOS="817" HPOS="363"><String WC="1." CONTENT="oder" HEIGHT="33" WIDTH="70" VPOS="819" HPOS="363"/><SP WIDTH="28" VPOS="819" HPOS="434"/><String WC="0.6233333349" CONTENT="höhere" HEIGHT="45" WIDTH="111" VPOS="818" HPOS="463"/><SP WIDTH="28" VPOS="820" HPOS="575"/><String WC="0.6035714149" CONTENT="Wasserpflanzen" HEIGHT="44" WIDTH="260" VPOS="818" HPOS="604"/><SP WIDTH="29" VPOS="818" HPOS="865"/><String WC="0.7839999795" CONTENT="(sehr" HEIGHT="45" WIDTH="76" VPOS="818" HPOS="895"/></TextLine>
|
||||
<TextLine HEIGHT="46" WIDTH="609" VPOS="872" HPOS="362"><String WC="0.6299999952" CONTENT="ist" HEIGHT="42" WIDTH="35" VPOS="874" HPOS="362"/><SP WIDTH="25" VPOS="875" HPOS="398"/><String WC="0.9666666389" CONTENT="der" HEIGHT="33" WIDTH="51" VPOS="875" HPOS="424"/><SP WIDTH="25" VPOS="875" HPOS="476"/><String WC="0.5278571248" CONTENT="Wasserschlauch" HEIGHT="44" WIDTH="245" VPOS="874" HPOS="502"/><SP WIDTH="25" VPOS="874" HPOS="748"/><String WC="0.8245454431" CONTENT="Utricularia" HEIGHT="36" WIDTH="197" VPOS="873" HPOS="774"/></TextLine>
|
||||
<TextLine HEIGHT="47" WIDTH="608" VPOS="927" HPOS="361"><String WC="0.7950000167" CONTENT="zu" HEIGHT="32" WIDTH="36" VPOS="939" HPOS="361"/><SP WIDTH="24" VPOS="939" HPOS="398"/><String WC="0.7300000191" CONTENT="empfehlen)," HEIGHT="44" WIDTH="194" VPOS="928" HPOS="423"/><SP WIDTH="32" VPOS="930" HPOS="618"/><String WC="0.9433333278" CONTENT="die" HEIGHT="33" WIDTH="46" VPOS="929" HPOS="651"/><SP WIDTH="29" VPOS="940" HPOS="698"/><String WC="0.5666666627" CONTENT="mit" HEIGHT="33" WIDTH="56" VPOS="930" HPOS="728"/><SP WIDTH="23" VPOS="930" HPOS="785"/><String WC="0.7674999833" CONTENT="braunem," HEIGHT="44" WIDTH="160" VPOS="929" HPOS="809"/></TextLine>
|
||||
<TextLine HEIGHT="49" WIDTH="606" VPOS="980" HPOS="362"><String WC="0.6863636374" CONTENT="schlickigem" HEIGHT="43" WIDTH="176" VPOS="984" HPOS="362"/><SP WIDTH="32" VPOS="981" HPOS="539"/><String WC="0.6887500286" CONTENT="Überzüge" HEIGHT="45" WIDTH="157" VPOS="981" HPOS="572"/><SP WIDTH="31" VPOS="984" HPOS="730"/><String WC="0.5857142806" CONTENT="besetzt" HEIGHT="45" WIDTH="101" VPOS="983" HPOS="762"/><SP WIDTH="32" VPOS="985" HPOS="864"/><String WC="0.8379999995" CONTENT="sind." HEIGHT="42" WIDTH="71" VPOS="984" HPOS="897"/></TextLine>
|
||||
</TextBlock>
|
||||
<Illustration ID="Page1_Block4" HEIGHT="232" WIDTH="604" VPOS="1131" HPOS="374"><Shape><Polygon POINTS="378,1134 982,1134 982,1364 378,1364 378,1134"/></Shape></Illustration>
|
||||
<Illustration ID="Page1_Block5" HEIGHT="664" WIDTH="539" VPOS="732" HPOS="1013"><Shape><Polygon POINTS="1019,737 1556,737 1556,1399 1019,1399 1019,737"/></Shape></Illustration>
|
||||
<TextBlock ID="Page1_Block6" HEIGHT="140" WIDTH="1258" VPOS="1423" HPOS="345" language="de" STYLEREFS="font0"><Shape><Polygon POINTS="348,1428 1606,1428 1606,1564 348,1564 348,1428"/></Shape>
|
||||
<TextLine HEIGHT="32" WIDTH="1225" VPOS="1429" HPOS="362"><String WC="0.4325000048" CONTENT="Fig." HEIGHT="26" WIDTH="46" VPOS="1435" HPOS="362"/><SP WIDTH="22" VPOS="1438" HPOS="409"/><String WC="0.3540000021" CONTENT="J54;." HEIGHT="22" WIDTH="44" VPOS="1438" HPOS="432"/><SP WIDTH="33" VPOS="1434" HPOS="477"/><String WC="0.7620000243" CONTENT="Cosmarium." HEIGHT="22" WIDTH="139" VPOS="1433" HPOS="511"/><SP WIDTH="32" VPOS="1432" HPOS="651"/><String WC="0.4550000131" CONTENT="A." HEIGHT="21" WIDTH="30" VPOS="1432" HPOS="684"/><SP WIDTH="19" VPOS="1432" HPOS="715"/><String WC="0.7699999809" CONTENT="C." HEIGHT="21" WIDTH="25" VPOS="1432" HPOS="735"/><SP WIDTH="23" VPOS="1439" HPOS="761"/><String WC="0.6628571153" CONTENT="margaritaceum," HEIGHT="28" WIDTH="184" VPOS="1431" HPOS="785"/><SP WIDTH="30" VPOS="1432" HPOS="970"/><String WC="0.4524999857" CONTENT="Fig." HEIGHT="27" WIDTH="46" VPOS="1432" HPOS="1001"/><SP WIDTH="15" VPOS="1435" HPOS="1048"/><String WC="0.5400000215" CONTENT="J35." HEIGHT="23" WIDTH="44" VPOS="1435" HPOS="1064"/><SP WIDTH="31" VPOS="1432" HPOS="1109"/><String WC="0.7572727203" CONTENT="Clostcrium." HEIGHT="23" WIDTH="134" VPOS="1430" HPOS="1141"/><SP WIDTH="27" VPOS="1431" HPOS="1276"/><String WC="0.5199999809" CONTENT="A" HEIGHT="19" WIDTH="22" VPOS="1431" HPOS="1304"/><SP WIDTH="18" VPOS="1430" HPOS="1327"/><String WC="0.6366666555" CONTENT="CI." HEIGHT="21" WIDTH="33" VPOS="1430" HPOS="1346"/><SP WIDTH="16" VPOS="1430" HPOS="1380"/><String WC="0.6342856884" CONTENT="lunula," HEIGHT="25" WIDTH="86" VPOS="1430" HPOS="1397"/><SP WIDTH="21" VPOS="1429" HPOS="1484"/><String WC="0.6314285994" CONTENT="Linzel-" HEIGHT="26" WIDTH="81" VPOS="1429" HPOS="1506"/></TextLine>
|
||||
<TextLine HEIGHT="32" WIDTH="1225" VPOS="1461" HPOS="361"><String WC="0.5600000024" CONTENT="a" HEIGHT="13" WIDTH="13" VPOS="1474" HPOS="361"/><SP WIDTH="14" VPOS="1468" HPOS="375"/><String WC="0.5083333254" CONTENT="Lnizelzellp," HEIGHT="26" WIDTH="128" VPOS="1467" HPOS="390"/><SP WIDTH="15" VPOS="1467" HPOS="519"/><String WC="0.25" CONTENT="b" HEIGHT="20" WIDTH="13" VPOS="1466" HPOS="535"/><SP WIDTH="14" VPOS="1466" HPOS="549"/><String WC="0.5822222233" CONTENT="Iochspore" HEIGHT="26" WIDTH="112" VPOS="1465" HPOS="564"/><SP WIDTH="14" VPOS="1471" HPOS="677"/><String WC="0.3700000048" CONTENT="mit" HEIGHT="20" WIDTH="39" VPOS="1465" HPOS="692"/><SP WIDTH="10" VPOS="1465" HPOS="732"/><String WC="0.3100000024" CONTENT="den" HEIGHT="20" WIDTH="37" VPOS="1465" HPOS="743"/><SP WIDTH="13" VPOS="1471" HPOS="781"/><String WC="0.4350000024" CONTENT="entleerten" HEIGHT="21" WIDTH="111" VPOS="1464" HPOS="795"/><SP WIDTH="8" VPOS="1464" HPOS="907"/><String WC="0.7940000296" CONTENT="Zell-" HEIGHT="27" WIDTH="55" VPOS="1464" HPOS="916"/><SP WIDTH="28" VPOS="1471" HPOS="972"/><String WC="0.6333333254" CONTENT="zelle," HEIGHT="25" WIDTH="54" VPOS="1465" HPOS="1001"/><SP WIDTH="15" VPOS="1464" HPOS="1056"/><String WC="0.2800000012" CONTENT="B" HEIGHT="20" WIDTH="18" VPOS="1464" HPOS="1072"/><SP WIDTH="14" VPOS="1464" HPOS="1091"/><String WC="0.9233333468" CONTENT="CI." HEIGHT="21" WIDTH="32" VPOS="1464" HPOS="1106"/><SP WIDTH="15" VPOS="1471" HPOS="1139"/><String WC="0.8188889027" CONTENT="rostratum" HEIGHT="19" WIDTH="111" VPOS="1465" HPOS="1155"/><SP WIDTH="12" VPOS="1463" HPOS="1267"/><String WC="0.2399999946" CONTENT="(nad?" HEIGHT="25" WIDTH="62" VPOS="1463" HPOS="1280"/><SP WIDTH="8" VPOS="1464" HPOS="1343"/><String WC="0.2949999869" CONTENT="Präparat" HEIGHT="26" WIDTH="110" VPOS="1463" HPOS="1352"/><SP WIDTH="10" VPOS="1465" HPOS="1463"/><String WC="0.1566666663" CONTENT="uon" HEIGHT="16" WIDTH="41" VPOS="1467" HPOS="1474"/><SP WIDTH="8" VPOS="1463" HPOS="1516"/><String WC="0.3420000076" CONTENT="pvof." HEIGHT="27" WIDTH="61" VPOS="1461" HPOS="1525"/></TextLine>
|
||||
<TextLine HEIGHT="33" WIDTH="1224" VPOS="1493" HPOS="362"><String WC="0.6571428776" CONTENT="häuten." HEIGHT="27" WIDTH="88" VPOS="1499" HPOS="362"/><SP WIDTH="27" VPOS="1499" HPOS="451"/><String WC="0.400000006" CONTENT="B" HEIGHT="20" WIDTH="18" VPOS="1499" HPOS="479"/><SP WIDTH="15" VPOS="1499" HPOS="498"/><String WC="0.6918181777" CONTENT="Linzelzelle" HEIGHT="27" WIDTH="120" VPOS="1497" HPOS="514"/><SP WIDTH="22" VPOS="1503" HPOS="635"/><String WC="0.453333348" CONTENT="von" HEIGHT="14" WIDTH="42" VPOS="1503" HPOS="658"/><SP WIDTH="21" VPOS="1497" HPOS="701"/><String WC="0.9250000119" CONTENT="C." HEIGHT="20" WIDTH="24" VPOS="1497" HPOS="723"/><SP WIDTH="15" VPOS="1497" HPOS="748"/><String WC="0.8562499881" CONTENT="botrytis" HEIGHT="26" WIDTH="89" VPOS="1497" HPOS="764"/><SP WIDTH="18" VPOS="1502" HPOS="854"/><String WC="0.4499999881" CONTENT="mit" HEIGHT="21" WIDTH="40" VPOS="1496" HPOS="873"/><SP WIDTH="19" VPOS="1498" HPOS="914"/><String WC="0.6700000167" CONTENT="un-" HEIGHT="15" WIDTH="38" VPOS="1502" HPOS="934"/><SP WIDTH="29" VPOS="1496" HPOS="973"/><String WC="0.5155555606" CONTENT="Homfeld)," HEIGHT="27" WIDTH="115" VPOS="1496" HPOS="1003"/><SP WIDTH="20" VPOS="1497" HPOS="1119"/><String WC="0.3355555534" CONTENT=")ochspore" HEIGHT="28" WIDTH="112" VPOS="1495" HPOS="1140"/><SP WIDTH="14" VPOS="1501" HPOS="1253"/><String WC="0.853333354" CONTENT="mit" HEIGHT="20" WIDTH="39" VPOS="1495" HPOS="1268"/><SP WIDTH="13" VPOS="1495" HPOS="1308"/><String WC="0.5233333111" CONTENT="den" HEIGHT="20" WIDTH="37" VPOS="1495" HPOS="1322"/><SP WIDTH="13" VPOS="1494" HPOS="1360"/><String WC="0.4783333242" CONTENT="leeren" HEIGHT="22" WIDTH="65" VPOS="1494" HPOS="1374"/><SP WIDTH="10" VPOS="1494" HPOS="1440"/><String WC="0.6600000262" CONTENT="Zellhäuten," HEIGHT="28" WIDTH="135" VPOS="1493" HPOS="1451"/></TextLine>
|
||||
<TextLine HEIGHT="29" WIDTH="839" VPOS="1527" HPOS="568"><String WC="0.4187499881" CONTENT="gleichen" HEIGHT="27" WIDTH="90" VPOS="1529" HPOS="568"/><SP WIDTH="14" VPOS="1529" HPOS="659"/><String WC="0.6687499881" CONTENT="Hälften." HEIGHT="27" WIDTH="97" VPOS="1529" HPOS="674"/><SP WIDTH="411" VPOS="1527" HPOS="772"/><String WC="0.7599999905" CONTENT="in" HEIGHT="21" WIDTH="22" VPOS="1527" HPOS="1184"/><SP WIDTH="13" VPOS="1534" HPOS="1207"/><String WC="0.4300000072" CONTENT="zwei" HEIGHT="26" WIDTH="50" VPOS="1527" HPOS="1221"/><SP WIDTH="15" VPOS="1527" HPOS="1272"/><String WC="0.6629999876" CONTENT="Ansichten." HEIGHT="26" WIDTH="119" VPOS="1527" HPOS="1288"/></TextLine>
|
||||
</TextBlock></ComposedBlock>
|
||||
<TextBlock ID="Page1_Block7" HEIGHT="610" WIDTH="1241" VPOS="1578" HPOS="354" language="de" STYLEREFS="font1"><Shape><Polygon POINTS="357,1583 1596,1583 1596,2189 357,2189 357,1583"/></Shape>
|
||||
<TextLine HEIGHT="49" WIDTH="1224" VPOS="1583" HPOS="363"><String WC="0.6650000215" CONTENT="Zu" HEIGHT="34" WIDTH="45" VPOS="1589" HPOS="363"/><SP WIDTH="37" VPOS="1590" HPOS="409"/><String WC="0.7360000014" CONTENT="hause" HEIGHT="43" WIDTH="97" VPOS="1589" HPOS="447"/><SP WIDTH="37" VPOS="1588" HPOS="545"/><String WC="0.7419999838" CONTENT="spült" HEIGHT="43" WIDTH="77" VPOS="1587" HPOS="583"/><SP WIDTH="32" VPOS="1589" HPOS="661"/><String WC="0.6266666651" CONTENT="man" HEIGHT="24" WIDTH="75" VPOS="1597" HPOS="694"/><SP WIDTH="37" VPOS="1587" HPOS="770"/><String WC="0.9300000072" CONTENT="die" HEIGHT="34" WIDTH="46" VPOS="1587" HPOS="808"/><SP WIDTH="36" VPOS="1596" HPOS="855"/><String WC="0.8169230819" CONTENT="mitgenommenen" HEIGHT="43" WIDTH="280" VPOS="1586" HPOS="892"/><SP WIDTH="38" VPOS="1586" HPOS="1173"/><String WC="0.7077777982" CONTENT="Pröbchen," HEIGHT="43" WIDTH="172" VPOS="1585" HPOS="1212"/><SP WIDTH="39" VPOS="1584" HPOS="1385"/><String WC="0.5366666913" CONTENT="die" HEIGHT="35" WIDTH="46" VPOS="1584" HPOS="1425"/><SP WIDTH="40" VPOS="1594" HPOS="1472"/><String WC="0.6233333349" CONTENT="man" HEIGHT="24" WIDTH="74" VPOS="1594" HPOS="1513"/></TextLine>
|
||||
<TextLine HEIGHT="48" WIDTH="1224" VPOS="1639" HPOS="363"><String WC="0.6377778053" CONTENT="natürlich" HEIGHT="43" WIDTH="148" VPOS="1644" HPOS="363"/><SP WIDTH="43" VPOS="1643" HPOS="512"/><String WC="0.5960000157" CONTENT="nicht" HEIGHT="43" WIDTH="75" VPOS="1642" HPOS="556"/><SP WIDTH="41" VPOS="1642" HPOS="632"/><String WC="0.7549999952" CONTENT="literweise" HEIGHT="43" WIDTH="157" VPOS="1642" HPOS="674"/><SP WIDTH="42" VPOS="1642" HPOS="832"/><String WC="0.6299999952" CONTENT="sammelt," HEIGHT="43" WIDTH="156" VPOS="1641" HPOS="875"/><SP WIDTH="43" VPOS="1641" HPOS="1032"/><String WC="1." CONTENT="in" HEIGHT="34" WIDTH="30" VPOS="1641" HPOS="1076"/><SP WIDTH="41" VPOS="1651" HPOS="1107"/><String WC="0.6600000262" CONTENT="wenig" HEIGHT="44" WIDTH="102" VPOS="1640" HPOS="1149"/><SP WIDTH="37" VPOS="1641" HPOS="1252"/><String WC="0.6949999928" CONTENT="Wasser" HEIGHT="42" WIDTH="118" VPOS="1640" HPOS="1290"/><SP WIDTH="37" VPOS="1650" HPOS="1409"/><String WC="0.8700000048" CONTENT="ab" HEIGHT="33" WIDTH="39" VPOS="1640" HPOS="1447"/><SP WIDTH="38" VPOS="1639" HPOS="1487"/><String WC="0.3733333349" CONTENT="und" HEIGHT="33" WIDTH="61" VPOS="1639" HPOS="1526"/></TextLine>
|
||||
<TextLine HEIGHT="48" WIDTH="1226" VPOS="1693" HPOS="362"><String WC="0.7250000238" CONTENT="bringt" HEIGHT="42" WIDTH="107" VPOS="1699" HPOS="362"/><SP WIDTH="43" VPOS="1700" HPOS="469"/><String WC="0.6857143044" CONTENT="winzige" HEIGHT="44" WIDTH="131" VPOS="1697" HPOS="512"/><SP WIDTH="36" VPOS="1698" HPOS="643"/><String WC="0.7214285731" CONTENT="Partien" HEIGHT="43" WIDTH="129" VPOS="1697" HPOS="679"/><SP WIDTH="46" VPOS="1697" HPOS="808"/><String WC="0.7133333087" CONTENT="des" HEIGHT="35" WIDTH="53" VPOS="1696" HPOS="854"/><SP WIDTH="46" VPOS="1706" HPOS="907"/><String WC="0.7216666937" CONTENT="abgeklopften" HEIGHT="43" WIDTH="222" VPOS="1696" HPOS="953"/><SP WIDTH="38" VPOS="1696" HPOS="1175"/><String WC="0.5181818008" CONTENT="Scf]lid?es-" HEIGHT="43" WIDTH="151" VPOS="1695" HPOS="1213"/><SP WIDTH="32" VPOS="1705" HPOS="1364"/><String WC="0.7933333516" CONTENT="mit" HEIGHT="35" WIDTH="57" VPOS="1694" HPOS="1396"/><SP WIDTH="37" VPOS="1696" HPOS="1453"/><String WC="0.7400000095" CONTENT="einem" HEIGHT="35" WIDTH="98" VPOS="1694" HPOS="1490"/></TextLine>
|
||||
<TextLine HEIGHT="47" WIDTH="1224" VPOS="1749" HPOS="363"><String WC="0.7430769205" CONTENT="Wassertropfen" HEIGHT="43" WIDTH="240" VPOS="1753" HPOS="363"/><SP WIDTH="32" VPOS="1763" HPOS="604"/><String WC="0.6000000238" CONTENT="auf" HEIGHT="42" WIDTH="55" VPOS="1752" HPOS="637"/><SP WIDTH="29" VPOS="1752" HPOS="693"/><String WC="0.6359999776" CONTENT="einen" HEIGHT="34" WIDTH="87" VPOS="1752" HPOS="723"/><SP WIDTH="31" VPOS="1753" HPOS="811"/><String WC="0.7069230676" CONTENT="Objektträger." HEIGHT="44" WIDTH="233" VPOS="1751" HPOS="843"/><SP WIDTH="51" VPOS="1752" HPOS="1077"/><String WC="0.6866666675" CONTENT="Mit" HEIGHT="35" WIDTH="65" VPOS="1750" HPOS="1129"/><SP WIDTH="29" VPOS="1752" HPOS="1195"/><String WC="0.6750000119" CONTENT="zwei" HEIGHT="42" WIDTH="75" VPOS="1750" HPOS="1225"/><SP WIDTH="30" VPOS="1750" HPOS="1301"/><String WC="0.7866666913" CONTENT="feinen" HEIGHT="42" WIDTH="101" VPOS="1750" HPOS="1332"/><SP WIDTH="30" VPOS="1751" HPOS="1434"/><String WC="0.6683333516" CONTENT="Nadeln" HEIGHT="35" WIDTH="122" VPOS="1749" HPOS="1465"/></TextLine>
|
||||
<TextLine HEIGHT="48" WIDTH="1224" VPOS="1804" HPOS="363"><String WC="0.7785714269" CONTENT="breitet" HEIGHT="33" WIDTH="109" VPOS="1809" HPOS="363"/><SP WIDTH="23" VPOS="1810" HPOS="473"/><String WC="0.4099999964" CONTENT="man" HEIGHT="24" WIDTH="74" VPOS="1818" HPOS="497"/><SP WIDTH="24" VPOS="1808" HPOS="572"/><String WC="0.8100000024" CONTENT="das" HEIGHT="33" WIDTH="56" VPOS="1808" HPOS="597"/><SP WIDTH="19" VPOS="1808" HPOS="654"/><String WC="0.7633333206" CONTENT="Klümpchen" HEIGHT="43" WIDTH="186" VPOS="1807" HPOS="674"/><SP WIDTH="24" VPOS="1817" HPOS="861"/><String WC="0.678888917" CONTENT="möglichst" HEIGHT="44" WIDTH="151" VPOS="1806" HPOS="886"/><SP WIDTH="23" VPOS="1809" HPOS="1038"/><String WC="0.6850000024" CONTENT="weit" HEIGHT="34" WIDTH="71" VPOS="1806" HPOS="1062"/><SP WIDTH="23" VPOS="1809" HPOS="1134"/><String WC="0.6025000215" CONTENT="aus," HEIGHT="33" WIDTH="68" VPOS="1816" HPOS="1158"/><SP WIDTH="25" VPOS="1805" HPOS="1227"/><String WC="0.7080000043" CONTENT="damit" HEIGHT="34" WIDTH="98" VPOS="1805" HPOS="1253"/><SP WIDTH="23" VPOS="1807" HPOS="1352"/><String WC="1." CONTENT="es" HEIGHT="24" WIDTH="31" VPOS="1815" HPOS="1376"/><SP WIDTH="25" VPOS="1807" HPOS="1408"/><String WC="0.8366666436" CONTENT="übersicht" HEIGHT="44" WIDTH="140" VPOS="1804" HPOS="1434" SUBS_TYPE="HypPart1" SUBS_CONTENT="übersichtlich"/><HYP CONTENT=""/></TextLine>
|
||||
<TextLine HEIGHT="48" WIDTH="1224" VPOS="1859" HPOS="363"><String WC="0.6650000215" CONTENT="lich" HEIGHT="43" WIDTH="52" VPOS="1864" HPOS="363" SUBS_TYPE="HypPart2" SUBS_CONTENT="übersichtlich"/><SP WIDTH="31" VPOS="1864" HPOS="416"/><String WC="0.5849999785" CONTENT="wird" HEIGHT="33" WIDTH="76" VPOS="1864" HPOS="448"/><SP WIDTH="31" VPOS="1863" HPOS="525"/><String WC="0.9066666961" CONTENT="und" HEIGHT="34" WIDTH="61" VPOS="1862" HPOS="557"/><SP WIDTH="31" VPOS="1862" HPOS="619"/><String WC="0.8728571534" CONTENT="bedeckt" HEIGHT="34" WIDTH="119" VPOS="1862" HPOS="651"/><SP WIDTH="30" VPOS="1863" HPOS="771"/><String WC="0.7833333611" CONTENT="das" HEIGHT="33" WIDTH="57" VPOS="1863" HPOS="802"/><SP WIDTH="24" VPOS="1862" HPOS="860"/><String WC="0.7537500262" CONTENT="Präparat" HEIGHT="43" WIDTH="161" VPOS="1862" HPOS="885"/><SP WIDTH="27" VPOS="1863" HPOS="1047"/><String WC="0.7566666603" CONTENT="mit" HEIGHT="34" WIDTH="56" VPOS="1861" HPOS="1075"/><SP WIDTH="24" VPOS="1863" HPOS="1132"/><String WC="0.7179999948" CONTENT="einem" HEIGHT="34" WIDTH="96" VPOS="1861" HPOS="1157"/><SP WIDTH="24" VPOS="1861" HPOS="1254"/><String WC="0.6629999876" CONTENT="veckglase." HEIGHT="42" WIDTH="171" VPOS="1861" HPOS="1279"/><SP WIDTH="47" VPOS="1860" HPOS="1451"/><String WC="1." CONTENT="Beim" HEIGHT="34" WIDTH="88" VPOS="1859" HPOS="1499"/></TextLine>
|
||||
<TextLine HEIGHT="48" WIDTH="1223" VPOS="1914" HPOS="364"><String WC="0.5649999976" CONTENT="Züchen" HEIGHT="43" WIDTH="115" VPOS="1919" HPOS="364"/><SP WIDTH="24" VPOS="1929" HPOS="480"/><String WC="0.8666666746" CONTENT="mit" HEIGHT="35" WIDTH="55" VPOS="1918" HPOS="505"/><SP WIDTH="22" VPOS="1919" HPOS="561"/><String WC="0.8566666842" CONTENT="mittlerer" HEIGHT="33" WIDTH="148" VPOS="1918" HPOS="584"/><SP WIDTH="24" VPOS="1918" HPOS="733"/><String WC="0.6583333611" CONTENT="Vergrößerung" HEIGHT="44" WIDTH="238" VPOS="1917" HPOS="758"/><SP WIDTH="31" VPOS="1927" HPOS="997"/><String WC="0.4524999857" CONTENT="wird" HEIGHT="34" WIDTH="78" VPOS="1916" HPOS="1029"/><SP WIDTH="24" VPOS="1917" HPOS="1108"/><String WC="0.6800000072" CONTENT="man" HEIGHT="25" WIDTH="73" VPOS="1926" HPOS="1133"/><SP WIDTH="25" VPOS="1916" HPOS="1207"/><String WC="0.6316666603" CONTENT="Formen" HEIGHT="42" WIDTH="132" VPOS="1916" HPOS="1233"/><SP WIDTH="24" VPOS="1915" HPOS="1366"/><String WC="0.7300000191" CONTENT="finden," HEIGHT="43" WIDTH="116" VPOS="1914" HPOS="1391"/><SP WIDTH="32" VPOS="1915" HPOS="1508"/><String WC="0.8633333445" CONTENT="die" HEIGHT="33" WIDTH="46" VPOS="1915" HPOS="1541"/></TextLine>
|
||||
<TextLine HEIGHT="48" WIDTH="1222" VPOS="1969" HPOS="365"><String WC="0.6333333254" CONTENT="aus" HEIGHT="23" WIDTH="58" VPOS="1984" HPOS="365"/><SP WIDTH="29" VPOS="1984" HPOS="424"/><String WC="0.7825000286" CONTENT="zwei" HEIGHT="41" WIDTH="74" VPOS="1974" HPOS="454"/><SP WIDTH="31" VPOS="1973" HPOS="529"/><String WC="0.6437500119" CONTENT="einander" HEIGHT="34" WIDTH="147" VPOS="1972" HPOS="561"/><SP WIDTH="30" VPOS="1983" HPOS="709"/><String WC="0.6938889027" CONTENT="gegenüberstehenden" HEIGHT="44" WIDTH="336" VPOS="1972" HPOS="740"/><SP WIDTH="32" VPOS="1972" HPOS="1077"/><String WC="0.5190908909" CONTENT="Halbkreisen" HEIGHT="45" WIDTH="198" VPOS="1970" HPOS="1110"/><SP WIDTH="33" VPOS="1970" HPOS="1309"/><String WC="0.2849999964" CONTENT="in" HEIGHT="34" WIDTH="31" VPOS="1970" HPOS="1343"/><SP WIDTH="33" VPOS="1970" HPOS="1375"/><String WC="0.8033333421" CONTENT="der" HEIGHT="33" WIDTH="51" VPOS="1970" HPOS="1409"/><SP WIDTH="30" VPOS="1971" HPOS="1461"/><String WC="0.8820000291" CONTENT="Mitte" HEIGHT="35" WIDTH="95" VPOS="1969" HPOS="1492"/></TextLine>
|
||||
<TextLine HEIGHT="49" WIDTH="1227" VPOS="2023" HPOS="361"><String WC="0.6323529482" CONTENT="zusammengewachsen" HEIGHT="43" WIDTH="350" VPOS="2028" HPOS="361"/><SP WIDTH="32" VPOS="2038" HPOS="711"/><String WC="0.7599999905" CONTENT="erscheinen" HEIGHT="44" WIDTH="163" VPOS="2027" HPOS="743"/><SP WIDTH="26" VPOS="2025" HPOS="906"/><String WC="0.8854545355" CONTENT="(Cosmarium," HEIGHT="44" WIDTH="238" VPOS="2025" HPOS="932"/><SP WIDTH="31" VPOS="2026" HPOS="1170"/><String WC="0.7774999738" CONTENT="Fig." HEIGHT="42" WIDTH="68" VPOS="2026" HPOS="1201"/><SP WIDTH="30" VPOS="2029" HPOS="1269"/><String WC="0.6140000224" CONTENT="134)," HEIGHT="43" WIDTH="84" VPOS="2024" HPOS="1299"/><SP WIDTH="34" VPOS="2035" HPOS="1383"/><String WC="0.6825000048" CONTENT="oder" HEIGHT="33" WIDTH="72" VPOS="2025" HPOS="1417"/><SP WIDTH="24" VPOS="2034" HPOS="1489"/><String WC="0.6833333373" CONTENT="man" HEIGHT="24" WIDTH="75" VPOS="2034" HPOS="1513"/></TextLine>
|
||||
<TextLine HEIGHT="47" WIDTH="1223" VPOS="2079" HPOS="365"><String WC="0.7799999714" CONTENT="findet" HEIGHT="41" WIDTH="94" VPOS="2083" HPOS="365"/><SP WIDTH="18" VPOS="2083" HPOS="460"/><String WC="0.8355555534" CONTENT="türkische" HEIGHT="44" WIDTH="142" VPOS="2082" HPOS="479"/><SP WIDTH="15" VPOS="2083" HPOS="622"/><String WC="0.6140000224" CONTENT="Halbmonde," HEIGHT="43" WIDTH="203" VPOS="2082" HPOS="638"/><SP WIDTH="20" VPOS="2083" HPOS="842"/><String WC="0.7233333588" CONTENT="die" HEIGHT="34" WIDTH="46" VPOS="2082" HPOS="863"/><SP WIDTH="21" VPOS="2092" HPOS="910"/><String WC="0.5899999738" CONTENT="genau" HEIGHT="33" WIDTH="101" VPOS="2091" HPOS="932"/><SP WIDTH="17" VPOS="2081" HPOS="1034"/><String WC="0.6620000005" CONTENT="durch" HEIGHT="43" WIDTH="86" VPOS="2081" HPOS="1052"/><SP WIDTH="20" VPOS="2081" HPOS="1139"/><String WC="0.6340000033" CONTENT="einen" HEIGHT="35" WIDTH="87" VPOS="2080" HPOS="1160"/><SP WIDTH="15" VPOS="2081" HPOS="1248"/><String WC="0.7910000086" CONTENT="Ouerstrich" HEIGHT="43" WIDTH="168" VPOS="2080" HPOS="1264"/><SP WIDTH="21" VPOS="2080" HPOS="1433"/><String WC="0.5950000286" CONTENT="halbiert" HEIGHT="44" WIDTH="133" VPOS="2079" HPOS="1455"/></TextLine>
|
||||
<TextLine HEIGHT="50" WIDTH="1222" VPOS="2133" HPOS="365"><String WC="0.5674999952" CONTENT="sind" HEIGHT="43" WIDTH="62" VPOS="2137" HPOS="365"/><SP WIDTH="37" VPOS="2137" HPOS="428"/><String WC="0.8000000119" CONTENT="und" HEIGHT="34" WIDTH="61" VPOS="2137" HPOS="466"/><SP WIDTH="38" VPOS="2136" HPOS="528"/><String WC="0.6499999762" CONTENT="an" HEIGHT="24" WIDTH="40" VPOS="2147" HPOS="567"/><SP WIDTH="33" VPOS="2137" HPOS="608"/><String WC="0.8183333278" CONTENT="beiden" HEIGHT="35" WIDTH="107" VPOS="2137" HPOS="642"/><SP WIDTH="34" VPOS="2138" HPOS="750"/><String WC="0.4499999881" CONTENT="Enden" HEIGHT="34" WIDTH="106" VPOS="2137" HPOS="785"/><SP WIDTH="36" VPOS="2137" HPOS="892"/><String WC="0.8600000143" CONTENT="je" HEIGHT="44" WIDTH="27" VPOS="2137" HPOS="929"/><SP WIDTH="34" VPOS="2146" HPOS="957"/><String WC="0.7225000262" CONTENT="eine" HEIGHT="34" WIDTH="64" VPOS="2136" HPOS="992"/><SP WIDTH="33" VPOS="2136" HPOS="1057"/><String WC="0.9139999747" CONTENT="kreisrunde" HEIGHT="35" WIDTH="180" VPOS="2135" HPOS="1091"/><SP WIDTH="33" VPOS="2135" HPOS="1272"/><String WC="0.6079999804" CONTENT="Blase" HEIGHT="43" WIDTH="89" VPOS="2135" HPOS="1306"/><SP WIDTH="33" VPOS="2145" HPOS="1396"/><String WC="0.7266666889" CONTENT="enthalten" HEIGHT="46" WIDTH="157" VPOS="2133" HPOS="1430"/></TextLine>
|
||||
</TextBlock><GraphicalElement ID="Page1_Block8" HEIGHT="184" WIDTH="8" VPOS="900" HPOS="1258"/><GraphicalElement ID="Page1_Block9" HEIGHT="90" WIDTH="3" VPOS="896" HPOS="1427"/><GraphicalElement ID="Page1_Block10" HEIGHT="146" WIDTH="7" VPOS="885" HPOS="1544"/>
|
||||
</PrintSpace>
|
||||
</Page>
|
||||
</Layout>
|
||||
</alto>
|
37
qurator/dinglehopper/tests/data/test.alto3.xml
Normal file
37
qurator/dinglehopper/tests/data/test.alto3.xml
Normal file
|
@ -0,0 +1,37 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<alto xmlns="http://www.loc.gov/standards/alto/ns-v3#">
|
||||
<Layout>
|
||||
<Page WIDTH="1148" HEIGHT="1852" PHYSICAL_IMG_NR="0" ID="page_0">
|
||||
<PrintSpace HPOS="0" VPOS="0" WIDTH="1148" HEIGHT="1852">
|
||||
<TextBlock ID="block_3" HPOS="135" VPOS="251" WIDTH="741" HEIGHT="47">
|
||||
<TextLine ID="line_3" HPOS="135" VPOS="251" WIDTH="741" HEIGHT="47">
|
||||
<String ID="string_5" HPOS="135" VPOS="251" WIDTH="65" HEIGHT="34" WC="0.89" CONTENT="über"/><SP WIDTH="19" VPOS="251" HPOS="200"/>
|
||||
<String ID="string_6" HPOS="219" VPOS="256" WIDTH="41" HEIGHT="31" WC="0.96" CONTENT="die"/><SP WIDTH="23" VPOS="256" HPOS="260"/>
|
||||
<String ID="string_7" HPOS="283" VPOS="258" WIDTH="87" HEIGHT="30" WC="0.87" CONTENT="vielen"/><SP WIDTH="16" VPOS="258" HPOS="370"/>
|
||||
<String ID="string_8" HPOS="386" VPOS="259" WIDTH="118" HEIGHT="37" WC="0.96" CONTENT="Sorgen"/><SP WIDTH="14" VPOS="259" HPOS="504"/>
|
||||
<String ID="string_9" HPOS="518" VPOS="265" WIDTH="90" HEIGHT="32" WC="0.21" CONTENT="wegen"/><SP WIDTH="12" VPOS="265" HPOS="608"/>
|
||||
<String ID="string_10" HPOS="620" VPOS="254" WIDTH="130" HEIGHT="42" WC="0.21" CONTENT="deſſelben"/><SP WIDTH="24" VPOS="254" HPOS="750"/>
|
||||
<String ID="string_11" HPOS="774" VPOS="255" WIDTH="102" HEIGHT="43" WC="0.74" CONTENT="vergaß"/>
|
||||
</TextLine>
|
||||
</TextBlock>
|
||||
<TextBlock ID="block_4" HPOS="134" VPOS="304" WIDTH="740" HEIGHT="40">
|
||||
<TextLine ID="line_4" HPOS="134" VPOS="304" WIDTH="740" HEIGHT="40">
|
||||
<String ID="string_12" HPOS="134" VPOS="304" WIDTH="203" HEIGHT="40" WC="0.75" CONTENT="Hartkopf,"/><SP WIDTH="30" VPOS="304" HPOS="337"/>
|
||||
<String ID="string_13" HPOS="367" VPOS="310" WIDTH="45" HEIGHT="27" WC="0.93" CONTENT="der"/><SP WIDTH="24" VPOS="310" HPOS="412"/>
|
||||
<String ID="string_14" HPOS="436" VPOS="309" WIDTH="74" HEIGHT="35" WC="0.59" CONTENT="Frau"/><SP WIDTH="22" VPOS="309" HPOS="510"/>
|
||||
<String ID="string_15" HPOS="532" VPOS="306" WIDTH="189" HEIGHT="36" WC="0.23" CONTENT="Amtmännin"/><SP WIDTH="16" VPOS="306" HPOS="721"/>
|
||||
<String ID="string_16" HPOS="737" VPOS="307" WIDTH="66" HEIGHT="34" WC="0.52" CONTENT="das"/><SP WIDTH="16" VPOS="307" HPOS="803"/>
|
||||
<String ID="string_17" HPOS="819" VPOS="318" WIDTH="55" HEIGHT="24" WC="0.0" CONTENT="ver-"/>
|
||||
</TextLine>
|
||||
</TextBlock>
|
||||
<TextBlock ID="block_5" HPOS="134" VPOS="356" WIDTH="761" HEIGHT="46">
|
||||
<TextLine ID="line_5" HPOS="134" VPOS="356" WIDTH="761" HEIGHT="46">
|
||||
<String ID="string_18" HPOS="134" VPOS="356" WIDTH="137" HEIGHT="37" WC="0.92" CONTENT="ſprochene"/><SP WIDTH="31" VPOS="356" HPOS="271"/>
|
||||
<String ID="string_19" HPOS="302" VPOS="365" WIDTH="32" HEIGHT="30" WC="0.73" CONTENT="zu"/><SP WIDTH="29" VPOS="365" HPOS="334"/>
|
||||
<String ID="string_20" HPOS="363" VPOS="356" WIDTH="170" HEIGHT="39" WC="0.52" CONTENT="überliefern."/><SP WIDTH="28" VPOS="356" HPOS="533"/>
|
||||
</TextLine>
|
||||
</TextBlock>
|
||||
</PrintSpace>
|
||||
</Page>
|
||||
</Layout>
|
||||
</alto>
|
3394
qurator/dinglehopper/tests/data/test.page2018.xml
Normal file
3394
qurator/dinglehopper/tests/data/test.page2018.xml
Normal file
File diff suppressed because it is too large
Load diff
1
qurator/dinglehopper/tests/data/test.txt
Normal file
1
qurator/dinglehopper/tests/data/test.txt
Normal file
|
@ -0,0 +1 @@
|
|||
Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet.
|
63
qurator/dinglehopper/tests/test_align.py
Normal file
63
qurator/dinglehopper/tests/test_align.py
Normal file
|
@ -0,0 +1,63 @@
|
|||
from .util import unzip
|
||||
from .. import align
|
||||
|
||||
|
||||
def test_left_empty():
|
||||
result = list(align('', 'foo'))
|
||||
expected = [(None, 'f'), (None, 'o'), (None, 'o')]
|
||||
assert result == expected
|
||||
|
||||
|
||||
def test_right_empty():
|
||||
result = list(align('foo', ''))
|
||||
expected = [('f', None), ('o', None), ('o', None)]
|
||||
assert result == expected
|
||||
|
||||
|
||||
def test_left_longer():
|
||||
result = list(align('food', 'foo'))
|
||||
expected = [('f', 'f'), ('o', 'o'), ('o', 'o'), ('d', None)]
|
||||
assert result == expected
|
||||
|
||||
|
||||
def test_right_longer():
|
||||
result = list(align('foo', 'food'))
|
||||
expected = [('f', 'f'), ('o', 'o'), ('o', 'o'), (None, 'd')]
|
||||
assert result == expected
|
||||
|
||||
|
||||
def test_some_diff():
|
||||
result = list(align('abcde', 'aaadef'))
|
||||
left, right = unzip(result)
|
||||
assert list(left) == ['a', 'b', 'c', 'd', 'e', None]
|
||||
assert list(right) == ['a', 'a', 'a', 'd', 'e', 'f']
|
||||
|
||||
|
||||
def test_longer():
|
||||
s1 = 'Dies ist eine Tst!'
|
||||
s2 = 'Dies ist ein Test.'
|
||||
|
||||
result = list(align(s1, s2)) # ; diffprint(*unzip(result))
|
||||
expected = [('D', 'D'), ('i', 'i'), ('e', 'e'), ('s', 's'), (' ', ' '),
|
||||
('i', 'i'), ('s', 's'), ('t', 't'), (' ', ' '),
|
||||
('e', 'e'), ('i', 'i'), ('n', 'n'), ('e', None), (' ', ' '),
|
||||
('T', 'T'), (None, 'e'), ('s', 's'), ('t', 't'), ('!', '.')]
|
||||
assert result == expected
|
||||
|
||||
|
||||
def test_completely_different():
|
||||
assert len(list(align('abcde', 'fghij'))) == 5
|
||||
|
||||
|
||||
def test_with_some_fake_ocr_errors():
|
||||
result = list(align('Über die vielen Sorgen wegen desselben vergaß',
|
||||
'SomeJunk MoreJunk Übey die vielen Sorgen wegen AdditionalJunk deffelben vcrgab'))
|
||||
left, right = unzip(result)
|
||||
|
||||
# Beginning
|
||||
assert list(left[:18]) == [None]*18
|
||||
assert list(right[:18]) == list('SomeJunk MoreJunk ')
|
||||
|
||||
# End
|
||||
assert list(left[-1:]) == ['ß']
|
||||
assert list(right[-1:]) == ['b']
|
37
qurator/dinglehopper/tests/test_character_error_rate.py
Normal file
37
qurator/dinglehopper/tests/test_character_error_rate.py
Normal file
|
@ -0,0 +1,37 @@
|
|||
from __future__ import division, print_function
|
||||
|
||||
import math
|
||||
import unicodedata
|
||||
|
||||
from .. import character_error_rate
|
||||
|
||||
|
||||
def test_character_error_rate():
|
||||
assert character_error_rate('a', 'a') == 0
|
||||
assert character_error_rate('a', 'b') == 1/1
|
||||
assert character_error_rate('Foo', 'Bar') == 3/3
|
||||
|
||||
assert character_error_rate('Foo', '') == 3/3
|
||||
|
||||
assert character_error_rate('', '') == 0
|
||||
assert math.isinf(character_error_rate('', 'Foo'))
|
||||
|
||||
assert character_error_rate('Foo', 'Food') == 1/3
|
||||
assert character_error_rate('Fnord', 'Food') == 2/5
|
||||
assert character_error_rate('Müll', 'Mull') == 1/4
|
||||
assert character_error_rate('Abstand', 'Sand') == 4/7
|
||||
|
||||
|
||||
def test_character_error_rate_hard():
|
||||
s1 = unicodedata.normalize('NFC', 'Schlyñ lorem ipsum.')
|
||||
s2 = unicodedata.normalize('NFD', 'Schlyñ lorem ipsum!') # Different, decomposed!
|
||||
assert character_error_rate(s1, s2) == 1/19
|
||||
|
||||
s1 = 'Schlyñ'
|
||||
assert len(s1) == 6 # This ends with LATIN SMALL LETTER N WITH TILDE, so 6 code points
|
||||
s2 = 'Schlym̃'
|
||||
assert len(s2) == 7 # This, OTOH, ends with LATIN SMALL LETTER M + COMBINING TILDE, 7 code points
|
||||
|
||||
# Both strings have the same length in terms of grapheme clusters. So the CER should be symmetrical.
|
||||
assert character_error_rate(s2, s1) == 1/6
|
||||
assert character_error_rate(s1, s2) == 1/6
|
40
qurator/dinglehopper/tests/test_edit_distance.py
Normal file
40
qurator/dinglehopper/tests/test_edit_distance.py
Normal file
|
@ -0,0 +1,40 @@
|
|||
from __future__ import division, print_function
|
||||
|
||||
import unicodedata
|
||||
|
||||
from .. import levenshtein, distance
|
||||
|
||||
|
||||
def test_levenshtein():
|
||||
assert levenshtein('a', 'a') == 0
|
||||
assert levenshtein('a', 'b') == 1
|
||||
assert levenshtein('Foo', 'Bar') == 3
|
||||
|
||||
assert levenshtein('', '') == 0
|
||||
assert levenshtein('Foo', '') == 3
|
||||
assert levenshtein('', 'Foo') == 3
|
||||
|
||||
assert levenshtein('Foo', 'Food') == 1
|
||||
assert levenshtein('Fnord', 'Food') == 2
|
||||
assert levenshtein('Müll', 'Mull') == 1
|
||||
assert levenshtein('Abstand', 'Sand') == 4
|
||||
|
||||
|
||||
def test_levenshtein_other_sequences():
|
||||
assert levenshtein(['a', 'ab'], ['a', 'ab', 'c']) == 1
|
||||
assert levenshtein(['a', 'ab'], ['a', 'c']) == 1
|
||||
|
||||
|
||||
def test_distance():
|
||||
assert distance('Fnord', 'Food') == 2
|
||||
assert distance('Müll', 'Mull') == 1
|
||||
|
||||
word1 = unicodedata.normalize('NFC', 'Schlyñ')
|
||||
word2 = unicodedata.normalize('NFD', 'Schlyñ') # Different, decomposed!
|
||||
assert distance(word1, word2) == 0
|
||||
|
||||
word1 = 'Schlyñ'
|
||||
assert len(word1) == 6 # This ends with LATIN SMALL LETTER N WITH TILDE, so 6 code points
|
||||
word2 = 'Schlym̃'
|
||||
assert len(word2) == 7 # This, OTOH, ends with LATIN SMALL LETTER M + COMBINING TILDE, 7 code points
|
||||
assert distance(word1, word2) == 1
|
38
qurator/dinglehopper/tests/test_editops.py
Normal file
38
qurator/dinglehopper/tests/test_editops.py
Normal file
|
@ -0,0 +1,38 @@
|
|||
from .. import seq_editops, editops
|
||||
|
||||
|
||||
def test_trivial():
|
||||
assert seq_editops('abc', 'abc') == []
|
||||
assert seq_editops('', '') == []
|
||||
|
||||
|
||||
def test_insert():
|
||||
assert seq_editops('bc', 'abc') == [('insert', 0, 0)]
|
||||
assert seq_editops('ac', 'abc') == [('insert', 1, 1)]
|
||||
assert seq_editops('ab', 'abc') == [('insert', 2, 2)]
|
||||
assert seq_editops('', 'a') == [('insert', 0, 0)]
|
||||
|
||||
|
||||
def test_multiple():
|
||||
assert seq_editops('bcd', 'abce') == [('insert', 0, 0), ('replace', 2, 3)]
|
||||
|
||||
|
||||
def test_delete():
|
||||
assert seq_editops('abcdef', 'cdef') == [('delete', 0, 0), ('delete', 1, 0)]
|
||||
assert seq_editops('Xabcdef', 'Xcdef') == [('delete', 1, 1), ('delete', 2, 1)]
|
||||
assert seq_editops('abcdefg', 'acdefX') == [('delete', 1, 1), ('replace', 6, 5)]
|
||||
assert seq_editops('abcde', 'aabcd') == [('insert', 1, 1), ('delete', 4, 5)]
|
||||
assert seq_editops('Foo', '') == [('delete', 0, 0), ('delete', 1, 0), ('delete', 2, 0)]
|
||||
assert seq_editops('Foolish', 'Foo') == [('delete', 3, 3), ('delete', 4, 3), ('delete', 5, 3), ('delete', 6, 3)]
|
||||
|
||||
|
||||
def test_ambiguous():
|
||||
assert seq_editops('bcd', 'abcef') == [('insert', 0, 0), ('replace', 2, 3), ('insert', 3, 4)]
|
||||
|
||||
|
||||
def test_editops():
|
||||
"""Test editops() in cases where dealing with grapheme clusters matters"""
|
||||
|
||||
# In these cases, one of the words has a composed form, the other one does not.
|
||||
assert editops('Schlyñ', 'Schlym̃') == [('replace', 5, 5)]
|
||||
assert editops('oͤde', 'öde') == [('replace', 0, 0)]
|
23
qurator/dinglehopper/tests/test_integ_align.py
Normal file
23
qurator/dinglehopper/tests/test_integ_align.py
Normal file
|
@ -0,0 +1,23 @@
|
|||
from __future__ import division, print_function
|
||||
|
||||
import os
|
||||
|
||||
import pytest
|
||||
from lxml import etree as ET
|
||||
|
||||
from .. import align, page_text
|
||||
|
||||
data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data')
|
||||
|
||||
|
||||
@pytest.mark.integration
|
||||
def test_align_page_files():
|
||||
# In the fake OCR file, we changed 2 characters and replaced a fi ligature with fi.
|
||||
# → 4 elements in the alignment should be different.
|
||||
# NOTE: In this example, it doesn't matter that we work with "characters", not grapheme clusters.
|
||||
|
||||
gt = page_text(ET.parse(os.path.join(data_dir, 'test-gt.page2018.xml')))
|
||||
ocr = page_text(ET.parse(os.path.join(data_dir, 'test-fake-ocr.page2018.xml')))
|
||||
|
||||
result = list(align(gt, ocr))
|
||||
assert sum(left != right for left, right in result) == 4
|
|
@ -0,0 +1,35 @@
|
|||
from __future__ import division, print_function
|
||||
|
||||
import os
|
||||
|
||||
import pytest
|
||||
from lxml import etree as ET
|
||||
|
||||
from .. import character_error_rate, page_text, alto_text
|
||||
|
||||
data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data')
|
||||
|
||||
|
||||
@pytest.mark.integration
|
||||
def test_character_error_rate_between_page_files():
|
||||
# In the fake OCR file, we changed 2 characters and replaced a fi ligature with fi.
|
||||
gt = page_text(ET.parse(os.path.join(data_dir, 'test-gt.page2018.xml')))
|
||||
ocr = page_text(ET.parse(os.path.join(data_dir, 'test-fake-ocr.page2018.xml')))
|
||||
assert character_error_rate(gt, ocr) == 4/(470 + 1 + 311) # 2 TextRegions, 1 \n
|
||||
|
||||
|
||||
@pytest.mark.integration
|
||||
def test_character_error_rate_between_page_alto():
|
||||
gt = page_text(ET.parse(os.path.join(data_dir, 'lorem-ipsum', 'lorem-ipsum-scan.gt.page.xml')))
|
||||
ocr = alto_text(ET.parse(os.path.join(data_dir, 'lorem-ipsum', 'lorem-ipsum-scan.ocr.tesseract.alto.xml')))
|
||||
|
||||
assert gt == ocr
|
||||
assert character_error_rate(gt, ocr) == 0
|
||||
|
||||
|
||||
@pytest.mark.integration
|
||||
def test_character_error_rate_between_page_alto_2():
|
||||
gt = page_text(ET.parse(os.path.join(data_dir, 'lorem-ipsum', 'lorem-ipsum-scan-bad.gt.page.xml')))
|
||||
ocr = alto_text(ET.parse(os.path.join(data_dir, 'lorem-ipsum', 'lorem-ipsum-scan-bad.ocr.tesseract.alto.xml')))
|
||||
|
||||
assert character_error_rate(gt, ocr) == 8/591 # Manually verified
|
35
qurator/dinglehopper/tests/test_integ_edit_distance_ocr.py
Normal file
35
qurator/dinglehopper/tests/test_integ_edit_distance_ocr.py
Normal file
|
@ -0,0 +1,35 @@
|
|||
from __future__ import division, print_function
|
||||
|
||||
import os
|
||||
|
||||
import pytest
|
||||
from lxml import etree as ET
|
||||
|
||||
from .. import distance, page_text, alto_text
|
||||
|
||||
data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data')
|
||||
|
||||
|
||||
@pytest.mark.integration
|
||||
def test_distance_between_page_files():
|
||||
# In the fake OCR file, we changed 2 characters and replaced a fi ligature with fi.
|
||||
gt = page_text(ET.parse(os.path.join(data_dir, 'test-gt.page2018.xml')))
|
||||
ocr = page_text(ET.parse(os.path.join(data_dir, 'test-fake-ocr.page2018.xml')))
|
||||
assert distance(gt, ocr) == 4
|
||||
|
||||
|
||||
@pytest.mark.integration
|
||||
def test_distance_between_page_alto():
|
||||
gt = page_text(ET.parse(os.path.join(data_dir, 'lorem-ipsum', 'lorem-ipsum-scan.gt.page.xml')))
|
||||
ocr = alto_text(ET.parse(os.path.join(data_dir, 'lorem-ipsum', 'lorem-ipsum-scan.ocr.tesseract.alto.xml')))
|
||||
|
||||
assert gt == ocr
|
||||
assert distance(gt, ocr) == 0
|
||||
|
||||
|
||||
@pytest.mark.integration
|
||||
def test_distance_between_page_alto_2():
|
||||
gt = page_text(ET.parse(os.path.join(data_dir, 'lorem-ipsum', 'lorem-ipsum-scan-bad.gt.page.xml')))
|
||||
ocr = alto_text(ET.parse(os.path.join(data_dir, 'lorem-ipsum', 'lorem-ipsum-scan-bad.ocr.tesseract.alto.xml')))
|
||||
|
||||
assert distance(gt, ocr) == 8 # Manually verified
|
43
qurator/dinglehopper/tests/test_integ_word_error_rate_ocr.py
Normal file
43
qurator/dinglehopper/tests/test_integ_word_error_rate_ocr.py
Normal file
|
@ -0,0 +1,43 @@
|
|||
from __future__ import division, print_function
|
||||
|
||||
import os
|
||||
|
||||
import pytest
|
||||
from lxml import etree as ET
|
||||
|
||||
from .. import word_error_rate, words, page_text, alto_text
|
||||
|
||||
data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data')
|
||||
|
||||
|
||||
@pytest.mark.integration
|
||||
def test_word_error_rate_between_page_files():
|
||||
# In the fake OCR file, we changed 2 characters and replaced a fi ligature with fi. → 3 changed words
|
||||
gt = page_text(ET.parse(os.path.join(data_dir, 'test-gt.page2018.xml')))
|
||||
|
||||
gt_word_count = 7+6+5+8+7+6+7+8+6+7+7+5+6+8+8+7+7+6+5+4 # Manually verified word count per line
|
||||
assert len(list(words(gt))) == gt_word_count
|
||||
|
||||
ocr = page_text(ET.parse(os.path.join(data_dir, 'test-fake-ocr.page2018.xml')))
|
||||
assert word_error_rate(gt, ocr) == 3/gt_word_count
|
||||
|
||||
|
||||
@pytest.mark.integration
|
||||
def test_word_error_rate_between_page_alto():
|
||||
gt = page_text(ET.parse(os.path.join(data_dir, 'lorem-ipsum', 'lorem-ipsum-scan.gt.page.xml')))
|
||||
ocr = alto_text(ET.parse(os.path.join(data_dir, 'lorem-ipsum', 'lorem-ipsum-scan.ocr.tesseract.alto.xml')))
|
||||
|
||||
assert gt == ocr
|
||||
assert word_error_rate(gt, ocr) == 0
|
||||
|
||||
|
||||
@pytest.mark.integration
|
||||
def test_word_error_rate_between_page_alto_2():
|
||||
gt = page_text(ET.parse(os.path.join(data_dir, 'lorem-ipsum', 'lorem-ipsum-scan-bad.gt.page.xml')))
|
||||
|
||||
gt_word_count = 14+18+17+14+17+17+3 # Manually verified word count per line
|
||||
assert len(list(words(gt))) == gt_word_count
|
||||
|
||||
ocr = alto_text(ET.parse(os.path.join(data_dir, 'lorem-ipsum', 'lorem-ipsum-scan-bad.ocr.tesseract.alto.xml')))
|
||||
|
||||
assert word_error_rate(gt, ocr) == 7/gt_word_count # Manually verified, 6 words are wrong, 1 got split (=2 errors)
|
99
qurator/dinglehopper/tests/test_ocr_files.py
Normal file
99
qurator/dinglehopper/tests/test_ocr_files.py
Normal file
|
@ -0,0 +1,99 @@
|
|||
import os
|
||||
import re
|
||||
|
||||
import lxml.etree as ET
|
||||
import textwrap
|
||||
|
||||
from .. import alto_namespace, alto_text, page_namespace, page_text, text
|
||||
|
||||
data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data')
|
||||
|
||||
|
||||
def test_alto_namespace():
|
||||
tree = ET.parse(os.path.join(data_dir, 'test.alto3.xml'))
|
||||
assert alto_namespace(tree) == 'http://www.loc.gov/standards/alto/ns-v3#'
|
||||
|
||||
|
||||
def test_alto_text():
|
||||
tree = ET.parse(os.path.join(data_dir, 'test.alto3.xml'))
|
||||
result = alto_text(tree)
|
||||
expected = textwrap.dedent("""\
|
||||
über die vielen Sorgen wegen deſſelben vergaß
|
||||
Hartkopf, der Frau Amtmännin das ver-
|
||||
ſprochene zu überliefern.""")
|
||||
assert result == expected
|
||||
|
||||
|
||||
def test_alto_text_ALTO1():
|
||||
tree = ET.parse(os.path.join(data_dir, 'test.alto1.xml'))
|
||||
assert "being erected at the Broadway stock" in alto_text(tree)
|
||||
|
||||
|
||||
def test_alto_text_ALTO2():
|
||||
tree = ET.parse(os.path.join(data_dir, 'test.alto2.xml'))
|
||||
assert "Halbmonde, die genau durch einen Ouerstrich halbiert\nsind und an beiden Enden" in alto_text(tree)
|
||||
|
||||
|
||||
def test_alto_text_ALTO3():
|
||||
tree = ET.parse(os.path.join(data_dir, 'test.alto3.xml'))
|
||||
assert "über die vielen Sorgen wegen deſſelben vergaß" in alto_text(tree)
|
||||
|
||||
|
||||
def test_page_namespace():
|
||||
tree = ET.parse(os.path.join(data_dir, 'test.page2018.xml'))
|
||||
assert page_namespace(tree) == 'http://schema.primaresearch.org/PAGE/gts/pagecontent/2018-07-15'
|
||||
|
||||
|
||||
def test_page_test():
|
||||
tree = ET.parse(os.path.join(data_dir, 'test.page2018.xml'))
|
||||
result = page_text(tree)
|
||||
expected = textwrap.dedent("""\
|
||||
ber die vielen Sorgen wegen deelben vergaß
|
||||
Hartkopf, der Frau Amtmnnin das ver⸗
|
||||
ſproene zu berliefern. — Ein Erpreer
|
||||
wurde an ihn abgeſit, um ihn ums Him⸗
|
||||
melswien zu ſagen, daß er das Verſproene
|
||||
glei den Augenbli berbringen mte, die
|
||||
Frau Amtmnnin htte auf ihn verlaen,
|
||||
und nun wßte e nit, was e anfangen
|
||||
ſote. Den Augenbli ſote er kommen,
|
||||
ſon vergieng e in ihrer Ang. — Die
|
||||
Ge wren ſon angekommen, und es fehlte
|
||||
ihr do no an aem. —
|
||||
Hartkopf mußte er bennen, und
|
||||
endli na langem Nadenken fiel es ihm er
|
||||
wieder ein. — Er langte den Zettel aus dem
|
||||
Accisbue heraus, und ſagte ſeiner Frau, daß
|
||||
e das, was da wre, herbeyſaffen mte.
|
||||
Jndeß mangelten do einige Generalia, die
|
||||
alſo wegfielen. — Hartkopf gieng ſelb
|
||||
mit und berbrate es. —""")
|
||||
assert result == expected
|
||||
|
||||
|
||||
def test_page_with_empty_region():
|
||||
# This file contains an empty TextRegion:
|
||||
#
|
||||
# <TextRegion id="region0000">
|
||||
# <Coords points="488,133 1197,133 1197,193 488,193"/>
|
||||
# <TextEquiv>
|
||||
# <Unicode></Unicode>
|
||||
# </TextEquiv>
|
||||
# </TextRegion>
|
||||
tree = ET.parse(os.path.join(data_dir, 'brochrnx_73075507X/00000139.ocrd-tess.ocr.page.xml'))
|
||||
result = page_text(tree)
|
||||
assert result
|
||||
|
||||
|
||||
def test_page_order():
|
||||
# This file contains TextRegions where file order is not the same as reading order.
|
||||
tree = ET.parse(os.path.join(data_dir, 'order.page.xml'))
|
||||
result = page_text(tree)
|
||||
|
||||
assert re.search(r'Herr Konfrater.*75.*Etwas f.r Wittwen.*Ein gewi.er Lord.*76\. Die', result, re.DOTALL)
|
||||
|
||||
|
||||
def test_text():
|
||||
assert "being erected at the Broadway stock" in text(os.path.join(data_dir, 'test.alto1.xml'))
|
||||
assert "wieder ein. — Er langte den Zettel aus dem" in text(os.path.join(data_dir, 'test.page2018.xml'))
|
||||
assert "Lorem ipsum" in text(os.path.join(data_dir, 'test.txt'))
|
45
qurator/dinglehopper/tests/test_word_error_rate.py
Normal file
45
qurator/dinglehopper/tests/test_word_error_rate.py
Normal file
|
@ -0,0 +1,45 @@
|
|||
from __future__ import division, print_function
|
||||
|
||||
import math
|
||||
|
||||
from .. import word_error_rate, words, unordered_word_error_rate
|
||||
|
||||
|
||||
def test_words():
|
||||
result = list(words('Der schnelle [„braune“] Fuchs kann keine 3,14 Meter springen, oder?'))
|
||||
expected = ['Der', 'schnelle', 'braune', 'Fuchs', 'kann', 'keine', '3,14', 'Meter', 'springen', 'oder']
|
||||
assert result == expected
|
||||
|
||||
|
||||
def test_words_private_use_area():
|
||||
result = list(words(
|
||||
'ber die vielen Sorgen wegen deelben vergaß Hartkopf, der Frau Amtmnnin das ver⸗\n'
|
||||
'ſproene zu berliefern.'))
|
||||
expected = [
|
||||
'ber', 'die', 'vielen', 'Sorgen', 'wegen', 'deelben', 'vergaß', 'Hartkopf',
|
||||
'der', 'Frau', 'Amtmnnin', 'das', 'ver',
|
||||
'ſproene', 'zu', 'berliefern']
|
||||
assert result == expected
|
||||
|
||||
|
||||
def test_word_error_rate():
|
||||
assert word_error_rate('Dies ist ein Beispielsatz!', 'Dies ist ein Beispielsatz!') == 0
|
||||
assert word_error_rate('Dies. ist ein Beispielsatz!', 'Dies ist ein Beispielsatz!') == 0
|
||||
assert word_error_rate('Dies. ist ein Beispielsatz!', 'Dies ist ein Beispielsatz.') == 0
|
||||
|
||||
assert word_error_rate('Dies ist ein Beispielsatz!', 'Dies ist ein Beispielsarz:') == 1/4
|
||||
assert word_error_rate('Dies ist ein Beispielsatz!', 'Dies ein ist Beispielsatz!') == 2/4
|
||||
|
||||
assert word_error_rate('Dies ist ein Beispielsatz!', '') == 4/4
|
||||
assert math.isinf(word_error_rate('', 'Dies ist ein Beispielsatz!'))
|
||||
assert word_error_rate('', '') == 0
|
||||
|
||||
assert word_error_rate('Schlyñ lorem ipsum dolor sit amet,', 'Schlym̃ lorem ipsum dolor sit amet.') == 1/6
|
||||
|
||||
|
||||
def test_unordered_word_error_rate():
|
||||
assert unordered_word_error_rate('abc def ghi', 'ghi abc def') == 0
|
||||
assert unordered_word_error_rate('abc def ghi', 'ghi abcX def') == 1/3
|
||||
assert unordered_word_error_rate('abc def ghi jkl', 'abc ghi def jkl') == 0
|
||||
assert unordered_word_error_rate('abc def ghi jkl', 'abc ghi defX jkl') == 1/4
|
||||
# XXX There seem to be some cases where this does not work
|
24
qurator/dinglehopper/tests/util.py
Normal file
24
qurator/dinglehopper/tests/util.py
Normal file
|
@ -0,0 +1,24 @@
|
|||
from itertools import zip_longest
|
||||
from typing import Iterable
|
||||
|
||||
import colorama
|
||||
|
||||
|
||||
def diffprint(x, y):
|
||||
"""Print elements or lists x and y, with differences in red"""
|
||||
|
||||
def _diffprint(x, y):
|
||||
if x != y:
|
||||
print(colorama.Fore.RED, x, y, colorama.Fore.RESET)
|
||||
else:
|
||||
print(x, y)
|
||||
|
||||
if isinstance(x, Iterable):
|
||||
for xe, ye in zip_longest(x, y):
|
||||
_diffprint(xe, ye)
|
||||
else:
|
||||
_diffprint(x, y)
|
||||
|
||||
|
||||
def unzip(l):
|
||||
return zip(*l)
|
69
qurator/dinglehopper/word_error_rate.py
Normal file
69
qurator/dinglehopper/word_error_rate.py
Normal file
|
@ -0,0 +1,69 @@
|
|||
from __future__ import division
|
||||
|
||||
import unicodedata
|
||||
|
||||
import uniseg.wordbreak
|
||||
|
||||
from .edit_distance import levenshtein
|
||||
|
||||
|
||||
def words(s):
|
||||
# Patch uniseg.wordbreak.word_break to deal with our private use characters. See also
|
||||
# https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/WordBreakProperty.txt
|
||||
old_word_break = uniseg.wordbreak.word_break
|
||||
|
||||
def new_word_break(c, index=0):
|
||||
if 0xE000 <= ord(c) <= 0xF8FF: # Private Use Area
|
||||
return 'ALetter'
|
||||
else:
|
||||
return old_word_break(c, index)
|
||||
uniseg.wordbreak.word_break = new_word_break
|
||||
|
||||
# Check if c is an unwanted character, i.e. whitespace, punctuation, or similar
|
||||
def unwanted(c):
|
||||
|
||||
# See https://www.fileformat.info/info/unicode/category/index.htm
|
||||
# and https://unicodebook.readthedocs.io/unicode.html#categories
|
||||
unwanted_categories = 'O', 'M', 'P', 'Z', 'S'
|
||||
unwanted_subcategories = 'Cc', 'Cf'
|
||||
|
||||
subcat = unicodedata.category(c)
|
||||
cat = subcat[0]
|
||||
return cat in unwanted_categories or subcat in unwanted_subcategories
|
||||
|
||||
# We follow Unicode Standard Annex #29 on Unicode Text Segmentation here: Split on word boundaries using
|
||||
# uniseg.wordbreak.words() and ignore all "words" that contain only whitespace, punctation "or similar characters."
|
||||
for word in uniseg.wordbreak.words(s):
|
||||
if all(unwanted(c) for c in word):
|
||||
pass
|
||||
else:
|
||||
yield word
|
||||
|
||||
|
||||
def words_normalized(s):
|
||||
return words(unicodedata.normalize('NFC', s))
|
||||
|
||||
|
||||
def word_error_rate(reference, compared):
|
||||
if isinstance(reference, str):
|
||||
reference_seq = list(words_normalized(reference))
|
||||
compared_seq = list(words_normalized(compared))
|
||||
else:
|
||||
reference_seq = list(reference)
|
||||
compared_seq = list(compared)
|
||||
|
||||
d = levenshtein(reference_seq, compared_seq)
|
||||
if d == 0:
|
||||
return 0
|
||||
|
||||
n = len(reference_seq)
|
||||
if n == 0:
|
||||
return float('inf')
|
||||
|
||||
return d / n
|
||||
|
||||
|
||||
def unordered_word_error_rate(reference, compared):
|
||||
reference_seq = sorted(words_normalized(reference))
|
||||
compared_seq = sorted(words_normalized(compared))
|
||||
return word_error_rate(reference_seq, compared_seq)
|
Loading…
Add table
Add a link
Reference in a new issue