➡ Move dinglehopper into its own directory
commit
89048bf55d
@ -0,0 +1,4 @@
|
|||||||
|
dinglehopper
|
||||||
|
============
|
||||||
|
|
||||||
|
dinglehopper is an OCR evaluation tool and reads ALTO, PAGE and text files.
|
@ -0,0 +1,2 @@
|
|||||||
|
__import__('pkg_resources').declare_namespace(__name__)
|
||||||
|
|
@ -0,0 +1,6 @@
|
|||||||
|
# User-specific stuff
|
||||||
|
.idea/**/workspace.xml
|
||||||
|
.idea/**/tasks.xml
|
||||||
|
.idea/**/usage.statistics.xml
|
||||||
|
.idea/**/dictionaries
|
||||||
|
.idea/**/shelf
|
@ -0,0 +1,12 @@
|
|||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<module type="PYTHON_MODULE" version="4">
|
||||||
|
<component name="NewModuleRootManager">
|
||||||
|
<content url="file://$MODULE_DIR$" />
|
||||||
|
<orderEntry type="jdk" jdkName="Python 3.7 (dinglehopper)" jdkType="Python SDK" />
|
||||||
|
<orderEntry type="sourceFolder" forTests="false" />
|
||||||
|
</component>
|
||||||
|
<component name="TestRunnerService">
|
||||||
|
<option name="projectConfiguration" value="pytest" />
|
||||||
|
<option name="PROJECT_TEST_RUNNER" value="pytest" />
|
||||||
|
</component>
|
||||||
|
</module>
|
@ -0,0 +1,7 @@
|
|||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<project version="4">
|
||||||
|
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.7 (dinglehopper)" project-jdk-type="Python SDK" />
|
||||||
|
<component name="PyCharmProfessionalAdvertiser">
|
||||||
|
<option name="shown" value="true" />
|
||||||
|
</component>
|
||||||
|
</project>
|
@ -0,0 +1,8 @@
|
|||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<project version="4">
|
||||||
|
<component name="ProjectModuleManager">
|
||||||
|
<modules>
|
||||||
|
<module fileurl="file://$PROJECT_DIR$/.idea/dinglehopper.iml" filepath="$PROJECT_DIR$/.idea/dinglehopper.iml" />
|
||||||
|
</modules>
|
||||||
|
</component>
|
||||||
|
</project>
|
@ -0,0 +1,5 @@
|
|||||||
|
from .ocr_files import *
|
||||||
|
from .substitute_equivalences import *
|
||||||
|
from .character_error_rate import *
|
||||||
|
from .word_error_rate import *
|
||||||
|
from .align import *
|
@ -0,0 +1,34 @@
|
|||||||
|
from .edit_distance import *
|
||||||
|
|
||||||
|
def align(s1, s2):
|
||||||
|
s1 = list(s1)
|
||||||
|
s2 = list(s2)
|
||||||
|
ops = seq_editops(s1, s2)
|
||||||
|
i = 0
|
||||||
|
j = 0
|
||||||
|
|
||||||
|
while i < len(s1) or j < len(s2):
|
||||||
|
o = None
|
||||||
|
try:
|
||||||
|
ot = ops[0]
|
||||||
|
if ot[1] == i and ot[2] == j:
|
||||||
|
ops = ops[1:]
|
||||||
|
o = ot
|
||||||
|
except IndexError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
if o:
|
||||||
|
if o[0] == 'insert':
|
||||||
|
yield (None, s2[j])
|
||||||
|
j += 1
|
||||||
|
elif o[0] == 'delete':
|
||||||
|
yield (s1[i], None)
|
||||||
|
i += 1
|
||||||
|
elif o[0] == 'replace':
|
||||||
|
yield (s1[i], s2[j])
|
||||||
|
i += 1
|
||||||
|
j += 1
|
||||||
|
else:
|
||||||
|
yield (s1[i], s2[j])
|
||||||
|
i += 1
|
||||||
|
j += 1
|
@ -0,0 +1,21 @@
|
|||||||
|
from __future__ import division
|
||||||
|
|
||||||
|
import unicodedata
|
||||||
|
|
||||||
|
from uniseg.graphemecluster import grapheme_clusters
|
||||||
|
|
||||||
|
from qurator.dinglehopper.edit_distance import distance
|
||||||
|
|
||||||
|
|
||||||
|
def character_error_rate(reference, compared):
|
||||||
|
d = distance(reference, compared)
|
||||||
|
if d == 0:
|
||||||
|
return 0
|
||||||
|
|
||||||
|
n = len(list(grapheme_clusters(unicodedata.normalize('NFC', reference))))
|
||||||
|
if n == 0:
|
||||||
|
return float('inf')
|
||||||
|
|
||||||
|
return d/n
|
||||||
|
|
||||||
|
# XXX Should we really count newlines here?
|
@ -0,0 +1,83 @@
|
|||||||
|
import os
|
||||||
|
|
||||||
|
import click
|
||||||
|
from jinja2 import Environment, FileSystemLoader
|
||||||
|
|
||||||
|
|
||||||
|
from qurator.dinglehopper import *
|
||||||
|
|
||||||
|
|
||||||
|
def gen_diff_report(gt_things, ocr_things, css_prefix, joiner, none):
|
||||||
|
gtx = ''
|
||||||
|
ocrx = ''
|
||||||
|
|
||||||
|
def format_thing(t, css_classes=None):
|
||||||
|
if t is None:
|
||||||
|
t = none
|
||||||
|
css_classes += ' ellipsis'
|
||||||
|
if t == '\n':
|
||||||
|
t = '<br>'
|
||||||
|
|
||||||
|
if css_classes:
|
||||||
|
return '<span class="{css_classes}">{t}</span>'.format(css_classes=css_classes, t=t)
|
||||||
|
else:
|
||||||
|
return '{t}'.format(t=t)
|
||||||
|
|
||||||
|
for k, (g, o) in enumerate(align(gt_things, ocr_things)):
|
||||||
|
if g == o:
|
||||||
|
css_classes = None
|
||||||
|
else:
|
||||||
|
css_classes = '{css_prefix}diff{k} diff'.format(css_prefix=css_prefix, k=k)
|
||||||
|
|
||||||
|
gtx += joiner + format_thing(g, css_classes)
|
||||||
|
ocrx += joiner + format_thing(o, css_classes)
|
||||||
|
|
||||||
|
return \
|
||||||
|
'''
|
||||||
|
<div class="row">
|
||||||
|
<div class="col-md-6 gt">{}</div>
|
||||||
|
<div class="col-md-6 ocr">{}</div>
|
||||||
|
</div>
|
||||||
|
'''.format(gtx, ocrx)
|
||||||
|
|
||||||
|
|
||||||
|
@click.command()
|
||||||
|
@click.argument('gt', type=click.Path(exists=True))
|
||||||
|
@click.argument('ocr', type=click.Path(exists=True))
|
||||||
|
def process(gt, ocr):
|
||||||
|
"""Check OCR result against GT"""
|
||||||
|
|
||||||
|
gt_text = text(gt)
|
||||||
|
ocr_text = text(ocr)
|
||||||
|
|
||||||
|
gt_text = substitute_equivalences(gt_text)
|
||||||
|
ocr_text = substitute_equivalences(ocr_text)
|
||||||
|
|
||||||
|
cer = character_error_rate(gt_text, ocr_text)
|
||||||
|
wer = word_error_rate(gt_text, ocr_text)
|
||||||
|
uwer = unordered_word_error_rate(gt_text, ocr_text)
|
||||||
|
|
||||||
|
char_diff_report = gen_diff_report(gt_text, ocr_text, css_prefix='c', joiner='', none='·')
|
||||||
|
|
||||||
|
gt_words = words(gt_text)
|
||||||
|
ocr_words = words(ocr_text)
|
||||||
|
word_diff_report = gen_diff_report(gt_words, ocr_words, css_prefix='w', joiner=' ', none='⋯')
|
||||||
|
|
||||||
|
env = Environment(loader=FileSystemLoader(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'templates')))
|
||||||
|
for out_fn in ('report.html', 'report.json'):
|
||||||
|
template_fn = out_fn + '.j2'
|
||||||
|
template = env.get_template(template_fn)
|
||||||
|
template.stream(
|
||||||
|
gt=gt, ocr=ocr,
|
||||||
|
cer=cer, wer=wer, uwer=uwer,
|
||||||
|
char_diff_report=char_diff_report,
|
||||||
|
word_diff_report=word_diff_report
|
||||||
|
).dump(out_fn)
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
process()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
@ -0,0 +1,95 @@
|
|||||||
|
from __future__ import division, print_function
|
||||||
|
|
||||||
|
import unicodedata
|
||||||
|
from functools import partial
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
from uniseg.graphemecluster import grapheme_clusters
|
||||||
|
|
||||||
|
|
||||||
|
def levenshtein_matrix(seq1, seq2):
|
||||||
|
"""Compute the matrix commonly computed to produce the Levenshtein distance.
|
||||||
|
|
||||||
|
This is also known as the Wagner-Fischer algorithm. The matrix element at the bottom right contains the desired
|
||||||
|
edit distance.
|
||||||
|
|
||||||
|
This algorithm is implemented here because we need an implementation that can work with sequences other than
|
||||||
|
strings, e.g. lists of grapheme clusters or lists of word strings.
|
||||||
|
"""
|
||||||
|
m = len(seq1)
|
||||||
|
n = len(seq2)
|
||||||
|
|
||||||
|
def from_to(start, stop):
|
||||||
|
return range(start, stop + 1, 1)
|
||||||
|
|
||||||
|
D = np.zeros((m + 1, n + 1), np.int)
|
||||||
|
D[0, 0] = 0
|
||||||
|
for i in from_to(1, m):
|
||||||
|
D[i, 0] = i
|
||||||
|
for j in from_to(1, n):
|
||||||
|
D[0, j] = j
|
||||||
|
for i in from_to(1, m):
|
||||||
|
for j in from_to(1, n):
|
||||||
|
D[i, j] = min(
|
||||||
|
D[i - 1, j - 1] + 1 * (seq1[i - 1] != seq2[j - 1]), # Same or Substitution
|
||||||
|
D[i, j - 1] + 1, # Insertion
|
||||||
|
D[i - 1, j] + 1 # Deletion
|
||||||
|
)
|
||||||
|
|
||||||
|
return D
|
||||||
|
|
||||||
|
|
||||||
|
def levenshtein(seq1, seq2):
|
||||||
|
"""Compute the Levenshtein edit distance between two sequences"""
|
||||||
|
m = len(seq1)
|
||||||
|
n = len(seq2)
|
||||||
|
|
||||||
|
D = levenshtein_matrix(seq1, seq2)
|
||||||
|
return D[m, n]
|
||||||
|
|
||||||
|
|
||||||
|
def distance(s1, s2):
|
||||||
|
"""Compute the Levenshtein edit distance between two Unicode strings
|
||||||
|
|
||||||
|
Note that this is different from levenshtein() as this function knows about Unicode normalization and grapheme
|
||||||
|
clusters. This should be the correct way to compare two Unicode strings.
|
||||||
|
"""
|
||||||
|
s1 = list(grapheme_clusters(unicodedata.normalize('NFC', s1)))
|
||||||
|
s2 = list(grapheme_clusters(unicodedata.normalize('NFC', s2)))
|
||||||
|
return levenshtein(s1, s2)
|
||||||
|
|
||||||
|
|
||||||
|
def seq_editops(seq1, seq2):
|
||||||
|
seq1 = list(seq1)
|
||||||
|
seq2 = list(seq2)
|
||||||
|
m = len(seq1)
|
||||||
|
n = len(seq2)
|
||||||
|
D = levenshtein_matrix(seq1, seq2)
|
||||||
|
|
||||||
|
def _tail_backtrace(i, j, accumulator):
|
||||||
|
if i > 0 and D[i - 1, j] + 1 == D[i, j]:
|
||||||
|
return partial(_tail_backtrace, i - 1, j, [('delete', i-1, j)] + accumulator)
|
||||||
|
if j > 0 and D[i, j - 1] + 1 == D[i, j]:
|
||||||
|
return partial(_tail_backtrace, i, j - 1, [('insert', i, j-1)] + accumulator)
|
||||||
|
if i > 0 and j > 0 and D[i - 1, j - 1] + 1 == D[i, j]:
|
||||||
|
return partial(_tail_backtrace, i - 1, j - 1, [('replace', i-1, j-1)] + accumulator)
|
||||||
|
if i > 0 and j > 0 and D[i - 1, j - 1] == D[i, j]:
|
||||||
|
return partial(_tail_backtrace, i - 1, j - 1, accumulator) # NOP
|
||||||
|
return accumulator
|
||||||
|
|
||||||
|
def backtrace(i, j):
|
||||||
|
result = partial(_tail_backtrace, i, j, [])
|
||||||
|
while isinstance(result, partial):
|
||||||
|
result = result()
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
b = backtrace(m, n)
|
||||||
|
return b
|
||||||
|
|
||||||
|
|
||||||
|
def editops(word1, word2):
|
||||||
|
# XXX Note that this returns indices to the _grapheme clusters_, not characters!
|
||||||
|
word1 = list(grapheme_clusters(unicodedata.normalize('NFC', word1)))
|
||||||
|
word2 = list(grapheme_clusters(unicodedata.normalize('NFC', word2)))
|
||||||
|
return seq_editops(word1, word2)
|
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,105 @@
|
|||||||
|
from __future__ import division, print_function
|
||||||
|
|
||||||
|
from lxml import etree as ET
|
||||||
|
import sys
|
||||||
|
|
||||||
|
from lxml.etree import XMLSyntaxError
|
||||||
|
|
||||||
|
|
||||||
|
def alto_namespace(tree):
|
||||||
|
"""Return the ALTO namespace used in the given ElementTree.
|
||||||
|
|
||||||
|
This relies on the assumption that, in any given ALTO file, the root element has the local name "alto". We do not
|
||||||
|
check if the files uses any valid ALTO namespace.
|
||||||
|
"""
|
||||||
|
root_name = ET.QName(tree.getroot().tag)
|
||||||
|
if root_name.localname == 'alto':
|
||||||
|
return root_name.namespace
|
||||||
|
else:
|
||||||
|
raise ValueError('Not an ALTO tree')
|
||||||
|
|
||||||
|
|
||||||
|
def alto_text(tree):
|
||||||
|
"""Extract text from the given ALTO ElementTree."""
|
||||||
|
|
||||||
|
nsmap = {'alto': alto_namespace(tree)}
|
||||||
|
|
||||||
|
lines = (
|
||||||
|
' '.join(string.attrib.get('CONTENT') for string in line.iterfind('alto:String', namespaces=nsmap))
|
||||||
|
for line in tree.iterfind('.//alto:TextLine', namespaces=nsmap))
|
||||||
|
text_ = '\n'.join(lines)
|
||||||
|
|
||||||
|
return text_
|
||||||
|
|
||||||
|
|
||||||
|
def page_namespace(tree):
|
||||||
|
"""Return the PAGE content namespace used in the given ElementTree.
|
||||||
|
|
||||||
|
This relies on the assumption that, in any given PAGE content file, the root element has the local name "PcGts". We
|
||||||
|
do not check if the files uses any valid PAGE namespace.
|
||||||
|
"""
|
||||||
|
root_name = ET.QName(tree.getroot().tag)
|
||||||
|
if root_name.localname == 'PcGts':
|
||||||
|
return root_name.namespace
|
||||||
|
else:
|
||||||
|
raise ValueError('Not a PAGE tree')
|
||||||
|
|
||||||
|
|
||||||
|
def page_text(tree):
|
||||||
|
"""Extract text from the given PAGE content ElementTree."""
|
||||||
|
|
||||||
|
nsmap = {'page': page_namespace(tree)}
|
||||||
|
|
||||||
|
def region_text(region):
|
||||||
|
try:
|
||||||
|
return region.find('./page:TextEquiv/page:Unicode', namespaces=nsmap).text
|
||||||
|
except AttributeError:
|
||||||
|
return None
|
||||||
|
|
||||||
|
region_texts = []
|
||||||
|
reading_order = tree.find('.//page:ReadingOrder', namespaces=nsmap)
|
||||||
|
if reading_order is not None:
|
||||||
|
for group in reading_order.iterfind('./*', namespaces=nsmap):
|
||||||
|
if ET.QName(group.tag).localname == 'OrderedGroup':
|
||||||
|
region_ref_indexeds = group.findall('./page:RegionRefIndexed', namespaces=nsmap)
|
||||||
|
for region_ref_indexed in sorted(region_ref_indexeds, key=lambda r: r.attrib['index']):
|
||||||
|
region_id = region_ref_indexed.attrib['regionRef']
|
||||||
|
region = tree.find('.//page:TextRegion[@id="%s"]' % region_id, namespaces=nsmap)
|
||||||
|
if region is not None:
|
||||||
|
region_texts.append(region_text(region))
|
||||||
|
else:
|
||||||
|
raise ValueError('Invalid region id "%s" in file' % region_id)
|
||||||
|
else:
|
||||||
|
raise NotImplementedError
|
||||||
|
else:
|
||||||
|
for region in tree.iterfind('.//page:TextRegion', namespaces=nsmap):
|
||||||
|
region_texts.append(region_text(region))
|
||||||
|
|
||||||
|
# XXX Does a file have to have regions etc.? region vs lines etc.
|
||||||
|
# Filter empty region texts
|
||||||
|
region_texts = (t for t in region_texts if t)
|
||||||
|
|
||||||
|
text_ = '\n'.join(region_texts)
|
||||||
|
|
||||||
|
return text_
|
||||||
|
|
||||||
|
|
||||||
|
def text(filename):
|
||||||
|
"""Read the text from the given file.
|
||||||
|
|
||||||
|
Supports PAGE, ALTO and falls back to plain text.
|
||||||
|
"""
|
||||||
|
|
||||||
|
try:
|
||||||
|
tree = ET.parse(filename)
|
||||||
|
except XMLSyntaxError:
|
||||||
|
with open(filename, 'r') as f:
|
||||||
|
return f.read()
|
||||||
|
try:
|
||||||
|
return page_text(tree)
|
||||||
|
except ValueError:
|
||||||
|
return alto_text(tree)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
print(text(sys.argv[1]))
|
@ -0,0 +1,4 @@
|
|||||||
|
[pytest]
|
||||||
|
markers =
|
||||||
|
integration: integration tests
|
||||||
|
serial
|
@ -0,0 +1,61 @@
|
|||||||
|
<!doctype html>
|
||||||
|
<html lang="en">
|
||||||
|
<head>
|
||||||
|
<meta charset="utf-8">
|
||||||
|
<meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
|
||||||
|
|
||||||
|
<link rel="stylesheet" href="https://stackpath.bootstrapcdn.com/bootstrap/4.3.1/css/bootstrap.min.css" integrity="sha384-ggOyR0iXCbMQv3Xipma34MD+dH/1fQ784/j6cY/iJTQUOhcWr7x9JvoRxT2MZw1T" crossorigin="anonymous">
|
||||||
|
<style type="text/css">
|
||||||
|
.gt .diff {
|
||||||
|
color: green;
|
||||||
|
}
|
||||||
|
.ocr .diff {
|
||||||
|
color: red;
|
||||||
|
}
|
||||||
|
.ellipsis {
|
||||||
|
opacity: 0.5;
|
||||||
|
font-style: italic;
|
||||||
|
}
|
||||||
|
.diff-highlight {
|
||||||
|
border: 2px solid;
|
||||||
|
border-radius: 5px;
|
||||||
|
}
|
||||||
|
</style>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<div class="container">
|
||||||
|
|
||||||
|
{{ gt }}<br>
|
||||||
|
{{ ocr }}
|
||||||
|
|
||||||
|
|
||||||
|
<h2>Metrics</h2>
|
||||||
|
<p>CER: {{ cer|round(4) }}</p>
|
||||||
|
<p>WER: {{ wer|round(4) }}</p>
|
||||||
|
<!-- FIXME <p>WER (unordered): {{ uwer|round(4) }}</p> -->
|
||||||
|
|
||||||
|
<h2>Character differences</h2>
|
||||||
|
{{ char_diff_report }}
|
||||||
|
|
||||||
|
<h2>Word differences</h2>
|
||||||
|
{{ word_diff_report }}
|
||||||
|
|
||||||
|
|
||||||
|
</div>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<script src="https://code.jquery.com/jquery-3.3.1.slim.min.js" integrity="sha384-q8i/X+965DzO0rT7abK41JStQIAqVgRVzpbzo5smXKp4YfRvH+8abtTE1Pi6jizo" crossorigin="anonymous"></script>
|
||||||
|
<script src="https://cdnjs.cloudflare.com/ajax/libs/popper.js/1.14.7/umd/popper.min.js" integrity="sha384-UO2eT0CpHqdSJQ6hJty5KVphtPhzWj9WO1clHTMGa3JDZwrnQq4sF86dIHNDz0W1" crossorigin="anonymous"></script>
|
||||||
|
<script src="https://stackpath.bootstrapcdn.com/bootstrap/4.3.1/js/bootstrap.min.js" integrity="sha384-JjSmVgyd0p3pXB1rRibZUAYoIIy6OrQ6VrjIEaFf/nJGzIxFDsf4x0xIM+B07jRM" crossorigin="anonymous"></script>
|
||||||
|
|
||||||
|
<script>
|
||||||
|
{% include 'report.html.js' %}
|
||||||
|
</script>
|
||||||
|
|
||||||
|
|
||||||
|
</body>
|
||||||
|
</html>
|
@ -0,0 +1,14 @@
|
|||||||
|
function find_diff_class(classes) {
|
||||||
|
return classes.split(/\s+/).find(x => x.match(/.diff\d.*/));
|
||||||
|
}
|
||||||
|
|
||||||
|
$(document).ready(function() {
|
||||||
|
$('.diff').mouseover(function() {
|
||||||
|
let c = find_diff_class($(this).attr('class'))
|
||||||
|
$('.' + c).addClass('diff-highlight')
|
||||||
|
});
|
||||||
|
$('.diff').mouseout(function() {
|
||||||
|
let c = find_diff_class($(this).attr('class'))
|
||||||
|
$('.' + c).removeClass('diff-highlight')
|
||||||
|
});
|
||||||
|
});
|
@ -0,0 +1,6 @@
|
|||||||
|
{
|
||||||
|
"gt": "{{ gt }}",
|
||||||
|
"ocr": "{{ ocr }}",
|
||||||
|
"cer": {{ cer|round(6) }},
|
||||||
|
"wer": {{ wer|round(6) }}
|
||||||
|
}
|
Binary file not shown.
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,47 @@
|
|||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<PcGts xmlns="http://schema.primaresearch.org/PAGE/gts/pagecontent/2018-07-15" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://schema.primaresearch.org/PAGE/gts/pagecontent/2018-07-15 http://schema.primaresearch.org/PAGE/gts/pagecontent/2018-07-15/pagecontent.xsd">
|
||||||
|
<Metadata>
|
||||||
|
<Creator></Creator>
|
||||||
|
<Created>2019-07-26T13:59:00</Created>
|
||||||
|
<LastChange>2019-07-26T14:00:29</LastChange></Metadata>
|
||||||
|
<Page imageFilename="lorem-ipsum-scan.tif" imageXResolution="300.00000" imageYResolution="300.00000" imageWidth="2481" imageHeight="3508">
|
||||||
|
<TextRegion id="tempReg357564684568544579089">
|
||||||
|
<Coords points="0,0 1,0 1,1 0,1"/>
|
||||||
|
<TextLine id="l0">
|
||||||
|
<Coords points="228,237 228,295 2216,295 2216,237"/>
|
||||||
|
<TextEquiv>
|
||||||
|
<Unicode></Unicode></TextEquiv></TextLine>
|
||||||
|
<TextLine id="l1">
|
||||||
|
<Coords points="228,298 228,348 2160,348 2160,298"/>
|
||||||
|
<TextEquiv>
|
||||||
|
<Unicode></Unicode></TextEquiv></TextLine>
|
||||||
|
<TextLine id="l2">
|
||||||
|
<Coords points="225,348 225,410 2178,410 2178,348"/>
|
||||||
|
<TextEquiv>
|
||||||
|
<Unicode></Unicode></TextEquiv></TextLine>
|
||||||
|
<TextLine id="l3">
|
||||||
|
<Coords points="218,413 218,463 2153,463 2153,413"/>
|
||||||
|
<TextEquiv>
|
||||||
|
<Unicode></Unicode></TextEquiv></TextLine>
|
||||||
|
<TextLine id="l4">
|
||||||
|
<Coords points="225,466 225,522 2153,522 2153,466"/>
|
||||||
|
<TextEquiv>
|
||||||
|
<Unicode></Unicode></TextEquiv></TextLine>
|
||||||
|
<TextLine id="l5">
|
||||||
|
<Coords points="216,524 216,581 2187,581 2187,524"/>
|
||||||
|
<TextEquiv>
|
||||||
|
<Unicode></Unicode></TextEquiv></TextLine>
|
||||||
|
<TextLine id="l6">
|
||||||
|
<Coords points="219,584 219,640 542,640 542,584"/>
|
||||||
|
<TextEquiv>
|
||||||
|
<Unicode></Unicode></TextEquiv></TextLine></TextRegion>
|
||||||
|
<TextRegion id="r7" type="paragraph">
|
||||||
|
<Coords points="204,212 204,651 2227,651 2227,212"/>
|
||||||
|
<TextEquiv>
|
||||||
|
<Unicode>Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt
|
||||||
|
ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo
|
||||||
|
dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit
|
||||||
|
amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor
|
||||||
|
invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et
|
||||||
|
justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum
|
||||||
|
dolor sit amet.</Unicode></TextEquiv></TextRegion></Page></PcGts>
|
@ -0,0 +1,139 @@
|
|||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<alto xmlns="http://www.loc.gov/standards/alto/ns-v3#" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.loc.gov/standards/alto/ns-v3# http://www.loc.gov/alto/v3/alto-3-0.xsd">
|
||||||
|
<Description>
|
||||||
|
<MeasurementUnit>pixel</MeasurementUnit>
|
||||||
|
<sourceImageInformation>
|
||||||
|
<fileName> </fileName>
|
||||||
|
</sourceImageInformation>
|
||||||
|
<OCRProcessing ID="OCR_0">
|
||||||
|
<ocrProcessingStep>
|
||||||
|
<processingSoftware>
|
||||||
|
<softwareName>tesseract 4.1.0-rc4</softwareName>
|
||||||
|
</processingSoftware>
|
||||||
|
</ocrProcessingStep>
|
||||||
|
</OCRProcessing>
|
||||||
|
</Description>
|
||||||
|
<Layout>
|
||||||
|
<Page WIDTH="2481" HEIGHT="3508" PHYSICAL_IMG_NR="0" ID="page_0">
|
||||||
|
<PrintSpace HPOS="0" VPOS="0" WIDTH="2481" HEIGHT="3508">
|
||||||
|
<TextBlock ID="block_0" HPOS="209" VPOS="258" WIDTH="1954" HEIGHT="437">
|
||||||
|
<TextLine ID="line_0" HPOS="209" VPOS="258" WIDTH="1954" HEIGHT="103">
|
||||||
|
<String ID="string_0" HPOS="209" VPOS="319" WIDTH="134" HEIGHT="34" WC="0.96" CONTENT="Lorem"/><SP WIDTH="13" VPOS="319" HPOS="343"/>
|
||||||
|
<String ID="string_1" HPOS="356" VPOS="316" WIDTH="121" HEIGHT="45" WC="0.96" CONTENT="ipsum"/><SP WIDTH="14" VPOS="316" HPOS="477"/>
|
||||||
|
<String ID="string_2" HPOS="491" VPOS="312" WIDTH="102" HEIGHT="36" WC="0.96" CONTENT="dolor"/><SP WIDTH="15" VPOS="312" HPOS="593"/>
|
||||||
|
<String ID="string_3" HPOS="608" VPOS="309" WIDTH="46" HEIGHT="35" WC="0.96" CONTENT="sit"/><SP WIDTH="14" VPOS="309" HPOS="654"/>
|
||||||
|
<String ID="string_4" HPOS="668" VPOS="311" WIDTH="106" HEIGHT="37" WC="0.96" CONTENT="amet,"/><SP WIDTH="16" VPOS="311" HPOS="774"/>
|
||||||
|
<String ID="string_5" HPOS="790" VPOS="307" WIDTH="201" HEIGHT="32" WC="0.88" CONTENT="consetetur"/><SP WIDTH="14" VPOS="307" HPOS="991"/>
|
||||||
|
<String ID="string_6" HPOS="1005" VPOS="297" WIDTH="205" HEIGHT="46" WC="0.96" CONTENT="sadipscing"/><SP WIDTH="15" VPOS="297" HPOS="1210"/>
|
||||||
|
<String ID="string_7" HPOS="1225" VPOS="293" WIDTH="84" HEIGHT="42" WC="0.91" CONTENT="elitr,"/><SP WIDTH="16" VPOS="293" HPOS="1309"/>
|
||||||
|
<String ID="string_8" HPOS="1325" VPOS="289" WIDTH="65" HEIGHT="38" WC="0.96" CONTENT="sed"/><SP WIDTH="14" VPOS="289" HPOS="1390"/>
|
||||||
|
<String ID="string_9" HPOS="1404" VPOS="286" WIDTH="97" HEIGHT="36" WC="0.93" CONTENT="diam"/><SP WIDTH="14" VPOS="286" HPOS="1501"/>
|
||||||
|
<String ID="string_10" HPOS="1515" VPOS="291" WIDTH="100" HEIGHT="24" WC="0.69" CONTENT="nonu"/><SP WIDTH="32" VPOS="291" HPOS="1615"/>
|
||||||
|
<String ID="string_11" HPOS="1647" VPOS="285" WIDTH="30" HEIGHT="36" WC="0.37" CONTENT="yy"/><SP WIDTH="17" VPOS="285" HPOS="1677"/>
|
||||||
|
<String ID="string_12" HPOS="1694" VPOS="268" WIDTH="140" HEIGHT="42" WC="0.93" CONTENT="eirmod"/><SP WIDTH="11" VPOS="268" HPOS="1834"/>
|
||||||
|
<String ID="string_13" HPOS="1845" VPOS="273" WIDTH="139" HEIGHT="37" WC="0.96" CONTENT="tempor"/><SP WIDTH="15" VPOS="273" HPOS="1984"/>
|
||||||
|
<String ID="string_14" HPOS="1999" VPOS="258" WIDTH="164" HEIGHT="38" WC="0.95" CONTENT="invidunt"/>
|
||||||
|
</TextLine>
|
||||||
|
<TextLine ID="line_1" HPOS="211" VPOS="315" WIDTH="1904" HEIGHT="102">
|
||||||
|
<String ID="string_15" HPOS="211" VPOS="380" WIDTH="39" HEIGHT="31" WC="0.96" CONTENT="ut"/><SP WIDTH="13" VPOS="380" HPOS="250"/>
|
||||||
|
<String ID="string_16" HPOS="263" VPOS="373" WIDTH="123" HEIGHT="44" WC="0.96" CONTENT="labore"/><SP WIDTH="16" VPOS="373" HPOS="386"/>
|
||||||
|
<String ID="string_17" HPOS="402" VPOS="379" WIDTH="33" HEIGHT="27" WC="0.95" CONTENT="et"/><SP WIDTH="14" VPOS="379" HPOS="435"/>
|
||||||
|
<String ID="string_18" HPOS="449" VPOS="370" WIDTH="123" HEIGHT="36" WC="0.95" CONTENT="dolore"/><SP WIDTH="15" VPOS="370" HPOS="572"/>
|
||||||
|
<String ID="string_19" HPOS="587" VPOS="374" WIDTH="133" HEIGHT="37" WC="0.96" CONTENT="magna"/><SP WIDTH="14" VPOS="374" HPOS="720"/>
|
||||||
|
<String ID="string_20" HPOS="734" VPOS="363" WIDTH="183" HEIGHT="43" WC="0.96" CONTENT="aliquyam"/><SP WIDTH="14" VPOS="363" HPOS="917"/>
|
||||||
|
<String ID="string_21" HPOS="931" VPOS="360" WIDTH="82" HEIGHT="36" WC="0.95" CONTENT="erat,"/><SP WIDTH="17" VPOS="360" HPOS="1013"/>
|
||||||
|
<String ID="string_22" HPOS="1030" VPOS="354" WIDTH="65" HEIGHT="35" WC="0.96" CONTENT="sed"/><SP WIDTH="13" VPOS="354" HPOS="1095"/>
|
||||||
|
<String ID="string_23" HPOS="1108" VPOS="352" WIDTH="96" HEIGHT="36" WC="0.96" CONTENT="diam"/><SP WIDTH="13" VPOS="352" HPOS="1204"/>
|
||||||
|
<String ID="string_24" HPOS="1217" VPOS="350" WIDTH="181" HEIGHT="44" WC="0.95" CONTENT="voluptua."/><SP WIDTH="13" VPOS="350" HPOS="1398"/>
|
||||||
|
<String ID="string_25" HPOS="1411" VPOS="345" WIDTH="49" HEIGHT="34" WC="0.95" CONTENT="At"/><SP WIDTH="11" VPOS="345" HPOS="1460"/>
|
||||||
|
<String ID="string_26" HPOS="1471" VPOS="348" WIDTH="88" HEIGHT="26" WC="0.93" CONTENT="Vero"/><SP WIDTH="16" VPOS="348" HPOS="1559"/>
|
||||||
|
<String ID="string_27" HPOS="1575" VPOS="345" WIDTH="65" HEIGHT="26" WC="0.96" CONTENT="eos"/><SP WIDTH="15" VPOS="345" HPOS="1640"/>
|
||||||
|
<String ID="string_28" HPOS="1655" VPOS="339" WIDTH="36" HEIGHT="29" WC="0.96" CONTENT="et"/><SP WIDTH="14" VPOS="339" HPOS="1691"/>
|
||||||
|
<String ID="string_29" HPOS="1705" VPOS="336" WIDTH="168" HEIGHT="31" WC="0.87" CONTENT="accusam"/><SP WIDTH="15" VPOS="336" HPOS="1873"/>
|
||||||
|
<String ID="string_30" HPOS="1888" VPOS="329" WIDTH="34" HEIGHT="28" WC="0.96" CONTENT="et"/><SP WIDTH="11" VPOS="329" HPOS="1922"/>
|
||||||
|
<String ID="string_31" HPOS="1933" VPOS="322" WIDTH="96" HEIGHT="44" WC="0.96" CONTENT="justo"/><SP WIDTH="15" VPOS="322" HPOS="2029"/>
|
||||||
|
<String ID="string_32" HPOS="2044" VPOS="315" WIDTH="71" HEIGHT="63" WC="0.96" CONTENT="duo"/>
|
||||||
|
</TextLine>
|
||||||
|
<TextLine ID="line_2" HPOS="214" VPOS="375" WIDTH="1919" HEIGHT="93">
|
||||||
|
<String ID="string_33" HPOS="214" VPOS="431" WIDTH="144" HEIGHT="37" WC="0.96" CONTENT="dolores"/><SP WIDTH="16" VPOS="431" HPOS="358"/>
|
||||||
|
<String ID="string_34" HPOS="374" VPOS="433" WIDTH="34" HEIGHT="31" WC="0.96" CONTENT="et"/><SP WIDTH="14" VPOS="433" HPOS="408"/>
|
||||||
|
<String ID="string_35" HPOS="422" VPOS="437" WIDTH="42" HEIGHT="25" WC="0.96" CONTENT="ea"/><SP WIDTH="13" VPOS="437" HPOS="464"/>
|
||||||
|
<String ID="string_36" HPOS="477" VPOS="426" WIDTH="136" HEIGHT="35" WC="0.96" CONTENT="rebum."/><SP WIDTH="18" VPOS="426" HPOS="613"/>
|
||||||
|
<String ID="string_37" HPOS="631" VPOS="424" WIDTH="75" HEIGHT="34" WC="0.96" CONTENT="Stet"/><SP WIDTH="14" VPOS="424" HPOS="706"/>
|
||||||
|
<String ID="string_38" HPOS="720" VPOS="419" WIDTH="85" HEIGHT="36" WC="0.96" CONTENT="clita"/><SP WIDTH="13" VPOS="419" HPOS="805"/>
|
||||||
|
<String ID="string_39" HPOS="818" VPOS="415" WIDTH="90" HEIGHT="35" WC="0.97" CONTENT="kasd"/><SP WIDTH="14" VPOS="415" HPOS="908"/>
|
||||||
|
<String ID="string_40" HPOS="922" VPOS="412" WIDTH="206" HEIGHT="48" WC="0.96" CONTENT="gubergren,"/><SP WIDTH="16" VPOS="412" HPOS="1128"/>
|
||||||
|
<String ID="string_41" HPOS="1144" VPOS="417" WIDTH="47" HEIGHT="26" WC="0.97" CONTENT="no"/><SP WIDTH="16" VPOS="417" HPOS="1191"/>
|
||||||
|
<String ID="string_42" HPOS="1207" VPOS="415" WIDTH="61" HEIGHT="25" WC="0.96" CONTENT="sea"/><SP WIDTH="13" VPOS="415" HPOS="1268"/>
|
||||||
|
<String ID="string_43" HPOS="1281" VPOS="405" WIDTH="169" HEIGHT="36" WC="0.91" CONTENT="iakimata"/><SP WIDTH="14" VPOS="405" HPOS="1450"/>
|
||||||
|
<String ID="string_44" HPOS="1464" VPOS="400" WIDTH="144" HEIGHT="33" WC="0.96" CONTENT="sanctus"/><SP WIDTH="16" VPOS="400" HPOS="1608"/>
|
||||||
|
<String ID="string_45" HPOS="1624" VPOS="397" WIDTH="54" HEIGHT="29" WC="0.97" CONTENT="est"/><SP WIDTH="13" VPOS="397" HPOS="1678"/>
|
||||||
|
<String ID="string_46" HPOS="1691" VPOS="390" WIDTH="132" HEIGHT="34" WC="0.96" CONTENT="Lorem"/><SP WIDTH="14" VPOS="390" HPOS="1823"/>
|
||||||
|
<String ID="string_47" HPOS="1837" VPOS="383" WIDTH="120" HEIGHT="44" WC="0.96" CONTENT="ipsum"/><SP WIDTH="14" VPOS="383" HPOS="1957"/>
|
||||||
|
<String ID="string_48" HPOS="1971" VPOS="375" WIDTH="102" HEIGHT="37" WC="0.96" CONTENT="dolor"/><SP WIDTH="15" VPOS="375" HPOS="2073"/>
|
||||||
|
<String ID="string_49" HPOS="2088" VPOS="377" WIDTH="45" HEIGHT="31" WC="0.96" CONTENT="sit"/>
|
||||||
|
</TextLine>
|
||||||
|
<TextLine ID="line_3" HPOS="215" VPOS="435" WIDTH="1896" HEIGHT="93">
|
||||||
|
<String ID="string_50" HPOS="215" VPOS="494" WIDTH="106" HEIGHT="32" WC="0.96" CONTENT="amet."/><SP WIDTH="16" VPOS="494" HPOS="321"/>
|
||||||
|
<String ID="string_51" HPOS="337" VPOS="488" WIDTH="130" HEIGHT="33" WC="0.96" CONTENT="Lorem"/><SP WIDTH="14" VPOS="488" HPOS="467"/>
|
||||||
|
<String ID="string_52" HPOS="481" VPOS="484" WIDTH="121" HEIGHT="44" WC="0.96" CONTENT="ipsum"/><SP WIDTH="14" VPOS="484" HPOS="602"/>
|
||||||
|
<String ID="string_53" HPOS="616" VPOS="479" WIDTH="104" HEIGHT="37" WC="0.96" CONTENT="dolor"/><SP WIDTH="14" VPOS="479" HPOS="720"/>
|
||||||
|
<String ID="string_54" HPOS="734" VPOS="476" WIDTH="46" HEIGHT="36" WC="0.93" CONTENT="sit"/><SP WIDTH="14" VPOS="476" HPOS="780"/>
|
||||||
|
<String ID="string_55" HPOS="794" VPOS="477" WIDTH="104" HEIGHT="36" WC="0.75" CONTENT="armet,"/><SP WIDTH="17" VPOS="477" HPOS="898"/>
|
||||||
|
<String ID="string_56" HPOS="915" VPOS="474" WIDTH="200" HEIGHT="30" WC="0.97" CONTENT="consetetur"/><SP WIDTH="14" VPOS="474" HPOS="1115"/>
|
||||||
|
<String ID="string_57" HPOS="1129" VPOS="463" WIDTH="205" HEIGHT="45" WC="0.96" CONTENT="sadipscing"/><SP WIDTH="15" VPOS="463" HPOS="1334"/>
|
||||||
|
<String ID="string_58" HPOS="1349" VPOS="457" WIDTH="86" HEIGHT="41" WC="0.96" CONTENT="elitr,"/><SP WIDTH="16" VPOS="457" HPOS="1435"/>
|
||||||
|
<String ID="string_59" HPOS="1451" VPOS="452" WIDTH="65" HEIGHT="39" WC="0.96" CONTENT="sed"/><SP WIDTH="14" VPOS="452" HPOS="1516"/>
|
||||||
|
<String ID="string_60" HPOS="1530" VPOS="449" WIDTH="99" HEIGHT="36" WC="0.93" CONTENT="diam"/><SP WIDTH="14" VPOS="449" HPOS="1629"/>
|
||||||
|
<String ID="string_61" HPOS="1643" VPOS="451" WIDTH="162" HEIGHT="36" WC="0.59" CONTENT="nonurny"/><SP WIDTH="16" VPOS="451" HPOS="1805"/>
|
||||||
|
<String ID="string_62" HPOS="1821" VPOS="435" WIDTH="138" HEIGHT="39" WC="0.96" CONTENT="eirmod"/><SP WIDTH="12" VPOS="435" HPOS="1959"/>
|
||||||
|
<String ID="string_63" HPOS="1971" VPOS="440" WIDTH="140" HEIGHT="37" WC="0.96" CONTENT="tempor"/>
|
||||||
|
</TextLine>
|
||||||
|
<TextLine ID="line_4" HPOS="216" VPOS="483" WIDTH="1888" HEIGHT="97">
|
||||||
|
<String ID="string_64" HPOS="216" VPOS="543" WIDTH="165" HEIGHT="37" WC="0.97" CONTENT="invidunt"/><SP WIDTH="13" VPOS="543" HPOS="381"/>
|
||||||
|
<String ID="string_65" HPOS="394" VPOS="546" WIDTH="39" HEIGHT="30" WC="0.97" CONTENT="ut"/><SP WIDTH="12" VPOS="546" HPOS="433"/>
|
||||||
|
<String ID="string_66" HPOS="445" VPOS="539" WIDTH="122" HEIGHT="36" WC="0.96" CONTENT="labore"/><SP WIDTH="16" VPOS="539" HPOS="567"/>
|
||||||
|
<String ID="string_67" HPOS="583" VPOS="543" WIDTH="35" HEIGHT="29" WC="0.96" CONTENT="et"/><SP WIDTH="14" VPOS="543" HPOS="618"/>
|
||||||
|
<String ID="string_68" HPOS="632" VPOS="536" WIDTH="125" HEIGHT="34" WC="0.96" CONTENT="dolore"/><SP WIDTH="14" VPOS="536" HPOS="757"/>
|
||||||
|
<String ID="string_69" HPOS="771" VPOS="539" WIDTH="131" HEIGHT="37" WC="0.46" CONTENT="magna"/><SP WIDTH="14" VPOS="539" HPOS="902"/>
|
||||||
|
<String ID="string_70" HPOS="916" VPOS="526" WIDTH="182" HEIGHT="45" WC="0.96" CONTENT="aliquyam"/><SP WIDTH="14" VPOS="526" HPOS="1098"/>
|
||||||
|
<String ID="string_71" HPOS="1112" VPOS="527" WIDTH="82" HEIGHT="37" WC="0.96" CONTENT="erat,"/><SP WIDTH="17" VPOS="527" HPOS="1194"/>
|
||||||
|
<String ID="string_72" HPOS="1211" VPOS="519" WIDTH="63" HEIGHT="36" WC="0.97" CONTENT="sed"/><SP WIDTH="14" VPOS="519" HPOS="1274"/>
|
||||||
|
<String ID="string_73" HPOS="1288" VPOS="517" WIDTH="97" HEIGHT="37" WC="0.96" CONTENT="diam"/><SP WIDTH="11" VPOS="517" HPOS="1385"/>
|
||||||
|
<String ID="string_74" HPOS="1396" VPOS="513" WIDTH="185" HEIGHT="44" WC="0.96" CONTENT="voluptua."/><SP WIDTH="14" VPOS="513" HPOS="1581"/>
|
||||||
|
<String ID="string_75" HPOS="1595" VPOS="505" WIDTH="50" HEIGHT="35" WC="0.96" CONTENT="At"/><SP WIDTH="11" VPOS="505" HPOS="1645"/>
|
||||||
|
<String ID="string_76" HPOS="1656" VPOS="511" WIDTH="89" HEIGHT="27" WC="0.96" CONTENT="vero"/><SP WIDTH="16" VPOS="511" HPOS="1745"/>
|
||||||
|
<String ID="string_77" HPOS="1761" VPOS="508" WIDTH="63" HEIGHT="26" WC="0.96" CONTENT="eos"/><SP WIDTH="15" VPOS="508" HPOS="1824"/>
|
||||||
|
<String ID="string_78" HPOS="1839" VPOS="501" WIDTH="35" HEIGHT="30" WC="0.97" CONTENT="et"/><SP WIDTH="13" VPOS="501" HPOS="1874"/>
|
||||||
|
<String ID="string_79" HPOS="1887" VPOS="499" WIDTH="168" HEIGHT="53" WC="0.80" CONTENT="accusam"/><SP WIDTH="-3" VPOS="499" HPOS="2055"/>
|
||||||
|
<String ID="string_80" HPOS="2052" VPOS="483" WIDTH="52" HEIGHT="55" WC="0.97" CONTENT="et"/>
|
||||||
|
</TextLine>
|
||||||
|
<TextLine ID="line_5" HPOS="215" VPOS="552" WIDTH="1941" HEIGHT="97">
|
||||||
|
<String ID="string_81" HPOS="215" VPOS="604" WIDTH="97" HEIGHT="45" WC="0.97" CONTENT="justo"/><SP WIDTH="16" VPOS="604" HPOS="312"/>
|
||||||
|
<String ID="string_82" HPOS="328" VPOS="600" WIDTH="71" HEIGHT="35" WC="0.97" CONTENT="duo"/><SP WIDTH="16" VPOS="600" HPOS="399"/>
|
||||||
|
<String ID="string_83" HPOS="415" VPOS="597" WIDTH="143" HEIGHT="36" WC="0.93" CONTENT="dolores"/><SP WIDTH="16" VPOS="597" HPOS="558"/>
|
||||||
|
<String ID="string_84" HPOS="574" VPOS="600" WIDTH="34" HEIGHT="29" WC="0.96" CONTENT="et"/><SP WIDTH="14" VPOS="600" HPOS="608"/>
|
||||||
|
<String ID="string_85" HPOS="622" VPOS="602" WIDTH="43" HEIGHT="26" WC="0.96" CONTENT="ea"/><SP WIDTH="13" VPOS="602" HPOS="665"/>
|
||||||
|
<String ID="string_86" HPOS="678" VPOS="590" WIDTH="136" HEIGHT="36" WC="0.96" CONTENT="rebum."/><SP WIDTH="19" VPOS="590" HPOS="814"/>
|
||||||
|
<String ID="string_87" HPOS="833" VPOS="588" WIDTH="74" HEIGHT="34" WC="0.96" CONTENT="Stet"/><SP WIDTH="14" VPOS="588" HPOS="907"/>
|
||||||
|
<String ID="string_88" HPOS="921" VPOS="584" WIDTH="83" HEIGHT="36" WC="0.96" CONTENT="clita"/><SP WIDTH="12" VPOS="584" HPOS="1004"/>
|
||||||
|
<String ID="string_89" HPOS="1016" VPOS="580" WIDTH="90" HEIGHT="36" WC="0.97" CONTENT="kasd"/><SP WIDTH="15" VPOS="580" HPOS="1106"/>
|
||||||
|
<String ID="string_90" HPOS="1121" VPOS="578" WIDTH="205" HEIGHT="47" WC="0.96" CONTENT="gubergren,"/><SP WIDTH="16" VPOS="578" HPOS="1326"/>
|
||||||
|
<String ID="string_91" HPOS="1342" VPOS="582" WIDTH="47" HEIGHT="25" WC="0.96" CONTENT="no"/><SP WIDTH="16" VPOS="582" HPOS="1389"/>
|
||||||
|
<String ID="string_92" HPOS="1405" VPOS="581" WIDTH="62" HEIGHT="26" WC="0.97" CONTENT="sea"/><SP WIDTH="13" VPOS="581" HPOS="1467"/>
|
||||||
|
<String ID="string_93" HPOS="1480" VPOS="566" WIDTH="172" HEIGHT="38" WC="0.96" CONTENT="takimata"/><SP WIDTH="14" VPOS="566" HPOS="1652"/>
|
||||||
|
<String ID="string_94" HPOS="1666" VPOS="563" WIDTH="145" HEIGHT="33" WC="0.97" CONTENT="sanctus"/><SP WIDTH="15" VPOS="563" HPOS="1811"/>
|
||||||
|
<String ID="string_95" HPOS="1826" VPOS="558" WIDTH="54" HEIGHT="30" WC="0.97" CONTENT="est"/><SP WIDTH="12" VPOS="558" HPOS="1880"/>
|
||||||
|
<String ID="string_96" HPOS="1892" VPOS="552" WIDTH="130" HEIGHT="34" WC="0.96" CONTENT="Lorem"/><SP WIDTH="15" VPOS="552" HPOS="2022"/>
|
||||||
|
<String ID="string_97" HPOS="2037" VPOS="553" WIDTH="119" HEIGHT="37" WC="0.51" CONTENT="Ipsum"/>
|
||||||
|
</TextLine>
|
||||||
|
<TextLine ID="line_6" HPOS="219" VPOS="657" WIDTH="282" HEIGHT="38">
|
||||||
|
<String ID="string_98" HPOS="219" VPOS="658" WIDTH="104" HEIGHT="37" WC="0.97" CONTENT="dolor"/><SP WIDTH="15" VPOS="658" HPOS="323"/>
|
||||||
|
<String ID="string_99" HPOS="338" VPOS="657" WIDTH="45" HEIGHT="35" WC="0.97" CONTENT="sit"/><SP WIDTH="14" VPOS="657" HPOS="383"/>
|
||||||
|
<String ID="string_100" HPOS="397" VPOS="660" WIDTH="104" HEIGHT="35" WC="0.94" CONTENT="amet."/>
|
||||||
|
</TextLine>
|
||||||
|
</TextBlock>
|
||||||
|
</PrintSpace>
|
||||||
|
</Page>
|
||||||
|
</Layout>
|
||||||
|
</alto>
|
Binary file not shown.
Binary file not shown.
@ -0,0 +1,47 @@
|
|||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<PcGts xmlns="http://schema.primaresearch.org/PAGE/gts/pagecontent/2018-07-15" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://schema.primaresearch.org/PAGE/gts/pagecontent/2018-07-15 http://schema.primaresearch.org/PAGE/gts/pagecontent/2018-07-15/pagecontent.xsd">
|
||||||
|
<Metadata>
|
||||||
|
<Creator></Creator>
|
||||||
|
<Created>2019-07-26T13:59:00</Created>
|
||||||
|
<LastChange>2019-07-26T14:00:29</LastChange></Metadata>
|
||||||
|
<Page imageFilename="lorem-ipsum-scan.tif" imageXResolution="300.00000" imageYResolution="300.00000" imageWidth="2481" imageHeight="3508">
|
||||||
|
<TextRegion id="tempReg357564684568544579089">
|
||||||
|
<Coords points="0,0 1,0 1,1 0,1"/>
|
||||||
|
<TextLine id="l0">
|
||||||
|
<Coords points="228,237 228,295 2216,295 2216,237"/>
|
||||||
|
<TextEquiv>
|
||||||
|
<Unicode></Unicode></TextEquiv></TextLine>
|
||||||
|
<TextLine id="l1">
|
||||||
|
<Coords points="228,298 228,348 2160,348 2160,298"/>
|
||||||
|
<TextEquiv>
|
||||||
|
<Unicode></Unicode></TextEquiv></TextLine>
|
||||||
|
<TextLine id="l2">
|
||||||
|
<Coords points="225,348 225,410 2178,410 2178,348"/>
|
||||||
|
<TextEquiv>
|
||||||
|
<Unicode></Unicode></TextEquiv></TextLine>
|
||||||
|
<TextLine id="l3">
|
||||||
|
<Coords points="218,413 218,463 2153,463 2153,413"/>
|
||||||
|
<TextEquiv>
|
||||||
|
<Unicode></Unicode></TextEquiv></TextLine>
|
||||||
|
<TextLine id="l4">
|
||||||
|
<Coords points="225,466 225,522 2153,522 2153,466"/>
|
||||||
|
<TextEquiv>
|
||||||
|
<Unicode></Unicode></TextEquiv></TextLine>
|
||||||
|
<TextLine id="l5">
|
||||||
|
<Coords points="216,524 216,581 2187,581 2187,524"/>
|
||||||
|
<TextEquiv>
|
||||||
|
<Unicode></Unicode></TextEquiv></TextLine>
|
||||||
|
<TextLine id="l6">
|
||||||
|
<Coords points="219,584 219,640 542,640 542,584"/>
|
||||||
|
<TextEquiv>
|
||||||
|
<Unicode></Unicode></TextEquiv></TextLine></TextRegion>
|
||||||
|
<TextRegion id="r7" type="paragraph">
|
||||||
|
<Coords points="204,212 204,651 2227,651 2227,212"/>
|
||||||
|
<TextEquiv>
|
||||||
|
<Unicode>Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt
|
||||||
|
ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo
|
||||||
|
dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit
|
||||||
|
amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor
|
||||||
|
invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et
|
||||||
|
justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum
|
||||||
|
dolor sit amet.</Unicode></TextEquiv></TextRegion></Page></PcGts>
|
@ -0,0 +1,138 @@
|
|||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<alto xmlns="http://www.loc.gov/standards/alto/ns-v3#" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.loc.gov/standards/alto/ns-v3# http://www.loc.gov/alto/v3/alto-3-0.xsd">
|
||||||
|
<Description>
|
||||||
|
<MeasurementUnit>pixel</MeasurementUnit>
|
||||||
|
<sourceImageInformation>
|
||||||
|
<fileName> </fileName>
|
||||||
|
</sourceImageInformation>
|
||||||
|
<OCRProcessing ID="OCR_0">
|
||||||
|
<ocrProcessingStep>
|
||||||
|
<processingSoftware>
|
||||||
|
<softwareName>tesseract 4.1.0-rc4</softwareName>
|
||||||
|
</processingSoftware>
|
||||||
|
</ocrProcessingStep>
|
||||||
|
</OCRProcessing>
|
||||||
|
</Description>
|
||||||
|
<Layout>
|
||||||
|
<Page WIDTH="2481" HEIGHT="3508" PHYSICAL_IMG_NR="0" ID="page_0">
|
||||||
|
<PrintSpace HPOS="0" VPOS="0" WIDTH="2481" HEIGHT="3508">
|
||||||
|
<TextBlock ID="block_0" HPOS="234" VPOS="244" WIDTH="1966" HEIGHT="387">
|
||||||
|
<TextLine ID="line_0" HPOS="237" VPOS="244" WIDTH="1963" HEIGHT="48">
|
||||||
|
<String ID="string_0" HPOS="237" VPOS="248" WIDTH="133" HEIGHT="34" WC="0.96" CONTENT="Lorem"/><SP WIDTH="14" VPOS="248" HPOS="370"/>
|
||||||
|
<String ID="string_1" HPOS="384" VPOS="247" WIDTH="120" HEIGHT="45" WC="0.96" CONTENT="ipsum"/><SP WIDTH="15" VPOS="247" HPOS="504"/>
|
||||||
|
<String ID="string_2" HPOS="519" VPOS="246" WIDTH="103" HEIGHT="36" WC="0.96" CONTENT="dolor"/><SP WIDTH="14" VPOS="246" HPOS="622"/>
|
||||||
|
<String ID="string_3" HPOS="636" VPOS="247" WIDTH="46" HEIGHT="35" WC="0.96" CONTENT="sit"/><SP WIDTH="14" VPOS="247" HPOS="682"/>
|
||||||
|
<String ID="string_4" HPOS="696" VPOS="252" WIDTH="105" HEIGHT="36" WC="0.97" CONTENT="amet,"/><SP WIDTH="17" VPOS="252" HPOS="801"/>
|
||||||
|
<String ID="string_5" HPOS="818" VPOS="251" WIDTH="202" HEIGHT="30" WC="0.96" CONTENT="consetetur"/><SP WIDTH="14" VPOS="251" HPOS="1020"/>
|
||||||
|
<String ID="string_6" HPOS="1034" VPOS="244" WIDTH="207" HEIGHT="46" WC="0.96" CONTENT="sadipscing"/><SP WIDTH="15" VPOS="244" HPOS="1241"/>
|
||||||
|
<String ID="string_7" HPOS="1256" VPOS="244" WIDTH="86" HEIGHT="43" WC="0.96" CONTENT="elitr,"/><SP WIDTH="16" VPOS="244" HPOS="1342"/>
|
||||||
|
<String ID="string_8" HPOS="1358" VPOS="244" WIDTH="65" HEIGHT="36" WC="0.96" CONTENT="sed"/><SP WIDTH="15" VPOS="244" HPOS="1423"/>
|
||||||
|
<String ID="string_9" HPOS="1438" VPOS="244" WIDTH="99" HEIGHT="36" WC="0.96" CONTENT="diam"/><SP WIDTH="14" VPOS="244" HPOS="1537"/>
|
||||||
|
<String ID="string_10" HPOS="1551" VPOS="255" WIDTH="164" HEIGHT="35" WC="0.97" CONTENT="nonumy"/><SP WIDTH="15" VPOS="255" HPOS="1715"/>
|
||||||
|
<String ID="string_11" HPOS="1730" VPOS="244" WIDTH="139" HEIGHT="36" WC="0.96" CONTENT="eirmod"/><SP WIDTH="13" VPOS="244" HPOS="1869"/>
|
||||||
|
<String ID="string_12" HPOS="1882" VPOS="250" WIDTH="140" HEIGHT="40" WC="0.96" CONTENT="tempor"/><SP WIDTH="13" VPOS="250" HPOS="2022"/>
|
||||||
|
<String ID="string_13" HPOS="2035" VPOS="244" WIDTH="165" HEIGHT="35" WC="0.96" CONTENT="invidunt"/>
|
||||||
|
</TextLine>
|
||||||
|
<TextLine ID="line_1" HPOS="237" VPOS="301" WIDTH="1913" HEIGHT="49">
|
||||||
|
<String ID="string_14" HPOS="237" VPOS="310" WIDTH="39" HEIGHT="29" WC="0.96" CONTENT="ut"/><SP WIDTH="13" VPOS="310" HPOS="276"/>
|
||||||
|
<String ID="string_15" HPOS="289" VPOS="304" WIDTH="123" HEIGHT="44" WC="0.96" CONTENT="labore"/><SP WIDTH="16" VPOS="304" HPOS="412"/>
|
||||||
|
<String ID="string_16" HPOS="428" VPOS="310" WIDTH="34" HEIGHT="29" WC="0.97" CONTENT="et"/><SP WIDTH="14" VPOS="310" HPOS="462"/>
|
||||||
|
<String ID="string_17" HPOS="476" VPOS="304" WIDTH="123" HEIGHT="36" WC="0.96" CONTENT="dolore"/><SP WIDTH="15" VPOS="304" HPOS="599"/>
|
||||||
|
<String ID="string_18" HPOS="614" VPOS="313" WIDTH="133" HEIGHT="37" WC="0.96" CONTENT="magna"/><SP WIDTH="14" VPOS="313" HPOS="747"/>
|
||||||
|
<String ID="string_19" HPOS="761" VPOS="302" WIDTH="183" HEIGHT="46" WC="0.96" CONTENT="aliquyam"/><SP WIDTH="15" VPOS="302" HPOS="944"/>
|
||||||
|
<String ID="string_20" HPOS="959" VPOS="308" WIDTH="81" HEIGHT="36" WC="0.96" CONTENT="erat,"/><SP WIDTH="17" VPOS="308" HPOS="1040"/>
|
||||||
|
<String ID="string_21" HPOS="1057" VPOS="301" WIDTH="65" HEIGHT="36" WC="0.96" CONTENT="sed"/><SP WIDTH="14" VPOS="301" HPOS="1122"/>
|
||||||
|
<String ID="string_22" HPOS="1136" VPOS="301" WIDTH="97" HEIGHT="36" WC="0.95" CONTENT="diam"/><SP WIDTH="13" VPOS="301" HPOS="1233"/>
|
||||||
|
<String ID="string_23" HPOS="1246" VPOS="301" WIDTH="183" HEIGHT="46" WC="0.96" CONTENT="voluptua."/><SP WIDTH="13" VPOS="301" HPOS="1429"/>
|
||||||
|
<String ID="string_24" HPOS="1442" VPOS="303" WIDTH="51" HEIGHT="34" WC="0.96" CONTENT="At"/><SP WIDTH="12" VPOS="303" HPOS="1493"/>
|
||||||
|
<String ID="string_25" HPOS="1505" VPOS="312" WIDTH="88" HEIGHT="25" WC="0.96" CONTENT="vero"/><SP WIDTH="17" VPOS="312" HPOS="1593"/>
|
||||||
|
<String ID="string_26" HPOS="1610" VPOS="312" WIDTH="64" HEIGHT="25" WC="0.96" CONTENT="eos"/><SP WIDTH="16" VPOS="312" HPOS="1674"/>
|
||||||
|
<String ID="string_27" HPOS="1690" VPOS="308" WIDTH="35" HEIGHT="29" WC="0.96" CONTENT="et"/><SP WIDTH="14" VPOS="308" HPOS="1725"/>
|
||||||
|
<String ID="string_28" HPOS="1739" VPOS="312" WIDTH="168" HEIGHT="25" WC="0.96" CONTENT="accusam"/><SP WIDTH="15" VPOS="312" HPOS="1907"/>
|
||||||
|
<String ID="string_29" HPOS="1922" VPOS="308" WIDTH="34" HEIGHT="29" WC="0.97" CONTENT="et"/><SP WIDTH="11" VPOS="308" HPOS="1956"/>
|
||||||
|
<String ID="string_30" HPOS="1967" VPOS="302" WIDTH="96" HEIGHT="45" WC="0.97" CONTENT="justo"/><SP WIDTH="16" VPOS="302" HPOS="2063"/>
|
||||||
|
<String ID="string_31" HPOS="2079" VPOS="301" WIDTH="71" HEIGHT="36" WC="0.96" CONTENT="duo"/>
|
||||||
|
</TextLine>
|
||||||
|
<TextLine ID="line_2" HPOS="238" VPOS="359" WIDTH="1928" HEIGHT="46">
|
||||||
|
<String ID="string_32" HPOS="238" VPOS="361" WIDTH="144" HEIGHT="36" WC="0.96" CONTENT="dolores"/><SP WIDTH="16" VPOS="361" HPOS="382"/>
|
||||||
|
<String ID="string_33" HPOS="398" VPOS="368" WIDTH="34" HEIGHT="29" WC="0.96" CONTENT="et"/><SP WIDTH="15" VPOS="368" HPOS="432"/>
|
||||||
|
<String ID="string_34" HPOS="447" VPOS="372" WIDTH="41" HEIGHT="25" WC="0.96" CONTENT="ea"/><SP WIDTH="14" VPOS="372" HPOS="488"/>
|
||||||
|
<String ID="string_35" HPOS="502" VPOS="361" WIDTH="136" HEIGHT="36" WC="0.96" CONTENT="rebum."/><SP WIDTH="19" VPOS="361" HPOS="638"/>
|
||||||
|
<String ID="string_36" HPOS="657" VPOS="363" WIDTH="75" HEIGHT="33" WC="0.97" CONTENT="Stet"/><SP WIDTH="14" VPOS="363" HPOS="732"/>
|
||||||
|
<String ID="string_37" HPOS="746" VPOS="360" WIDTH="84" HEIGHT="36" WC="0.96" CONTENT="clita"/><SP WIDTH="13" VPOS="360" HPOS="830"/>
|
||||||
|
<String ID="string_38" HPOS="843" VPOS="359" WIDTH="91" HEIGHT="36" WC="0.96" CONTENT="kasd"/><SP WIDTH="13" VPOS="359" HPOS="934"/>
|
||||||
|
<String ID="string_39" HPOS="947" VPOS="359" WIDTH="208" HEIGHT="46" WC="0.96" CONTENT="gubergren,"/><SP WIDTH="16" VPOS="359" HPOS="1155"/>
|
||||||
|
<String ID="string_40" HPOS="1171" VPOS="370" WIDTH="47" HEIGHT="24" WC="0.96" CONTENT="no"/><SP WIDTH="16" VPOS="370" HPOS="1218"/>
|
||||||
|
<String ID="string_41" HPOS="1234" VPOS="370" WIDTH="61" HEIGHT="25" WC="0.96" CONTENT="sea"/><SP WIDTH="13" VPOS="370" HPOS="1295"/>
|
||||||
|
<String ID="string_42" HPOS="1308" VPOS="359" WIDTH="172" HEIGHT="36" WC="0.96" CONTENT="takimata"/><SP WIDTH="15" VPOS="359" HPOS="1480"/>
|
||||||
|
<String ID="string_43" HPOS="1495" VPOS="365" WIDTH="145" HEIGHT="30" WC="0.96" CONTENT="sanctus"/><SP WIDTH="16" VPOS="365" HPOS="1640"/>
|
||||||
|
<String ID="string_44" HPOS="1656" VPOS="365" WIDTH="55" HEIGHT="29" WC="0.96" CONTENT="est"/><SP WIDTH="13" VPOS="365" HPOS="1711"/>
|
||||||
|
<String ID="string_45" HPOS="1724" VPOS="361" WIDTH="131" HEIGHT="33" WC="0.96" CONTENT="Lorem"/><SP WIDTH="15" VPOS="361" HPOS="1855"/>
|
||||||
|
<String ID="string_46" HPOS="1870" VPOS="360" WIDTH="119" HEIGHT="44" WC="0.96" CONTENT="ipsum"/><SP WIDTH="15" VPOS="360" HPOS="1989"/>
|
||||||
|
<String ID="string_47" HPOS="2004" VPOS="359" WIDTH="103" HEIGHT="35" WC="0.96" CONTENT="dolor"/><SP WIDTH="14" VPOS="359" HPOS="2107"/>
|
||||||
|
<String ID="string_48" HPOS="2121" VPOS="360" WIDTH="45" HEIGHT="34" WC="0.96" CONTENT="sit"/>
|
||||||
|
</TextLine>
|
||||||
|
<TextLine ID="line_3" HPOS="238" VPOS="416" WIDTH="1905" HEIGHT="48">
|
||||||
|
<String ID="string_49" HPOS="238" VPOS="425" WIDTH="105" HEIGHT="29" WC="0.96" CONTENT="amet."/><SP WIDTH="16" VPOS="425" HPOS="343"/>
|
||||||
|
<String ID="string_50" HPOS="359" VPOS="421" WIDTH="132" HEIGHT="33" WC="0.96" CONTENT="Lorem"/><SP WIDTH="13" VPOS="421" HPOS="491"/>
|
||||||
|
<String ID="string_51" HPOS="504" VPOS="420" WIDTH="121" HEIGHT="44" WC="0.96" CONTENT="ipsum"/><SP WIDTH="15" VPOS="420" HPOS="625"/>
|
||||||
|
<String ID="string_52" HPOS="640" VPOS="418" WIDTH="104" HEIGHT="36" WC="0.96" CONTENT="dolor"/><SP WIDTH="14" VPOS="418" HPOS="744"/>
|
||||||
|
<String ID="string_53" HPOS="758" VPOS="419" WIDTH="45" HEIGHT="35" WC="0.97" CONTENT="sit"/><SP WIDTH="15" VPOS="419" HPOS="803"/>
|
||||||
|
<String ID="string_54" HPOS="818" VPOS="424" WIDTH="104" HEIGHT="36" WC="0.96" CONTENT="amet,"/><SP WIDTH="17" VPOS="424" HPOS="922"/>
|
||||||
|
<String ID="string_55" HPOS="939" VPOS="422" WIDTH="201" HEIGHT="30" WC="0.96" CONTENT="consetetur"/><SP WIDTH="15" VPOS="422" HPOS="1140"/>
|
||||||
|
<String ID="string_56" HPOS="1155" VPOS="416" WIDTH="207" HEIGHT="46" WC="0.96" CONTENT="sadipscing"/><SP WIDTH="15" VPOS="416" HPOS="1362"/>
|
||||||
|
<String ID="string_57" HPOS="1377" VPOS="417" WIDTH="86" HEIGHT="42" WC="0.96" CONTENT="elitr,"/><SP WIDTH="17" VPOS="417" HPOS="1463"/>
|
||||||
|
<String ID="string_58" HPOS="1480" VPOS="416" WIDTH="66" HEIGHT="36" WC="0.96" CONTENT="sed"/><SP WIDTH="15" VPOS="416" HPOS="1546"/>
|
||||||
|
<String ID="string_59" HPOS="1561" VPOS="416" WIDTH="98" HEIGHT="36" WC="0.96" CONTENT="diam"/><SP WIDTH="14" VPOS="416" HPOS="1659"/>
|
||||||
|
<String ID="string_60" HPOS="1673" VPOS="427" WIDTH="163" HEIGHT="35" WC="0.96" CONTENT="nonumy"/><SP WIDTH="16" VPOS="427" HPOS="1836"/>
|
||||||
|
<String ID="string_61" HPOS="1852" VPOS="416" WIDTH="138" HEIGHT="36" WC="0.96" CONTENT="eirmod"/><SP WIDTH="13" VPOS="416" HPOS="1990"/>
|
||||||
|
<String ID="string_62" HPOS="2003" VPOS="422" WIDTH="140" HEIGHT="40" WC="0.96" CONTENT="tempor"/>
|
||||||
|
</TextLine>
|
||||||
|
<TextLine ID="line_4" HPOS="236" VPOS="474" WIDTH="1897" HEIGHT="47">
|
||||||
|
<String ID="string_63" HPOS="236" VPOS="476" WIDTH="166" HEIGHT="35" WC="0.96" CONTENT="invidunt"/><SP WIDTH="14" VPOS="476" HPOS="402"/>
|
||||||
|
<String ID="string_64" HPOS="416" VPOS="482" WIDTH="39" HEIGHT="29" WC="0.96" CONTENT="ut"/><SP WIDTH="12" VPOS="482" HPOS="455"/>
|
||||||
|
<String ID="string_65" HPOS="467" VPOS="476" WIDTH="122" HEIGHT="35" WC="0.96" CONTENT="labore"/><SP WIDTH="16" VPOS="476" HPOS="589"/>
|
||||||
|
<String ID="string_66" HPOS="605" VPOS="482" WIDTH="34" HEIGHT="29" WC="0.96" CONTENT="et"/><SP WIDTH="15" VPOS="482" HPOS="639"/>
|
||||||
|
<String ID="string_67" HPOS="654" VPOS="475" WIDTH="125" HEIGHT="36" WC="0.96" CONTENT="dolore"/><SP WIDTH="14" VPOS="475" HPOS="779"/>
|
||||||
|
<String ID="string_68" HPOS="793" VPOS="484" WIDTH="131" HEIGHT="37" WC="0.96" CONTENT="magna"/><SP WIDTH="15" VPOS="484" HPOS="924"/>
|
||||||
|
<String ID="string_69" HPOS="939" VPOS="474" WIDTH="182" HEIGHT="45" WC="0.96" CONTENT="aliquyam"/><SP WIDTH="15" VPOS="474" HPOS="1121"/>
|
||||||
|
<String ID="string_70" HPOS="1136" VPOS="480" WIDTH="81" HEIGHT="37" WC="0.96" CONTENT="erat,"/><SP WIDTH="18" VPOS="480" HPOS="1217"/>
|
||||||
|
<String ID="string_71" HPOS="1235" VPOS="474" WIDTH="63" HEIGHT="35" WC="0.96" CONTENT="sed"/><SP WIDTH="15" VPOS="474" HPOS="1298"/>
|
||||||
|
<String ID="string_72" HPOS="1313" VPOS="474" WIDTH="97" HEIGHT="35" WC="0.96" CONTENT="diam"/><SP WIDTH="13" VPOS="474" HPOS="1410"/>
|
||||||
|
<String ID="string_73" HPOS="1423" VPOS="474" WIDTH="186" HEIGHT="46" WC="0.96" CONTENT="voluptua."/><SP WIDTH="14" VPOS="474" HPOS="1609"/>
|
||||||
|
<String ID="string_74" HPOS="1623" VPOS="475" WIDTH="50" HEIGHT="34" WC="0.96" CONTENT="At"/><SP WIDTH="12" VPOS="475" HPOS="1673"/>
|
||||||
|
<String ID="string_75" HPOS="1685" VPOS="485" WIDTH="89" HEIGHT="24" WC="0.96" CONTENT="vero"/><SP WIDTH="16" VPOS="485" HPOS="1774"/>
|
||||||
|
<String ID="string_76" HPOS="1790" VPOS="484" WIDTH="63" HEIGHT="25" WC="0.96" CONTENT="eos"/><SP WIDTH="15" VPOS="484" HPOS="1853"/>
|
||||||
|
<String ID="string_77" HPOS="1868" VPOS="480" WIDTH="34" HEIGHT="29" WC="0.96" CONTENT="et"/><SP WIDTH="14" VPOS="480" HPOS="1902"/>
|
||||||
|
<String ID="string_78" HPOS="1916" VPOS="484" WIDTH="168" HEIGHT="25" WC="0.96" CONTENT="accusam"/><SP WIDTH="16" VPOS="484" HPOS="2084"/>
|
||||||
|
<String ID="string_79" HPOS="2100" VPOS="480" WIDTH="33" HEIGHT="29" WC="0.96" CONTENT="et"/>
|
||||||
|
</TextLine>
|
||||||
|
<TextLine ID="line_5" HPOS="234" VPOS="531" WIDTH="1950" HEIGHT="47">
|
||||||
|
<String ID="string_80" HPOS="234" VPOS="534" WIDTH="98" HEIGHT="44" WC="0.97" CONTENT="justo"/><SP WIDTH="16" VPOS="534" HPOS="332"/>
|
||||||
|
<String ID="string_81" HPOS="348" VPOS="533" WIDTH="71" HEIGHT="35" WC="0.96" CONTENT="duo"/><SP WIDTH="16" VPOS="533" HPOS="419"/>
|
||||||
|
<String ID="string_82" HPOS="435" VPOS="533" WIDTH="143" HEIGHT="35" WC="0.96" CONTENT="dolores"/><SP WIDTH="15" VPOS="533" HPOS="578"/>
|
||||||
|
<String ID="string_83" HPOS="593" VPOS="539" WIDTH="35" HEIGHT="29" WC="0.96" CONTENT="et"/><SP WIDTH="14" VPOS="539" HPOS="628"/>
|
||||||
|
<String ID="string_84" HPOS="642" VPOS="543" WIDTH="42" HEIGHT="25" WC="0.97" CONTENT="ea"/><SP WIDTH="14" VPOS="543" HPOS="684"/>
|
||||||
|
<String ID="string_85" HPOS="698" VPOS="533" WIDTH="137" HEIGHT="35" WC="0.96" CONTENT="rebum."/><SP WIDTH="18" VPOS="533" HPOS="835"/>
|
||||||
|
<String ID="string_86" HPOS="853" VPOS="534" WIDTH="74" HEIGHT="34" WC="0.96" CONTENT="Stet"/><SP WIDTH="14" VPOS="534" HPOS="927"/>
|
||||||
|
<String ID="string_87" HPOS="941" VPOS="531" WIDTH="84" HEIGHT="36" WC="0.96" CONTENT="clita"/><SP WIDTH="13" VPOS="531" HPOS="1025"/>
|
||||||
|
<String ID="string_88" HPOS="1038" VPOS="531" WIDTH="89" HEIGHT="35" WC="0.96" CONTENT="kasd"/><SP WIDTH="15" VPOS="531" HPOS="1127"/>
|
||||||
|
<String ID="string_89" HPOS="1142" VPOS="531" WIDTH="208" HEIGHT="46" WC="0.96" CONTENT="gubergren,"/><SP WIDTH="16" VPOS="531" HPOS="1350"/>
|
||||||
|
<String ID="string_90" HPOS="1366" VPOS="542" WIDTH="48" HEIGHT="25" WC="0.96" CONTENT="no"/><SP WIDTH="16" VPOS="542" HPOS="1414"/>
|
||||||
|
<String ID="string_91" HPOS="1430" VPOS="542" WIDTH="62" HEIGHT="25" WC="0.96" CONTENT="sea"/><SP WIDTH="13" VPOS="542" HPOS="1492"/>
|
||||||
|
<String ID="string_92" HPOS="1505" VPOS="531" WIDTH="173" HEIGHT="36" WC="0.96" CONTENT="takimata"/><SP WIDTH="15" VPOS="531" HPOS="1678"/>
|
||||||
|
<String ID="string_93" HPOS="1693" VPOS="538" WIDTH="144" HEIGHT="29" WC="0.96" CONTENT="sanctus"/><SP WIDTH="16" VPOS="538" HPOS="1837"/>
|
||||||
|
<String ID="string_94" HPOS="1853" VPOS="537" WIDTH="53" HEIGHT="29" WC="0.96" CONTENT="est"/><SP WIDTH="14" VPOS="537" HPOS="1906"/>
|
||||||
|
<String ID="string_95" HPOS="1920" VPOS="533" WIDTH="130" HEIGHT="33" WC="0.96" CONTENT="Lorem"/><SP WIDTH="14" VPOS="533" HPOS="2050"/>
|
||||||
|
<String ID="string_96" HPOS="2064" VPOS="532" WIDTH="120" HEIGHT="44" WC="0.95" CONTENT="ipsum"/>
|
||||||
|
</TextLine>
|
||||||
|
<TextLine ID="line_6" HPOS="237" VPOS="590" WIDTH="282" HEIGHT="41">
|
||||||
|
<String ID="string_97" HPOS="237" VPOS="590" WIDTH="104" HEIGHT="35" WC="0.96" CONTENT="dolor"/><SP WIDTH="15" VPOS="590" HPOS="341"/>
|
||||||
|
<String ID="string_98" HPOS="356" VPOS="591" WIDTH="45" HEIGHT="35" WC="0.96" CONTENT="sit"/><SP WIDTH="14" VPOS="591" HPOS="401"/>
|
||||||
|
<String ID="string_99" HPOS="415" VPOS="597" WIDTH="104" HEIGHT="34" WC="0.96" CONTENT="amet."/>
|
||||||
|
</TextLine>
|
||||||
|
</TextBlock>
|
||||||
|
</PrintSpace>
|
||||||
|
</Page>
|
||||||
|
</Layout>
|
||||||
|
</alto>
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -0,0 +1 @@
|
|||||||
|
Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet.
|
@ -0,0 +1,63 @@
|
|||||||
|
from .util import unzip
|
||||||
|
from .. import align
|
||||||
|
|
||||||
|
|
||||||
|
def test_left_empty():
|
||||||
|
result = list(align('', 'foo'))
|
||||||
|
expected = [(None, 'f'), (None, 'o'), (None, 'o')]
|
||||||
|
assert result == expected
|
||||||
|
|
||||||
|
|
||||||
|
def test_right_empty():
|
||||||
|
result = list(align('foo', ''))
|
||||||
|
expected = [('f', None), ('o', None), ('o', None)]
|
||||||
|
assert result == expected
|
||||||
|
|
||||||
|
|
||||||
|
def test_left_longer():
|
||||||
|
result = list(align('food', 'foo'))
|
||||||
|
expected = [('f', 'f'), ('o', 'o'), ('o', 'o'), ('d', None)]
|
||||||
|
assert result == expected
|
||||||
|
|
||||||
|
|
||||||
|
def test_right_longer():
|
||||||
|
result = list(align('foo', 'food'))
|
||||||
|
expected = [('f', 'f'), ('o', 'o'), ('o', 'o'), (None, 'd')]
|
||||||
|
assert result == expected
|
||||||
|
|
||||||
|
|
||||||
|
def test_some_diff():
|
||||||
|
result = list(align('abcde', 'aaadef'))
|
||||||
|
left, right = unzip(result)
|
||||||
|
assert list(left) == ['a', 'b', 'c', 'd', 'e', None]
|
||||||
|
assert list(right) == ['a', 'a', 'a', 'd', 'e', 'f']
|
||||||
|
|
||||||
|
|
||||||
|
def test_longer():
|
||||||
|
s1 = 'Dies ist eine Tst!'
|
||||||
|
s2 = 'Dies ist ein Test.'
|
||||||
|
|
||||||
|
result = list(align(s1, s2)) # ; diffprint(*unzip(result))
|
||||||
|
expected = [('D', 'D'), ('i', 'i'), ('e', 'e'), ('s', 's'), (' ', ' '),
|
||||||
|
('i', 'i'), ('s', 's'), ('t', 't'), (' ', ' '),
|
||||||
|
('e', 'e'), ('i', 'i'), ('n', 'n'), ('e', None), (' ', ' '),
|
||||||
|
('T', 'T'), (None, 'e'), ('s', 's'), ('t', 't'), ('!', '.')]
|
||||||
|
assert result == expected
|
||||||
|
|
||||||
|
|
||||||
|
def test_completely_different():
|
||||||
|
assert len(list(align('abcde', 'fghij'))) == 5
|
||||||
|
|
||||||
|
|
||||||
|
def test_with_some_fake_ocr_errors():
|
||||||
|
result = list(align('Über die vielen Sorgen wegen desselben vergaß',
|
||||||
|
'SomeJunk MoreJunk Übey die vielen Sorgen wegen AdditionalJunk deffelben vcrgab'))
|
||||||
|
left, right = unzip(result)
|
||||||
|
|
||||||
|
# Beginning
|
||||||
|
assert list(left[:18]) == [None]*18
|
||||||
|
assert list(right[:18]) == list('SomeJunk MoreJunk ')
|
||||||
|
|
||||||
|
# End
|
||||||
|
assert list(left[-1:]) == ['ß']
|
||||||
|
assert list(right[-1:]) == ['b']
|
@ -0,0 +1,37 @@
|
|||||||
|
from __future__ import division, print_function
|
||||||
|
|
||||||
|
import math
|
||||||
|
import unicodedata
|
||||||
|
|
||||||
|
from .. import character_error_rate
|
||||||
|
|
||||||
|
|
||||||
|
def test_character_error_rate():
|
||||||
|
assert character_error_rate('a', 'a') == 0
|
||||||
|
assert character_error_rate('a', 'b') == 1/1
|
||||||
|
assert character_error_rate('Foo', 'Bar') == 3/3
|
||||||
|
|
||||||
|
assert character_error_rate('Foo', '') == 3/3
|
||||||
|
|
||||||
|
assert character_error_rate('', '') == 0
|
||||||
|
assert math.isinf(character_error_rate('', 'Foo'))
|
||||||
|
|
||||||
|
assert character_error_rate('Foo', 'Food') == 1/3
|
||||||
|
assert character_error_rate('Fnord', 'Food') == 2/5
|
||||||
|
assert character_error_rate('Müll', 'Mull') == 1/4
|
||||||
|
assert character_error_rate('Abstand', 'Sand') == 4/7
|
||||||
|
|
||||||
|
|
||||||
|
def test_character_error_rate_hard():
|
||||||
|
s1 = unicodedata.normalize('NFC', 'Schlyñ lorem ipsum.')
|
||||||
|
s2 = unicodedata.normalize('NFD', 'Schlyñ lorem ipsum!') # Different, decomposed!
|
||||||
|
assert character_error_rate(s1, s2) == 1/19
|
||||||
|
|
||||||
|
s1 = 'Schlyñ'
|
||||||
|
assert len(s1) == 6 # This ends with LATIN SMALL LETTER N WITH TILDE, so 6 code points
|
||||||
|
s2 = 'Schlym̃'
|
||||||
|
assert len(s2) == 7 # This, OTOH, ends with LATIN SMALL LETTER M + COMBINING TILDE, 7 code points
|
||||||
|
|
||||||
|
# Both strings have the same length in terms of grapheme clusters. So the CER should be symmetrical.
|
||||||
|
assert character_error_rate(s2, s1) == 1/6
|
||||||
|
assert character_error_rate(s1, s2) == 1/6
|
@ -0,0 +1,40 @@
|
|||||||
|
from __future__ import division, print_function
|
||||||
|
|
||||||
|
import unicodedata
|
||||||
|
|
||||||
|
from .. import levenshtein, distance
|
||||||
|
|
||||||
|
|
||||||
|
def test_levenshtein():
|
||||||
|
assert levenshtein('a', 'a') == 0
|
||||||
|
assert levenshtein('a', 'b') == 1
|
||||||
|
assert levenshtein('Foo', 'Bar') == 3
|
||||||
|
|
||||||
|
assert levenshtein('', '') == 0
|
||||||
|
assert levenshtein('Foo', '') == 3
|
||||||
|
assert levenshtein('', 'Foo') == 3
|
||||||
|
|
||||||
|
assert levenshtein('Foo', 'Food') == 1
|
||||||
|
assert levenshtein('Fnord', 'Food') == 2
|
||||||
|
assert levenshtein('Müll', 'Mull') == 1
|
||||||
|
assert levenshtein('Abstand', 'Sand') == 4
|
||||||
|
|
||||||
|
|
||||||
|
def test_levenshtein_other_sequences():
|
||||||
|
assert levenshtein(['a', 'ab'], ['a', 'ab', 'c']) == 1
|
||||||
|
assert levenshtein(['a', 'ab'], ['a', 'c']) == 1
|
||||||
|
|
||||||
|
|
||||||
|
def test_distance():
|
||||||
|
assert distance('Fnord', 'Food') == 2
|
||||||
|
assert distance('Müll', 'Mull') == 1
|
||||||
|
|
||||||
|
word1 = unicodedata.normalize('NFC', 'Schlyñ')
|
||||||
|
word2 = unicodedata.normalize('NFD', 'Schlyñ') # Different, decomposed!
|
||||||
|
assert distance(word1, word2) == 0
|
||||||
|
|
||||||
|
word1 = 'Schlyñ'
|
||||||
|
assert len(word1) == 6 # This ends with LATIN SMALL LETTER N WITH TILDE, so 6 code points
|
||||||
|
word2 = 'Schlym̃'
|
||||||
|
assert len(word2) == 7 # This, OTOH, ends with LATIN SMALL LETTER M + COMBINING TILDE, 7 code points
|
||||||
|
assert distance(word1, word2) == 1
|
@ -0,0 +1,38 @@
|
|||||||
|
from .. import seq_editops, editops
|
||||||
|
|
||||||
|
|
||||||
|
def test_trivial():
|
||||||
|
assert seq_editops('abc', 'abc') == []
|
||||||
|
assert seq_editops('', '') == []
|
||||||
|
|
||||||
|
|
||||||
|
def test_insert():
|
||||||
|
assert seq_editops('bc', 'abc') == [('insert', 0, 0)]
|
||||||
|
assert seq_editops('ac', 'abc') == [('insert', 1, 1)]
|
||||||
|
assert seq_editops('ab', 'abc') == [('insert', 2, 2)]
|
||||||
|
assert seq_editops('', 'a') == [('insert', 0, 0)]
|
||||||
|
|
||||||
|
|
||||||
|
def test_multiple():
|
||||||
|
assert seq_editops('bcd', 'abce') == [('insert', 0, 0), ('replace', 2, 3)]
|
||||||
|
|
||||||
|
|
||||||
|
def test_delete():
|
||||||
|
assert seq_editops('abcdef', 'cdef') == [('delete', 0, 0), ('delete', 1, 0)]
|
||||||
|
assert seq_editops('Xabcdef', 'Xcdef') == [('delete', 1, 1), ('delete', 2, 1)]
|
||||||
|
assert seq_editops('abcdefg', 'acdefX') == [('delete', 1, 1), ('replace', 6, 5)]
|
||||||
|
assert seq_editops('abcde', 'aabcd') == [('insert', 1, 1), ('delete', 4, 5)]
|
||||||
|
assert seq_editops('Foo', '') == [('delete', 0, 0), ('delete', 1, 0), ('delete', 2, 0)]
|
||||||
|
assert seq_editops('Foolish', 'Foo') == [('delete', 3, 3), ('delete', 4, 3), ('delete', 5, 3), ('delete', 6, 3)]
|
||||||
|
|
||||||
|
|
||||||
|
def test_ambiguous():
|
||||||
|
assert seq_editops('bcd', 'abcef') == [('insert', 0, 0), ('replace', 2, 3), ('insert', 3, 4)]
|
||||||
|
|
||||||
|
|
||||||
|
def test_editops():
|
||||||
|
"""Test editops() in cases where dealing with grapheme clusters matters"""
|
||||||
|
|
||||||
|
# In these cases, one of the words has a composed form, the other one does not.
|
||||||
|
assert editops('Schlyñ', 'Schlym̃') == [('replace', 5, 5)]
|
||||||
|
assert editops('oͤde', 'öde') == [('replace', 0, 0)]
|
@ -0,0 +1,23 @@
|
|||||||
|
from __future__ import division, print_function
|
||||||
|
|
||||||
|
import os
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
from lxml import etree as ET
|
||||||
|
|
||||||
|
from .. import align, page_text
|
||||||
|
|
||||||
|
data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data')
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.integration
|
||||||
|
def test_align_page_files():
|
||||||
|
# In the fake OCR file, we changed 2 characters and replaced a fi ligature with fi.
|
||||||
|
# → 4 elements in the alignment should be different.
|
||||||
|
# NOTE: In this example, it doesn't matter that we work with "characters", not grapheme clusters.
|
||||||
|
|
||||||
|
gt = page_text(ET.parse(os.path.join(data_dir, 'test-gt.page2018.xml')))
|
||||||
|
ocr = page_text(ET.parse(os.path.join(data_dir, 'test-fake-ocr.page2018.xml')))
|
||||||
|
|
||||||
|
result = list(align(gt, ocr))
|
||||||
|
assert sum(left != right for left, right in result) == 4
|
@ -0,0 +1,35 @@
|
|||||||
|
from __future__ import division, print_function
|
||||||
|
|
||||||
|
import os
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
from lxml import etree as ET
|
||||||
|
|
||||||
|
from .. import character_error_rate, page_text, alto_text
|
||||||
|
|
||||||
|
data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data')
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.integration
|
||||||
|
def test_character_error_rate_between_page_files():
|
||||||
|
# In the fake OCR file, we changed 2 characters and replaced a fi ligature with fi.
|
||||||
|
gt = page_text(ET.parse(os.path.join(data_dir, 'test-gt.page2018.xml')))
|
||||||
|
ocr = page_text(ET.parse(os.path.join(data_dir, 'test-fake-ocr.page2018.xml')))
|
||||||
|
assert character_error_rate(gt, ocr) == 4/(470 + 1 + 311) # 2 TextRegions, 1 \n
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.integration
|
||||||
|
def test_character_error_rate_between_page_alto():
|
||||||
|
gt = page_text(ET.parse(os.path.join(data_dir, 'lorem-ipsum', 'lorem-ipsum-scan.gt.page.xml')))
|
||||||
|
ocr = alto_text(ET.parse(os.path.join(data_dir, 'lorem-ipsum', 'lorem-ipsum-scan.ocr.tesseract.alto.xml')))
|
||||||
|
|
||||||
|
assert gt == ocr
|
||||||
|
assert character_error_rate(gt, ocr) == 0
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.integration
|
||||||
|
def test_character_error_rate_between_page_alto_2():
|
||||||
|
gt = page_text(ET.parse(os.path.join(data_dir, 'lorem-ipsum', 'lorem-ipsum-scan-bad.gt.page.xml')))
|
||||||
|
ocr = alto_text(ET.parse(os.path.join(data_dir, 'lorem-ipsum', 'lorem-ipsum-scan-bad.ocr.tesseract.alto.xml')))
|
||||||
|
|
||||||
|
assert character_error_rate(gt, ocr) == 8/591 # Manually verified
|
@ -0,0 +1,35 @@
|
|||||||
|
from __future__ import division, print_function
|
||||||
|
|
||||||
|
import os
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
from lxml import etree as ET
|
||||||
|
|
||||||
|
from .. import distance, page_text, alto_text
|
||||||
|
|
||||||
|
data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data')
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.integration
|
||||||
|
def test_distance_between_page_files():
|
||||||
|
# In the fake OCR file, we changed 2 characters and replaced a fi ligature with fi.
|
||||||
|
gt = page_text(ET.parse(os.path.join(data_dir, 'test-gt.page2018.xml')))
|
||||||
|
ocr = page_text(ET.parse(os.path.join(data_dir, 'test-fake-ocr.page2018.xml')))
|
||||||
|
assert distance(gt, ocr) == 4
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.integration
|
||||||
|
def test_distance_between_page_alto():
|
||||||
|
gt = page_text(ET.parse(os.path.join(data_dir, 'lorem-ipsum', 'lorem-ipsum-scan.gt.page.xml')))
|
||||||
|
ocr = alto_text(ET.parse(os.path.join(data_dir, 'lorem-ipsum', 'lorem-ipsum-scan.ocr.tesseract.alto.xml')))
|
||||||
|
|
||||||
|
assert gt == ocr
|
||||||
|
assert distance(gt, ocr) == 0
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.integration
|
||||||
|
def test_distance_between_page_alto_2():
|
||||||
|
gt = page_text(ET.parse(os.path.join(data_dir, 'lorem-ipsum', 'lorem-ipsum-scan-bad.gt.page.xml')))
|
||||||
|
ocr = alto_text(ET.parse(os.path.join(data_dir, 'lorem-ipsum', 'lorem-ipsum-scan-bad.ocr.tesseract.alto.xml')))
|
||||||
|
|
||||||
|
assert distance(gt, ocr) == 8 # Manually verified
|
@ -0,0 +1,43 @@
|
|||||||
|
from __future__ import division, print_function
|
||||||
|
|
||||||
|
import os
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
from lxml import etree as ET
|
||||||
|
|
||||||
|
from .. import word_error_rate, words, page_text, alto_text
|
||||||
|
|
||||||
|
data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data')
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.integration
|
||||||
|
def test_word_error_rate_between_page_files():
|
||||||
|
# In the fake OCR file, we changed 2 characters and replaced a fi ligature with fi. → 3 changed words
|
||||||
|
gt = page_text(ET.parse(os.path.join(data_dir, 'test-gt.page2018.xml')))
|
||||||
|
|
||||||
|
gt_word_count = 7+6+5+8+7+6+7+8+6+7+7+5+6+8+8+7+7+6+5+4 # Manually verified word count per line
|
||||||
|
assert len(list(words(gt))) == gt_word_count
|
||||||
|
|
||||||
|
ocr = page_text(ET.parse(os.path.join(data_dir, 'test-fake-ocr.page2018.xml')))
|
||||||
|
assert word_error_rate(gt, ocr) == 3/gt_word_count
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.integration
|
||||||
|
def test_word_error_rate_between_page_alto():
|
||||||
|
gt = page_text(ET.parse(os.path.join(data_dir, 'lorem-ipsum', 'lorem-ipsum-scan.gt.page.xml')))
|
||||||
|
ocr = alto_text(ET.parse(os.path.join(data_dir, 'lorem-ipsum', 'lorem-ipsum-scan.ocr.tesseract.alto.xml')))
|
||||||
|
|
||||||
|
assert gt == ocr
|
||||||
|
assert word_error_rate(gt, ocr) == 0
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.integration
|
||||||
|
def test_word_error_rate_between_page_alto_2():
|
||||||
|
gt = page_text(ET.parse(os.path.join(data_dir, 'lorem-ipsum', 'lorem-ipsum-scan-bad.gt.page.xml')))
|
||||||
|
|
||||||
|
gt_word_count = 14+18+17+14+17+17+3 # Manually verified word count per line
|
||||||
|
assert len(list(words(gt))) == gt_word_count
|
||||||
|
|
||||||
|
ocr = alto_text(ET.parse(os.path.join(data_dir, 'lorem-ipsum', 'lorem-ipsum-scan-bad.ocr.tesseract.alto.xml')))
|
||||||
|
|
||||||
|
assert word_error_rate(gt, ocr) == 7/gt_word_count # Manually verified, 6 words are wrong, 1 got split (=2 errors)
|
@ -0,0 +1,24 @@
|
|||||||
|
from itertools import zip_longest
|
||||||
|
from typing import Iterable
|
||||||
|
|
||||||
|
import colorama
|
||||||
|
|
||||||
|
|
||||||
|
def diffprint(x, y):
|
||||||
|
"""Print elements or lists x and y, with differences in red"""
|
||||||
|
|
||||||
|
def _diffprint(x, y):
|
||||||
|
if x != y:
|
||||||
|
print(colorama.Fore.RED, x, y, colorama.Fore.RESET)
|
||||||
|
else:
|
||||||
|
print(x, y)
|
||||||
|
|
||||||
|
if isinstance(x, Iterable):
|
||||||
|
for xe, ye in zip_longest(x, y):
|
||||||
|
_diffprint(xe, ye)
|
||||||
|
else:
|
||||||
|
_diffprint(x, y)
|
||||||
|
|
||||||
|
|
||||||
|
def unzip(l):
|
||||||
|
return zip(*l)
|
@ -0,0 +1,69 @@
|
|||||||
|
from __future__ import division
|
||||||
|
|
||||||
|
import unicodedata
|
||||||
|
|
||||||
|
import uniseg.wordbreak
|
||||||
|
|
||||||
|
from .edit_distance import levenshtein
|
||||||
|
|
||||||
|
|
||||||
|
def words(s):
|
||||||
|
# Patch uniseg.wordbreak.word_break to deal with our private use characters. See also
|
||||||
|
# https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/WordBreakProperty.txt
|
||||||
|
old_word_break = uniseg.wordbreak.word_break
|
||||||
|
|
||||||
|
def new_word_break(c, index=0):
|
||||||
|
if 0xE000 <= ord(c) <= 0xF8FF: # Private Use Area
|
||||||
|
return 'ALetter'
|
||||||
|
else:
|
||||||
|
return old_word_break(c, index)
|
||||||
|
uniseg.wordbreak.word_break = new_word_break
|
||||||
|
|
||||||
|
# Check if c is an unwanted character, i.e. whitespace, punctuation, or similar
|
||||||
|
def unwanted(c):
|
||||||
|
|
||||||
|
# See https://www.fileformat.info/info/unicode/category/index.htm
|
||||||
|
# and https://unicodebook.readthedocs.io/unicode.html#categories
|
||||||
|
unwanted_categories = 'O', 'M', 'P', 'Z', 'S'
|
||||||
|
unwanted_subcategories = 'Cc', 'Cf'
|
||||||
|
|
||||||
|
subcat = unicodedata.category(c)
|
||||||
|
cat = subcat[0]
|
||||||
|
return cat in unwanted_categories or subcat in unwanted_subcategories
|
||||||
|
|
||||||
|
# We follow Unicode Standard Annex #29 on Unicode Text Segmentation here: Split on word boundaries using
|
||||||
|
# uniseg.wordbreak.words() and ignore all "words" that contain only whitespace, punctation "or similar characters."
|
||||||
|
for word in uniseg.wordbreak.words(s):
|
||||||
|
if all(unwanted(c) for c in word):
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
yield word
|
||||||
|
|
||||||
|
|
||||||
|
def words_normalized(s):
|
||||||
|
return words(unicodedata.normalize('NFC', s))
|
||||||
|
|
||||||
|
|
||||||
|
def word_error_rate(reference, compared):
|
||||||
|
if isinstance(reference, str):
|
||||||
|
reference_seq = list(words_normalized(reference))
|
||||||
|
compared_seq = list(words_normalized(compared))
|
||||||
|
else:
|
||||||
|
reference_seq = list(reference)
|
||||||
|
compared_seq = list(compared)
|
||||||
|
|
||||||
|
d = levenshtein(reference_seq, compared_seq)
|
||||||
|
if d == 0:
|
||||||
|
return 0
|
||||||
|
|
||||||
|
n = len(reference_seq)
|
||||||
|
if n == 0:
|
||||||
|
return float('inf')
|
||||||
|
|
||||||
|
return d / n
|
||||||
|
|
||||||
|
|
||||||
|
def unordered_word_error_rate(reference, compared):
|
||||||
|
reference_seq = sorted(words_normalized(reference))
|
||||||
|
compared_seq = sorted(words_normalized(compared))
|
||||||
|
return word_error_rate(reference_seq, compared_seq)
|
@ -0,0 +1,6 @@
|
|||||||
|
click
|
||||||
|
jinja2
|
||||||
|
lxml
|
||||||
|
uniseg
|
||||||
|
numpy
|
||||||
|
colorama
|
@ -0,0 +1,23 @@
|
|||||||
|
from io import open
|
||||||
|
from setuptools import find_packages, setup
|
||||||
|
|
||||||
|
with open('requirements.txt') as fp:
|
||||||
|
install_requires = fp.read()
|
||||||
|
|
||||||
|
setup(
|
||||||
|
name='dinglehopper',
|
||||||
|
author_email='qurator@sbb.spk-berlin.de',
|
||||||
|
description='The OCR evaluation tool',
|
||||||
|
long_description=open('README.md', 'r', encoding='utf-8').read(),
|
||||||
|
long_description_content_type='text/markdown',
|
||||||
|
keywords='qurator ocr',
|
||||||
|
license='Apache',
|
||||||
|
namespace_packages=['qurator'],
|
||||||
|
packages=find_packages(exclude=['*.tests', '*.tests.*', 'tests.*', 'tests']),
|
||||||
|
install_requires=install_requires,
|
||||||
|
entry_points={
|
||||||
|
'console_scripts': [
|
||||||
|
'dinglehopper=qurator.dinglehopper.cli:main',
|
||||||
|
]
|
||||||
|
}
|
||||||
|
)
|
Loading…
Reference in New Issue