Revert "Merge branch 'master' of https://github.com/qurator-spk/sbb_textline_detector"
This reverts commit 2c89bf3b35ee290d7b830ef270df3a96aa48245e, reversing changes made to 9f7e413148ca5dbac9b555d7b0d0a5fa3a0f5340.pull/23/head
parent
1303a7d92f
commit
48a31ce672
@ -1,2 +0,0 @@
|
||||
__pycache__
|
||||
*.egg-info
|
Binary file not shown.
After Width: | Height: | Size: 144 KiB |
@ -0,0 +1,14 @@
|
||||
dist: xenial # required for Python >= 3.7
|
||||
language: python
|
||||
python:
|
||||
- "3.5"
|
||||
- "3.6"
|
||||
- "3.7"
|
||||
- "3.8"
|
||||
|
||||
|
||||
install:
|
||||
- pip install -r requirements.txt
|
||||
|
||||
script:
|
||||
- pytest
|
@ -1,9 +0,0 @@
|
||||
FROM python:3
|
||||
|
||||
ADD requirements.txt /
|
||||
RUN pip install --proxy=http-proxy.sbb.spk-berlin.de:3128 -r requirements.txt
|
||||
|
||||
COPY . /usr/src/sbb_textline_detector
|
||||
RUN pip install /usr/src/sbb_textline_detector
|
||||
|
||||
ENTRYPOINT ["sbb_textline_detector"]
|
@ -1,30 +1,49 @@
|
||||
# Textline Detection
|
||||
dinglehopper
|
||||
============
|
||||
|
||||
## Introduction
|
||||
This tool performs textline detection from document image data and returns the results as PAGE-XML.
|
||||
dinglehopper is an OCR evaluation tool and reads [ALTO](https://github.com/altoxml), [PAGE](https://github.com/PRImA-Research-Lab/PAGE-XML) and text files.
|
||||
|
||||
## Installation
|
||||
[![Build Status](https://travis-ci.org/qurator-spk/dinglehopper.svg?branch=master)](https://travis-ci.org/qurator-spk/dinglehopper)
|
||||
|
||||
`pip install .`
|
||||
Goals
|
||||
-----
|
||||
* Useful
|
||||
* As a UI tool
|
||||
* For an automated evaluation
|
||||
* As a library
|
||||
* Unicode support
|
||||
|
||||
## Models
|
||||
In order to run this tool you also need trained models. You can download our pre-trained models from here:
|
||||
https://file.spk-berlin.de:8443/textline_detection/
|
||||
|
||||
## Usage
|
||||
Installation
|
||||
------------
|
||||
It's best to use pip, e.g.:
|
||||
~~~
|
||||
sudo pip install .
|
||||
~~~
|
||||
|
||||
`sbb_textline_detector -i <image file name> -o <directory to write output xml> -m <directory of models>`
|
||||
Usage
|
||||
-----
|
||||
~~~
|
||||
dinglehopper some-document.gt.page.xml some-document.ocr.alto.xml
|
||||
~~~
|
||||
This generates `report.html` and `report.json`.
|
||||
|
||||
## Usage with OCR-D
|
||||
|
||||
As a OCR-D processor:
|
||||
~~~
|
||||
ocrd-example-binarize -I OCR-D-IMG -O OCR-D-IMG-BIN
|
||||
ocrd-sbb-textline-detector -I OCR-D-IMG-BIN -O OCR-D-SEG-LINE-SBB \
|
||||
-p '{ "model": "/path/to/the/models/textline_detection" }'
|
||||
ocrd-dinglehopper -m mets.xml -I OCR-D-GT-PAGE,OCR-D-OCR-TESS -O OCR-D-OCR-TESS-EVAL
|
||||
~~~
|
||||
This generates HTML and JSON reports in the `OCR-D-OCR-TESS-EVAL` filegroup.
|
||||
|
||||
Segmentation works on raw RGB images, but respects and retains
|
||||
`AlternativeImage`s from binarization steps, so it's a good idea to do
|
||||
binarization first, then perform the textline detection. The used binarization
|
||||
processor must produce an `AlternativeImage` for the binarized image, not
|
||||
replace the original raw RGB image.
|
||||
|
||||
![dinglehopper displaying metrics and character differences](.screenshots/dinglehopper.png?raw=true)
|
||||
|
||||
Testing
|
||||
-------
|
||||
Use `pytest` to run the tests in [the tests directory](qurator/dinglehopper/tests):
|
||||
~~~
|
||||
virtualenv -p /usr/bin/python3 venv
|
||||
. venv/bin/activate
|
||||
pip install -r requirements.txt
|
||||
pip install pytest
|
||||
pytest
|
||||
~~~
|
||||
|
@ -1 +1 @@
|
||||
qurator/sbb_textline_detector/ocrd-tool.json
|
||||
qurator/dinglehopper/ocrd-tool.json
|
@ -0,0 +1,4 @@
|
||||
[pytest]
|
||||
markers =
|
||||
integration: integration tests
|
||||
serial
|
@ -1 +1,2 @@
|
||||
__import__('pkg_resources').declare_namespace(__name__)
|
||||
|
||||
|
@ -0,0 +1,6 @@
|
||||
# User-specific stuff
|
||||
.idea/**/workspace.xml
|
||||
.idea/**/tasks.xml
|
||||
.idea/**/usage.statistics.xml
|
||||
.idea/**/dictionaries
|
||||
.idea/**/shelf
|
@ -0,0 +1,12 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<module type="PYTHON_MODULE" version="4">
|
||||
<component name="NewModuleRootManager">
|
||||
<content url="file://$MODULE_DIR$" />
|
||||
<orderEntry type="jdk" jdkName="Python 3.7 (dinglehopper)" jdkType="Python SDK" />
|
||||
<orderEntry type="sourceFolder" forTests="false" />
|
||||
</component>
|
||||
<component name="TestRunnerService">
|
||||
<option name="projectConfiguration" value="pytest" />
|
||||
<option name="PROJECT_TEST_RUNNER" value="pytest" />
|
||||
</component>
|
||||
</module>
|
@ -0,0 +1,7 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.7 (dinglehopper)" project-jdk-type="Python SDK" />
|
||||
<component name="PyCharmProfessionalAdvertiser">
|
||||
<option name="shown" value="true" />
|
||||
</component>
|
||||
</project>
|
@ -0,0 +1,8 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="ProjectModuleManager">
|
||||
<modules>
|
||||
<module fileurl="file://$PROJECT_DIR$/.idea/dinglehopper.iml" filepath="$PROJECT_DIR$/.idea/dinglehopper.iml" />
|
||||
</modules>
|
||||
</component>
|
||||
</project>
|
@ -0,0 +1,5 @@
|
||||
from .ocr_files import *
|
||||
from .substitute_equivalences import *
|
||||
from .character_error_rate import *
|
||||
from .word_error_rate import *
|
||||
from .align import *
|
@ -0,0 +1,43 @@
|
||||
from .edit_distance import *
|
||||
|
||||
|
||||
def align(t1, t2):
|
||||
"""Align text."""
|
||||
s1 = list(grapheme_clusters(unicodedata.normalize('NFC', t1)))
|
||||
s2 = list(grapheme_clusters(unicodedata.normalize('NFC', t2)))
|
||||
return seq_align(s1, s2)
|
||||
|
||||
|
||||
def seq_align(s1, s2):
|
||||
"""Align general sequences."""
|
||||
s1 = list(s1)
|
||||
s2 = list(s2)
|
||||
ops = seq_editops(s1, s2)
|
||||
i = 0
|
||||
j = 0
|
||||
|
||||
while i < len(s1) or j < len(s2):
|
||||
o = None
|
||||
try:
|
||||
ot = ops[0]
|
||||
if ot[1] == i and ot[2] == j:
|
||||
ops = ops[1:]
|
||||
o = ot
|
||||
except IndexError:
|
||||
pass
|
||||
|
||||
if o:
|
||||
if o[0] == 'insert':
|
||||
yield (None, s2[j])
|
||||
j += 1
|
||||
elif o[0] == 'delete':
|
||||
yield (s1[i], None)
|
||||
i += 1
|
||||
elif o[0] == 'replace':
|
||||
yield (s1[i], s2[j])
|
||||
i += 1
|
||||
j += 1
|
||||
else:
|
||||
yield (s1[i], s2[j])
|
||||
i += 1
|
||||
j += 1
|
@ -0,0 +1,21 @@
|
||||
from __future__ import division
|
||||
|
||||
import unicodedata
|
||||
|
||||
from uniseg.graphemecluster import grapheme_clusters
|
||||
|
||||
from qurator.dinglehopper.edit_distance import distance
|
||||
|
||||
|
||||
def character_error_rate(reference, compared):
|
||||
d = distance(reference, compared)
|
||||
if d == 0:
|
||||
return 0
|
||||
|
||||
n = len(list(grapheme_clusters(unicodedata.normalize('NFC', reference))))
|
||||
if n == 0:
|
||||
return float('inf')
|
||||
|
||||
return d/n
|
||||
|
||||
# XXX Should we really count newlines here?
|
@ -0,0 +1,106 @@
|
||||
import os
|
||||
|
||||
import click
|
||||
from jinja2 import Environment, FileSystemLoader
|
||||
from markupsafe import escape
|
||||
|
||||
|
||||
from qurator.dinglehopper import *
|
||||
|
||||
|
||||
def gen_diff_report(gt_things, ocr_things, css_prefix, joiner, none, align):
|
||||
gtx = ''
|
||||
ocrx = ''
|
||||
|
||||
def format_thing(t, css_classes=None):
|
||||
if t is None:
|
||||
html_t = none
|
||||
css_classes += ' ellipsis'
|
||||
elif t == '\n':
|
||||
html_t = '<br>'
|
||||
else:
|
||||
html_t = escape(t)
|
||||
|
||||
if css_classes:
|
||||
return '<span class="{css_classes}">{html_t}</span>'.format(css_classes=css_classes, html_t=html_t)
|
||||
else:
|
||||
return '{html_t}'.format(html_t=html_t)
|
||||
|
||||
for k, (g, o) in enumerate(align(gt_things, ocr_things)):
|
||||
if g == o:
|
||||
css_classes = None
|
||||
else:
|
||||
css_classes = '{css_prefix}diff{k} diff'.format(css_prefix=css_prefix, k=k)
|
||||
|
||||
gtx += joiner + format_thing(g, css_classes)
|
||||
ocrx += joiner + format_thing(o, css_classes)
|
||||
|
||||
return \
|
||||
'''
|
||||
<div class="row">
|
||||
<div class="col-md-6 gt">{}</div>
|
||||
<div class="col-md-6 ocr">{}</div>
|
||||
</div>
|
||||
'''.format(gtx, ocrx)
|
||||
|
||||
|
||||
def process(gt, ocr, report_prefix):
|
||||
"""Check OCR result against GT.
|
||||
|
||||
The @click decorators change the signature of the decorated functions, so we keep this undecorated version and use
|
||||
Click on a wrapper.
|
||||
"""
|
||||
|
||||
gt_text = text(gt)
|
||||
ocr_text = text(ocr)
|
||||
|
||||
gt_text = substitute_equivalences(gt_text)
|
||||
ocr_text = substitute_equivalences(ocr_text)
|
||||
|
||||
cer = character_error_rate(gt_text, ocr_text)
|
||||
wer = word_error_rate(gt_text, ocr_text)
|
||||
|
||||
char_diff_report = gen_diff_report(gt_text, ocr_text, css_prefix='c', joiner='', none='·', align=align)
|
||||
|
||||
gt_words = words_normalized(gt_text)
|
||||
ocr_words = words_normalized(ocr_text)
|
||||
word_diff_report = gen_diff_report(gt_words, ocr_words, css_prefix='w', joiner=' ', none='⋯', align=seq_align)
|
||||
|
||||
def json_float(value):
|
||||
"""Convert a float value to an JSON float.
|
||||
|
||||
This is here so that float('inf') yields "Infinity", not "inf".
|
||||
"""
|
||||
if value == float('inf'):
|
||||
return 'Infinity'
|
||||
elif value == float('-inf'):
|
||||
return '-Infinity'
|
||||
else:
|
||||
return str(value)
|
||||
|
||||
env = Environment(loader=FileSystemLoader(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'templates')))
|
||||
env.filters['json_float'] = json_float
|
||||
|
||||
for report_suffix in ('.html', '.json'):
|
||||
template_fn = 'report' + report_suffix + '.j2'
|
||||
out_fn = report_prefix + report_suffix
|
||||
|
||||
template = env.get_template(template_fn)
|
||||
template.stream(
|
||||
gt=gt, ocr=ocr,
|
||||
cer=cer, wer=wer,
|
||||
char_diff_report=char_diff_report,
|
||||
word_diff_report=word_diff_report
|
||||
).dump(out_fn)
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.argument('gt', type=click.Path(exists=True))
|
||||
@click.argument('ocr', type=click.Path(exists=True))
|
||||
@click.argument('report_prefix', type=click.Path(), default='report')
|
||||
def main(gt, ocr, report_prefix):
|
||||
process(gt, ocr, report_prefix)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
@ -0,0 +1,122 @@
|
||||
from __future__ import division, print_function
|
||||
|
||||
import unicodedata
|
||||
from functools import partial, lru_cache
|
||||
from typing import Sequence, Tuple
|
||||
|
||||
import numpy as np
|
||||
from uniseg.graphemecluster import grapheme_clusters
|
||||
|
||||
|
||||
def levenshtein_matrix(seq1: Sequence, seq2: Sequence):
|
||||
"""Compute the matrix commonly computed to produce the Levenshtein distance.
|
||||
This is also known as the Wagner-Fischer algorithm. The matrix element at the bottom right contains the desired
|
||||
edit distance.
|
||||
|
||||
This algorithm is implemented here because we need an implementation that can work with sequences other than
|
||||
strings, e.g. lists of grapheme clusters or lists of word strings.
|
||||
"""
|
||||
|
||||
# Internally, we use a cached version. As the cache only works on hashable parameters, we convert the input
|
||||
# sequences to tuples to make them hashable.
|
||||
return _levenshtein_matrix(tuple(seq1), tuple(seq2))
|
||||
|
||||
|
||||
@lru_cache(maxsize=10)
|
||||
def _levenshtein_matrix(seq1: Tuple, seq2: Tuple):
|
||||
"""Compute the matrix commonly computed to produce the Levenshtein distance.
|
||||
|
||||
This is a LRU cached function not meant to be used directly. Use levenshtein_matrix() instead.
|
||||
"""
|
||||
m = len(seq1)
|
||||
n = len(seq2)
|
||||
|
||||
def from_to(start, stop):
|
||||
return range(start, stop + 1, 1)
|
||||
|
||||
D = np.zeros((m + 1, n + 1), np.int)
|
||||
D[0, 0] = 0
|
||||
for i in from_to(1, m):
|
||||
D[i, 0] = i
|
||||
for j in from_to(1, n):
|
||||
D[0, j] = j
|
||||
for i in from_to(1, m):
|
||||
for j in from_to(1, n):
|
||||
D[i, j] = min(
|
||||
D[i - 1, j - 1] + 1 * (seq1[i - 1] != seq2[j - 1]), # Same or Substitution
|
||||
D[i, j - 1] + 1, # Insertion
|
||||
D[i - 1, j] + 1 # Deletion
|
||||
)
|
||||
|
||||
return D
|
||||
|
||||
|
||||
def levenshtein(seq1, seq2):
|
||||
"""Compute the Levenshtein edit distance between two sequences"""
|
||||
m = len(seq1)
|
||||
n = len(seq2)
|
||||
|
||||
D = levenshtein_matrix(seq1, seq2)
|
||||
return D[m, n]
|
||||
|
||||
|
||||
def levenshtein_matrix_cache_clear():
|
||||
"""Clear internal Levenshtein matrix cache.
|
||||
|
||||
You want to do this between different input file pairs to decrease memory
|
||||
usage by not caching results from prior input files.
|
||||
"""
|
||||
_levenshtein_matrix.cache_clear()
|
||||
|
||||
|
||||
def distance(s1, s2):
|
||||
"""Compute the Levenshtein edit distance between two Unicode strings
|
||||
|
||||
Note that this is different from levenshtein() as this function knows about Unicode normalization and grapheme
|
||||
clusters. This should be the correct way to compare two Unicode strings.
|
||||
"""
|
||||
s1 = list(grapheme_clusters(unicodedata.normalize('NFC', s1)))
|
||||
s2 = list(grapheme_clusters(unicodedata.normalize('NFC', s2)))
|
||||
return levenshtein(s1, s2)
|
||||
|
||||
|
||||
def seq_editops(seq1, seq2):
|
||||
"""
|
||||
Return sequence of edit operations transforming one sequence to another.
|
||||
|
||||
This aims to return the same/similar results as python-Levenshtein's editops(), just generalized to arbitrary
|
||||
sequences.
|
||||
"""
|
||||
seq1 = list(seq1)
|
||||
seq2 = list(seq2)
|
||||
m = len(seq1)
|
||||
n = len(seq2)
|
||||
D = levenshtein_matrix(seq1, seq2)
|
||||
|
||||
def _tail_backtrace(i, j, accumulator):
|
||||
if i > 0 and D[i - 1, j] + 1 == D[i, j]:
|
||||
return partial(_tail_backtrace, i - 1, j, [('delete', i-1, j)] + accumulator)
|
||||
if j > 0 and D[i, j - 1] + 1 == D[i, j]:
|
||||
return partial(_tail_backtrace, i, j - 1, [('insert', i, j-1)] + accumulator)
|
||||
if i > 0 and j > 0 and D[i - 1, j - 1] + 1 == D[i, j]:
|
||||
return partial(_tail_backtrace, i - 1, j - 1, [('replace', i-1, j-1)] + accumulator)
|
||||
if i > 0 and j > 0 and D[i - 1, j - 1] == D[i, j]:
|
||||
return partial(_tail_backtrace, i - 1, j - 1, accumulator) # NOP
|
||||
return accumulator
|
||||
|
||||
def backtrace(i, j):
|
||||
result = partial(_tail_backtrace, i, j, [])
|
||||
while isinstance(result, partial):
|
||||
result = result()
|
||||
|
||||
return result
|
||||
|
||||
b = backtrace(m, n)
|
||||
return b
|
||||
|
||||
|
||||
def editops(word1, word2):
|
||||
# XXX Note that this returns indices to the _grapheme clusters_, not characters!
|
||||
word1 = list(grapheme_clusters(unicodedata.normalize('NFC', word1)))
|
||||
word2 = list(grapheme_clusters(unicodedata.normalize('NFC', word2)))
|
||||
return seq_editops(word1, word2)
|
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,107 @@
|
||||
from __future__ import division, print_function
|
||||
|
||||
from warnings import warn
|
||||
|
||||
from lxml import etree as ET
|
||||
import sys
|
||||
|
||||
from lxml.etree import XMLSyntaxError
|
||||
|
||||
|
||||
def alto_namespace(tree):
|
||||
"""Return the ALTO namespace used in the given ElementTree.
|
||||
|
||||
This relies on the assumption that, in any given ALTO file, the root element has the local name "alto". We do not
|
||||
check if the files uses any valid ALTO namespace.
|
||||
"""
|
||||
root_name = ET.QName(tree.getroot().tag)
|
||||
if root_name.localname == 'alto':
|
||||
return root_name.namespace
|
||||
else:
|
||||
raise ValueError('Not an ALTO tree')
|
||||
|
||||
|
||||
def alto_text(tree):
|
||||
"""Extract text from the given ALTO ElementTree."""
|
||||
|
||||
nsmap = {'alto': alto_namespace(tree)}
|
||||
|
||||
lines = (
|
||||
' '.join(string.attrib.get('CONTENT') for string in line.iterfind('alto:String', namespaces=nsmap))
|
||||
for line in tree.iterfind('.//alto:TextLine', namespaces=nsmap))
|
||||
text_ = '\n'.join(lines)
|
||||
|
||||
return text_
|
||||
|
||||
|
||||
def page_namespace(tree):
|
||||
"""Return the PAGE content namespace used in the given ElementTree.
|
||||
|
||||
This relies on the assumption that, in any given PAGE content file, the root element has the local name "PcGts". We
|
||||
do not check if the files uses any valid PAGE namespace.
|
||||
"""
|
||||
root_name = ET.QName(tree.getroot().tag)
|
||||
if root_name.localname == 'PcGts':
|
||||
return root_name.namespace
|
||||
else:
|
||||
raise ValueError('Not a PAGE tree')
|
||||
|
||||
|
||||
def page_text(tree):
|
||||
"""Extract text from the given PAGE content ElementTree."""
|
||||
|
||||
nsmap = {'page': page_namespace(tree)}
|
||||
|
||||
def region_text(region):
|
||||
try:
|
||||
return region.find('./page:TextEquiv/page:Unicode', namespaces=nsmap).text
|
||||
except AttributeError:
|
||||
return None
|
||||
|
||||
region_texts = []
|
||||
reading_order = tree.find('.//page:ReadingOrder', namespaces=nsmap)
|
||||
if reading_order is not None:
|
||||
for group in reading_order.iterfind('./*', namespaces=nsmap):
|
||||
if ET.QName(group.tag).localname == 'OrderedGroup':
|
||||
region_ref_indexeds = group.findall('./page:RegionRefIndexed', namespaces=nsmap)
|
||||
for region_ref_indexed in sorted(region_ref_indexeds, key=lambda r: int(r.attrib['index'])):
|
||||
region_id = region_ref_indexed.attrib['regionRef']
|
||||
region = tree.find('.//page:TextRegion[@id="%s"]' % region_id, namespaces=nsmap)
|
||||
if region is not None:
|
||||
region_texts.append(region_text(region))
|
||||
else:
|
||||
warn('Not a TextRegion: "%s"' % region_id)
|
||||
else:
|
||||
raise NotImplementedError
|
||||
else:
|
||||
for region in tree.iterfind('.//page:TextRegion', namespaces=nsmap):
|
||||
region_texts.append(region_text(region))
|
||||
|
||||
# XXX Does a file have to have regions etc.? region vs lines etc.
|
||||
# Filter empty region texts
|
||||
region_texts = (t for t in region_texts if t)
|
||||
|
||||
text_ = '\n'.join(region_texts)
|
||||
|
||||
return text_
|
||||
|
||||
|
||||
def text(filename):
|
||||
"""Read the text from the given file.
|
||||
|
||||
Supports PAGE, ALTO and falls back to plain text.
|
||||
"""
|
||||
|
||||
try:
|
||||
tree = ET.parse(filename)
|
||||
except XMLSyntaxError:
|
||||
with open(filename, 'r') as f:
|
||||
return f.read()
|
||||
try:
|
||||
return page_text(tree)
|
||||
except ValueError:
|
||||
return alto_text(tree)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
print(text(sys.argv[1]))
|
@ -0,0 +1,22 @@
|
||||
{
|
||||
"git_url": "https://github.com/qurator-spk/dinglehopper",
|
||||
"tools": {
|
||||
"ocrd-dinglehopper": {
|
||||
"executable": "ocrd-dinglehopper",
|
||||
"description": "Evaluate OCR text against ground truth with dinglehopper",
|
||||
"input_file_grp": [
|
||||
"OCR-D-GT-PAGE",
|
||||
"OCR-D-OCR"
|
||||
],
|
||||
"output_file_grp": [
|
||||
"OCR-D-OCR-EVAL"
|
||||
],
|
||||
"categories": [
|
||||
"Quality assurance"
|
||||
],
|
||||
"steps": [
|
||||
"recognition/text-recognition"
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
@ -0,0 +1,71 @@
|
||||
import json
|
||||
import os
|
||||
|
||||
import click
|
||||
from ocrd import Processor
|
||||
from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor
|
||||
from ocrd_utils import concat_padded, getLogger
|
||||
from pkg_resources import resource_string
|
||||
|
||||
from qurator.dinglehopper.cli import process as cli_process
|
||||
from qurator.dinglehopper.edit_distance import levenshtein_matrix_cache_clear
|
||||
|
||||
log = getLogger('processor.OcrdDinglehopperEvaluate')
|
||||
|
||||
OCRD_TOOL = json.loads(resource_string(__name__, 'ocrd-tool.json').decode('utf8'))
|
||||
|
||||
|
||||
@click.command()
|
||||
@ocrd_cli_options
|
||||
def ocrd_dinglehopper(*args, **kwargs):
|
||||
return ocrd_cli_wrap_processor(OcrdDinglehopperEvaluate, *args, **kwargs)
|
||||
|
||||
|
||||
class OcrdDinglehopperEvaluate(Processor):
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
kwargs['ocrd_tool'] = OCRD_TOOL['tools']['ocrd-dinglehopper']
|
||||
super(OcrdDinglehopperEvaluate, self).__init__(*args, **kwargs)
|
||||
|
||||
def _make_file_id(self, input_file, input_file_grp, n):
|
||||
file_id = input_file.ID.replace(input_file_grp, self.output_file_grp)
|
||||
if file_id == input_file.ID:
|
||||
file_id = concat_padded(self.output_file_grp, n)
|
||||
return file_id
|
||||
|
||||
def process(self):
|
||||
gt_grp, ocr_grp = self.input_file_grp.split(',')
|
||||
for n, page_id in enumerate(self.workspace.mets.physical_pages):
|
||||
gt_file = self.workspace.mets.find_files(fileGrp=gt_grp, pageId=page_id)[0]
|
||||
ocr_file = self.workspace.mets.find_files(fileGrp=ocr_grp, pageId=page_id)[0]
|
||||
log.info("INPUT FILES %i / %s↔ %s", n, gt_file, ocr_file)
|
||||
|
||||
file_id = self._make_file_id(ocr_file, ocr_grp, n)
|
||||
report_prefix = os.path.join(self.output_file_grp, file_id)
|
||||
|
||||
# Process the files
|
||||
try:
|
||||
os.mkdir(self.output_file_grp)
|
||||
except FileExistsError:
|
||||
pass
|
||||
cli_process(gt_file.local_filename, ocr_file.local_filename, report_prefix)
|
||||
|
||||
# Add reports to the workspace
|
||||
for report_suffix, mimetype in \
|
||||
[
|
||||
['.html', 'text/html'],
|
||||
['.json', 'application/json']
|
||||
]:
|
||||
self.workspace.add_file(
|
||||
ID=file_id + report_suffix,
|
||||
file_grp=self.output_file_grp,
|
||||
pageId=page_id,
|
||||
mimetype=mimetype,
|
||||
local_filename=report_prefix + report_suffix)
|
||||
|
||||
# Clear cache between files
|
||||
levenshtein_matrix_cache_clear()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
ocrd_dinglehopper()
|
@ -0,0 +1,60 @@
|
||||
<!doctype html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="utf-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
|
||||
|
||||
<link rel="stylesheet" href="https://stackpath.bootstrapcdn.com/bootstrap/4.3.1/css/bootstrap.min.css" integrity="sha384-ggOyR0iXCbMQv3Xipma34MD+dH/1fQ784/j6cY/iJTQUOhcWr7x9JvoRxT2MZw1T" crossorigin="anonymous">
|
||||
<style type="text/css">
|
||||
.gt .diff {
|
||||
color: green;
|
||||
}
|
||||
.ocr .diff {
|
||||
color: red;
|
||||
}
|
||||
.ellipsis {
|
||||
opacity: 0.5;
|
||||
font-style: italic;
|
||||
}
|
||||
.diff-highlight {
|
||||
border: 2px solid;
|
||||
border-radius: 5px;
|
||||
}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
|
||||
|
||||
|
||||
<div class="container">
|
||||
|
||||
{{ gt }}<br>
|
||||
{{ ocr }}
|
||||
|
||||
|
||||
<h2>Metrics</h2>
|
||||
<p>CER: {{ cer|round(4) }}</p>
|
||||
<p>WER: {{ wer|round(4) }}</p>
|
||||
|
||||
<h2>Character differences</h2>
|
||||
{{ char_diff_report }}
|
||||
|
||||
<h2>Word differences</h2>
|
||||
{{ word_diff_report }}
|
||||
|
||||
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
<script src="https://code.jquery.com/jquery-3.3.1.slim.min.js" integrity="sha384-q8i/X+965DzO0rT7abK41JStQIAqVgRVzpbzo5smXKp4YfRvH+8abtTE1Pi6jizo" crossorigin="anonymous"></script>
|
||||
<script src="https://cdnjs.cloudflare.com/ajax/libs/popper.js/1.14.7/umd/popper.min.js" integrity="sha384-UO2eT0CpHqdSJQ6hJty5KVphtPhzWj9WO1clHTMGa3JDZwrnQq4sF86dIHNDz0W1" crossorigin="anonymous"></script>
|
||||
<script src="https://stackpath.bootstrapcdn.com/bootstrap/4.3.1/js/bootstrap.min.js" integrity="sha384-JjSmVgyd0p3pXB1rRibZUAYoIIy6OrQ6VrjIEaFf/nJGzIxFDsf4x0xIM+B07jRM" crossorigin="anonymous"></script>
|
||||
|
||||
<script>
|
||||
{% include 'report.html.js' %}
|
||||
</script>
|
||||
|
||||
|
||||
</body>
|
||||
</html>
|
@ -0,0 +1,14 @@
|
||||
function find_diff_class(classes) {
|
||||
return classes.split(/\s+/).find(x => x.match(/.diff\d.*/));
|
||||
}
|
||||
|
||||
$(document).ready(function() {
|
||||
$('.diff').mouseover(function() {
|
||||
let c = find_diff_class($(this).attr('class'))
|
||||
$('.' + c).addClass('diff-highlight')
|
||||
});
|
||||
$('.diff').mouseout(function() {
|
||||
let c = find_diff_class($(this).attr('class'))
|
||||
$('.' + c).removeClass('diff-highlight')
|
||||
});
|
||||
});
|
@ -0,0 +1,6 @@
|
||||
{
|
||||
"gt": "{{ gt }}",
|
||||
"ocr": "{{ ocr }}",
|
||||
"cer": {{ cer|json_float }},
|
||||
"wer": {{ wer|json_float }}
|
||||
}
|
Binary file not shown.
File diff suppressed because it is too large
Load Diff
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
@ -0,0 +1,287 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<mets:mets xmlns:mets="http://www.loc.gov/METS/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="info:lc/xmlns/premis-v2 http://www.loc.gov/standards/premis/v2/premis-v2-0.xsd http://www.loc.gov/mods/v3 http://www.loc.gov/standards/mods/v3/mods-3-6.xsd http://www.loc.gov/METS/ http://www.loc.gov/standards/mets/version17/mets.v1-7.xsd http://www.loc.gov/mix/v10 http://www.loc.gov/standards/mix/mix10/mix10.xsd">
|
||||
<mets:metsHdr CREATEDATE="2017-08-22T14:23:38">
|
||||
<mets:agent OTHERTYPE="SOFTWARE" ROLE="CREATOR" TYPE="OTHER">
|
||||
<mets:name>Goobi - UGH-1.11.1-v1.11.0-11-gbafb11b - 16−November−2015</mets:name>
|
||||
<mets:note>Goobi</mets:note>
|
||||
</mets:agent>
|
||||
</mets:metsHdr>
|
||||
<mets:dmdSec ID="DMDLOG_0000">
|
||||
<mets:mdWrap MDTYPE="MODS">
|
||||
<mets:xmlData>
|
||||
<mods:mods xmlns:mods="http://www.loc.gov/mods/v3">
|
||||
<mods:location>
|
||||
<mods:physicalLocation authority="marcorg" displayLabel="Staatsbibliothek zu Berlin - Preußischer Kulturbesitz, Berlin, Germany">DE-1</mods:physicalLocation>
|
||||
<mods:shelfLocator>4" Fy 11178</mods:shelfLocator>
|
||||
</mods:location>
|
||||
<mods:originInfo eventType="publication">
|
||||
<mods:place>
|
||||
<mods:placeTerm type="text">Hanau</mods:placeTerm>
|
||||
</mods:place>
|
||||
<mods:dateIssued encoding="iso8601" keyDate="yes">1749</mods:dateIssued>
|
||||
</mods:originInfo>
|
||||
<mods:originInfo eventType="digitization">
|
||||
<mods:place>
|
||||
<mods:placeTerm type="text">Berlin</mods:placeTerm>
|
||||
</mods:place>
|
||||
<mods:dateCaptured encoding="iso8601">2012</mods:dateCaptured>
|
||||
<mods:publisher>Staatsbibliothek zu Berlin - Preußischer Kulturbesitz, Germany</mods:publisher>
|
||||
<mods:edition>[Electronic ed.]</mods:edition>
|
||||
</mods:originInfo>
|
||||
<mods:classification authority="ZVDD">Historische Drucke</mods:classification>
|
||||
<mods:classification authority="ZVDD">Rechtswissenschaft</mods:classification>
|
||||
<mods:classification authority="ZVDD">VD18 digital</mods:classification>
|
||||
<mods:recordInfo>
|
||||
<mods:recordIdentifier source="gbv-ppn">PPN718448162</mods:recordIdentifier>
|
||||
</mods:recordInfo>
|
||||
<mods:identifier type="purl">http://resolver.staatsbibliothek-berlin.de/SBB00008F1000000000</mods:identifier>
|
||||
<mods:identifier type="vd18">11750219</mods:identifier>
|
||||
<mods:identifier type="PPNanalog">PPN370506340</mods:identifier>
|
||||
<mods:titleInfo>
|
||||
<mods:title>Acten-mäßiger Verlauff, Des Fameusen Processus sich verhaltende zwischen Herrn Hoff-Rath Eraßmus Senckenberg des Raths zu Franckfurt An einem und der Unschuldigen Catharina Agricola, am andern Theil puncto stupri violenti</mods:title>
|
||||
<mods:subTitle>Worinnen allen unpartheyischen Iustitiariis diese unverantwortliche Procedur und dabey gespielte listige Touren klärlich vor Augen gestellet werden</mods:subTitle>
|
||||
</mods:titleInfo>
|
||||
<mods:note type="source characteristics">P_Drucke_VD18</mods:note>
|
||||
<mods:note type="bibliography">VD18 11750219</mods:note>
|
||||
<mods:language>
|
||||
<mods:languageTerm authority="iso639-2b" type="code">ger</mods:languageTerm>
|
||||
</mods:language>
|
||||
<mods:relatedItem type="series">
|
||||
<mods:titleInfo>
|
||||
<mods:title>VD18 digital</mods:title>
|
||||
</mods:titleInfo>
|
||||
</mods:relatedItem>
|
||||
<mods:name type="personal">
|
||||
<mods:role>
|
||||
<mods:roleTerm authority="marcrelator" type="code">asn</mods:roleTerm>
|
||||
</mods:role>
|
||||
<mods:namePart type="family">Senckenberg</mods:namePart>
|
||||
<mods:namePart type="given">Eraßmus</mods:namePart>
|
||||
<mods:displayForm>Senckenberg, Eraßmus</mods:displayForm>
|
||||
</mods:name>
|
||||
<mods:name type="personal">
|
||||
<mods:role>
|
||||
<mods:roleTerm authority="marcrelator" type="code">asn</mods:roleTerm>
|
||||
</mods:role>
|
||||
<mods:namePart type="family">Agricola</mods:namePart>
|
||||
<mods:namePart type="given">Catharina</mods:namePart>
|
||||
<mods:displayForm>Agricola, Catharina</mods:displayForm>
|
||||
</mods:name>
|
||||
<mods:name type="corporate">
|
||||
<mods:role>
|
||||
<mods:roleTerm authority="marcrelator" type="code">fnd</mods:roleTerm>
|
||||
</mods:role>
|
||||
<mods:namePart>Deutsche Forschungsgemeinschaft</mods:namePart>
|
||||
</mods:name>
|
||||
<mods:physicalDescription>
|
||||
<mods:digitalOrigin>reformatted digital</mods:digitalOrigin>
|
||||
<mods:extent>44 S.</mods:extent>
|
||||
<mods:extent>2°</mods:extent>
|
||||
</mods:physicalDescription>
|
||||
<mods:extension>
|
||||
<zvdd:zvddWrap xmlns:zvdd="http://zvdd.gdz-cms.de/">
|
||||
<zvdd:titleWord>Aktenmäßiger Verlauf famosen Prozesses Hofrat Erasmus Rats Frankfurt Justitiariis</zvdd:titleWord>
|
||||
</zvdd:zvddWrap>
|
||||
</mods:extension>
|
||||
<mods:accessCondition type="use and reproduction">CC BY-NC-SA 4.0 International</mods:accessCondition>
|
||||
<mods:typeOfResource>text</mods:typeOfResource>
|
||||
</mods:mods>
|
||||
</mets:xmlData>
|
||||
</mets:mdWrap>
|
||||
</mets:dmdSec>
|
||||
<mets:dmdSec ID="DMDLOG_0001">
|
||||
<mets:mdWrap MDTYPE="MODS">
|
||||
<mets:xmlData>
|
||||
<mods:mods xmlns:mods="http://www.loc.gov/mods/v3">
|
||||
<mods:titleInfo>
|
||||
<mods:title>Ursachen so diesen Druck veranlasset</mods:title>
|
||||
</mods:titleInfo>
|
||||
</mods:mods>
|
||||
</mets:xmlData>
|
||||
</mets:mdWrap>
|
||||
</mets:dmdSec>
|
||||
<mets:dmdSec ID="DMDLOG_0002">
|
||||
<mets:mdWrap MDTYPE="MODS">
|
||||
<mets:xmlData>
|
||||
<mods:mods xmlns:mods="http://www.loc.gov/mods/v3">
|
||||
<mods:titleInfo>
|
||||
<mods:title>Endlich Abgetrungene Rechtliche Interims-Defensions-Schrifft ...</mods:title>
|
||||
</mods:titleInfo>
|
||||
</mods:mods>
|
||||
</mets:xmlData>
|
||||
</mets:mdWrap>
|
||||
</mets:dmdSec>
|
||||
<mets:amdSec ID="AMD">
|
||||
<mets:rightsMD ID="RIGHTS">
|
||||
<mets:mdWrap MDTYPE="OTHER" MIMETYPE="text/xml" OTHERMDTYPE="DVRIGHTS">
|
||||
<mets:xmlData>
|
||||
<dv:rights xmlns:dv="http://dfg-viewer.de/">
|
||||
<dv:owner>Staatsbibliothek zu Berlin - Preußischer Kulturbesitz</dv:owner>
|
||||
<dv:ownerLogo>http://resolver.staatsbibliothek-berlin.de/SBB0000000100000000</dv:ownerLogo>
|
||||
<dv:ownerSiteURL>http://www.staatsbibliothek-berlin.de</dv:ownerSiteURL>
|
||||
<dv:ownerContact>mailto:info@sbb.spk-berlin.de</dv:ownerContact>
|
||||
</dv:rights>
|
||||
</mets:xmlData>
|
||||
</mets:mdWrap>
|
||||
</mets:rightsMD>
|
||||
<mets:digiprovMD ID="DIGIPROV">
|
||||
<mets:mdWrap MDTYPE="OTHER" MIMETYPE="text/xml" OTHERMDTYPE="DVLINKS">
|
||||
<mets:xmlData>
|
||||
<dv:links xmlns:dv="http://dfg-viewer.de/">
|
||||
<dv:reference>http://www.stabikat.de/DB=1/PPN?PPN=718448162 </dv:reference>
|
||||
<dv:presentation>http://digital.staatsbibliothek-berlin.de/dms/werkansicht/?PPN=PPN718448162</dv:presentation>
|
||||
</dv:links>
|
||||
</mets:xmlData>
|
||||
</mets:mdWrap>
|
||||
</mets:digiprovMD>
|
||||
</mets:amdSec>
|
||||
<mets:fileSec>
|
||||
<mets:fileGrp USE="OCR-D-GT-PAGE">
|
||||
<mets:file MIMETYPE="application/xml" ID="OCR-D-GT-PAGE_00000024">
|
||||
<mets:FLocat xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="OCR-D-GT-PAGE/00000024.page.xml"/>
|
||||
</mets:file>
|
||||
</mets:fileGrp>
|
||||
<mets:fileGrp USE="OCR-D-OCR-CALAMARI">
|
||||
<mets:file MIMETYPE="application/vnd.prima.page+xml" ID="OCR-D-OCR-CALAMARI_0001">
|
||||
<mets:FLocat xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="OCR-D-OCR-CALAMARI/OCR-D-OCR-CALAMARI_0001.xml"/>
|
||||
</mets:file>
|
||||
</mets:fileGrp>
|
||||
<mets:fileGrp USE="OCR-D-OCR-TESS">
|
||||
<mets:file MIMETYPE="application/vnd.prima.page+xml" ID="OCR-D-OCR-TESS_0001">
|
||||
<mets:FLocat xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="OCR-D-OCR-TESS/OCR-D-OCR-TESS_0001.xml"/>
|
||||
</mets:file>
|
||||
</mets:fileGrp>
|
||||
</mets:fileSec>
|
||||
<mets:structMap TYPE="LOGICAL">
|
||||
<mets:div ADMID="AMD" CONTENTIDS="http://resolver.staatsbibliothek-berlin.de/SBB00008F1000000000" DMDID="DMDLOG_0000" ID="LOG_0000" LABEL="Acten-mäßiger Verlauff, Des Fameusen Processus sich verhaltende zwischen Herrn Hoff-Rath Eraßmus Senckenberg des Raths zu Franckfurt An einem und der Unschuldigen Catharina Agricola, am andern Theil puncto stupri violenti" ORDERLABEL="Acten-mäßiger Verlauff, Des Fameusen Processus sich verhaltende zwischen Herrn Hoff-Rath Eraßmus Senckenberg des Raths zu Franckfurt An einem und der Unschuldigen Catharina Agricola, am andern Theil puncto stupri violenti" TYPE="monograph">
|
||||
<mets:div ID="LOG_0001" TYPE="binding">
|
||||
<mets:div ID="LOG_0002" TYPE="cover_front"/>
|
||||
</mets:div>
|
||||
<mets:div ID="LOG_0003" TYPE="title_page"/>
|
||||
<mets:div DMDID="DMDLOG_0001" ID="LOG_0004" LABEL="Ursachen so diesen Druck veranlasset" TYPE="section"/>
|
||||
<mets:div DMDID="DMDLOG_0002" ID="LOG_0005" LABEL="Endlich Abgetrungene Rechtliche Interims-Defensions-Schrifft ..." TYPE="section"/>
|
||||
<mets:div ID="LOG_0006" TYPE="binding">
|
||||
<mets:div ID="LOG_0007" TYPE="cover_back"/>
|
||||
</mets:div>
|
||||
</mets:div>
|
||||
</mets:structMap>
|
||||
<mets:structMap TYPE="PHYSICAL">
|
||||
<mets:div CONTENTIDS="http://resolver.staatsbibliothek-berlin.de/SBB00008F1000000000" DMDID="DMDPHYS_0000" ID="PHYS_0000" TYPE="physSequence">
|
||||
<mets:div TYPE="page" ID="00000024">
|
||||
<mets:fptr FILEID="OCR-D-GT-PAGE_00000024"/>
|
||||
<mets:fptr FILEID="OCR-D-OCR-CALAMARI_0001"/>
|
||||
<mets:fptr FILEID="OCR-D-OCR-TESS_0001"/>
|
||||
</mets:div>
|
||||
</mets:div>
|
||||
</mets:structMap>
|
||||
<mets:structLink>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0001" xlink:from="LOG_0000"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0002" xlink:from="LOG_0000"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0003" xlink:from="LOG_0000"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0004" xlink:from="LOG_0000"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0005" xlink:from="LOG_0000"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0006" xlink:from="LOG_0000"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0007" xlink:from="LOG_0000"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0008" xlink:from="LOG_0000"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0009" xlink:from="LOG_0000"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0010" xlink:from="LOG_0000"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0011" xlink:from="LOG_0000"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0012" xlink:from="LOG_0000"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0013" xlink:from="LOG_0000"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0014" xlink:from="LOG_0000"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0015" xlink:from="LOG_0000"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0016" xlink:from="LOG_0000"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0017" xlink:from="LOG_0000"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0018" xlink:from="LOG_0000"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0019" xlink:from="LOG_0000"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0020" xlink:from="LOG_0000"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0021" xlink:from="LOG_0000"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0022" xlink:from="LOG_0000"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0023" xlink:from="LOG_0000"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0024" xlink:from="LOG_0000"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0025" xlink:from="LOG_0000"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0026" xlink:from="LOG_0000"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0027" xlink:from="LOG_0000"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0028" xlink:from="LOG_0000"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0029" xlink:from="LOG_0000"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0030" xlink:from="LOG_0000"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0031" xlink:from="LOG_0000"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0032" xlink:from="LOG_0000"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0033" xlink:from="LOG_0000"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0034" xlink:from="LOG_0000"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0035" xlink:from="LOG_0000"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0036" xlink:from="LOG_0000"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0037" xlink:from="LOG_0000"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0038" xlink:from="LOG_0000"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0039" xlink:from="LOG_0000"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0040" xlink:from="LOG_0000"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0041" xlink:from="LOG_0000"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0042" xlink:from="LOG_0000"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0043" xlink:from="LOG_0000"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0044" xlink:from="LOG_0000"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0045" xlink:from="LOG_0000"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0046" xlink:from="LOG_0000"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0047" xlink:from="LOG_0000"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0048" xlink:from="LOG_0000"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0049" xlink:from="LOG_0000"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0050" xlink:from="LOG_0000"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0051" xlink:from="LOG_0000"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0052" xlink:from="LOG_0000"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0053" xlink:from="LOG_0000"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0001" xlink:from="LOG_0001"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0002" xlink:from="LOG_0001"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0003" xlink:from="LOG_0001"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0004" xlink:from="LOG_0001"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0001" xlink:from="LOG_0002"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0005" xlink:from="LOG_0003"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0006" xlink:from="LOG_0003"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0007" xlink:from="LOG_0004"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0008" xlink:from="LOG_0004"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0008" xlink:from="LOG_0005"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0009" xlink:from="LOG_0005"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0010" xlink:from="LOG_0005"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0011" xlink:from="LOG_0005"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0012" xlink:from="LOG_0005"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0013" xlink:from="LOG_0005"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0014" xlink:from="LOG_0005"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0015" xlink:from="LOG_0005"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0016" xlink:from="LOG_0005"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0017" xlink:from="LOG_0005"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0018" xlink:from="LOG_0005"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0019" xlink:from="LOG_0005"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0020" xlink:from="LOG_0005"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0021" xlink:from="LOG_0005"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0022" xlink:from="LOG_0005"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0023" xlink:from="LOG_0005"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0024" xlink:from="LOG_0005"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0025" xlink:from="LOG_0005"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0026" xlink:from="LOG_0005"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0027" xlink:from="LOG_0005"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0028" xlink:from="LOG_0005"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0029" xlink:from="LOG_0005"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0030" xlink:from="LOG_0005"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0031" xlink:from="LOG_0005"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0032" xlink:from="LOG_0005"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0033" xlink:from="LOG_0005"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0034" xlink:from="LOG_0005"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0035" xlink:from="LOG_0005"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0036" xlink:from="LOG_0005"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0037" xlink:from="LOG_0005"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0038" xlink:from="LOG_0005"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0039" xlink:from="LOG_0005"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0040" xlink:from="LOG_0005"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0041" xlink:from="LOG_0005"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0042" xlink:from="LOG_0005"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0043" xlink:from="LOG_0005"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0044" xlink:from="LOG_0005"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0045" xlink:from="LOG_0005"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0046" xlink:from="LOG_0005"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0047" xlink:from="LOG_0005"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0048" xlink:from="LOG_0005"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0049" xlink:from="LOG_0006"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0050" xlink:from="LOG_0006"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0051" xlink:from="LOG_0006"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0052" xlink:from="LOG_0006"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0052" xlink:from="LOG_0007"/>
|
||||
</mets:structLink>
|
||||
</mets:mets>
|
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,47 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<PcGts xmlns="http://schema.primaresearch.org/PAGE/gts/pagecontent/2018-07-15" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://schema.primaresearch.org/PAGE/gts/pagecontent/2018-07-15 http://schema.primaresearch.org/PAGE/gts/pagecontent/2018-07-15/pagecontent.xsd">
|
||||
<Metadata>
|
||||
<Creator></Creator>
|
||||
<Created>2019-07-26T13:59:00</Created>
|
||||
<LastChange>2019-07-26T14:00:29</LastChange></Metadata>
|
||||
<Page imageFilename="lorem-ipsum-scan.tif" imageXResolution="300.00000" imageYResolution="300.00000" imageWidth="2481" imageHeight="3508">
|
||||
<TextRegion id="tempReg357564684568544579089">
|
||||
<Coords points="0,0 1,0 1,1 0,1"/>
|
||||
<TextLine id="l0">
|
||||
<Coords points="228,237 228,295 2216,295 2216,237"/>
|
||||
<TextEquiv>
|
||||
<Unicode></Unicode></TextEquiv></TextLine>
|
||||
<TextLine id="l1">
|
||||
<Coords points="228,298 228,348 2160,348 2160,298"/>
|
||||
<TextEquiv>
|
||||
<Unicode></Unicode></TextEquiv></TextLine>
|
||||
<TextLine id="l2">
|
||||
<Coords points="225,348 225,410 2178,410 2178,348"/>
|
||||
<TextEquiv>
|
||||
<Unicode></Unicode></TextEquiv></TextLine>
|
||||
<TextLine id="l3">
|
||||
<Coords points="218,413 218,463 2153,463 2153,413"/>
|
||||
<TextEquiv>
|
||||
<Unicode></Unicode></TextEquiv></TextLine>
|
||||
<TextLine id="l4">
|
||||
<Coords points="225,466 225,522 2153,522 2153,466"/>
|
||||
<TextEquiv>
|
||||
<Unicode></Unicode></TextEquiv></TextLine>
|
||||
<TextLine id="l5">
|
||||
<Coords points="216,524 216,581 2187,581 2187,524"/>
|
||||
<TextEquiv>
|
||||
<Unicode></Unicode></TextEquiv></TextLine>
|
||||
<TextLine id="l6">
|
||||
<Coords points="219,584 219,640 542,640 542,584"/>
|
||||
<TextEquiv>
|
||||
<Unicode></Unicode></TextEquiv></TextLine></TextRegion>
|
||||
<TextRegion id="r7" type="paragraph">
|
||||
<Coords points="204,212 204,651 2227,651 2227,212"/>
|
||||
<TextEquiv>
|
||||
<Unicode>Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt
|
||||
ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo
|
||||
dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit
|
||||
amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor
|
||||
invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et
|
||||
justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum
|
||||
dolor sit amet.</Unicode></TextEquiv></TextRegion></Page></PcGts>
|
@ -0,0 +1,139 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<alto xmlns="http://www.loc.gov/standards/alto/ns-v3#" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.loc.gov/standards/alto/ns-v3# http://www.loc.gov/alto/v3/alto-3-0.xsd">
|
||||
<Description>
|
||||
<MeasurementUnit>pixel</MeasurementUnit>
|
||||
<sourceImageInformation>
|
||||
<fileName> </fileName>
|
||||
</sourceImageInformation>
|
||||
<OCRProcessing ID="OCR_0">
|
||||
<ocrProcessingStep>
|
||||
<processingSoftware>
|
||||
<softwareName>tesseract 4.1.0-rc4</softwareName>
|
||||
</processingSoftware>
|
||||
</ocrProcessingStep>
|
||||
</OCRProcessing>
|
||||
</Description>
|
||||
<Layout>
|
||||
<Page WIDTH="2481" HEIGHT="3508" PHYSICAL_IMG_NR="0" ID="page_0">
|
||||
<PrintSpace HPOS="0" VPOS="0" WIDTH="2481" HEIGHT="3508">
|
||||
<TextBlock ID="block_0" HPOS="209" VPOS="258" WIDTH="1954" HEIGHT="437">
|
||||
<TextLine ID="line_0" HPOS="209" VPOS="258" WIDTH="1954" HEIGHT="103">
|
||||
<String ID="string_0" HPOS="209" VPOS="319" WIDTH="134" HEIGHT="34" WC="0.96" CONTENT="Lorem"/><SP WIDTH="13" VPOS="319" HPOS="343"/>
|
||||
<String ID="string_1" HPOS="356" VPOS="316" WIDTH="121" HEIGHT="45" WC="0.96" CONTENT="ipsum"/><SP WIDTH="14" VPOS="316" HPOS="477"/>
|
||||
<String ID="string_2" HPOS="491" VPOS="312" WIDTH="102" HEIGHT="36" WC="0.96" CONTENT="dolor"/><SP WIDTH="15" VPOS="312" HPOS="593"/>
|
||||
<String ID="string_3" HPOS="608" VPOS="309" WIDTH="46" HEIGHT="35" WC="0.96" CONTENT="sit"/><SP WIDTH="14" VPOS="309" HPOS="654"/>
|
||||
<String ID="string_4" HPOS="668" VPOS="311" WIDTH="106" HEIGHT="37" WC="0.96" CONTENT="amet,"/><SP WIDTH="16" VPOS="311" HPOS="774"/>
|
||||
<String ID="string_5" HPOS="790" VPOS="307" WIDTH="201" HEIGHT="32" WC="0.88" CONTENT="consetetur"/><SP WIDTH="14" VPOS="307" HPOS="991"/>
|
||||
<String ID="string_6" HPOS="1005" VPOS="297" WIDTH="205" HEIGHT="46" WC="0.96" CONTENT="sadipscing"/><SP WIDTH="15" VPOS="297" HPOS="1210"/>
|
||||
<String ID="string_7" HPOS="1225" VPOS="293" WIDTH="84" HEIGHT="42" WC="0.91" CONTENT="elitr,"/><SP WIDTH="16" VPOS="293" HPOS="1309"/>
|
||||
<String ID="string_8" HPOS="1325" VPOS="289" WIDTH="65" HEIGHT="38" WC="0.96" CONTENT="sed"/><SP WIDTH="14" VPOS="289" HPOS="1390"/>
|
||||
<String ID="string_9" HPOS="1404" VPOS="286" WIDTH="97" HEIGHT="36" WC="0.93" CONTENT="diam"/><SP WIDTH="14" VPOS="286" HPOS="1501"/>
|
||||
<String ID="string_10" HPOS="1515" VPOS="291" WIDTH="100" HEIGHT="24" WC="0.69" CONTENT="nonu"/><SP WIDTH="32" VPOS="291" HPOS="1615"/>
|
||||
<String ID="string_11" HPOS="1647" VPOS="285" WIDTH="30" HEIGHT="36" WC="0.37" CONTENT="yy"/><SP WIDTH="17" VPOS="285" HPOS="1677"/>
|
||||
<String ID="string_12" HPOS="1694" VPOS="268" WIDTH="140" HEIGHT="42" WC="0.93" CONTENT="eirmod"/><SP WIDTH="11" VPOS="268" HPOS="1834"/>
|
||||
<String ID="string_13" HPOS="1845" VPOS="273" WIDTH="139" HEIGHT="37" WC="0.96" CONTENT="tempor"/><SP WIDTH="15" VPOS="273" HPOS="1984"/>
|
||||
<String ID="string_14" HPOS="1999" VPOS="258" WIDTH="164" HEIGHT="38" WC="0.95" CONTENT="invidunt"/>
|
||||
</TextLine>
|
||||
<TextLine ID="line_1" HPOS="211" VPOS="315" WIDTH="1904" HEIGHT="102">
|
||||
<String ID="string_15" HPOS="211" VPOS="380" WIDTH="39" HEIGHT="31" WC="0.96" CONTENT="ut"/><SP WIDTH="13" VPOS="380" HPOS="250"/>
|
||||
<String ID="string_16" HPOS="263" VPOS="373" WIDTH="123" HEIGHT="44" WC="0.96" CONTENT="labore"/><SP WIDTH="16" VPOS="373" HPOS="386"/>
|
||||
<String ID="string_17" HPOS="402" VPOS="379" WIDTH="33" HEIGHT="27" WC="0.95" CONTENT="et"/><SP WIDTH="14" VPOS="379" HPOS="435"/>
|
||||
<String ID="string_18" HPOS="449" VPOS="370" WIDTH="123" HEIGHT="36" WC="0.95" CONTENT="dolore"/><SP WIDTH="15" VPOS="370" HPOS="572"/>
|
||||
<String ID="string_19" HPOS="587" VPOS="374" WIDTH="133" HEIGHT="37" WC="0.96" CONTENT="magna"/><SP WIDTH="14" VPOS="374" HPOS="720"/>
|
||||
<String ID="string_20" HPOS="734" VPOS="363" WIDTH="183" HEIGHT="43" WC="0.96" CONTENT="aliquyam"/><SP WIDTH="14" VPOS="363" HPOS="917"/>
|
||||
<String ID="string_21" HPOS="931" VPOS="360" WIDTH="82" HEIGHT="36" WC="0.95" CONTENT="erat,"/><SP WIDTH="17" VPOS="360" HPOS="1013"/>
|
||||
<String ID="string_22" HPOS="1030" VPOS="354" WIDTH="65" HEIGHT="35" WC="0.96" CONTENT="sed"/><SP WIDTH="13" VPOS="354" HPOS="1095"/>
|
||||
<String ID="string_23" HPOS="1108" VPOS="352" WIDTH="96" HEIGHT="36" WC="0.96" CONTENT="diam"/><SP WIDTH="13" VPOS="352" HPOS="1204"/>
|
||||
<String ID="string_24" HPOS="1217" VPOS="350" WIDTH="181" HEIGHT="44" WC="0.95" CONTENT="voluptua."/><SP WIDTH="13" VPOS="350" HPOS="1398"/>
|
||||
<String ID="string_25" HPOS="1411" VPOS="345" WIDTH="49" HEIGHT="34" WC="0.95" CONTENT="At"/><SP WIDTH="11" VPOS="345" HPOS="1460"/>
|
||||
<String ID="string_26" HPOS="1471" VPOS="348" WIDTH="88" HEIGHT="26" WC="0.93" CONTENT="Vero"/><SP WIDTH="16" VPOS="348" HPOS="1559"/>
|
||||
<String ID="string_27" HPOS="1575" VPOS="345" WIDTH="65" HEIGHT="26" WC="0.96" CONTENT="eos"/><SP WIDTH="15" VPOS="345" HPOS="1640"/>
|
||||
<String ID="string_28" HPOS="1655" VPOS="339" WIDTH="36" HEIGHT="29" WC="0.96" CONTENT="et"/><SP WIDTH="14" VPOS="339" HPOS="1691"/>
|
||||
<String ID="string_29" HPOS="1705" VPOS="336" WIDTH="168" HEIGHT="31" WC="0.87" CONTENT="accusam"/><SP WIDTH="15" VPOS="336" HPOS="1873"/>
|
||||
<String ID="string_30" HPOS="1888" VPOS="329" WIDTH="34" HEIGHT="28" WC="0.96" CONTENT="et"/><SP WIDTH="11" VPOS="329" HPOS="1922"/>
|
||||
<String ID="string_31" HPOS="1933" VPOS="322" WIDTH="96" HEIGHT="44" WC="0.96" CONTENT="justo"/><SP WIDTH="15" VPOS="322" HPOS="2029"/>
|
||||
<String ID="string_32" HPOS="2044" VPOS="315" WIDTH="71" HEIGHT="63" WC="0.96" CONTENT="duo"/>
|
||||
</TextLine>
|
||||
<TextLine ID="line_2" HPOS="214" VPOS="375" WIDTH="1919" HEIGHT="93">
|
||||
<String ID="string_33" HPOS="214" VPOS="431" WIDTH="144" HEIGHT="37" WC="0.96" CONTENT="dolores"/><SP WIDTH="16" VPOS="431" HPOS="358"/>
|
||||
<String ID="string_34" HPOS="374" VPOS="433" WIDTH="34" HEIGHT="31" WC="0.96" CONTENT="et"/><SP WIDTH="14" VPOS="433" HPOS="408"/>
|
||||
<String ID="string_35" HPOS="422" VPOS="437" WIDTH="42" HEIGHT="25" WC="0.96" CONTENT="ea"/><SP WIDTH="13" VPOS="437" HPOS="464"/>
|
||||
<String ID="string_36" HPOS="477" VPOS="426" WIDTH="136" HEIGHT="35" WC="0.96" CONTENT="rebum."/><SP WIDTH="18" VPOS="426" HPOS="613"/>
|
||||
<String ID="string_37" HPOS="631" VPOS="424" WIDTH="75" HEIGHT="34" WC="0.96" CONTENT="Stet"/><SP WIDTH="14" VPOS="424" HPOS="706"/>
|
||||
<String ID="string_38" HPOS="720" VPOS="419" WIDTH="85" HEIGHT="36" WC="0.96" CONTENT="clita"/><SP WIDTH="13" VPOS="419" HPOS="805"/>
|
||||
<String ID="string_39" HPOS="818" VPOS="415" WIDTH="90" HEIGHT="35" WC="0.97" CONTENT="kasd"/><SP WIDTH="14" VPOS="415" HPOS="908"/>
|
||||
<String ID="string_40" HPOS="922" VPOS="412" WIDTH="206" HEIGHT="48" WC="0.96" CONTENT="gubergren,"/><SP WIDTH="16" VPOS="412" HPOS="1128"/>
|
||||
<String ID="string_41" HPOS="1144" VPOS="417" WIDTH="47" HEIGHT="26" WC="0.97" CONTENT="no"/><SP WIDTH="16" VPOS="417" HPOS="1191"/>
|
||||
<String ID="string_42" HPOS="1207" VPOS="415" WIDTH="61" HEIGHT="25" WC="0.96" CONTENT="sea"/><SP WIDTH="13" VPOS="415" HPOS="1268"/>
|
||||
<String ID="string_43" HPOS="1281" VPOS="405" WIDTH="169" HEIGHT="36" WC="0.91" CONTENT="iakimata"/><SP WIDTH="14" VPOS="405" HPOS="1450"/>
|
||||
<String ID="string_44" HPOS="1464" VPOS="400" WIDTH="144" HEIGHT="33" WC="0.96" CONTENT="sanctus"/><SP WIDTH="16" VPOS="400" HPOS="1608"/>
|
||||
<String ID="string_45" HPOS="1624" VPOS="397" WIDTH="54" HEIGHT="29" WC="0.97" CONTENT="est"/><SP WIDTH="13" VPOS="397" HPOS="1678"/>
|
||||
<String ID="string_46" HPOS="1691" VPOS="390" WIDTH="132" HEIGHT="34" WC="0.96" CONTENT="Lorem"/><SP WIDTH="14" VPOS="390" HPOS="1823"/>
|
||||
<String ID="string_47" HPOS="1837" VPOS="383" WIDTH="120" HEIGHT="44" WC="0.96" CONTENT="ipsum"/><SP WIDTH="14" VPOS="383" HPOS="1957"/>
|
||||
<String ID="string_48" HPOS="1971" VPOS="375" WIDTH="102" HEIGHT="37" WC="0.96" CONTENT="dolor"/><SP WIDTH="15" VPOS="375" HPOS="2073"/>
|
||||
<String ID="string_49" HPOS="2088" VPOS="377" WIDTH="45" HEIGHT="31" WC="0.96" CONTENT="sit"/>
|
||||
</TextLine>
|
||||
<TextLine ID="line_3" HPOS="215" VPOS="435" WIDTH="1896" HEIGHT="93">
|
||||
<String ID="string_50" HPOS="215" VPOS="494" WIDTH="106" HEIGHT="32" WC="0.96" CONTENT="amet."/><SP WIDTH="16" VPOS="494" HPOS="321"/>
|
||||
<String ID="string_51" HPOS="337" VPOS="488" WIDTH="130" HEIGHT="33" WC="0.96" CONTENT="Lorem"/><SP WIDTH="14" VPOS="488" HPOS="467"/>
|
||||
<String ID="string_52" HPOS="481" VPOS="484" WIDTH="121" HEIGHT="44" WC="0.96" CONTENT="ipsum"/><SP WIDTH="14" VPOS="484" HPOS="602"/>
|
||||
<String ID="string_53" HPOS="616" VPOS="479" WIDTH="104" HEIGHT="37" WC="0.96" CONTENT="dolor"/><SP WIDTH="14" VPOS="479" HPOS="720"/>
|
||||
<String ID="string_54" HPOS="734" VPOS="476" WIDTH="46" HEIGHT="36" WC="0.93" CONTENT="sit"/><SP WIDTH="14" VPOS="476" HPOS="780"/>
|
||||
<String ID="string_55" HPOS="794" VPOS="477" WIDTH="104" HEIGHT="36" WC="0.75" CONTENT="armet,"/><SP WIDTH="17" VPOS="477" HPOS="898"/>
|
||||
<String ID="string_56" HPOS="915" VPOS="474" WIDTH="200" HEIGHT="30" WC="0.97" CONTENT="consetetur"/><SP WIDTH="14" VPOS="474" HPOS="1115"/>
|
||||
<String ID="string_57" HPOS="1129" VPOS="463" WIDTH="205" HEIGHT="45" WC="0.96" CONTENT="sadipscing"/><SP WIDTH="15" VPOS="463" HPOS="1334"/>
|
||||
<String ID="string_58" HPOS="1349" VPOS="457" WIDTH="86" HEIGHT="41" WC="0.96" CONTENT="elitr,"/><SP WIDTH="16" VPOS="457" HPOS="1435"/>
|
||||
<String ID="string_59" HPOS="1451" VPOS="452" WIDTH="65" HEIGHT="39" WC="0.96" CONTENT="sed"/><SP WIDTH="14" VPOS="452" HPOS="1516"/>
|
||||
<String ID="string_60" HPOS="1530" VPOS="449" WIDTH="99" HEIGHT="36" WC="0.93" CONTENT="diam"/><SP WIDTH="14" VPOS="449" HPOS="1629"/>
|
||||
<String ID="string_61" HPOS="1643" VPOS="451" WIDTH="162" HEIGHT="36" WC="0.59" CONTENT="nonurny"/><SP WIDTH="16" VPOS="451" HPOS="1805"/>
|
||||
<String ID="string_62" HPOS="1821" VPOS="435" WIDTH="138" HEIGHT="39" WC="0.96" CONTENT="eirmod"/><SP WIDTH="12" VPOS="435" HPOS="1959"/>
|
||||
<String ID="string_63" HPOS="1971" VPOS="440" WIDTH="140" HEIGHT="37" WC="0.96" CONTENT="tempor"/>
|
||||
</TextLine>
|
||||
<TextLine ID="line_4" HPOS="216" VPOS="483" WIDTH="1888" HEIGHT="97">
|
||||
<String ID="string_64" HPOS="216" VPOS="543" WIDTH="165" HEIGHT="37" WC="0.97" CONTENT="invidunt"/><SP WIDTH="13" VPOS="543" HPOS="381"/>
|
||||
<String ID="string_65" HPOS="394" VPOS="546" WIDTH="39" HEIGHT="30" WC="0.97" CONTENT="ut"/><SP WIDTH="12" VPOS="546" HPOS="433"/>
|
||||
<String ID="string_66" HPOS="445" VPOS="539" WIDTH="122" HEIGHT="36" WC="0.96" CONTENT="labore"/><SP WIDTH="16" VPOS="539" HPOS="567"/>
|
||||
<String ID="string_67" HPOS="583" VPOS="543" WIDTH="35" HEIGHT="29" WC="0.96" CONTENT="et"/><SP WIDTH="14" VPOS="543" HPOS="618"/>
|
||||
<String ID="string_68" HPOS="632" VPOS="536" WIDTH="125" HEIGHT="34" WC="0.96" CONTENT="dolore"/><SP WIDTH="14" VPOS="536" HPOS="757"/>
|
||||
<String ID="string_69" HPOS="771" VPOS="539" WIDTH="131" HEIGHT="37" WC="0.46" CONTENT="magna"/><SP WIDTH="14" VPOS="539" HPOS="902"/>
|
||||
<String ID="string_70" HPOS="916" VPOS="526" WIDTH="182" HEIGHT="45" WC="0.96" CONTENT="aliquyam"/><SP WIDTH="14" VPOS="526" HPOS="1098"/>
|
||||
<String ID="string_71" HPOS="1112" VPOS="527" WIDTH="82" HEIGHT="37" WC="0.96" CONTENT="erat,"/><SP WIDTH="17" VPOS="527" HPOS="1194"/>
|
||||
<String ID="string_72" HPOS="1211" VPOS="519" WIDTH="63" HEIGHT="36" WC="0.97" CONTENT="sed"/><SP WIDTH="14" VPOS="519" HPOS="1274"/>
|
||||
<String ID="string_73" HPOS="1288" VPOS="517" WIDTH="97" HEIGHT="37" WC="0.96" CONTENT="diam"/><SP WIDTH="11" VPOS="517" HPOS="1385"/>
|
||||
<String ID="string_74" HPOS="1396" VPOS="513" WIDTH="185" HEIGHT="44" WC="0.96" CONTENT="voluptua."/><SP WIDTH="14" VPOS="513" HPOS="1581"/>
|
||||
<String ID="string_75" HPOS="1595" VPOS="505" WIDTH="50" HEIGHT="35" WC="0.96" CONTENT="At"/><SP WIDTH="11" VPOS="505" HPOS="1645"/>
|
||||
<String ID="string_76" HPOS="1656" VPOS="511" WIDTH="89" HEIGHT="27" WC="0.96" CONTENT="vero"/><SP WIDTH="16" VPOS="511" HPOS="1745"/>
|
||||
<String ID="string_77" HPOS="1761" VPOS="508" WIDTH="63" HEIGHT="26" WC="0.96" CONTENT="eos"/><SP WIDTH="15" VPOS="508" HPOS="1824"/>
|
||||
<String ID="string_78" HPOS="1839" VPOS="501" WIDTH="35" HEIGHT="30" WC="0.97" CONTENT="et"/><SP WIDTH="13" VPOS="501" HPOS="1874"/>
|
||||
<String ID="string_79" HPOS="1887" VPOS="499" WIDTH="168" HEIGHT="53" WC="0.80" CONTENT="accusam"/><SP WIDTH="-3" VPOS="499" HPOS="2055"/>
|
||||
<String ID="string_80" HPOS="2052" VPOS="483" WIDTH="52" HEIGHT="55" WC="0.97" CONTENT="et"/>
|
||||
</TextLine>
|
||||
<TextLine ID="line_5" HPOS="215" VPOS="552" WIDTH="1941" HEIGHT="97">
|
||||
<String ID="string_81" HPOS="215" VPOS="604" WIDTH="97" HEIGHT="45" WC="0.97" CONTENT="justo"/><SP WIDTH="16" VPOS="604" HPOS="312"/>
|
||||
<String ID="string_82" HPOS="328" VPOS="600" WIDTH="71" HEIGHT="35" WC="0.97" CONTENT="duo"/><SP WIDTH="16" VPOS="600" HPOS="399"/>
|
||||
<String ID="string_83" HPOS="415" VPOS="597" WIDTH="143" HEIGHT="36" WC="0.93" CONTENT="dolores"/><SP WIDTH="16" VPOS="597" HPOS="558"/>
|
||||
<String ID="string_84" HPOS="574" VPOS="600" WIDTH="34" HEIGHT="29" WC="0.96" CONTENT="et"/><SP WIDTH="14" VPOS="600" HPOS="608"/>
|
||||
<String ID="string_85" HPOS="622" VPOS="602" WIDTH="43" HEIGHT="26" WC="0.96" CONTENT="ea"/><SP WIDTH="13" VPOS="602" HPOS="665"/>
|
||||
<String ID="string_86" HPOS="678" VPOS="590" WIDTH="136" HEIGHT="36" WC="0.96" CONTENT="rebum."/><SP WIDTH="19" VPOS="590" HPOS="814"/>
|
||||
<String ID="string_87" HPOS="833" VPOS="588" WIDTH="74" HEIGHT="34" WC="0.96" CONTENT="Stet"/><SP WIDTH="14" VPOS="588" HPOS="907"/>
|
||||
<String ID="string_88" HPOS="921" VPOS="584" WIDTH="83" HEIGHT="36" WC="0.96" CONTENT="clita"/><SP WIDTH="12" VPOS="584" HPOS="1004"/>
|
||||
<String ID="string_89" HPOS="1016" VPOS="580" WIDTH="90" HEIGHT="36" WC="0.97" CONTENT="kasd"/><SP WIDTH="15" VPOS="580" HPOS="1106"/>
|
||||
<String ID="string_90" HPOS="1121" VPOS="578" WIDTH="205" HEIGHT="47" WC="0.96" CONTENT="gubergren,"/><SP WIDTH="16" VPOS="578" HPOS="1326"/>
|
||||
<String ID="string_91" HPOS="1342" VPOS="582" WIDTH="47" HEIGHT="25" WC="0.96" CONTENT="no"/><SP WIDTH="16" VPOS="582" HPOS="1389"/>
|
||||
<String ID="string_92" HPOS="1405" VPOS="581" WIDTH="62" HEIGHT="26" WC="0.97" CONTENT="sea"/><SP WIDTH="13" VPOS="581" HPOS="1467"/>
|
||||
<String ID="string_93" HPOS="1480" VPOS="566" WIDTH="172" HEIGHT="38" WC="0.96" CONTENT="takimata"/><SP WIDTH="14" VPOS="566" HPOS="1652"/>
|
||||
<String ID="string_94" HPOS="1666" VPOS="563" WIDTH="145" HEIGHT="33" WC="0.97" CONTENT="sanctus"/><SP WIDTH="15" VPOS="563" HPOS="1811"/>
|
||||
<String ID="string_95" HPOS="1826" VPOS="558" WIDTH="54" HEIGHT="30" WC="0.97" CONTENT="est"/><SP WIDTH="12" VPOS="558" HPOS="1880"/>
|
||||
<String ID="string_96" HPOS="1892" VPOS="552" WIDTH="130" HEIGHT="34" WC="0.96" CONTENT="Lorem"/><SP WIDTH="15" VPOS="552" HPOS="2022"/>
|
||||
<String ID="string_97" HPOS="2037" VPOS="553" WIDTH="119" HEIGHT="37" WC="0.51" CONTENT="Ipsum"/>
|
||||
</TextLine>
|
||||
<TextLine ID="line_6" HPOS="219" VPOS="657" WIDTH="282" HEIGHT="38">
|
||||
<String ID="string_98" HPOS="219" VPOS="658" WIDTH="104" HEIGHT="37" WC="0.97" CONTENT="dolor"/><SP WIDTH="15" VPOS="658" HPOS="323"/>
|
||||
<String ID="string_99" HPOS="338" VPOS="657" WIDTH="45" HEIGHT="35" WC="0.97" CONTENT="sit"/><SP WIDTH="14" VPOS="657" HPOS="383"/>
|
||||
<String ID="string_100" HPOS="397" VPOS="660" WIDTH="104" HEIGHT="35" WC="0.94" CONTENT="amet."/>
|
||||
</TextLine>
|
||||
</TextBlock>
|
||||
</PrintSpace>
|
||||
</Page>
|
||||
</Layout>
|
||||
</alto>
|
Binary file not shown.
Binary file not shown.
@ -0,0 +1,47 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<PcGts xmlns="http://schema.primaresearch.org/PAGE/gts/pagecontent/2018-07-15" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://schema.primaresearch.org/PAGE/gts/pagecontent/2018-07-15 http://schema.primaresearch.org/PAGE/gts/pagecontent/2018-07-15/pagecontent.xsd">
|
||||
<Metadata>
|
||||
<Creator></Creator>
|
||||
<Created>2019-07-26T13:59:00</Created>
|
||||
<LastChange>2019-07-26T14:00:29</LastChange></Metadata>
|
||||
<Page imageFilename="lorem-ipsum-scan.tif" imageXResolution="300.00000" imageYResolution="300.00000" imageWidth="2481" imageHeight="3508">
|
||||
<TextRegion id="tempReg357564684568544579089">
|
||||
<Coords points="0,0 1,0 1,1 0,1"/>
|
||||
<TextLine id="l0">
|
||||
<Coords points="228,237 228,295 2216,295 2216,237"/>
|
||||
<TextEquiv>
|
||||
<Unicode></Unicode></TextEquiv></TextLine>
|
||||
<TextLine id="l1">
|
||||
<Coords points="228,298 228,348 2160,348 2160,298"/>
|
||||
<TextEquiv>
|
||||
<Unicode></Unicode></TextEquiv></TextLine>
|
||||
<TextLine id="l2">
|
||||
<Coords points="225,348 225,410 2178,410 2178,348"/>
|
||||
<TextEquiv>
|
||||
<Unicode></Unicode></TextEquiv></TextLine>
|
||||
<TextLine id="l3">
|
||||
<Coords points="218,413 218,463 2153,463 2153,413"/>
|
||||
<TextEquiv>
|
||||
<Unicode></Unicode></TextEquiv></TextLine>
|
||||
<TextLine id="l4">
|
||||
<Coords points="225,466 225,522 2153,522 2153,466"/>
|
||||
<TextEquiv>
|
||||
<Unicode></Unicode></TextEquiv></TextLine>
|
||||
<TextLine id="l5">
|
||||
<Coords points="216,524 216,581 2187,581 2187,524"/>
|
||||
<TextEquiv>
|
||||
<Unicode></Unicode></TextEquiv></TextLine>
|
||||
<TextLine id="l6">
|
||||
<Coords points="219,584 219,640 542,640 542,584"/>
|
||||
<TextEquiv>
|
||||
<Unicode></Unicode></TextEquiv></TextLine></TextRegion>
|
||||
<TextRegion id="r7" type="paragraph">
|
||||
<Coords points="204,212 204,651 2227,651 2227,212"/>
|
||||
<TextEquiv>
|
||||
<Unicode>Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt
|
||||
ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo
|
||||
dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit
|
||||
amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor
|
||||
invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et
|
||||
justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum
|
||||
dolor sit amet.</Unicode></TextEquiv></TextRegion></Page></PcGts>
|
@ -0,0 +1,138 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<alto xmlns="http://www.loc.gov/standards/alto/ns-v3#" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.loc.gov/standards/alto/ns-v3# http://www.loc.gov/alto/v3/alto-3-0.xsd">
|
||||
<Description>
|
||||
<MeasurementUnit>pixel</MeasurementUnit>
|
||||
<sourceImageInformation>
|
||||
<fileName> </fileName>
|
||||
</sourceImageInformation>
|
||||
<OCRProcessing ID="OCR_0">
|
||||
<ocrProcessingStep>
|
||||
<processingSoftware>
|
||||
<softwareName>tesseract 4.1.0-rc4</softwareName>
|
||||
</processingSoftware>
|
||||
</ocrProcessingStep>
|
||||
</OCRProcessing>
|
||||
</Description>
|
||||
<Layout>
|
||||
<Page WIDTH="2481" HEIGHT="3508" PHYSICAL_IMG_NR="0" ID="page_0">
|
||||
<PrintSpace HPOS="0" VPOS="0" WIDTH="2481" HEIGHT="3508">
|
||||
<TextBlock ID="block_0" HPOS="234" VPOS="244" WIDTH="1966" HEIGHT="387">
|
||||
<TextLine ID="line_0" HPOS="237" VPOS="244" WIDTH="1963" HEIGHT="48">
|
||||
<String ID="string_0" HPOS="237" VPOS="248" WIDTH="133" HEIGHT="34" WC="0.96" CONTENT="Lorem"/><SP WIDTH="14" VPOS="248" HPOS="370"/>
|
||||
<String ID="string_1" HPOS="384" VPOS="247" WIDTH="120" HEIGHT="45" WC="0.96" CONTENT="ipsum"/><SP WIDTH="15" VPOS="247" HPOS="504"/>
|
||||
<String ID="string_2" HPOS="519" VPOS="246" WIDTH="103" HEIGHT="36" WC="0.96" CONTENT="dolor"/><SP WIDTH="14" VPOS="246" HPOS="622"/>
|
||||
<String ID="string_3" HPOS="636" VPOS="247" WIDTH="46" HEIGHT="35" WC="0.96" CONTENT="sit"/><SP WIDTH="14" VPOS="247" HPOS="682"/>
|
||||
<String ID="string_4" HPOS="696" VPOS="252" WIDTH="105" HEIGHT="36" WC="0.97" CONTENT="amet,"/><SP WIDTH="17" VPOS="252" HPOS="801"/>
|
||||
<String ID="string_5" HPOS="818" VPOS="251" WIDTH="202" HEIGHT="30" WC="0.96" CONTENT="consetetur"/><SP WIDTH="14" VPOS="251" HPOS="1020"/>
|
||||
<String ID="string_6" HPOS="1034" VPOS="244" WIDTH="207" HEIGHT="46" WC="0.96" CONTENT="sadipscing"/><SP WIDTH="15" VPOS="244" HPOS="1241"/>
|
||||
<String ID="string_7" HPOS="1256" VPOS="244" WIDTH="86" HEIGHT="43" WC="0.96" CONTENT="elitr,"/><SP WIDTH="16" VPOS="244" HPOS="1342"/>
|
||||
<String ID="string_8" HPOS="1358" VPOS="244" WIDTH="65" HEIGHT="36" WC="0.96" CONTENT="sed"/><SP WIDTH="15" VPOS="244" HPOS="1423"/>
|
||||
<String ID="string_9" HPOS="1438" VPOS="244" WIDTH="99" HEIGHT="36" WC="0.96" CONTENT="diam"/><SP WIDTH="14" VPOS="244" HPOS="1537"/>
|
||||
<String ID="string_10" HPOS="1551" VPOS="255" WIDTH="164" HEIGHT="35" WC="0.97" CONTENT="nonumy"/><SP WIDTH="15" VPOS="255" HPOS="1715"/>
|
||||
<String ID="string_11" HPOS="1730" VPOS="244" WIDTH="139" HEIGHT="36" WC="0.96" CONTENT="eirmod"/><SP WIDTH="13" VPOS="244" HPOS="1869"/>
|
||||
<String ID="string_12" HPOS="1882" VPOS="250" WIDTH="140" HEIGHT="40" WC="0.96" CONTENT="tempor"/><SP WIDTH="13" VPOS="250" HPOS="2022"/>
|
||||
<String ID="string_13" HPOS="2035" VPOS="244" WIDTH="165" HEIGHT="35" WC="0.96" CONTENT="invidunt"/>
|
||||
</TextLine>
|
||||
<TextLine ID="line_1" HPOS="237" VPOS="301" WIDTH="1913" HEIGHT="49">
|
||||
<String ID="string_14" HPOS="237" VPOS="310" WIDTH="39" HEIGHT="29" WC="0.96" CONTENT="ut"/><SP WIDTH="13" VPOS="310" HPOS="276"/>
|
||||
<String ID="string_15" HPOS="289" VPOS="304" WIDTH="123" HEIGHT="44" WC="0.96" CONTENT="labore"/><SP WIDTH="16" VPOS="304" HPOS="412"/>
|
||||
<String ID="string_16" HPOS="428" VPOS="310" WIDTH="34" HEIGHT="29" WC="0.97" CONTENT="et"/><SP WIDTH="14" VPOS="310" HPOS="462"/>
|
||||
<String ID="string_17" HPOS="476" VPOS="304" WIDTH="123" HEIGHT="36" WC="0.96" CONTENT="dolore"/><SP WIDTH="15" VPOS="304" HPOS="599"/>
|
||||
<String ID="string_18" HPOS="614" VPOS="313" WIDTH="133" HEIGHT="37" WC="0.96" CONTENT="magna"/><SP WIDTH="14" VPOS="313" HPOS="747"/>
|
||||
<String ID="string_19" HPOS="761" VPOS="302" WIDTH="183" HEIGHT="46" WC="0.96" CONTENT="aliquyam"/><SP WIDTH="15" VPOS="302" HPOS="944"/>
|
||||
<String ID="string_20" HPOS="959" VPOS="308" WIDTH="81" HEIGHT="36" WC="0.96" CONTENT="erat,"/><SP WIDTH="17" VPOS="308" HPOS="1040"/>
|
||||
<String ID="string_21" HPOS="1057" VPOS="301" WIDTH="65" HEIGHT="36" WC="0.96" CONTENT="sed"/><SP WIDTH="14" VPOS="301" HPOS="1122"/>
|
||||
<String ID="string_22" HPOS="1136" VPOS="301" WIDTH="97" HEIGHT="36" WC="0.95" CONTENT="diam"/><SP WIDTH="13" VPOS="301" HPOS="1233"/>
|
||||
<String ID="string_23" HPOS="1246" VPOS="301" WIDTH="183" HEIGHT="46" WC="0.96" CONTENT="voluptua."/><SP WIDTH="13" VPOS="301" HPOS="1429"/>
|
||||
<String ID="string_24" HPOS="1442" VPOS="303" WIDTH="51" HEIGHT="34" WC="0.96" CONTENT="At"/><SP WIDTH="12" VPOS="303" HPOS="1493"/>
|
||||
<String ID="string_25" HPOS="1505" VPOS="312" WIDTH="88" HEIGHT="25" WC="0.96" CONTENT="vero"/><SP WIDTH="17" VPOS="312" HPOS="1593"/>
|
||||
<String ID="string_26" HPOS="1610" VPOS="312" WIDTH="64" HEIGHT="25" WC="0.96" CONTENT="eos"/><SP WIDTH="16" VPOS="312" HPOS="1674"/>
|
||||
<String ID="string_27" HPOS="1690" VPOS="308" WIDTH="35" HEIGHT="29" WC="0.96" CONTENT="et"/><SP WIDTH="14" VPOS="308" HPOS="1725"/>
|
||||
<String ID="string_28" HPOS="1739" VPOS="312" WIDTH="168" HEIGHT="25" WC="0.96" CONTENT="accusam"/><SP WIDTH="15" VPOS="312" HPOS="1907"/>
|
||||
<String ID="string_29" HPOS="1922" VPOS="308" WIDTH="34" HEIGHT="29" WC="0.97" CONTENT="et"/><SP WIDTH="11" VPOS="308" HPOS="1956"/>
|
||||
<String ID="string_30" HPOS="1967" VPOS="302" WIDTH="96" HEIGHT="45" WC="0.97" CONTENT="justo"/><SP WIDTH="16" VPOS="302" HPOS="2063"/>
|
||||
<String ID="string_31" HPOS="2079" VPOS="301" WIDTH="71" HEIGHT="36" WC="0.96" CONTENT="duo"/>
|
||||
</TextLine>
|
||||
<TextLine ID="line_2" HPOS="238" VPOS="359" WIDTH="1928" HEIGHT="46">
|
||||
<String ID="string_32" HPOS="238" VPOS="361" WIDTH="144" HEIGHT="36" WC="0.96" CONTENT="dolores"/><SP WIDTH="16" VPOS="361" HPOS="382"/>
|
||||
<String ID="string_33" HPOS="398" VPOS="368" WIDTH="34" HEIGHT="29" WC="0.96" CONTENT="et"/><SP WIDTH="15" VPOS="368" HPOS="432"/>
|
||||
<String ID="string_34" HPOS="447" VPOS="372" WIDTH="41" HEIGHT="25" WC="0.96" CONTENT="ea"/><SP WIDTH="14" VPOS="372" HPOS="488"/>
|
||||
<String ID="string_35" HPOS="502" VPOS="361" WIDTH="136" HEIGHT="36" WC="0.96" CONTENT="rebum."/><SP WIDTH="19" VPOS="361" HPOS="638"/>
|
||||
<String ID="string_36" HPOS="657" VPOS="363" WIDTH="75" HEIGHT="33" WC="0.97" CONTENT="Stet"/><SP WIDTH="14" VPOS="363" HPOS="732"/>
|
||||
<String ID="string_37" HPOS="746" VPOS="360" WIDTH="84" HEIGHT="36" WC="0.96" CONTENT="clita"/><SP WIDTH="13" VPOS="360" HPOS="830"/>
|
||||
<String ID="string_38" HPOS="843" VPOS="359" WIDTH="91" HEIGHT="36" WC="0.96" CONTENT="kasd"/><SP WIDTH="13" VPOS="359" HPOS="934"/>
|
||||
<String ID="string_39" HPOS="947" VPOS="359" WIDTH="208" HEIGHT="46" WC="0.96" CONTENT="gubergren,"/><SP WIDTH="16" VPOS="359" HPOS="1155"/>
|
||||
<String ID="string_40" HPOS="1171" VPOS="370" WIDTH="47" HEIGHT="24" WC="0.96" CONTENT="no"/><SP WIDTH="16" VPOS="370" HPOS="1218"/>
|
||||
<String ID="string_41" HPOS="1234" VPOS="370" WIDTH="61" HEIGHT="25" WC="0.96" CONTENT="sea"/><SP WIDTH="13" VPOS="370" HPOS="1295"/>
|
||||
<String ID="string_42" HPOS="1308" VPOS="359" WIDTH="172" HEIGHT="36" WC="0.96" CONTENT="takimata"/><SP WIDTH="15" VPOS="359" HPOS="1480"/>
|
||||
<String ID="string_43" HPOS="1495" VPOS="365" WIDTH="145" HEIGHT="30" WC="0.96" CONTENT="sanctus"/><SP WIDTH="16" VPOS="365" HPOS="1640"/>
|
||||
<String ID="string_44" HPOS="1656" VPOS="365" WIDTH="55" HEIGHT="29" WC="0.96" CONTENT="est"/><SP WIDTH="13" VPOS="365" HPOS="1711"/>
|
||||
<String ID="string_45" HPOS="1724" VPOS="361" WIDTH="131" HEIGHT="33" WC="0.96" CONTENT="Lorem"/><SP WIDTH="15" VPOS="361" HPOS="1855"/>
|
||||
<String ID="string_46" HPOS="1870" VPOS="360" WIDTH="119" HEIGHT="44" WC="0.96" CONTENT="ipsum"/><SP WIDTH="15" VPOS="360" HPOS="1989"/>
|
||||
<String ID="string_47" HPOS="2004" VPOS="359" WIDTH="103" HEIGHT="35" WC="0.96" CONTENT="dolor"/><SP WIDTH="14" VPOS="359" HPOS="2107"/>
|
||||
<String ID="string_48" HPOS="2121" VPOS="360" WIDTH="45" HEIGHT="34" WC="0.96" CONTENT="sit"/>
|
||||
</TextLine>
|
||||
<TextLine ID="line_3" HPOS="238" VPOS="416" WIDTH="1905" HEIGHT="48">
|
||||
<String ID="string_49" HPOS="238" VPOS="425" WIDTH="105" HEIGHT="29" WC="0.96" CONTENT="amet."/><SP WIDTH="16" VPOS="425" HPOS="343"/>
|
||||
<String ID="string_50" HPOS="359" VPOS="421" WIDTH="132" HEIGHT="33" WC="0.96" CONTENT="Lorem"/><SP WIDTH="13" VPOS="421" HPOS="491"/>
|
||||
<String ID="string_51" HPOS="504" VPOS="420" WIDTH="121" HEIGHT="44" WC="0.96" CONTENT="ipsum"/><SP WIDTH="15" VPOS="420" HPOS="625"/>
|
||||
<String ID="string_52" HPOS="640" VPOS="418" WIDTH="104" HEIGHT="36" WC="0.96" CONTENT="dolor"/><SP WIDTH="14" VPOS="418" HPOS="744"/>
|
||||
<String ID="string_53" HPOS="758" VPOS="419" WIDTH="45" HEIGHT="35" WC="0.97" CONTENT="sit"/><SP WIDTH="15" VPOS="419" HPOS="803"/>
|
||||
<String ID="string_54" HPOS="818" VPOS="424" WIDTH="104" HEIGHT="36" WC="0.96" CONTENT="amet,"/><SP WIDTH="17" VPOS="424" HPOS="922"/>
|
||||
<String ID="string_55" HPOS="939" VPOS="422" WIDTH="201" HEIGHT="30" WC="0.96" CONTENT="consetetur"/><SP WIDTH="15" VPOS="422" HPOS="1140"/>
|
||||
<String ID="string_56" HPOS="1155" VPOS="416" WIDTH="207" HEIGHT="46" WC="0.96" CONTENT="sadipscing"/><SP WIDTH="15" VPOS="416" HPOS="1362"/>
|
||||
<String ID="string_57" HPOS="1377" VPOS="417" WIDTH="86" HEIGHT="42" WC="0.96" CONTENT="elitr,"/><SP WIDTH="17" VPOS="417" HPOS="1463"/>
|
||||
<String ID="string_58" HPOS="1480" VPOS="416" WIDTH="66" HEIGHT="36" WC="0.96" CONTENT="sed"/><SP WIDTH="15" VPOS="416" HPOS="1546"/>
|
||||
<String ID="string_59" HPOS="1561" VPOS="416" WIDTH="98" HEIGHT="36" WC="0.96" CONTENT="diam"/><SP WIDTH="14" VPOS="416" HPOS="1659"/>
|
||||
<String ID="string_60" HPOS="1673" VPOS="427" WIDTH="163" HEIGHT="35" WC="0.96" CONTENT="nonumy"/><SP WIDTH="16" VPOS="427" HPOS="1836"/>
|
||||
<String ID="string_61" HPOS="1852" VPOS="416" WIDTH="138" HEIGHT="36" WC="0.96" CONTENT="eirmod"/><SP WIDTH="13" VPOS="416" HPOS="1990"/>
|
||||
<String ID="string_62" HPOS="2003" VPOS="422" WIDTH="140" HEIGHT="40" WC="0.96" CONTENT="tempor"/>
|
||||
</TextLine>
|
||||
<TextLine ID="line_4" HPOS="236" VPOS="474" WIDTH="1897" HEIGHT="47">
|
||||
<String ID="string_63" HPOS="236" VPOS="476" WIDTH="166" HEIGHT="35" WC="0.96" CONTENT="invidunt"/><SP WIDTH="14" VPOS="476" HPOS="402"/>
|
||||
<String ID="string_64" HPOS="416" VPOS="482" WIDTH="39" HEIGHT="29" WC="0.96" CONTENT="ut"/><SP WIDTH="12" VPOS="482" HPOS="455"/>
|
||||
<String ID="string_65" HPOS="467" VPOS="476" WIDTH="122" HEIGHT="35" WC="0.96" CONTENT="labore"/><SP WIDTH="16" VPOS="476" HPOS="589"/>
|
||||
<String ID="string_66" HPOS="605" VPOS="482" WIDTH="34" HEIGHT="29" WC="0.96" CONTENT="et"/><SP WIDTH="15" VPOS="482" HPOS="639"/>
|
||||
<String ID="string_67" HPOS="654" VPOS="475" WIDTH="125" HEIGHT="36" WC="0.96" CONTENT="dolore"/><SP WIDTH="14" VPOS="475" HPOS="779"/>
|
||||
<String ID="string_68" HPOS="793" VPOS="484" WIDTH="131" HEIGHT="37" WC="0.96" CONTENT="magna"/><SP WIDTH="15" VPOS="484" HPOS="924"/>
|
||||
<String ID="string_69" HPOS="939" VPOS="474" WIDTH="182" HEIGHT="45" WC="0.96" CONTENT="aliquyam"/><SP WIDTH="15" VPOS="474" HPOS="1121"/>
|
||||
<String ID="string_70" HPOS="1136" VPOS="480" WIDTH="81" HEIGHT="37" WC="0.96" CONTENT="erat,"/><SP WIDTH="18" VPOS="480" HPOS="1217"/>
|
||||
<String ID="string_71" HPOS="1235" VPOS="474" WIDTH="63" HEIGHT="35" WC="0.96" CONTENT="sed"/><SP WIDTH="15" VPOS="474" HPOS="1298"/>
|
||||
<String ID="string_72" HPOS="1313" VPOS="474" WIDTH="97" HEIGHT="35" WC="0.96" CONTENT="diam"/><SP WIDTH="13" VPOS="474" HPOS="1410"/>
|
||||
<String ID="string_73" HPOS="1423" VPOS="474" WIDTH="186" HEIGHT="46" WC="0.96" CONTENT="voluptua."/><SP WIDTH="14" VPOS="474" HPOS="1609"/>
|
||||
<String ID="string_74" HPOS="1623" VPOS="475" WIDTH="50" HEIGHT="34" WC="0.96" CONTENT="At"/><SP WIDTH="12" VPOS="475" HPOS="1673"/>
|
||||
<String ID="string_75" HPOS="1685" VPOS="485" WIDTH="89" HEIGHT="24" WC="0.96" CONTENT="vero"/><SP WIDTH="16" VPOS="485" HPOS="1774"/>
|
||||
<String ID="string_76" HPOS="1790" VPOS="484" WIDTH="63" HEIGHT="25" WC="0.96" CONTENT="eos"/><SP WIDTH="15" VPOS="484" HPOS="1853"/>
|
||||
<String ID="string_77" HPOS="1868" VPOS="480" WIDTH="34" HEIGHT="29" WC="0.96" CONTENT="et"/><SP WIDTH="14" VPOS="480" HPOS="1902"/>
|
||||
<String ID="string_78" HPOS="1916" VPOS="484" WIDTH="168" HEIGHT="25" WC="0.96" CONTENT="accusam"/><SP WIDTH="16" VPOS="484" HPOS="2084"/>
|
||||
<String ID="string_79" HPOS="2100" VPOS="480" WIDTH="33" HEIGHT="29" WC="0.96" CONTENT="et"/>
|
||||
</TextLine>
|
||||
<TextLine ID="line_5" HPOS="234" VPOS="531" WIDTH="1950" HEIGHT="47">
|
||||
<String ID="string_80" HPOS="234" VPOS="534" WIDTH="98" HEIGHT="44" WC="0.97" CONTENT="justo"/><SP WIDTH="16" VPOS="534" HPOS="332"/>
|
||||
<String ID="string_81" HPOS="348" VPOS="533" WIDTH="71" HEIGHT="35" WC="0.96" CONTENT="duo"/><SP WIDTH="16" VPOS="533" HPOS="419"/>
|
||||
<String ID="string_82" HPOS="435" VPOS="533" WIDTH="143" HEIGHT="35" WC="0.96" CONTENT="dolores"/><SP WIDTH="15" VPOS="533" HPOS="578"/>
|
||||
<String ID="string_83" HPOS="593" VPOS="539" WIDTH="35" HEIGHT="29" WC="0.96" CONTENT="et"/><SP WIDTH="14" VPOS="539" HPOS="628"/>
|
||||
<String ID="string_84" HPOS="642" VPOS="543" WIDTH="42" HEIGHT="25" WC="0.97" CONTENT="ea"/><SP WIDTH="14" VPOS="543" HPOS="684"/>
|
||||
<String ID="string_85" HPOS="698" VPOS="533" WIDTH="137" HEIGHT="35" WC="0.96" CONTENT="rebum."/><SP WIDTH="18" VPOS="533" HPOS="835"/>
|
||||
<String ID="string_86" HPOS="853" VPOS="534" WIDTH="74" HEIGHT="34" WC="0.96" CONTENT="Stet"/><SP WIDTH="14" VPOS="534" HPOS="927"/>
|
||||
<String ID="string_87" HPOS="941" VPOS="531" WIDTH="84" HEIGHT="36" WC="0.96" CONTENT="clita"/><SP WIDTH="13" VPOS="531" HPOS="1025"/>
|
||||
<String ID="string_88" HPOS="1038" VPOS="531" WIDTH="89" HEIGHT="35" WC="0.96" CONTENT="kasd"/><SP WIDTH="15" VPOS="531" HPOS="1127"/>
|
||||
<String ID="string_89" HPOS="1142" VPOS="531" WIDTH="208" HEIGHT="46" WC="0.96" CONTENT="gubergren,"/><SP WIDTH="16" VPOS="531" HPOS="1350"/>
|
||||
<String ID="string_90" HPOS="1366" VPOS="542" WIDTH="48" HEIGHT="25" WC="0.96" CONTENT="no"/><SP WIDTH="16" VPOS="542" HPOS="1414"/>
|
||||
<String ID="string_91" HPOS="1430" VPOS="542" WIDTH="62" HEIGHT="25" WC="0.96" CONTENT="sea"/><SP WIDTH="13" VPOS="542" HPOS="1492"/>
|
||||
<String ID="string_92" HPOS="1505" VPOS="531" WIDTH="173" HEIGHT="36" WC="0.96" CONTENT="takimata"/><SP WIDTH="15" VPOS="531" HPOS="1678"/>
|
||||
<String ID="string_93" HPOS="1693" VPOS="538" WIDTH="144" HEIGHT="29" WC="0.96" CONTENT="sanctus"/><SP WIDTH="16" VPOS="538" HPOS="1837"/>
|
||||
<String ID="string_94" HPOS="1853" VPOS="537" WIDTH="53" HEIGHT="29" WC="0.96" CONTENT="est"/><SP WIDTH="14" VPOS="537" HPOS="1906"/>
|
||||
<String ID="string_95" HPOS="1920" VPOS="533" WIDTH="130" HEIGHT="33" WC="0.96" CONTENT="Lorem"/><SP WIDTH="14" VPOS="533" HPOS="2050"/>
|
||||
<String ID="string_96" HPOS="2064" VPOS="532" WIDTH="120" HEIGHT="44" WC="0.95" CONTENT="ipsum"/>
|
||||
</TextLine>
|
||||
<TextLine ID="line_6" HPOS="237" VPOS="590" WIDTH="282" HEIGHT="41">
|
||||
<String ID="string_97" HPOS="237" VPOS="590" WIDTH="104" HEIGHT="35" WC="0.96" CONTENT="dolor"/><SP WIDTH="15" VPOS="590" HPOS="341"/>
|
||||
<String ID="string_98" HPOS="356" VPOS="591" WIDTH="45" HEIGHT="35" WC="0.96" CONTENT="sit"/><SP WIDTH="14" VPOS="591" HPOS="401"/>
|
||||
<String ID="string_99" HPOS="415" VPOS="597" WIDTH="104" HEIGHT="34" WC="0.96" CONTENT="amet."/>
|
||||
</TextLine>
|
||||
</TextBlock>
|
||||
</PrintSpace>
|
||||
</Page>
|
||||
</Layout>
|
||||
</alto>
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -0,0 +1 @@
|
||||
Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet.
|
@ -0,0 +1,108 @@
|
||||
from .util import unzip
|
||||
from .. import align, seq_align, distance
|
||||
|
||||
|
||||
def test_left_empty():
|
||||
result = list(align('', 'foo'))
|
||||
expected = [(None, 'f'), (None, 'o'), (None, 'o')]
|
||||
assert result == expected
|
||||
|
||||
|
||||
def test_right_empty():
|
||||
result = list(align('foo', ''))
|
||||
expected = [('f', None), ('o', None), ('o', None)]
|
||||
assert result == expected
|
||||
|
||||
|
||||
def test_left_longer():
|
||||
result = list(align('food', 'foo'))
|
||||
expected = [('f', 'f'), ('o', 'o'), ('o', 'o'), ('d', None)]
|
||||
assert result == expected
|
||||
|
||||
|
||||
def test_right_longer():
|
||||
result = list(align('foo', 'food'))
|
||||
expected = [('f', 'f'), ('o', 'o'), ('o', 'o'), (None, 'd')]
|
||||
assert result == expected
|
||||
|
||||
|
||||
def test_some_diff():
|
||||
result = list(align('abcde', 'aaadef'))
|
||||
left, right = unzip(result)
|
||||
assert list(left) == ['a', 'b', 'c', 'd', 'e', None]
|
||||
assert list(right) == ['a', 'a', 'a', 'd', 'e', 'f']
|
||||
|
||||
|
||||
def test_longer():
|
||||
s1 = 'Dies ist eine Tst!'
|
||||
s2 = 'Dies ist ein Test.'
|
||||
|
||||
result = list(align(s1, s2)) # ; diffprint(*unzip(result))
|
||||
expected = [('D', 'D'), ('i', 'i'), ('e', 'e'), ('s', 's'), (' ', ' '),
|
||||
('i', 'i'), ('s', 's'), ('t', 't'), (' ', ' '),
|
||||
('e', 'e'), ('i', 'i'), ('n', 'n'), ('e', None), (' ', ' '),
|
||||
('T', 'T'), (None, 'e'), ('s', 's'), ('t', 't'), ('!', '.')]
|
||||
assert result == expected
|
||||
|
||||
|
||||
def test_completely_different():
|
||||
assert len(list(align('abcde', 'fghij'))) == 5
|
||||
|
||||
|
||||
def test_with_some_fake_ocr_errors():
|
||||
result = list(align('Über die vielen Sorgen wegen desselben vergaß',
|
||||
'SomeJunk MoreJunk Übey die vielen Sorgen wegen AdditionalJunk deffelben vcrgab'))
|
||||
left, right = unzip(result)
|
||||
|
||||
# Beginning
|
||||
assert list(left[:18]) == [None]*18
|
||||
assert list(right[:18]) == list('SomeJunk MoreJunk ')
|
||||
|
||||
# End
|
||||
assert list(left[-1:]) == ['ß']
|
||||
assert list(right[-1:]) == ['b']
|
||||
|
||||
|
||||
def test_lines():
|
||||
"""Test comparing list of lines.
|
||||
|
||||
This mainly serves as documentation for comparing lists of lines.
|
||||
"""
|
||||
result = list(seq_align(
|
||||
['This is a line.', 'This is another', 'And the last line'],
|
||||
['This is a line.', 'This is another', 'J u n k', 'And the last line']
|
||||
))
|
||||
left, right = unzip(result)
|
||||
assert list(left) == ['This is a line.', 'This is another', None, 'And the last line']
|
||||
assert list(right) == ['This is a line.', 'This is another', 'J u n k', 'And the last line']
|
||||
|
||||
|
||||
def test_lines_similar():
|
||||
"""Test comparing list of lines while using a "weaker equivalence".
|
||||
|
||||
This mainly serves as documentation.
|
||||
"""
|
||||
|
||||
class SimilarString:
|
||||
def __init__(self, string):
|
||||
self._string = string
|
||||
|
||||
def __eq__(self, other):
|
||||
return distance(self._string, other._string) < 2 # XXX NOT the final version
|
||||
|
||||
def __ne__(self, other):
|
||||
return not self.__eq__(other)
|
||||
|
||||
def __repr__(self):
|
||||
return 'SimilarString(\'%s\')' % self._string
|
||||
|
||||
def __hash__(self):
|
||||
return hash(self._string)
|
||||
|
||||
result = list(seq_align(
|
||||
[SimilarString('This is a line.'), SimilarString('This is another'), SimilarString('And the last line')],
|
||||
[SimilarString('This is a ljne.'), SimilarString('This is another'), SimilarString('J u n k'), SimilarString('And the last line')]
|
||||
))
|
||||
left, right = unzip(result)
|
||||
assert list(left) == [SimilarString('This is a line.'), SimilarString('This is another'), None, SimilarString('And the last line')]
|
||||
assert list(right) == [SimilarString('This is a ljne.'), SimilarString('This is another'), SimilarString('J u n k'), SimilarString('And the last line')]
|
@ -0,0 +1,37 @@
|
||||
from __future__ import division, print_function
|
||||
|
||||
import math
|
||||
import unicodedata
|
||||
|
||||
from .. import character_error_rate
|
||||
|
||||
|
||||
def test_character_error_rate():
|
||||
assert character_error_rate('a', 'a') == 0
|
||||
assert character_error_rate('a', 'b') == 1/1
|
||||
assert character_error_rate('Foo', 'Bar') == 3/3
|
||||
|
||||
assert character_error_rate('Foo', '') == 3/3
|
||||
|
||||
assert character_error_rate('', '') == 0
|
||||
assert math.isinf(character_error_rate('', 'Foo'))
|
||||
|
||||
assert character_error_rate('Foo', 'Food') == 1/3
|
||||
assert character_error_rate('Fnord', 'Food') == 2/5
|
||||
assert character_error_rate('Müll', 'Mull') == 1/4
|
||||
assert character_error_rate('Abstand', 'Sand') == 4/7
|
||||
|
||||
|
||||
def test_character_error_rate_hard():
|
||||
s1 = unicodedata.normalize('NFC', 'Schlyñ lorem ipsum.')
|
||||
s2 = unicodedata.normalize('NFD', 'Schlyñ lorem ipsum!') # Different, decomposed!
|
||||
assert character_error_rate(s1, s2) == 1/19
|
||||
|
||||
s1 = 'Schlyñ'
|
||||
assert len(s1) == 6 # This ends with LATIN SMALL LETTER N WITH TILDE, so 6 code points
|
||||
s2 = 'Schlym̃'
|
||||
assert len(s2) == 7 # This, OTOH, ends with LATIN SMALL LETTER M + COMBINING TILDE, 7 code points
|
||||
|
||||
# Both strings have the same length in terms of grapheme clusters. So the CER should be symmetrical.
|
||||
assert character_error_rate(s2, s1) == 1/6
|
||||
assert character_error_rate(s1, s2) == 1/6
|
@ -0,0 +1,40 @@
|
||||
from __future__ import division, print_function
|
||||
|
||||
import unicodedata
|
||||
|
||||
from .. import levenshtein, distance
|
||||
|
||||
|
||||
def test_levenshtein():
|
||||
assert levenshtein('a', 'a') == 0
|
||||
assert levenshtein('a', 'b') == 1
|
||||
assert levenshtein('Foo', 'Bar') == 3
|
||||
|
||||
assert levenshtein('', '') == 0
|
||||
assert levenshtein('Foo', '') == 3
|
||||
assert levenshtein('', 'Foo') == 3
|
||||
|
||||
assert levenshtein('Foo', 'Food') == 1
|
||||
assert levenshtein('Fnord', 'Food') == 2
|
||||
assert levenshtein('Müll', 'Mull') == 1
|
||||
assert levenshtein('Abstand', 'Sand') == 4
|
||||
|
||||
|
||||
def test_levenshtein_other_sequences():
|
||||
assert levenshtein(['a', 'ab'], ['a', 'ab', 'c']) == 1
|
||||
assert levenshtein(['a', 'ab'], ['a', 'c']) == 1
|
||||
|
||||
|
||||
def test_distance():
|
||||
assert distance('Fnord', 'Food') == 2
|
||||
assert distance('Müll', 'Mull') == 1
|
||||
|
||||
word1 = unicodedata.normalize('NFC', 'Schlyñ')
|
||||
word2 = unicodedata.normalize('NFD', 'Schlyñ') # Different, decomposed!
|
||||
assert distance(word1, word2) == 0
|
||||
|
||||
word1 = 'Schlyñ'
|
||||
assert len(word1) == 6 # This ends with LATIN SMALL LETTER N WITH TILDE, so 6 code points
|
||||
word2 = 'Schlym̃'
|
||||
assert len(word2) == 7 # This, OTOH, ends with LATIN SMALL LETTER M + COMBINING TILDE, 7 code points
|
||||
assert distance(word1, word2) == 1
|
@ -0,0 +1,48 @@
|
||||
import unicodedata
|
||||
|
||||
from .. import seq_editops, editops
|
||||
|
||||
|
||||
def test_trivial():
|
||||
assert seq_editops('abc', 'abc') == []
|
||||
assert seq_editops('', '') == []
|
||||
|
||||
|
||||
def test_insert():
|
||||
assert seq_editops('bc', 'abc') == [('insert', 0, 0)]
|
||||
assert seq_editops('ac', 'abc') == [('insert', 1, 1)]
|
||||
assert seq_editops('ab', 'abc') == [('insert', 2, 2)]
|
||||
assert seq_editops('', 'a') == [('insert', 0, 0)]
|
||||
|
||||
|
||||
def test_multiple():
|
||||
assert seq_editops('bcd', 'abce') == [('insert', 0, 0), ('replace', 2, 3)]
|
||||
|
||||
|
||||
def test_delete():
|
||||
assert seq_editops('abcdef', 'cdef') == [('delete', 0, 0), ('delete', 1, 0)]
|
||||
assert seq_editops('Xabcdef', 'Xcdef') == [('delete', 1, 1), ('delete', 2, 1)]
|
||||
assert seq_editops('abcdefg', 'acdefX') == [('delete', 1, 1), ('replace', 6, 5)]
|
||||
assert seq_editops('abcde', 'aabcd') == [('insert', 1, 1), ('delete', 4, 5)]
|
||||
assert seq_editops('Foo', '') == [('delete', 0, 0), ('delete', 1, 0), ('delete', 2, 0)]
|
||||
assert seq_editops('Foolish', 'Foo') == [('delete', 3, 3), ('delete', 4, 3), ('delete', 5, 3), ('delete', 6, 3)]
|
||||
|
||||
|
||||
def test_ambiguous():
|
||||
assert seq_editops('bcd', 'abcef') == [('insert', 0, 0), ('replace', 2, 3), ('insert', 3, 4)]
|
||||
|
||||
|
||||
def test_editops():
|
||||
"""Test editops() in cases where dealing with grapheme clusters matters"""
|
||||
|
||||
# In these cases, one of the words has a composed form, the other one does not.
|
||||
assert editops('Schlyñ', 'Schlym̃') == [('replace', 5, 5)]
|
||||
assert editops('oͤde', 'öde') == [('replace', 0, 0)]
|
||||
|
||||
|
||||
def test_editops_canonically_equivalent():
|
||||
left = unicodedata.lookup('LATIN SMALL LETTER N') + unicodedata.lookup('COMBINING TILDE')
|
||||
right = unicodedata.lookup('LATIN SMALL LETTER N WITH TILDE')
|
||||
assert left != right
|
||||
assert unicodedata.normalize('NFC', left) == unicodedata.normalize('NFC', right)
|
||||
assert editops(left, right) == []
|
@ -0,0 +1,23 @@
|
||||
from __future__ import division, print_function
|
||||
|
||||
import os
|
||||
|
||||
import pytest
|
||||
from lxml import etree as ET
|
||||
|
||||
from .. import align, page_text
|
||||
|
||||
data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data')
|
||||
|
||||
|
||||
@pytest.mark.integration
|
||||
def test_align_page_files():
|
||||
# In the fake OCR file, we changed 2 characters and replaced a fi ligature with fi.
|
||||
# → 4 elements in the alignment should be different.
|
||||
# NOTE: In this example, it doesn't matter that we work with "characters", not grapheme clusters.
|
||||
|
||||
gt = page_text(ET.parse(os.path.join(data_dir, 'test-gt.page2018.xml')))
|
||||
ocr = page_text(ET.parse(os.path.join(data_dir, 'test-fake-ocr.page2018.xml')))
|
||||
|
||||
result = list(align(gt, ocr))
|
||||
assert sum(left != right for left, right in result) == 4
|
@ -0,0 +1,35 @@
|
||||
from __future__ import division, print_function
|
||||
|
||||
import os
|
||||
|
||||
import pytest
|
||||
from lxml import etree as ET
|
||||
|
||||
from .. import character_error_rate, page_text, alto_text
|
||||
|
||||
data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data')
|
||||
|
||||
|
||||
@pytest.mark.integration
|
||||
def test_character_error_rate_between_page_files():
|
||||
# In the fake OCR file, we changed 2 characters and replaced a fi ligature with fi.
|
||||
gt = page_text(ET.parse(os.path.join(data_dir, 'test-gt.page2018.xml')))
|
||||
ocr = page_text(ET.parse(os.path.join(data_dir, 'test-fake-ocr.page2018.xml')))
|
||||
assert character_error_rate(gt, ocr) == 4/(470 + 1 + 311) # 2 TextRegions, 1 \n
|
||||
|
||||
|
||||
@pytest.mark.integration
|
||||
def test_character_error_rate_between_page_alto():
|
||||
gt = page_text(ET.parse(os.path.join(data_dir, 'lorem-ipsum', 'lorem-ipsum-scan.gt.page.xml')))
|
||||
ocr = alto_text(ET.parse(os.path.join(data_dir, 'lorem-ipsum', 'lorem-ipsum-scan.ocr.tesseract.alto.xml')))
|
||||
|
||||
assert gt == ocr
|
||||
assert character_error_rate(gt, ocr) == 0
|
||||
|
||||
|
||||
@pytest.mark.integration
|
||||
def test_character_error_rate_between_page_alto_2():
|
||||
gt = page_text(ET.parse(os.path.join(data_dir, 'lorem-ipsum', 'lorem-ipsum-scan-bad.gt.page.xml')))
|
||||
ocr = alto_text(ET.parse(os.path.join(data_dir, 'lorem-ipsum', 'lorem-ipsum-scan-bad.ocr.tesseract.alto.xml')))
|
||||
|
||||
assert character_error_rate(gt, ocr) == 8/591 # Manually verified
|
@ -0,0 +1,39 @@
|
||||
import os
|
||||
import json
|
||||
|
||||
import pytest
|
||||
from .util import working_directory
|
||||
|
||||
from ..cli import process
|
||||
|
||||
|
||||
def test_cli_json(tmp_path):
|
||||
"""Test that the cli/process() yields a loadable JSON report"""
|
||||
|
||||
# XXX Path.__str__() is necessary for Python 3.5
|
||||
with working_directory(str(tmp_path)):
|
||||
with open('gt.txt', 'w') as gtf:
|
||||
gtf.write('AAAAA')
|
||||
with open('ocr.txt', 'w') as ocrf:
|
||||
ocrf.write('AAAAB')
|
||||
|
||||
process('gt.txt', 'ocr.txt', 'report')
|
||||
with open('report.json', 'r') as jsonf:
|
||||
j = json.load(jsonf)
|
||||
assert j['cer'] == pytest.approx(0.2)
|
||||
|
||||
|
||||
def test_cli_json_cer_is_infinity(tmp_path):
|
||||
"""Test that the cli/process() yields a loadable JSON report when CER == inf"""
|
||||
|
||||
# XXX Path.__str__() is necessary for Python 3.5
|
||||
with working_directory(str(tmp_path)):
|
||||
with open('gt.txt', 'w') as gtf:
|
||||
gtf.write('') # Empty to yield CER == inf
|
||||
with open('ocr.txt', 'w') as ocrf:
|
||||
ocrf.write('Not important')
|
||||
|
||||
process('gt.txt', 'ocr.txt', 'report')
|
||||
with open('report.json', 'r') as jsonf:
|
||||
j = json.load(jsonf)
|
||||
assert j['cer'] == pytest.approx(float('inf'))
|
@ -0,0 +1,35 @@
|
||||
from __future__ import division, print_function
|
||||
|
||||
import os
|
||||
|
||||
import pytest
|
||||
from lxml import etree as ET
|
||||
|
||||
from .. import distance, page_text, alto_text
|
||||
|
||||
data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data')
|
||||
|
||||
|
||||
@pytest.mark.integration
|
||||
def test_distance_between_page_files():
|
||||
# In the fake OCR file, we changed 2 characters and replaced a fi ligature with fi.
|
||||
gt = page_text(ET.parse(os.path.join(data_dir, 'test-gt.page2018.xml')))
|
||||
ocr = page_text(ET.parse(os.path.join(data_dir, 'test-fake-ocr.page2018.xml')))
|
||||
assert distance(gt, ocr) == 4
|
||||
|
||||
|
||||
@pytest.mark.integration
|
||||
def test_distance_between_page_alto():
|
||||
gt = page_text(ET.parse(os.path.join(data_dir, 'lorem-ipsum', 'lorem-ipsum-scan.gt.page.xml')))
|
||||
ocr = alto_text(ET.parse(os.path.join(data_dir, 'lorem-ipsum', 'lorem-ipsum-scan.ocr.tesseract.alto.xml')))
|
||||
|
||||
assert gt == ocr
|
||||
assert distance(gt, ocr) == 0
|
||||
|
||||
|
||||
@pytest.mark.integration
|
||||
def test_distance_between_page_alto_2():
|
||||
gt = page_text(ET.parse(os.path.join(data_dir, 'lorem-ipsum', 'lorem-ipsum-scan-bad.gt.page.xml')))
|
||||
ocr = alto_text(ET.parse(os.path.join(data_dir, 'lorem-ipsum', 'lorem-ipsum-scan-bad.ocr.tesseract.alto.xml')))
|
||||
|
||||
assert distance(gt, ocr) == 8 # Manually verified
|
@ -0,0 +1,37 @@
|
||||
import os
|
||||
import re
|
||||
import shutil
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
from click.testing import CliRunner
|
||||
import pytest
|
||||
from .util import working_directory
|
||||
|
||||
|
||||
from ..ocrd_cli import ocrd_dinglehopper
|
||||
|
||||
data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data')
|
||||
|
||||
|
||||
def test_ocrd_cli(tmp_path):
|
||||
"""Test OCR-D interface"""
|
||||
|
||||
# XXX Path.str() is necessary for Python 3.5
|
||||
|
||||
# Copy test workspace
|
||||
test_workspace_dir_source = Path(data_dir) / 'actevedef_718448162'
|
||||
test_workspace_dir = tmp_path / 'test_ocrd_cli'
|
||||
shutil.copytree(str(test_workspace_dir_source), str(test_workspace_dir))
|
||||
|
||||
# Run through the OCR-D interface
|
||||
with working_directory(str(test_workspace_dir)):
|
||||
runner = CliRunner()
|
||||
result = runner.invoke(ocrd_dinglehopper, [
|
||||
'-m', 'mets.xml',
|
||||
'-I', 'OCR-D-GT-PAGE,OCR-D-OCR-CALAMARI',
|
||||
'-O', 'OCR-D-OCR-CALAMARI-EVAL'
|
||||
])
|
||||
assert result.exit_code == 0
|
||||
result_json = list((test_workspace_dir / 'OCR-D-OCR-CALAMARI-EVAL').glob('*.json'))
|
||||
assert json.load(open(str(result_json[0])))['cer'] < 0.03
|
@ -0,0 +1,43 @@
|
||||
from __future__ import division, print_function
|
||||
|
||||
import os
|
||||
|
||||
import pytest
|
||||
from lxml import etree as ET
|
||||
|
||||
from .. import word_error_rate, words, page_text, alto_text
|
||||
|
||||
data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data')
|
||||
|
||||
|
||||
@pytest.mark.integration
|
||||
def test_word_error_rate_between_page_files():
|
||||
# In the fake OCR file, we changed 2 characters and replaced a fi ligature with fi. → 3 changed words
|
||||
gt = page_text(ET.parse(os.path.join(data_dir, 'test-gt.page2018.xml')))
|
||||
|
||||
gt_word_count = 7+6+5+8+7+6+7+8+6+7+7+5+6+8+8+7+7+6+5+4 # Manually verified word count per line
|
||||
assert len(list(words(gt))) == gt_word_count
|
||||
|
||||
ocr = page_text(ET.parse(os.path.join(data_dir, 'test-fake-ocr.page2018.xml')))
|
||||
assert word_error_rate(gt, ocr) == 3/gt_word_count
|
||||
|
||||
|
||||
@pytest.mark.integration
|
||||
def test_word_error_rate_between_page_alto():
|
||||
gt = page_text(ET.parse(os.path.join(data_dir, 'lorem-ipsum', 'lorem-ipsum-scan.gt.page.xml')))
|
||||
ocr = alto_text(ET.parse(os.path.join(data_dir, 'lorem-ipsum', 'lorem-ipsum-scan.ocr.tesseract.alto.xml')))
|
||||
|
||||
assert gt == ocr
|
||||
assert word_error_rate(gt, ocr) == 0
|
||||
|
||||
|
||||
@pytest.mark.integration
|
||||
def test_word_error_rate_between_page_alto_2():
|
||||
gt = page_text(ET.parse(os.path.join(data_dir, 'lorem-ipsum', 'lorem-ipsum-scan-bad.gt.page.xml')))
|
||||
|
||||
gt_word_count = 14+18+17+14+17+17+3 # Manually verified word count per line
|
||||
assert len(list(words(gt))) == gt_word_count
|
||||
|
||||
ocr = alto_text(ET.parse(os.path.join(data_dir, 'lorem-ipsum', 'lorem-ipsum-scan-bad.ocr.tesseract.alto.xml')))
|
||||
|
||||
assert word_error_rate(gt, ocr) == 7/gt_word_count # Manually verified, 6 words are wrong, 1 got split (=2 errors)
|
@ -0,0 +1,38 @@
|
||||
from itertools import zip_longest
|
||||
from typing import Iterable
|
||||
|
||||
import colorama
|
||||
import os
|
||||
|
||||
|
||||
def diffprint(x, y):
|
||||
"""Print elements or lists x and y, with differences in red"""
|
||||
|
||||
def _diffprint(x, y):
|
||||
if x != y:
|
||||
print(colorama.Fore.RED, x, y, colorama.Fore.RESET)
|
||||
else:
|
||||
print(x, y)
|
||||
|
||||
if isinstance(x, Iterable):
|
||||
for xe, ye in zip_longest(x, y):
|
||||
_diffprint(xe, ye)
|
||||
else:
|
||||
_diffprint(x, y)
|
||||
|
||||
|
||||
def unzip(l):
|
||||
return zip(*l)
|
||||
|
||||
|
||||
class working_directory:
|
||||
"""Context manager to temporarily change the working directory"""
|
||||
def __init__(self, wd):
|
||||
self.wd = wd
|
||||
|
||||
def __enter__(self):
|
||||
self.old_wd = os.getcwd()
|
||||
os.chdir(self.wd)
|
||||
|
||||
def __exit__(self, etype, value, traceback):
|
||||
os.chdir(self.old_wd)
|
@ -0,0 +1,63 @@
|
||||
from __future__ import division
|
||||
|
||||
import unicodedata
|
||||
|
||||
import uniseg.wordbreak
|
||||
|
||||
from .edit_distance import levenshtein
|
||||
|
||||
|
||||
def words(s):
|
||||
# Patch uniseg.wordbreak.word_break to deal with our private use characters. See also
|
||||
# https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/WordBreakProperty.txt
|
||||
old_word_break = uniseg.wordbreak.word_break
|
||||
|
||||
def new_word_break(c, index=0):
|
||||
if 0xE000 <= ord(c) <= 0xF8FF: # Private Use Area
|
||||
return 'ALetter'
|
||||
else:
|
||||
return old_word_break(c, index)
|
||||
uniseg.wordbreak.word_break = new_word_break
|
||||
|
||||
# Check if c is an unwanted character, i.e. whitespace, punctuation, or similar
|
||||
def unwanted(c):
|
||||
|
||||
# See https://www.fileformat.info/info/unicode/category/index.htm
|
||||
# and https://unicodebook.readthedocs.io/unicode.html#categories
|
||||
unwanted_categories = 'O', 'M', 'P', 'Z', 'S'
|
||||
unwanted_subcategories = 'Cc', 'Cf'
|
||||
|
||||
subcat = unicodedata.category(c)
|
||||
cat = subcat[0]
|
||||
return cat in unwanted_categories or subcat in unwanted_subcategories
|
||||
|
||||
# We follow Unicode Standard Annex #29 on Unicode Text Segmentation here: Split on word boundaries using
|
||||
# uniseg.wordbreak.words() and ignore all "words" that contain only whitespace, punctation "or similar characters."
|
||||
for word in uniseg.wordbreak.words(s):
|
||||
if all(unwanted(c) for c in word):
|
||||
pass
|
||||
else:
|
||||
yield word
|
||||
|
||||
|
||||
def words_normalized(s):
|
||||
return words(unicodedata.normalize('NFC', s))
|
||||
|
||||
|
||||
def word_error_rate(reference, compared):
|
||||
if isinstance(reference, str):
|
||||
reference_seq = list(words_normalized(reference))
|
||||
compared_seq = list(words_normalized(compared))
|
||||
else:
|
||||
reference_seq = list(reference)
|
||||
compared_seq = list(compared)
|
||||
|
||||
d = levenshtein(reference_seq, compared_seq)
|
||||
if d == 0:
|
||||
return 0
|
||||
|
||||
n = len(reference_seq)
|
||||
if n == 0:
|
||||
return float('inf')
|
||||
|
||||
return d / n
|
@ -1,2 +0,0 @@
|
||||
from .main import *
|
||||
from .ocrd_cli import *
|
File diff suppressed because it is too large
Load Diff
@ -1,19 +0,0 @@
|
||||
{
|
||||
"version": "0.0.1",
|
||||
"tools": {
|
||||
"ocrd-sbb-textline-detector": {
|
||||
"executable": "ocrd-sbb-textline-detector",
|
||||
"description": "Detect lines",
|
||||
"steps": ["layout/segmentation/line"],
|
||||
"input_file_grp": [
|
||||
"OCR-D-IMG"
|
||||
],
|
||||
"output_file_grp": [
|
||||
"OCR-D-SBB-SEG-LINE"
|
||||
],
|
||||
"parameters": {
|
||||
"model": {"type": "string", "format": "file", "cacheable": true}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
@ -1,110 +0,0 @@
|
||||
import json
|
||||
import os
|
||||
import tempfile
|
||||
|
||||
import click
|
||||
import ocrd_models.ocrd_page
|
||||
from ocrd import Processor
|
||||
from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor
|
||||
from ocrd_modelfactory import page_from_file
|
||||
from ocrd_models import OcrdFile
|
||||
from ocrd_models.ocrd_page_generateds import MetadataItemType, LabelsType, LabelType
|
||||
from ocrd_utils import concat_padded, getLogger, MIMETYPE_PAGE
|
||||
from pkg_resources import resource_string
|
||||
|
||||
from qurator.sbb_textline_detector import textline_detector
|
||||
|
||||
log = getLogger('processor.OcrdSbbTextlineDetectorRecognize')
|
||||
|
||||
OCRD_TOOL = json.loads(resource_string(__name__, 'ocrd-tool.json').decode('utf8'))
|
||||
|
||||
|
||||
@click.command()
|
||||
@ocrd_cli_options
|
||||
def ocrd_sbb_textline_detector(*args, **kwargs):
|
||||
return ocrd_cli_wrap_processor(OcrdSbbTextlineDetectorRecognize, *args, **kwargs)
|
||||
|
||||
|
||||
TOOL = 'ocrd_sbb_textline_detector'
|
||||
|
||||
|
||||
class OcrdSbbTextlineDetectorRecognize(Processor):
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
kwargs['ocrd_tool'] = OCRD_TOOL['tools'][TOOL]
|
||||
kwargs['version'] = OCRD_TOOL['version']
|
||||
super(OcrdSbbTextlineDetectorRecognize, self).__init__(*args, **kwargs)
|
||||
|
||||
def _make_file_id(self, input_file, input_file_grp, n):
|
||||
file_id = input_file.ID.replace(input_file_grp, self.output_file_grp)
|
||||
if file_id == input_file.ID:
|
||||
file_id = concat_padded(self.output_file_grp, n)
|
||||
return file_id
|
||||
|
||||
def _resolve_image_file(self, input_file: OcrdFile) -> str:
|
||||
if input_file.mimetype == MIMETYPE_PAGE:
|
||||
pcgts = page_from_file(self.workspace.download_file(input_file))
|
||||
page = pcgts.get_Page()
|
||||
image_file = page.imageFilename
|
||||
else:
|
||||
image_file = input_file.local_filename
|
||||
return image_file
|
||||
|
||||
def process(self):
|
||||
for n, page_id in enumerate(self.workspace.mets.physical_pages):
|
||||
input_file = self.workspace.mets.find_files(fileGrp=self.input_file_grp, pageId=page_id)[0]
|
||||
log.info("INPUT FILE %i / %s", n, input_file)
|
||||
|
||||
file_id = self._make_file_id(input_file, self.input_file_grp, n)
|
||||
|
||||
# Process the files
|
||||
try:
|
||||
os.mkdir(self.output_file_grp)
|
||||
except FileExistsError:
|
||||
pass
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmp_dirname:
|
||||
# Segment the image
|
||||
image_file = self._resolve_image_file(input_file)
|
||||
model = self.parameter['model']
|
||||
x = textline_detector(image_file, tmp_dirname, file_id, model)
|
||||
x.run()
|
||||
|
||||
# Read segmentation results
|
||||
tmp_filename = os.path.join(tmp_dirname, file_id) + '.xml'
|
||||
tmp_pcgts = ocrd_models.ocrd_page.parse(tmp_filename)
|
||||
tmp_page = tmp_pcgts.get_Page()
|
||||
|
||||
# Create a new PAGE file from the input file
|
||||
pcgts = page_from_file(self.workspace.download_file(input_file))
|
||||
page = pcgts.get_Page()
|
||||
|
||||
# Merge results → PAGE file
|
||||
page.set_PrintSpace(tmp_page.get_PrintSpace())
|
||||
page.set_ReadingOrder(tmp_page.get_ReadingOrder())
|
||||
page.set_TextRegion(tmp_page.get_TextRegion())
|
||||
|
||||
# Save metadata about this operation
|
||||
metadata = pcgts.get_Metadata()
|
||||
metadata.add_MetadataItem(
|
||||
MetadataItemType(type_="processingStep",
|
||||
name=self.ocrd_tool['steps'][0],
|
||||
value=TOOL,
|
||||
Labels=[LabelsType(
|
||||
externalModel="ocrd-tool",
|
||||
externalId="parameters",
|
||||
Label=[LabelType(type_=name, value=self.parameter[name])
|
||||
for name in self.parameter.keys()])]))
|
||||
|
||||
self.workspace.add_file(
|
||||
ID=file_id,
|
||||
file_grp=self.output_file_grp,
|
||||
pageId=page_id,
|
||||
mimetype='application/vnd.prima.page+xml',
|
||||
local_filename=os.path.join(self.output_file_grp, file_id) + '.xml',
|
||||
content=ocrd_models.ocrd_page.to_xml(pcgts)
|
||||
)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
ocrd_sbb_textline_detector()
|
@ -1,10 +1,7 @@
|
||||
opencv-python-headless
|
||||
matplotlib
|
||||
seaborn
|
||||
tqdm
|
||||
keras
|
||||
shapely
|
||||
scikit-learn
|
||||
tensorflow-gpu < 2.0
|
||||
scipy
|
||||
ocrd >= 2.0.0
|
||||
click
|
||||
jinja2
|
||||
lxml
|
||||
uniseg
|
||||
numpy
|
||||
colorama
|
||||
ocrd >= 1.0.0b15
|
||||
|
Loading…
Reference in New Issue