mirror of
https://github.com/qurator-spk/dinglehopper.git
synced 2025-07-05 08:30:01 +02:00
🐛 Move source into src/ to fix install
Installing was broken since moving to pyproject.toml, which we didn't notice because of
leftover files in build/. Fix this by using the convention of having the source files
in src/ and adjusting pyproject.toml accordingly.
Fixes gh-86. 🤞
This commit is contained in:
parent
db7c051b22
commit
325e5af5f5
84 changed files with 2 additions and 3 deletions
5
src/dinglehopper/__init__.py
Normal file
5
src/dinglehopper/__init__.py
Normal file
|
@ -0,0 +1,5 @@
|
|||
from .ocr_files import *
|
||||
from .extracted_text import *
|
||||
from .character_error_rate import *
|
||||
from .word_error_rate import *
|
||||
from .align import *
|
44
src/dinglehopper/align.py
Normal file
44
src/dinglehopper/align.py
Normal file
|
@ -0,0 +1,44 @@
|
|||
from .edit_distance import *
|
||||
from rapidfuzz.distance import Levenshtein
|
||||
|
||||
|
||||
def align(t1, t2):
|
||||
"""Align text."""
|
||||
s1 = list(grapheme_clusters(unicodedata.normalize("NFC", t1)))
|
||||
s2 = list(grapheme_clusters(unicodedata.normalize("NFC", t2)))
|
||||
return seq_align(s1, s2)
|
||||
|
||||
|
||||
def seq_align(s1, s2):
|
||||
"""Align general sequences."""
|
||||
s1 = list(s1)
|
||||
s2 = list(s2)
|
||||
ops = Levenshtein.editops(s1, s2)
|
||||
i = 0
|
||||
j = 0
|
||||
|
||||
while i < len(s1) or j < len(s2):
|
||||
o = None
|
||||
try:
|
||||
ot = ops[0]
|
||||
if ot[1] == i and ot[2] == j:
|
||||
del ops[0]
|
||||
o = ot
|
||||
except IndexError:
|
||||
pass
|
||||
|
||||
if o:
|
||||
if o[0] == "insert":
|
||||
yield None, s2[j]
|
||||
j += 1
|
||||
elif o[0] == "delete":
|
||||
yield s1[i], None
|
||||
i += 1
|
||||
elif o[0] == "replace":
|
||||
yield s1[i], s2[j]
|
||||
i += 1
|
||||
j += 1
|
||||
else:
|
||||
yield s1[i], s2[j]
|
||||
i += 1
|
||||
j += 1
|
47
src/dinglehopper/character_error_rate.py
Normal file
47
src/dinglehopper/character_error_rate.py
Normal file
|
@ -0,0 +1,47 @@
|
|||
from __future__ import division
|
||||
|
||||
import unicodedata
|
||||
from typing import Tuple
|
||||
|
||||
from multimethod import multimethod
|
||||
from uniseg.graphemecluster import grapheme_clusters
|
||||
|
||||
from .edit_distance import distance
|
||||
from .extracted_text import ExtractedText
|
||||
|
||||
|
||||
@multimethod
|
||||
def character_error_rate_n(reference: str, compared: str) -> Tuple[float, int]:
|
||||
"""
|
||||
Compute character error rate.
|
||||
|
||||
:return: character error rate and length of the reference
|
||||
"""
|
||||
|
||||
d = distance(reference, compared)
|
||||
n = len(list(grapheme_clusters(unicodedata.normalize("NFC", reference))))
|
||||
|
||||
if d == 0:
|
||||
return 0, n
|
||||
if n == 0:
|
||||
return float("inf"), n
|
||||
return d / n, n
|
||||
|
||||
# XXX Should we really count newlines here?
|
||||
|
||||
|
||||
@multimethod
|
||||
def character_error_rate_n(
|
||||
reference: ExtractedText, compared: ExtractedText
|
||||
) -> Tuple[float, int]:
|
||||
return character_error_rate_n(reference.text, compared.text)
|
||||
|
||||
|
||||
def character_error_rate(reference, compared) -> float:
|
||||
"""
|
||||
Compute character error rate.
|
||||
|
||||
:return: character error rate
|
||||
"""
|
||||
cer, _ = character_error_rate_n(reference, compared)
|
||||
return cer
|
239
src/dinglehopper/cli.py
Normal file
239
src/dinglehopper/cli.py
Normal file
|
@ -0,0 +1,239 @@
|
|||
import os
|
||||
from collections import Counter
|
||||
|
||||
import click
|
||||
from jinja2 import Environment, FileSystemLoader
|
||||
from markupsafe import escape
|
||||
from uniseg.graphemecluster import grapheme_clusters
|
||||
from ocrd_utils import initLogging
|
||||
|
||||
from dinglehopper.character_error_rate import character_error_rate_n
|
||||
from dinglehopper.word_error_rate import word_error_rate_n, words_normalized
|
||||
from dinglehopper.align import seq_align
|
||||
from dinglehopper.extracted_text import ExtractedText
|
||||
from dinglehopper.ocr_files import extract
|
||||
from dinglehopper.config import Config
|
||||
|
||||
|
||||
def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none, differences=False):
|
||||
gtx = ""
|
||||
ocrx = ""
|
||||
|
||||
def format_thing(t, css_classes=None, id_=None):
|
||||
if t is None:
|
||||
html_t = none
|
||||
css_classes += " ellipsis"
|
||||
elif t == "\n":
|
||||
html_t = "<br>"
|
||||
else:
|
||||
html_t = escape(t)
|
||||
|
||||
html_custom_attrs = ""
|
||||
|
||||
# Set Bootstrap tooltip to the segment id
|
||||
if id_:
|
||||
html_custom_attrs += 'data-toggle="tooltip" title="{}"'.format(id_)
|
||||
|
||||
if css_classes:
|
||||
return '<span class="{css_classes}" {html_custom_attrs}>{html_t}</span>'.format(
|
||||
css_classes=css_classes,
|
||||
html_t=html_t,
|
||||
html_custom_attrs=html_custom_attrs,
|
||||
)
|
||||
else:
|
||||
return "{html_t}".format(html_t=html_t)
|
||||
|
||||
if isinstance(gt_in, ExtractedText):
|
||||
if not isinstance(ocr_in, ExtractedText):
|
||||
raise TypeError()
|
||||
# XXX splitting should be done in ExtractedText
|
||||
gt_things = list(grapheme_clusters(gt_in.text))
|
||||
ocr_things = list(grapheme_clusters(ocr_in.text))
|
||||
else:
|
||||
gt_things = gt_in
|
||||
ocr_things = ocr_in
|
||||
|
||||
g_pos = 0
|
||||
o_pos = 0
|
||||
found_differences = []
|
||||
|
||||
for k, (g, o) in enumerate(seq_align(gt_things, ocr_things)):
|
||||
css_classes = None
|
||||
gt_id = None
|
||||
ocr_id = None
|
||||
if g != o:
|
||||
css_classes = "{css_prefix}diff{k} diff".format(css_prefix=css_prefix, k=k)
|
||||
if isinstance(gt_in, ExtractedText):
|
||||
gt_id = gt_in.segment_id_for_pos(g_pos) if g is not None else None
|
||||
ocr_id = ocr_in.segment_id_for_pos(o_pos) if o is not None else None
|
||||
# Deletions and inserts only produce one id + None, UI must
|
||||
# support this, i.e. display for the one id produced
|
||||
|
||||
if differences:
|
||||
found_differences.append(f'{g} :: {o}')
|
||||
|
||||
gtx += joiner + format_thing(g, css_classes, gt_id)
|
||||
ocrx += joiner + format_thing(o, css_classes, ocr_id)
|
||||
|
||||
if g is not None:
|
||||
g_pos += len(g)
|
||||
if o is not None:
|
||||
o_pos += len(o)
|
||||
|
||||
found_differences = dict(Counter(elem for elem in found_differences))
|
||||
|
||||
return """
|
||||
<div class="row">
|
||||
<div class="col-md-6 gt">{}</div>
|
||||
<div class="col-md-6 ocr">{}</div>
|
||||
</div>
|
||||
""".format(
|
||||
gtx, ocrx
|
||||
), found_differences
|
||||
|
||||
|
||||
def json_float(value):
|
||||
"""Convert a float value to an JSON float.
|
||||
|
||||
This is here so that float('inf') yields "Infinity", not "inf".
|
||||
"""
|
||||
if value == float("inf"):
|
||||
return "Infinity"
|
||||
elif value == float("-inf"):
|
||||
return "-Infinity"
|
||||
else:
|
||||
return str(value)
|
||||
|
||||
|
||||
def process(gt, ocr, report_prefix, reports_folder='.', *, metrics=True,
|
||||
differences=False, textequiv_level="region"):
|
||||
"""Check OCR result against GT.
|
||||
|
||||
The @click decorators change the signature of the decorated functions, so we keep this undecorated version and use
|
||||
Click on a wrapper.
|
||||
"""
|
||||
|
||||
gt_text = extract(gt, textequiv_level=textequiv_level)
|
||||
ocr_text = extract(ocr, textequiv_level=textequiv_level)
|
||||
|
||||
cer, n_characters = character_error_rate_n(gt_text, ocr_text)
|
||||
wer, n_words = word_error_rate_n(gt_text, ocr_text)
|
||||
|
||||
char_diff_report, diff_c = gen_diff_report(gt_text, ocr_text, css_prefix="c",
|
||||
joiner="",
|
||||
none="·", differences=differences)
|
||||
|
||||
gt_words = words_normalized(gt_text)
|
||||
ocr_words = words_normalized(ocr_text)
|
||||
word_diff_report, diff_w = gen_diff_report(
|
||||
gt_words, ocr_words, css_prefix="w", joiner=" ", none="⋯",
|
||||
differences=differences
|
||||
)
|
||||
|
||||
env = Environment(
|
||||
loader=FileSystemLoader(
|
||||
os.path.join(os.path.dirname(os.path.realpath(__file__)), "templates")
|
||||
)
|
||||
)
|
||||
env.filters["json_float"] = json_float
|
||||
|
||||
for report_suffix in (".html", ".json"):
|
||||
template_fn = "report" + report_suffix + ".j2"
|
||||
|
||||
if not os.path.isdir(reports_folder):
|
||||
os.mkdir(reports_folder)
|
||||
|
||||
out_fn = os.path.join(reports_folder, report_prefix + report_suffix)
|
||||
|
||||
template = env.get_template(template_fn)
|
||||
template.stream(
|
||||
gt=gt,
|
||||
ocr=ocr,
|
||||
cer=cer,
|
||||
n_characters=n_characters,
|
||||
wer=wer,
|
||||
n_words=n_words,
|
||||
char_diff_report=char_diff_report,
|
||||
word_diff_report=word_diff_report,
|
||||
metrics=metrics,
|
||||
differences=differences,
|
||||
diff_c=diff_c,
|
||||
diff_w=diff_w,
|
||||
).dump(out_fn)
|
||||
|
||||
|
||||
def process_dir(gt, ocr, report_prefix, reports_folder, metrics, differences,
|
||||
textequiv_level):
|
||||
for gt_file in os.listdir(gt):
|
||||
gt_file_path = os.path.join(gt, gt_file)
|
||||
ocr_file_path = os.path.join(ocr, gt_file)
|
||||
|
||||
if os.path.isfile(gt_file_path) and os.path.isfile(ocr_file_path):
|
||||
process(gt_file_path, ocr_file_path,
|
||||
f"{gt_file}-{report_prefix}",
|
||||
reports_folder=reports_folder,
|
||||
metrics=metrics,
|
||||
differences=differences,
|
||||
textequiv_level=textequiv_level)
|
||||
else:
|
||||
print("Skipping {0} and {1}".format(gt_file_path, ocr_file_path))
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.argument("gt", type=click.Path(exists=True))
|
||||
@click.argument("ocr", type=click.Path(exists=True))
|
||||
@click.argument("report_prefix", type=click.Path(), default="report")
|
||||
@click.argument("reports_folder", type=click.Path(), default=".")
|
||||
@click.option(
|
||||
"--metrics/--no-metrics", default=True, help="Enable/disable metrics and green/red"
|
||||
)
|
||||
@click.option(
|
||||
"--differences",
|
||||
default=False,
|
||||
help="Enable reporting character and word level differences"
|
||||
)
|
||||
@click.option(
|
||||
"--textequiv-level",
|
||||
default="region",
|
||||
help="PAGE TextEquiv level to extract text from",
|
||||
metavar="LEVEL",
|
||||
)
|
||||
@click.option("--progress", default=False, is_flag=True, help="Show progress bar")
|
||||
def main(gt, ocr, report_prefix, reports_folder, metrics, differences, textequiv_level,
|
||||
progress):
|
||||
"""
|
||||
Compare the PAGE/ALTO/text document GT against the document OCR.
|
||||
|
||||
dinglehopper detects if GT/OCR are ALTO or PAGE XML documents to extract
|
||||
their text and falls back to plain text if no ALTO or PAGE is detected.
|
||||
|
||||
The files GT and OCR are usually a ground truth document and the result of
|
||||
an OCR software, but you may use dinglehopper to compare two OCR results. In
|
||||
that case, use --no-metrics to disable the then meaningless metrics and also
|
||||
change the color scheme from green/red to blue.
|
||||
|
||||
The comparison report will be written to $REPORTS_FOLDER/$REPORT_PREFIX.{html,json},
|
||||
where $REPORTS_FOLDER defaults to the current working directory and
|
||||
$REPORT_PREFIX defaults to "report". The reports include the character error
|
||||
rate (CER) and the word error rate (WER).
|
||||
|
||||
By default, the text of PAGE files is extracted on 'region' level. You may
|
||||
use "--textequiv-level line" to extract from the level of TextLine tags.
|
||||
"""
|
||||
initLogging()
|
||||
Config.progress = progress
|
||||
if os.path.isdir(gt):
|
||||
if not os.path.isdir(ocr):
|
||||
raise click.BadParameter(
|
||||
"OCR must be a directory if GT is a directory", param_hint="ocr"
|
||||
)
|
||||
else:
|
||||
process_dir(gt, ocr, report_prefix, reports_folder, metrics,
|
||||
differences, textequiv_level)
|
||||
else:
|
||||
process(gt, ocr, report_prefix, reports_folder, metrics=metrics,
|
||||
differences=differences, textequiv_level=textequiv_level)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
34
src/dinglehopper/cli_extract.py
Normal file
34
src/dinglehopper/cli_extract.py
Normal file
|
@ -0,0 +1,34 @@
|
|||
import os
|
||||
|
||||
import click
|
||||
from ocrd_utils import initLogging
|
||||
|
||||
from .extracted_text import ExtractedText
|
||||
from .ocr_files import extract
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.argument("input_file", type=click.Path(exists=True))
|
||||
@click.option(
|
||||
"--textequiv-level",
|
||||
default="region",
|
||||
help="PAGE TextEquiv level to extract text from",
|
||||
metavar="LEVEL",
|
||||
)
|
||||
def main(input_file, textequiv_level):
|
||||
"""
|
||||
Extract the text of the given INPUT_FILE.
|
||||
|
||||
dinglehopper detects if INPUT_FILE is an ALTO or PAGE XML document to extract
|
||||
its text and falls back to plain text if no ALTO or PAGE is detected.
|
||||
|
||||
By default, the text of PAGE files is extracted on 'region' level. You may
|
||||
use "--textequiv-level line" to extract from the level of TextLine tags.
|
||||
"""
|
||||
initLogging()
|
||||
input_text = extract(input_file, textequiv_level=textequiv_level).text
|
||||
print(input_text)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
146
src/dinglehopper/cli_line_dirs.py
Normal file
146
src/dinglehopper/cli_line_dirs.py
Normal file
|
@ -0,0 +1,146 @@
|
|||
import os
|
||||
import sys
|
||||
import itertools
|
||||
|
||||
import click
|
||||
from jinja2 import Environment, FileSystemLoader
|
||||
from markupsafe import escape
|
||||
from uniseg.graphemecluster import grapheme_clusters
|
||||
from ocrd_utils import initLogging
|
||||
|
||||
from .character_error_rate import character_error_rate_n
|
||||
from .word_error_rate import word_error_rate_n, words_normalized
|
||||
from .align import seq_align
|
||||
from .extracted_text import ExtractedText
|
||||
from .ocr_files import plain_extract
|
||||
from .config import Config
|
||||
from .cli import gen_diff_report, json_float
|
||||
|
||||
|
||||
def all_equal(iterable):
|
||||
g = itertools.groupby(iterable)
|
||||
return next(g, True) and not next(g, False)
|
||||
|
||||
|
||||
def common_prefix(its):
|
||||
return [p[0] for p in itertools.takewhile(all_equal, zip(*its))]
|
||||
|
||||
|
||||
def common_suffix(its):
|
||||
return reversed(common_prefix(reversed(it) for it in its))
|
||||
|
||||
|
||||
def removesuffix(text, suffix):
|
||||
if suffix and text.endswith(suffix):
|
||||
return text[: -len(suffix)]
|
||||
return text
|
||||
|
||||
|
||||
def process(gt_dir, ocr_dir, report_prefix, *, metrics=True):
|
||||
gt_suffix = "".join(common_suffix(os.listdir(gt_dir)))
|
||||
ocr_suffix = "".join(common_suffix(os.listdir(ocr_dir)))
|
||||
|
||||
cer = None
|
||||
n_characters = None
|
||||
char_diff_report = ""
|
||||
wer = None
|
||||
n_words = None
|
||||
word_diff_report = ""
|
||||
|
||||
for k, gt in enumerate(os.listdir(gt_dir)):
|
||||
# Find a match by replacing the suffix
|
||||
ocr = removesuffix(gt, gt_suffix) + ocr_suffix
|
||||
|
||||
gt_text = plain_extract(os.path.join(gt_dir, gt), include_filename_in_id=True)
|
||||
ocr_text = plain_extract(
|
||||
os.path.join(ocr_dir, ocr), include_filename_in_id=True
|
||||
)
|
||||
|
||||
# Compute CER
|
||||
l_cer, l_n_characters = character_error_rate_n(gt_text, ocr_text)
|
||||
if cer is None:
|
||||
cer, n_characters = l_cer, l_n_characters
|
||||
else:
|
||||
# Rolling update
|
||||
cer = (cer * n_characters + l_cer * l_n_characters) / (
|
||||
n_characters + l_n_characters
|
||||
)
|
||||
n_characters = n_characters + l_n_characters
|
||||
|
||||
# Compute WER
|
||||
l_wer, l_n_words = word_error_rate_n(gt_text, ocr_text)
|
||||
if wer is None:
|
||||
wer, n_words = l_wer, l_n_words
|
||||
else:
|
||||
# Rolling update
|
||||
wer = (wer * n_words + l_wer * l_n_words) / (n_words + l_n_words)
|
||||
n_words = n_words + l_n_words
|
||||
|
||||
# Generate diff reports
|
||||
char_diff_report += gen_diff_report(
|
||||
gt_text, ocr_text, css_prefix="l{0}-c".format(k), joiner="", none="·"
|
||||
)
|
||||
gt_words = words_normalized(gt_text)
|
||||
ocr_words = words_normalized(ocr_text)
|
||||
word_diff_report += gen_diff_report(
|
||||
gt_words, ocr_words, css_prefix="l{0}-w".format(k), joiner=" ", none="⋯"
|
||||
)
|
||||
|
||||
env = Environment(
|
||||
loader=FileSystemLoader(
|
||||
os.path.join(os.path.dirname(os.path.realpath(__file__)), "templates")
|
||||
)
|
||||
)
|
||||
env.filters["json_float"] = json_float
|
||||
|
||||
for report_suffix in (".html", ".json"):
|
||||
template_fn = "report" + report_suffix + ".j2"
|
||||
out_fn = report_prefix + report_suffix
|
||||
|
||||
template = env.get_template(template_fn)
|
||||
template.stream(
|
||||
gt=gt_dir, # Note: directory
|
||||
ocr=ocr_dir, # Note: directory
|
||||
cer=cer,
|
||||
n_characters=n_characters,
|
||||
wer=wer,
|
||||
n_words=n_words,
|
||||
char_diff_report=char_diff_report,
|
||||
word_diff_report=word_diff_report,
|
||||
metrics=metrics,
|
||||
).dump(out_fn)
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.argument("gt", type=click.Path(exists=True))
|
||||
@click.argument("ocr", type=click.Path(exists=True))
|
||||
@click.argument("report_prefix", type=click.Path(), default="report")
|
||||
@click.option(
|
||||
"--metrics/--no-metrics", default=True, help="Enable/disable metrics and green/red"
|
||||
)
|
||||
def main(gt, ocr, report_prefix, metrics):
|
||||
"""
|
||||
Compare the GT line text directory against the OCR line text directory.
|
||||
|
||||
This assumes that the GT line text directory contains textfiles with a common
|
||||
suffix like ".gt.txt", and the OCR line text directory contains textfiles with
|
||||
a common suffix like ".some-ocr.txt". The text files also need to be paired,
|
||||
i.e. the GT file "line001.gt.txt" needs to match a file "line001.some-ocr.txt"
|
||||
in the OCT lines directory.
|
||||
|
||||
The GT and OCR directories are usually round truth line texts and the results of
|
||||
an OCR software, but you may use dinglehopper to compare two OCR results. In
|
||||
that case, use --no-metrics to disable the then meaningless metrics and also
|
||||
change the color scheme from green/red to blue.
|
||||
|
||||
The comparison report will be written to $REPORT_PREFIX.{html,json}, where
|
||||
$REPORT_PREFIX defaults to "report". The reports include the character error
|
||||
rate (CER) and the word error rate (WER).
|
||||
|
||||
"""
|
||||
initLogging()
|
||||
process(gt, ocr, report_prefix, metrics=metrics)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
101
src/dinglehopper/cli_summarize.py
Normal file
101
src/dinglehopper/cli_summarize.py
Normal file
|
@ -0,0 +1,101 @@
|
|||
import json
|
||||
import os
|
||||
|
||||
import click
|
||||
from ocrd_utils import initLogging
|
||||
from jinja2 import Environment, FileSystemLoader
|
||||
|
||||
from dinglehopper.cli import json_float
|
||||
|
||||
|
||||
def process(reports_folder, occurrences_threshold=1):
|
||||
cer_list = []
|
||||
wer_list = []
|
||||
cer_sum = 0
|
||||
wer_sum = 0
|
||||
diff_c = {}
|
||||
diff_w = {}
|
||||
|
||||
for report in os.listdir(reports_folder):
|
||||
if report.endswith(".json"):
|
||||
with open(os.path.join(reports_folder, report), "r") as f:
|
||||
report_data = json.load(f)
|
||||
|
||||
if "cer" not in report_data or "wer" not in report_data:
|
||||
click.echo(
|
||||
f"Skipping {report} because it does not contain CER and WER")
|
||||
continue
|
||||
|
||||
cer = report_data["cer"]
|
||||
wer = report_data["wer"]
|
||||
cer_list.append(cer)
|
||||
wer_list.append(wer)
|
||||
cer_sum += cer
|
||||
wer_sum += wer
|
||||
|
||||
for key, value in report_data["differences"]["character_level"].items():
|
||||
diff_c[key] = diff_c.get(key, 0) + value
|
||||
for key, value in report_data["differences"]["word_level"].items():
|
||||
diff_w[key] = diff_w.get(key, 0) + value
|
||||
|
||||
if len(cer_list) == 0:
|
||||
click.echo(f"No reports found in folder '{os.path.abspath(reports_folder)}'")
|
||||
return
|
||||
|
||||
cer_avg = cer_sum / len(cer_list)
|
||||
wer_avg = wer_sum / len(wer_list)
|
||||
|
||||
print(f"Number of reports: {len(cer_list)}")
|
||||
print(f"Average CER: {cer_avg}")
|
||||
print(f"Average WER: {wer_avg}")
|
||||
print(f"Sum of common mistakes: {cer_sum}")
|
||||
print(f"Sum of common mistakes: {wer_sum}")
|
||||
|
||||
env = Environment(
|
||||
loader=FileSystemLoader(
|
||||
os.path.join(os.path.dirname(os.path.realpath(__file__)), "templates")
|
||||
)
|
||||
)
|
||||
env.filters["json_float"] = json_float
|
||||
for report_suffix in (".html", ".json"):
|
||||
template_fn = "summary" + report_suffix + ".j2"
|
||||
|
||||
out_fn = os.path.join(reports_folder, 'summary' + report_suffix)
|
||||
template = env.get_template(template_fn)
|
||||
template.stream(
|
||||
num_reports=len(cer_list),
|
||||
cer_avg=cer_avg,
|
||||
wer_avg=wer_avg,
|
||||
diff_c=diff_c,
|
||||
diff_w=diff_w,
|
||||
occurrences_threshold=occurrences_threshold,
|
||||
).dump(out_fn)
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.argument("reports_folder",
|
||||
type=click.Path(exists=True),
|
||||
default="./reports"
|
||||
)
|
||||
@click.option("--occurrences-threshold",
|
||||
type=int,
|
||||
default=1,
|
||||
help="Only show differences that occur at least this many times.")
|
||||
def main(reports_folder, occurrences_threshold):
|
||||
"""
|
||||
Summarize the results from multiple reports generated earlier by dinglehopper.
|
||||
It calculates the average CER and WER, as well as a sum of common mistakes.
|
||||
Reports include lists of mistakes and their occurrences.
|
||||
|
||||
You may use a threshold to reduce the file size of the HTML report by only showing
|
||||
mistakes whose number of occurrences is above the threshold. The JSON report will
|
||||
always contain all mistakes.
|
||||
|
||||
All JSON files in the provided folder will be gathered and summarized.
|
||||
"""
|
||||
initLogging()
|
||||
process(reports_folder, occurrences_threshold)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
2
src/dinglehopper/config.py
Normal file
2
src/dinglehopper/config.py
Normal file
|
@ -0,0 +1,2 @@
|
|||
class Config:
|
||||
progress = False
|
43
src/dinglehopper/edit_distance.py
Normal file
43
src/dinglehopper/edit_distance.py
Normal file
|
@ -0,0 +1,43 @@
|
|||
from __future__ import division, print_function
|
||||
|
||||
import unicodedata
|
||||
from functools import partial, lru_cache
|
||||
from typing import Sequence, Tuple
|
||||
|
||||
import numpy as np
|
||||
from multimethod import multimethod
|
||||
from uniseg.graphemecluster import grapheme_clusters
|
||||
from tqdm import tqdm
|
||||
from rapidfuzz.distance import Levenshtein
|
||||
|
||||
from .extracted_text import ExtractedText
|
||||
from .config import Config
|
||||
|
||||
|
||||
@multimethod
|
||||
def distance(s1: str, s2: str):
|
||||
"""Compute the Levenshtein edit distance between two Unicode strings
|
||||
|
||||
Note that this is different from levenshtein() as this function knows about Unicode
|
||||
normalization and grapheme clusters. This should be the correct way to compare two
|
||||
Unicode strings.
|
||||
"""
|
||||
seq1 = list(grapheme_clusters(unicodedata.normalize("NFC", s1)))
|
||||
seq2 = list(grapheme_clusters(unicodedata.normalize("NFC", s2)))
|
||||
return Levenshtein.distance(seq1, seq2)
|
||||
|
||||
|
||||
@multimethod
|
||||
def distance(s1: ExtractedText, s2: ExtractedText):
|
||||
return distance(s1.text, s2.text)
|
||||
|
||||
|
||||
def editops(word1, word2):
|
||||
"""
|
||||
Return sequence of edit operations transforming one string to another.
|
||||
|
||||
Note that this returns indices to the _grapheme clusters_, not characters!
|
||||
"""
|
||||
word1 = list(grapheme_clusters(unicodedata.normalize("NFC", word1)))
|
||||
word2 = list(grapheme_clusters(unicodedata.normalize("NFC", word2)))
|
||||
return Levenshtein.editops(word1, word2).as_list()
|
279
src/dinglehopper/extracted_text.py
Normal file
279
src/dinglehopper/extracted_text.py
Normal file
|
@ -0,0 +1,279 @@
|
|||
import enum
|
||||
import re
|
||||
import unicodedata
|
||||
from contextlib import suppress
|
||||
from itertools import repeat
|
||||
from typing import Optional
|
||||
|
||||
import attr
|
||||
import numpy as np
|
||||
from lxml import etree as ET
|
||||
from ocrd_utils import getLogger
|
||||
|
||||
|
||||
class Normalization(enum.Enum):
|
||||
NFC = 1
|
||||
NFC_MUFI = 2 # TODO
|
||||
NFC_SBB = 3
|
||||
|
||||
|
||||
def normalize(text, normalization):
|
||||
if normalization == Normalization.NFC:
|
||||
return unicodedata.normalize("NFC", text)
|
||||
if normalization == Normalization.NFC_MUFI:
|
||||
raise NotImplementedError()
|
||||
if normalization == Normalization.NFC_SBB:
|
||||
return substitute_equivalences(text)
|
||||
else:
|
||||
raise ValueError()
|
||||
|
||||
|
||||
# XXX hack
|
||||
def normalize_sbb(t):
|
||||
return normalize(t, Normalization.NFC_SBB)
|
||||
|
||||
|
||||
def unjoin_ligatures(s):
|
||||
"""Unjoin ligatures, i.e. ff becomes ff."""
|
||||
|
||||
equivalences = {
|
||||
"": "ſſ",
|
||||
"\ueba7": "ſſi", # MUFI: LATIN SMALL LIGATURE LONG S LONG S I
|
||||
"": "ch",
|
||||
"": "ck",
|
||||
"": "ll",
|
||||
"": "ſi",
|
||||
"": "ſt",
|
||||
"fi": "fi",
|
||||
"ff": "ff",
|
||||
"fl": "fl",
|
||||
"ffi": "ffi",
|
||||
"": "ct",
|
||||
"": "tz", # MUFI: LATIN SMALL LIGATURE TZ
|
||||
"\uf532": "as", # eMOP: Latin small ligature as
|
||||
"\uf533": "is", # eMOP: Latin small ligature is
|
||||
"\uf534": "us", # eMOP: Latin small ligature us
|
||||
"\uf535": "Qu", # eMOP: Latin ligature capital Q small u
|
||||
"ij": "ij", # U+0133 LATIN SMALL LIGATURE IJ
|
||||
"\uE8BF": "q&",
|
||||
# MUFI: LATIN SMALL LETTER Q LIGATED WITH FINAL ET
|
||||
# XXX How to replace this correctly?
|
||||
"\uEBA5": "ſp", # MUFI: LATIN SMALL LIGATURE LONG S P
|
||||
"st": "st", # U+FB06 LATIN SMALL LIGATURE ST
|
||||
}
|
||||
s = unicodedata.normalize("NFC", s)
|
||||
for fr, to in equivalences.items():
|
||||
s = s.replace(fr, to)
|
||||
return s
|
||||
|
||||
|
||||
def substitute_equivalences(s):
|
||||
# These are for OCR-D GT vs Tesseract frk vs Calamari GT4HistOCR
|
||||
# It might make sense to use different rules for GT and for the different OCR
|
||||
equivalences = {
|
||||
"": "ü",
|
||||
"": "ä",
|
||||
"==": "–", # → en-dash
|
||||
"—": "–", # em-dash → en-dash
|
||||
"": "ö",
|
||||
"’": "'",
|
||||
"⸗": "-",
|
||||
"aͤ": "ä", # LATIN SMALL LETTER A, COMBINING LATIN SMALL LETTER E
|
||||
"oͤ": "ö", # LATIN SMALL LETTER O, COMBINING LATIN SMALL LETTER E
|
||||
"uͤ": "ü", # LATIN SMALL LETTER U, COMBINING LATIN SMALL LETTER E
|
||||
"\uF50E": "q́", # U+F50E LATIN SMALL LETTER Q WITH ACUTE ACCENT
|
||||
}
|
||||
|
||||
s = unicodedata.normalize("NFC", s)
|
||||
s = unjoin_ligatures(s)
|
||||
for fr, to in equivalences.items():
|
||||
s = s.replace(fr, to)
|
||||
return s
|
||||
|
||||
|
||||
@attr.s(frozen=True)
|
||||
class ExtractedText:
|
||||
"""
|
||||
Extracted text.
|
||||
|
||||
We need a segment id for each extracted text segment. As this should support
|
||||
extracting from the word (or even glyph) level, we need to have a
|
||||
hierarchical representation of the
|
||||
text due to the different "joiners" needed on each level.
|
||||
|
||||
For example, here is pseudo code to get the text of a page:
|
||||
|
||||
* from region texts:
|
||||
`'\n'.join(region_texts)`
|
||||
* from line texts:
|
||||
`'\n'.join('\n'.join(line_texts) for every region`)
|
||||
* from word texts:
|
||||
`'\n'.join(('\n'.join(' '.join(word_texts) for every line) for every region))`
|
||||
|
||||
An ExtractedText object either contains a text itself or has child segments
|
||||
(and a joiner), not both.
|
||||
|
||||
Objects of this class are guaranteed to be a. always in their normalization
|
||||
and b. in NFC.
|
||||
"""
|
||||
|
||||
segment_id = attr.ib(type=Optional[str])
|
||||
|
||||
@segment_id.validator
|
||||
def check(self, _, value):
|
||||
if value is None:
|
||||
return
|
||||
if not re.match(r"[\w\d_-]+", value):
|
||||
raise ValueError('Malformed segment id "{}"'.format(value))
|
||||
|
||||
# An object contains either
|
||||
# a. _text itself
|
||||
# b. or segments (ExtractedText) and a joiner
|
||||
|
||||
segments = attr.ib(type=Optional[list], converter=attr.converters.optional(list))
|
||||
joiner = attr.ib(type=Optional[str])
|
||||
_text = attr.ib(type=Optional[str])
|
||||
|
||||
@segments.validator
|
||||
def check(self, _, value):
|
||||
if value is not None and self._text is not None:
|
||||
raise ValueError("Can't have both segments and text")
|
||||
|
||||
@_text.validator
|
||||
def check(self, _, value):
|
||||
if value is not None and self.segments is not None:
|
||||
raise ValueError("Can't have both segments and text")
|
||||
if value is not None and unicodedata.normalize("NFC", value) != value:
|
||||
raise ValueError('String "{}" is not in NFC.'.format(value))
|
||||
if value is not None and normalize(value, self.normalization) != value:
|
||||
raise ValueError('String "{}" is not normalized.'.format(value))
|
||||
|
||||
normalization = attr.ib(converter=Normalization, default=Normalization.NFC_SBB)
|
||||
|
||||
@property
|
||||
def text(self):
|
||||
if self._text is not None:
|
||||
return self._text
|
||||
else:
|
||||
return self.joiner.join(s.text for s in self.segments)
|
||||
|
||||
_segment_id_for_pos = None
|
||||
|
||||
def segment_id_for_pos(self, pos):
|
||||
# Calculate segment ids once, on the first call
|
||||
if not self._segment_id_for_pos:
|
||||
if self._text is not None:
|
||||
segment_id_for_pos = list(repeat(self.segment_id, len(self._text)))
|
||||
else:
|
||||
# Recurse
|
||||
segment_id_for_pos = []
|
||||
for s in self.segments:
|
||||
seg_ids = [s.segment_id_for_pos(i) for i in range(len(s.text))]
|
||||
segment_id_for_pos.extend(seg_ids)
|
||||
segment_id_for_pos.extend(repeat(None, len(self.joiner)))
|
||||
segment_id_for_pos = segment_id_for_pos[: -len(self.joiner)]
|
||||
|
||||
# This is frozen, so we have to jump through the hoop:
|
||||
object.__setattr__(self, "_segment_id_for_pos", segment_id_for_pos)
|
||||
assert self._segment_id_for_pos
|
||||
|
||||
return self._segment_id_for_pos[pos]
|
||||
|
||||
@classmethod
|
||||
def from_text_segment(cls, text_segment, nsmap, textequiv_level="region"):
|
||||
"""Build an ExtractedText from a PAGE content text element"""
|
||||
|
||||
localname_for_textequiv_level = {"region": "TextRegion", "line": "TextLine"}
|
||||
textequiv_level_for_localname = invert_dict(localname_for_textequiv_level)
|
||||
children_for_localname = {"TextRegion": "TextLine"}
|
||||
joiner_for_textequiv_level = {"line": "\n"}
|
||||
|
||||
segment_id = text_segment.attrib["id"]
|
||||
localname = ET.QName(text_segment).localname
|
||||
if localname == localname_for_textequiv_level[textequiv_level]:
|
||||
segment_text = None
|
||||
with suppress(AttributeError):
|
||||
segment_text = get_textequiv_unicode(text_segment, nsmap)
|
||||
# FIXME hardcoded SBB normalization
|
||||
segment_text = normalize_sbb(segment_text)
|
||||
segment_text = segment_text or ""
|
||||
return cls(segment_id, None, None, segment_text)
|
||||
else:
|
||||
# Recurse
|
||||
sub_localname = children_for_localname[localname]
|
||||
sub_textequiv_level = textequiv_level_for_localname[sub_localname]
|
||||
segments = []
|
||||
for sub_segment in text_segment.iterfind(
|
||||
"./page:%s" % sub_localname, namespaces=nsmap
|
||||
):
|
||||
segments.append(
|
||||
ExtractedText.from_text_segment(
|
||||
sub_segment, nsmap, textequiv_level=sub_textequiv_level
|
||||
)
|
||||
)
|
||||
joiner = joiner_for_textequiv_level[sub_textequiv_level]
|
||||
return cls(segment_id, segments, joiner, None)
|
||||
|
||||
@classmethod
|
||||
def from_str(cls, text, normalization=Normalization.NFC_SBB):
|
||||
normalized_text = normalize(text, normalization)
|
||||
return cls(None, None, None, normalized_text, normalization=normalization)
|
||||
|
||||
|
||||
def invert_dict(d):
|
||||
"""Invert the given dict."""
|
||||
return {v: k for k, v in d.items()}
|
||||
|
||||
|
||||
def get_textequiv_unicode(text_segment, nsmap) -> str:
|
||||
"""Get the TextEquiv/Unicode text of the given PAGE text element."""
|
||||
segment_id = text_segment.attrib["id"]
|
||||
textequivs = text_segment.findall("./page:TextEquiv", namespaces=nsmap)
|
||||
|
||||
if not textequivs:
|
||||
return ""
|
||||
|
||||
textequiv = get_first_textequiv(textequivs, segment_id)
|
||||
return textequiv.find("./page:Unicode", namespaces=nsmap).text or ""
|
||||
|
||||
|
||||
def get_first_textequiv(textequivs, segment_id):
|
||||
"""Get the first TextEquiv based on index or conf order if index is not present."""
|
||||
log = getLogger("processor.OcrdDinglehopperEvaluate")
|
||||
if len(textequivs) == 1:
|
||||
return textequivs[0]
|
||||
|
||||
# try ordering by index
|
||||
indices = np.array([get_attr(te, "index") for te in textequivs], dtype=float)
|
||||
nan_mask = np.isnan(indices)
|
||||
if np.any(~nan_mask):
|
||||
if np.any(nan_mask):
|
||||
log.warning("TextEquiv without index in %s.", segment_id)
|
||||
index = np.nanargmin(indices)
|
||||
else:
|
||||
# try ordering by conf
|
||||
confidences = np.array([get_attr(te, "conf") for te in textequivs], dtype=float)
|
||||
if np.any(~np.isnan(confidences)):
|
||||
log.info(
|
||||
"No index attributes, use 'conf' attribute to sort TextEquiv in %s.",
|
||||
segment_id,
|
||||
)
|
||||
index = np.nanargmax(confidences)
|
||||
else:
|
||||
# fallback to first entry in case of neither index or conf present
|
||||
log.warning("No index attributes, use first TextEquiv in %s.", segment_id)
|
||||
index = 0
|
||||
return textequivs[index]
|
||||
|
||||
|
||||
def get_attr(te, attr_name) -> float:
|
||||
"""Extract the attribute for the given name.
|
||||
|
||||
Note: currently only handles numeric values!
|
||||
Other or non existend values are encoded as np.nan.
|
||||
"""
|
||||
attr_value = te.attrib.get(attr_name)
|
||||
try:
|
||||
return float(attr_value)
|
||||
except TypeError:
|
||||
return np.nan
|
946
src/dinglehopper/notebooks/Levenshtein.ipynb
Normal file
946
src/dinglehopper/notebooks/Levenshtein.ipynb
Normal file
|
@ -0,0 +1,946 @@
|
|||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import numpy as np\n",
|
||||
"import unicodedata\n",
|
||||
"import inspect"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Levenshtein edit distance"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"dinglehopper uses to have its own (very inefficient) Levenshtein edit distance implementation, but now uses RapidFuzz."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from rapidfuzz.distance.Levenshtein import distance as levenshtein"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"assert levenshtein('a', 'a') == 0\n",
|
||||
"assert levenshtein('a', 'b') == 1\n",
|
||||
"assert levenshtein('Foo', 'Bar') == 3\n",
|
||||
"assert levenshtein('', '') == 0\n",
|
||||
"assert levenshtein('Foo', '') == 3\n",
|
||||
"assert levenshtein('', 'Foo') == 3\n",
|
||||
"assert levenshtein('Fnord', 'Food') == 2\n",
|
||||
"assert levenshtein('Müll', 'Mull') == 1\n",
|
||||
"assert levenshtein('Abstand', 'Sand') == 4"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"This fails for different representations of the \"same\" canonically equivalent string:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"2"
|
||||
]
|
||||
},
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"word1 = unicodedata.normalize('NFC', 'Schlyñ')\n",
|
||||
"word2 = unicodedata.normalize('NFD', 'Schlyñ') # Different, decomposed!\n",
|
||||
"levenshtein(word1, word2)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"1"
|
||||
]
|
||||
},
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Same, but for grapheme clusters\n",
|
||||
"from uniseg.graphemecluster import grapheme_clusters\n",
|
||||
"\n",
|
||||
"word1 = list(grapheme_clusters(unicodedata.normalize('NFC', 'Schlyñ')))\n",
|
||||
"word2 = list(grapheme_clusters(unicodedata.normalize('NFD', 'Schlyñ')))\n",
|
||||
"levenshtein(word1, word2)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Better."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Let's define a edit distance function that uses the basic Levenshtein algorithm, but knows about Unicode normalization and grapheme clusters!"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"@multimethod\n",
|
||||
"def distance(s1: str, s2: str):\n",
|
||||
" \"\"\"Compute the Levenshtein edit distance between two Unicode strings\n",
|
||||
"\n",
|
||||
" Note that this is different from levenshtein() as this function knows about Unicode\n",
|
||||
" normalization and grapheme clusters. This should be the correct way to compare two\n",
|
||||
" Unicode strings.\n",
|
||||
" \"\"\"\n",
|
||||
" seq1 = list(grapheme_clusters(unicodedata.normalize(\"NFC\", s1)))\n",
|
||||
" seq2 = list(grapheme_clusters(unicodedata.normalize(\"NFC\", s2)))\n",
|
||||
" return levenshtein(seq1, seq2)\n",
|
||||
"\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from qurator.dinglehopper.edit_distance import distance\n",
|
||||
"print(inspect.getsource(distance))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"0"
|
||||
]
|
||||
},
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"word1 = unicodedata.normalize('NFC', 'Schlyñ')\n",
|
||||
"word2 = unicodedata.normalize('NFD', 'Schlyñ') # Different, decomposed!\n",
|
||||
"\n",
|
||||
"distance(word1, word2)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"This should give us the correct answer of 1 for 'Schlyñ' (with LATIN SMALL LETTER N WITH TILDE) vs 'Schlym̃' (with LATIN SMALL LETTER M + COMBINING TILDE):"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"1"
|
||||
]
|
||||
},
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"word1 = 'Schlyñ'\n",
|
||||
"word2 = 'Schlym̃'\n",
|
||||
"#print('Lengths, as far as Python is concerned:', len(word1), len(word2)) # → gives 6 and 7!\n",
|
||||
"distance(word1, word2)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Edit operations\n",
|
||||
"\n",
|
||||
"python-Levenshtein + RapidFuzz supports backtracing, i.e. giving a sequence of edit options that transforms a word to another word:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[('replace', 2, 2)]"
|
||||
]
|
||||
},
|
||||
"execution_count": 9,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from rapidfuzz.distance.Levenshtein import editops\n",
|
||||
"\n",
|
||||
"editops('Foo', 'Fon')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"[('insert', 4, 4)]\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(editops('Käptn', 'Käpt\\'n'))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"[('delete', 6, 6)]\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(editops('Delete something', 'Deletesomething'))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 12,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"[('delete', 1, 1), ('replace', 13, 12), ('insert', 16, 15), ('delete', 23, 23)]\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(editops('A more difficult example', 'Amore difficült exampl'))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Let's try it with a difficult example that needs grapheme cluster handling:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 13,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[('insert', 5, 5), ('replace', 5, 6)]"
|
||||
]
|
||||
},
|
||||
"execution_count": 13,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"word1 = 'Schlyñ' # with LATIN SMALL LETTER N WITH TILDE\n",
|
||||
"word2 = 'Schlym̃' # with LATIN SMALL LETTER M + COMBINING TILDE\n",
|
||||
"\n",
|
||||
"editops(word1, word2)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"That doesn't look right, let's redefine it with grapheme cluster support:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 14,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"def editops(word1, word2):\n",
|
||||
" \"\"\"\n",
|
||||
" Return sequence of edit operations transforming one string to another.\n",
|
||||
"\n",
|
||||
" Note that this returns indices to the _grapheme clusters_, not characters!\n",
|
||||
" \"\"\"\n",
|
||||
" word1 = list(grapheme_clusters(unicodedata.normalize(\"NFC\", word1)))\n",
|
||||
" word2 = list(grapheme_clusters(unicodedata.normalize(\"NFC\", word2)))\n",
|
||||
" return levenshtein_editops(word1, word2)\n",
|
||||
"\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from qurator.dinglehopper.edit_distance import editops\n",
|
||||
"print(inspect.getsource(editops))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 15,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[('replace', 5, 5)]"
|
||||
]
|
||||
},
|
||||
"execution_count": 15,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"word1 = 'Schlyñ' # with LATIN SMALL LETTER N WITH TILDE\n",
|
||||
"word2 = 'Schlym̃' # with LATIN SMALL LETTER M + COMBINING TILDE\n",
|
||||
"\n",
|
||||
"editops(word1, word2)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"🎉\n",
|
||||
"\n",
|
||||
"Here, a problem is that the positions are grapheme cluster positions, not Python character indexes!"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Character error rate\n",
|
||||
"\n",
|
||||
"[digitisation.eu](https://sites.google.com/site/textdigitisation/qualitymeasures/computingerrorrates) defines the character error rate (CER) as:\n",
|
||||
"\n",
|
||||
"$$\n",
|
||||
"\\text{CER} = \\frac{i + s + d}{n}\n",
|
||||
"$$\n",
|
||||
"\n",
|
||||
"where $i$ is the number of inserts, $s$ the number of substitutions, $d$ the number of deletions and $n$ is the number of characters in the reference text. (The text is not super clear about $n$ being the number of characters in the reference text, but it seems appropiate as they *are* clear about this when computing the word error rate.)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Because our edit distance is equal to $i + s + d$, we can thus define:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 16,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"def character_error_rate(reference, compared) -> float:\n",
|
||||
" \"\"\"\n",
|
||||
" Compute character error rate.\n",
|
||||
"\n",
|
||||
" :return: character error rate\n",
|
||||
" \"\"\"\n",
|
||||
" cer, _ = character_error_rate_n(reference, compared)\n",
|
||||
" return cer\n",
|
||||
"\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from qurator.dinglehopper.character_error_rate import character_error_rate\n",
|
||||
"print(inspect.getsource(character_error_rate))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 17,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"assert character_error_rate('Foo', 'Bär') == 3/3\n",
|
||||
"assert character_error_rate('Fnord', 'Food') == 2/5\n",
|
||||
"assert character_error_rate('Food', 'Fnord') == 2/4\n",
|
||||
"assert character_error_rate('Schlyñ', 'Schlym̃') == 1/6"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 18,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# From experiments/2019-07-ocrevalUAtion: These are already preprocessed by the equivalences in equivalences-tess-frk.csv.\n",
|
||||
"gt = \"\"\"115 über die vielen Sorgen wegen deſſelben vergaß Hartkopf, der Frau Amtmännin das ver⸗ ſprochene zu überliefern. — Ein Erpreſſer wurde an ihn abgeſchickt, um ihn ums Him⸗ melswillen zu ſagen, daß er das Verſprochene gleich den Augenblick überbringen möchte, die Frau Amtmännin hätte ſich auf ihn verlaſſen, und nun wüßte ſie nicht, was ſie anfangen ſollte. Den Augenblick ſollte er kommen, ſonſt vergieng ſie in ihrer Angſt. — Die Gäſte wären ſchon angekommen, und es fehlte ihr doch noch an allem. — Hartkopf mußte ſich erſt beſinnen, und endlich nach langem Nachdenken fiel es ihm erſt wieder ein. — Er langte den Zettel aus dem Accisbuche heraus, und ſagte ſeiner Frau, daß ſie das, was da wäre, herbeyſchaffen möchte. Jndeß mangelten doch einige Generalia, die alſo wegfielen. — Hartkopf gieng ſelbſt mit und überbrachte es. — „Herr Jemine! er böſer Mann!“ — ſchrie ihm die Frau Amtmännin entgegen, und ſchlug ihn auf die Schulter und blickte den Korb, der voll gedrückt, gerüttelt und überflüſſig in ihren Schoos gegeben werden ſollte, mit Augen voller Freu⸗ H 2\"\"\"\n",
|
||||
"tess = \"\"\"emm unmit; Lis Übey die vielen Sorgen wegen\" deſſelben vergaß Hartkopf, der Frau! Amimännin das- ver ſprochene zu überliefeen. ==\" Ein Epypreſſer- wurde an ihn abgeſchieet', um' ihn ums Hime melswillen zu ſagen, \"daß er das Verſyrochene leich den Augenblick \"überbringen möchte, die Frau Amtmännin hätte ſich auf ihn veriaſſen, und nun wüßte ſie- nicht, was ſie anfangen ſollte, =! 'Den Augenblick ſollte \"er kommen, ſonſt vergieng ſie in ihrer Angſt. == Die Säuaſie- wären. ſchon angekommen, und es fehlte ihr do < noch an alien, === Hartfopyf mußte ſich erſt TIM und endlich mach langem Rachdenken fiel es ihm erſt wieder ein, ==. Ex langte den Zettel aus dem- Accisbuche heraus, und ſagte ſeiner Frau, daß ſie das , was da wäre, herbeyſchaffen mschte. ZIudeß „mangelten doch einige Generalia, die alſo wegfielen. == ' Havrkopf gieng ſelbſt mit und überbrachte es == | „Herr Jemine! er böſer Mann 1-2 ſchrie ihm die Frau Amtmännin entgegen, und ſchlug ihn auf die Schulter und blickte den Korb, der - voll gedrückt, gerüttelt und überfirfſig in ihren Ss HEILE werden ſolite, mit Augen voller EE) Fron?\"\"\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 19,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"0.1190\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print('{:.4f}'.format(character_error_rate(gt, tess)))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"XXX This gives a smaller CER than ocrevalUAtion (which gives 0.1228). Why?"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 20,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"0.1190253045923149"
|
||||
]
|
||||
},
|
||||
"execution_count": 20,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"levenshtein(gt, tess)/len(gt)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"That's ~ the same, so I think it's not about the character segmentation. Check that we're only dealing with single-codepoint grapheme clusters:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 21,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"for w in gt, tess:\n",
|
||||
" for g in grapheme_clusters(w):\n",
|
||||
" assert len(g) == 1"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Maybe ocrevalUAtion doesn't count whitespace?"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 22,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'115überdievielenSorgenwegendeſſelbenvergaßHartkopf,derFrauAmtmännindasver⸗ſprochenezuüberliefern.—EinErpreſſerwurdeanihnabgeſchickt,umihnumsHim⸗melswillenzuſagen,daßerdasVerſprochenegleichdenAugenblicküberbringenmöchte,dieFrauAmtmänninhätteſichaufihnverlaſſen,undnunwüßteſienicht,wasſieanfangenſollte.DenAugenblickſollteerkommen,ſonſtvergiengſieinihrerAngſt.—DieGäſtewärenſchonangekommen,undesfehlteihrdochnochanallem.—Hartkopfmußteſicherſtbeſinnen,undendlichnachlangemNachdenkenfielesihmerſtwiederein.—ErlangtedenZettelausdemAccisbucheheraus,undſagteſeinerFrau,daßſiedas,wasdawäre,herbeyſchaffenmöchte.JndeßmangeltendocheinigeGeneralia,diealſowegfielen.—Hartkopfgiengſelbſtmitundüberbrachtees.—„HerrJemine!erböſerMann!“—ſchrieihmdieFrauAmtmänninentgegen,undſchlugihnaufdieSchulterundblicktedenKorb,dervollgedrückt,gerütteltundüberflüſſiginihrenSchoosgegebenwerdenſollte,mitAugenvollerFreu⸗H2'"
|
||||
]
|
||||
},
|
||||
"execution_count": 22,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"def remove_whitespace(s):\n",
|
||||
" return s.replace(' ', '')\n",
|
||||
"remove_whitespace(gt)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 23,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"0.1324\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print('{:.4f}'.format(character_error_rate(remove_whitespace(gt), remove_whitespace(tess))))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Now it's larger than ocrevalUAtion 🤷♂️"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Word error rate"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Word segmentation"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Naively split on spaces.\n",
|
||||
"\n",
|
||||
"(Note: ocrevalUAtion does confusing things here, like the Token splitting in a hash function, with an empty pattern?!)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 24,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def naive_word_split(s):\n",
|
||||
" return s.split(' ')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 25,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"example_text = \"The quick (“brown”) fox can't jump 32.3 feet, right?\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 26,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"['The',\n",
|
||||
" 'quick',\n",
|
||||
" '(“brown”)',\n",
|
||||
" 'fox',\n",
|
||||
" \"can't\",\n",
|
||||
" 'jump',\n",
|
||||
" '32.3',\n",
|
||||
" 'feet,',\n",
|
||||
" 'right?']"
|
||||
]
|
||||
},
|
||||
"execution_count": 26,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"naive_word_split(example_text)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Let's do it the Unicode way (Appendix UAX #29 on Unicode Text Segmentation): Split on word boundaries using the uniseg libraries and ignore words that contain only whitespace, punctuation \"and similar characters\":"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 27,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"@multimethod\n",
|
||||
"def words(s: str):\n",
|
||||
" \"\"\"Extract words from a string\"\"\"\n",
|
||||
"\n",
|
||||
" # Patch uniseg.wordbreak.word_break to deal with our private use characters. See also\n",
|
||||
" # https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/WordBreakProperty.txt\n",
|
||||
" old_word_break = uniseg.wordbreak.word_break\n",
|
||||
"\n",
|
||||
" def new_word_break(c, index=0):\n",
|
||||
" if 0xE000 <= ord(c) <= 0xF8FF: # Private Use Area\n",
|
||||
" return \"ALetter\"\n",
|
||||
" else:\n",
|
||||
" return old_word_break(c, index)\n",
|
||||
"\n",
|
||||
" uniseg.wordbreak.word_break = new_word_break\n",
|
||||
"\n",
|
||||
" # Check if c is an unwanted character, i.e. whitespace, punctuation, or similar\n",
|
||||
" def unwanted(c):\n",
|
||||
"\n",
|
||||
" # See https://www.fileformat.info/info/unicode/category/index.htm\n",
|
||||
" # and https://unicodebook.readthedocs.io/unicode.html#categories\n",
|
||||
" unwanted_categories = \"O\", \"M\", \"P\", \"Z\", \"S\"\n",
|
||||
" unwanted_subcategories = \"Cc\", \"Cf\"\n",
|
||||
"\n",
|
||||
" subcat = unicodedata.category(c)\n",
|
||||
" cat = subcat[0]\n",
|
||||
" return cat in unwanted_categories or subcat in unwanted_subcategories\n",
|
||||
"\n",
|
||||
" # We follow Unicode Standard Annex #29 on Unicode Text Segmentation here: Split on word boundaries using\n",
|
||||
" # uniseg.wordbreak.words() and ignore all \"words\" that contain only whitespace, punctation \"or similar characters.\"\n",
|
||||
" for word in uniseg.wordbreak.words(s):\n",
|
||||
" if all(unwanted(c) for c in word):\n",
|
||||
" pass\n",
|
||||
" else:\n",
|
||||
" yield word\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"['The', 'quick', 'brown', 'fox', \"can't\", 'jump', '32.3', 'feet', 'right']"
|
||||
]
|
||||
},
|
||||
"execution_count": 27,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from qurator.dinglehopper.word_error_rate import words\n",
|
||||
"print(inspect.getsource(words))\n",
|
||||
"\n",
|
||||
"list(words(example_text))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 28,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"['Der',\n",
|
||||
" 'schnelle',\n",
|
||||
" 'braune',\n",
|
||||
" 'Fuchs',\n",
|
||||
" 'kann',\n",
|
||||
" 'keine',\n",
|
||||
" '3,14',\n",
|
||||
" 'Meter',\n",
|
||||
" 'springen',\n",
|
||||
" 'oder']"
|
||||
]
|
||||
},
|
||||
"execution_count": 28,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"list(words('Der schnelle [„braune“] Fuchs kann keine 3,14 Meter springen, oder?'))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 29,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"['Dies', 'ist', 'ein', 'Beispielsatz', 'Oh', 'ja']"
|
||||
]
|
||||
},
|
||||
"execution_count": 29,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"list(words('Dies ist ein Beispielsatz. Oh, ja.'))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"It's probably not correct for Chinese and Japanese, but at least it doesn't rely on spaces."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 30,
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"['我', '很', '高', '興', '跟', '你', '見', '面']"
|
||||
]
|
||||
},
|
||||
"execution_count": 30,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"list(words('我很高興跟你見面')) # \"Pleased to meet you\" in Mandarin, Traditional writing"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 31,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"['医', '者', 'を', '呼', 'ん', 'で', 'く', 'だ', 'さ', 'い']"
|
||||
]
|
||||
},
|
||||
"execution_count": 31,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"list(words('医者を呼んでください。'))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Word error rate"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"For the word error rate, normalize again and compare sequences of words."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 32,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"def word_error_rate(reference, compared) -> float:\n",
|
||||
" wer, _ = word_error_rate_n(reference, compared)\n",
|
||||
" return wer\n",
|
||||
"\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from qurator.dinglehopper.word_error_rate import word_error_rate\n",
|
||||
"print(inspect.getsource(word_error_rate))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 33,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"0.25"
|
||||
]
|
||||
},
|
||||
"execution_count": 33,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"word_error_rate('Dies ist ein Beispielsatz.', 'Dies isi ein Beispielsatz,')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 34,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"0.75"
|
||||
]
|
||||
},
|
||||
"execution_count": 34,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"word_error_rate('Fnord ist verdampfter Kräutertee!', 'Fnòrd ist verdmpfter Krautertee.')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 35,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"0.18823529411764706"
|
||||
]
|
||||
},
|
||||
"execution_count": 35,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"word_error_rate(gt, tess)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"This is a little larger than the ocrevalUAtion result!"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"hide_input": false,
|
||||
"kernelspec": {
|
||||
"display_name": "dinglehopper-github",
|
||||
"language": "python",
|
||||
"name": "dinglehopper-github"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.7.12"
|
||||
},
|
||||
"toc": {
|
||||
"base_numbering": 1,
|
||||
"nav_menu": {},
|
||||
"number_sections": true,
|
||||
"sideBar": true,
|
||||
"skip_h1_title": false,
|
||||
"title_cell": "Table of Contents",
|
||||
"title_sidebar": "Contents",
|
||||
"toc_cell": false,
|
||||
"toc_position": {},
|
||||
"toc_section_display": true,
|
||||
"toc_window_display": true
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
|
@ -0,0 +1,558 @@
|
|||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import unicodedata"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def list_characters(s):\n",
|
||||
" \"\"\"List characters of string s, as seen by Python\"\"\"\n",
|
||||
" for c in s:\n",
|
||||
" print(c, end=' ')\n",
|
||||
" if unicodedata.combining(c):\n",
|
||||
" print(end=' ')\n",
|
||||
" print(unicodedata.name(c))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Comparing two Unicode strings"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"S LATIN CAPITAL LETTER S\n",
|
||||
"c LATIN SMALL LETTER C\n",
|
||||
"h LATIN SMALL LETTER H\n",
|
||||
"l LATIN SMALL LETTER L\n",
|
||||
"y LATIN SMALL LETTER Y\n",
|
||||
"ñ LATIN SMALL LETTER N WITH TILDE\n",
|
||||
"\n",
|
||||
"S LATIN CAPITAL LETTER S\n",
|
||||
"c LATIN SMALL LETTER C\n",
|
||||
"h LATIN SMALL LETTER H\n",
|
||||
"l LATIN SMALL LETTER L\n",
|
||||
"y LATIN SMALL LETTER Y\n",
|
||||
"n LATIN SMALL LETTER N\n",
|
||||
"̃ COMBINING TILDE\n",
|
||||
"\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"words = [unicodedata.normalize('NFC', 'Schlyñ'), unicodedata.normalize('NFD', 'Schlyñ')]\n",
|
||||
"\n",
|
||||
"for s in words:\n",
|
||||
" list_characters(s)\n",
|
||||
" print()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"These two strings are different:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"False"
|
||||
]
|
||||
},
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"words[0] == words[1]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"And yet they are the canonically equivalent:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"True"
|
||||
]
|
||||
},
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"unicodedata.normalize('NFC', words[0]) == unicodedata.normalize('NFC', words[1])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"→ Normalize to NFC (Normalization Form Composed) to compare. NFC is also composed, which is what we want. But it doesn't matter because we're not interested in the characters as Python sees them, but in grapheme clusters (see below.)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Grapheme clusters"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"For evaluation we're interesting in what is perceived as \"characters\". But is \"ñ\" 1 character (LATIN SMALL LETTER N WITH TILDE) or 2 (LATIN SMALL LETTER N + COMBINING TILDE)?\n",
|
||||
"\n",
|
||||
"What we're probably want are [grapheme clusters](https://uniseg-python.readthedocs.io/en/latest/graphemecluster.html):"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"['S', 'c', 'h', 'l', 'y', 'ñ']\n",
|
||||
"['S', 'c', 'h', 'l', 'y', 'ñ']\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from uniseg.graphemecluster import grapheme_clusters\n",
|
||||
"\n",
|
||||
"for w in words:\n",
|
||||
" print(list(grapheme_clusters(w)))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Just looking at the interesting character – the last one - from both words:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"ñ LATIN SMALL LETTER N WITH TILDE\n",
|
||||
"\n",
|
||||
"n LATIN SMALL LETTER N\n",
|
||||
"̃ COMBINING TILDE\n",
|
||||
"\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"for w in words:\n",
|
||||
" list_characters(list(grapheme_clusters(w))[-1])\n",
|
||||
" print()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"→ Work with grapheme clusters, not \"characters as Python sees them\"."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def unicode_name(c):\n",
|
||||
" if 0xE000 <= ord(c) <= 0xF8FF:\n",
|
||||
" return 'private use character 0x{:04X}'.format(ord(c))\n",
|
||||
" else:\n",
|
||||
" return unicodedata.name(c)\n",
|
||||
" \n",
|
||||
"\n",
|
||||
"def list_grapheme_clusters(s):\n",
|
||||
" \"\"\"List grapheme clusters of string s\"\"\"\n",
|
||||
" for g in grapheme_clusters(s):\n",
|
||||
" print(g, end=' ')\n",
|
||||
" if len(g) > 1:\n",
|
||||
" print('(multiple)', end=' ')\n",
|
||||
" try:\n",
|
||||
" print(', '.join(unicode_name(c) for c in g))\n",
|
||||
" except ValueError:\n",
|
||||
" print('ValueError')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"S LATIN CAPITAL LETTER S\n",
|
||||
"c LATIN SMALL LETTER C\n",
|
||||
"h LATIN SMALL LETTER H\n",
|
||||
"l LATIN SMALL LETTER L\n",
|
||||
"y LATIN SMALL LETTER Y\n",
|
||||
"ñ LATIN SMALL LETTER N WITH TILDE\n",
|
||||
"\n",
|
||||
"S LATIN CAPITAL LETTER S\n",
|
||||
"c LATIN SMALL LETTER C\n",
|
||||
"h LATIN SMALL LETTER H\n",
|
||||
"l LATIN SMALL LETTER L\n",
|
||||
"y LATIN SMALL LETTER Y\n",
|
||||
"ñ (multiple) LATIN SMALL LETTER N, COMBINING TILDE\n",
|
||||
"\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"for w in words:\n",
|
||||
" list_grapheme_clusters(w)\n",
|
||||
" print()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"私 CJK UNIFIED IDEOGRAPH-79C1\n",
|
||||
"は HIRAGANA LETTER HA\n",
|
||||
"彼 CJK UNIFIED IDEOGRAPH-5F7C\n",
|
||||
"女 CJK UNIFIED IDEOGRAPH-5973\n",
|
||||
"が HIRAGANA LETTER GA\n",
|
||||
"お HIRAGANA LETTER O\n",
|
||||
"茶 CJK UNIFIED IDEOGRAPH-8336\n",
|
||||
"を HIRAGANA LETTER WO\n",
|
||||
"好 CJK UNIFIED IDEOGRAPH-597D\n",
|
||||
"き HIRAGANA LETTER KI\n",
|
||||
"な HIRAGANA LETTER NA\n",
|
||||
"事 CJK UNIFIED IDEOGRAPH-4E8B\n",
|
||||
"が HIRAGANA LETTER GA\n",
|
||||
"分 CJK UNIFIED IDEOGRAPH-5206\n",
|
||||
"か HIRAGANA LETTER KA\n",
|
||||
"っ HIRAGANA LETTER SMALL TU\n",
|
||||
"た HIRAGANA LETTER TA\n",
|
||||
"。 IDEOGRAPHIC FULL STOP\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"list_grapheme_clusters('私は彼女がお茶を好きな事が分かった。')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
". FULL STOP\n",
|
||||
" SPACE\n",
|
||||
"ا ARABIC LETTER ALEF\n",
|
||||
"م ARABIC LETTER MEEM\n",
|
||||
"ا ARABIC LETTER ALEF\n",
|
||||
" SPACE\n",
|
||||
"چ ARABIC LETTER TCHEH\n",
|
||||
"ن ARABIC LETTER NOON\n",
|
||||
"د ARABIC LETTER DAL\n",
|
||||
" SPACE\n",
|
||||
"ت ARABIC LETTER TEH\n",
|
||||
"ا ARABIC LETTER ALEF\n",
|
||||
" SPACE\n",
|
||||
"ح ARABIC LETTER HAH\n",
|
||||
"ر ARABIC LETTER REH\n",
|
||||
"ف ARABIC LETTER FEH\n",
|
||||
" SPACE\n",
|
||||
"ت ARABIC LETTER TEH\n",
|
||||
"و ARABIC LETTER WAW\n",
|
||||
" SPACE\n",
|
||||
"ف ARABIC LETTER FEH\n",
|
||||
"ا ARABIC LETTER ALEF\n",
|
||||
"ر ARABIC LETTER REH\n",
|
||||
"س ARABIC LETTER SEEN\n",
|
||||
"ی ARABIC LETTER FARSI YEH\n",
|
||||
" SPACE\n",
|
||||
"ه ARABIC LETTER HEH\n",
|
||||
"س ARABIC LETTER SEEN\n",
|
||||
"ت ARABIC LETTER TEH\n",
|
||||
" SPACE\n",
|
||||
"ک ARABIC LETTER KEHEH\n",
|
||||
"ه ARABIC LETTER HEH\n",
|
||||
" SPACE\n",
|
||||
"ت ARABIC LETTER TEH\n",
|
||||
"و ARABIC LETTER WAW\n",
|
||||
" SPACE\n",
|
||||
"ع ARABIC LETTER AIN\n",
|
||||
"ر ARABIC LETTER REH\n",
|
||||
"ب ARABIC LETTER BEH\n",
|
||||
"ی ARABIC LETTER FARSI YEH\n",
|
||||
" SPACE\n",
|
||||
"ن ARABIC LETTER NOON\n",
|
||||
"ی ARABIC LETTER FARSI YEH\n",
|
||||
"س ARABIC LETTER SEEN\n",
|
||||
"ت ARABIC LETTER TEH\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"list_grapheme_clusters('. اما چند تا حرف تو فارسی هست که تو عربی نیست')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 12,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
". FULL STOP\n",
|
||||
" SPACE\n",
|
||||
"ل ARABIC LETTER LAM\n",
|
||||
"ك ARABIC LETTER KAF\n",
|
||||
"ن ARABIC LETTER NOON\n",
|
||||
" SPACE\n",
|
||||
"ك ARABIC LETTER KAF\n",
|
||||
"م ARABIC LETTER MEEM\n",
|
||||
" SPACE\n",
|
||||
"ع ARABIC LETTER AIN\n",
|
||||
"د ARABIC LETTER DAL\n",
|
||||
"د ARABIC LETTER DAL\n",
|
||||
" SPACE\n",
|
||||
"ا ARABIC LETTER ALEF\n",
|
||||
"ل ARABIC LETTER LAM\n",
|
||||
"ك ARABIC LETTER KAF\n",
|
||||
"ل ARABIC LETTER LAM\n",
|
||||
"م ARABIC LETTER MEEM\n",
|
||||
"ا ARABIC LETTER ALEF\n",
|
||||
"ت ARABIC LETTER TEH\n",
|
||||
" SPACE\n",
|
||||
"ب ARABIC LETTER BEH\n",
|
||||
"ا ARABIC LETTER ALEF\n",
|
||||
"ل ARABIC LETTER LAM\n",
|
||||
"ف ARABIC LETTER FEH\n",
|
||||
"ا ARABIC LETTER ALEF\n",
|
||||
"ر ARABIC LETTER REH\n",
|
||||
"س ARABIC LETTER SEEN\n",
|
||||
"ي ARABIC LETTER YEH\n",
|
||||
"ة ARABIC LETTER TEH MARBUTA\n",
|
||||
" SPACE\n",
|
||||
"ه ARABIC LETTER HEH\n",
|
||||
"ل ARABIC LETTER LAM\n",
|
||||
" SPACE\n",
|
||||
"أ ARABIC LETTER ALEF WITH HAMZA ABOVE\n",
|
||||
"ن ARABIC LETTER NOON\n",
|
||||
"ت ARABIC LETTER TEH\n",
|
||||
" SPACE\n",
|
||||
"ب ARABIC LETTER BEH\n",
|
||||
"ا ARABIC LETTER ALEF\n",
|
||||
"ل ARABIC LETTER LAM\n",
|
||||
"ل ARABIC LETTER LAM\n",
|
||||
"غ ARABIC LETTER GHAIN\n",
|
||||
"ة ARABIC LETTER TEH MARBUTA\n",
|
||||
" SPACE\n",
|
||||
"ا ARABIC LETTER ALEF\n",
|
||||
"ل ARABIC LETTER LAM\n",
|
||||
"ع ARABIC LETTER AIN\n",
|
||||
"ر ARABIC LETTER REH\n",
|
||||
"ب ARABIC LETTER BEH\n",
|
||||
"ي ARABIC LETTER YEH\n",
|
||||
"ة ARABIC LETTER TEH MARBUTA\n",
|
||||
"؟ ARABIC QUESTION MARK\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"list_grapheme_clusters('. لكن كم عدد الكلمات بالفارسية هل أنت باللغة العربية؟')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 13,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"H LATIN CAPITAL LETTER H\n",
|
||||
"e LATIN SMALL LETTER E\n",
|
||||
"l LATIN SMALL LETTER L\n",
|
||||
"l LATIN SMALL LETTER L\n",
|
||||
"😀 GRINNING FACE\n",
|
||||
" SPACE\n",
|
||||
"W LATIN CAPITAL LETTER W\n",
|
||||
"😀 GRINNING FACE\n",
|
||||
"r LATIN SMALL LETTER R\n",
|
||||
"l LATIN SMALL LETTER L\n",
|
||||
"d LATIN SMALL LETTER D\n",
|
||||
"! EXCLAMATION MARK\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"list_grapheme_clusters('Hell😀 W😀rld!')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 14,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"u̶̜͓̬̞͚͙̪̰͓̯̲̝̬͔͎̳̼͇̓͊ͤ̋̃̀̄̓̿͊̀̚͟͜͟ͅ (multiple) LATIN SMALL LETTER U, COMBINING COMMA ABOVE, COMBINING NOT TILDE ABOVE, COMBINING LATIN SMALL LETTER E, COMBINING DOUBLE ACUTE ACCENT, COMBINING TILDE, COMBINING GRAVE ACCENT, COMBINING LEFT ANGLE ABOVE, COMBINING MACRON, COMBINING COMMA ABOVE, COMBINING DOUBLE OVERLINE, COMBINING NOT TILDE ABOVE, COMBINING DOUBLE MACRON BELOW, COMBINING GRAVE TONE MARK, COMBINING DOUBLE BREVE BELOW, COMBINING LONG STROKE OVERLAY, COMBINING DOUBLE MACRON BELOW, COMBINING LEFT HALF RING BELOW, COMBINING X BELOW, COMBINING CARON BELOW, COMBINING DOWN TACK BELOW, COMBINING DOUBLE RING BELOW, COMBINING ASTERISK BELOW, COMBINING BRIDGE BELOW, COMBINING TILDE BELOW, COMBINING X BELOW, COMBINING INVERTED BREVE BELOW, COMBINING LOW LINE, COMBINING UP TACK BELOW, COMBINING CARON BELOW, COMBINING LEFT ARROWHEAD BELOW, COMBINING UPWARDS ARROW BELOW, COMBINING DOUBLE LOW LINE, COMBINING SEAGULL BELOW, COMBINING EQUALS SIGN BELOW, COMBINING GREEK YPOGEGRAMMENI\n",
|
||||
"ņ̷͔̤̜̗̘̠̦̦̖̟͉̹͕̬͎̙̲̲̎̅̈́ͮͣ̔̀̌͂̄͆͑̚ (multiple) LATIN SMALL LETTER N, COMBINING DOUBLE VERTICAL LINE ABOVE, COMBINING OVERLINE, COMBINING GREEK DIALYTIKA TONOS, COMBINING LEFT ANGLE ABOVE, COMBINING LATIN SMALL LETTER V, COMBINING LATIN SMALL LETTER A, COMBINING REVERSED COMMA ABOVE, COMBINING GRAVE ACCENT, COMBINING CARON, COMBINING GREEK PERISPOMENI, COMBINING MACRON, COMBINING BRIDGE ABOVE, COMBINING LEFT HALF RING ABOVE, COMBINING SHORT SOLIDUS OVERLAY, COMBINING CEDILLA, COMBINING LEFT ARROWHEAD BELOW, COMBINING DIAERESIS BELOW, COMBINING LEFT HALF RING BELOW, COMBINING ACUTE ACCENT BELOW, COMBINING LEFT TACK BELOW, COMBINING MINUS SIGN BELOW, COMBINING COMMA BELOW, COMBINING COMMA BELOW, COMBINING GRAVE ACCENT BELOW, COMBINING PLUS SIGN BELOW, COMBINING LEFT ANGLE BELOW, COMBINING RIGHT HALF RING BELOW, COMBINING RIGHT ARROWHEAD BELOW, COMBINING CARON BELOW, COMBINING UPWARDS ARROW BELOW, COMBINING RIGHT TACK BELOW, COMBINING LOW LINE, COMBINING LOW LINE\n",
|
||||
"i̴̢͖̳̣̙͕̍ͯͧ̀ͥͭ̆ͣ̉͐͆̊͋͛̈́͒͟ (multiple) LATIN SMALL LETTER I, COMBINING VERTICAL LINE ABOVE, COMBINING LATIN SMALL LETTER X, COMBINING LATIN SMALL LETTER U, COMBINING GRAVE ACCENT, COMBINING LATIN SMALL LETTER I, COMBINING LATIN SMALL LETTER T, COMBINING BREVE, COMBINING LATIN SMALL LETTER A, COMBINING HOOK ABOVE, COMBINING RIGHT ARROWHEAD ABOVE, COMBINING BRIDGE ABOVE, COMBINING RING ABOVE, COMBINING HOMOTHETIC ABOVE, COMBINING ZIGZAG ABOVE, COMBINING GREEK DIALYTIKA TONOS, COMBINING FERMATA, COMBINING TILDE OVERLAY, COMBINING RETROFLEX HOOK BELOW, COMBINING DOUBLE MACRON BELOW, COMBINING RIGHT ARROWHEAD AND UP ARROWHEAD BELOW, COMBINING DOUBLE LOW LINE, COMBINING DOT BELOW, COMBINING RIGHT TACK BELOW, COMBINING RIGHT ARROWHEAD BELOW\n",
|
||||
"c̰̟̫̲͇̺̹͖̼̦̾ͮ̍̐ͤͪ̓ͤ̐̈́̅ͯͤ̚̚͘ (multiple) LATIN SMALL LETTER C, COMBINING VERTICAL TILDE, COMBINING LATIN SMALL LETTER V, COMBINING VERTICAL LINE ABOVE, COMBINING CANDRABINDU, COMBINING LATIN SMALL LETTER E, COMBINING LEFT ANGLE ABOVE, COMBINING LATIN SMALL LETTER H, COMBINING COMMA ABOVE, COMBINING LATIN SMALL LETTER E, COMBINING LEFT ANGLE ABOVE, COMBINING CANDRABINDU, COMBINING GREEK DIALYTIKA TONOS, COMBINING OVERLINE, COMBINING LATIN SMALL LETTER X, COMBINING LATIN SMALL LETTER E, COMBINING DOT ABOVE RIGHT, COMBINING TILDE BELOW, COMBINING PLUS SIGN BELOW, COMBINING INVERTED DOUBLE ARCH BELOW, COMBINING LOW LINE, COMBINING EQUALS SIGN BELOW, COMBINING INVERTED BRIDGE BELOW, COMBINING RIGHT HALF RING BELOW, COMBINING RIGHT ARROWHEAD AND UP ARROWHEAD BELOW, COMBINING SEAGULL BELOW, COMBINING COMMA BELOW\n",
|
||||
"o̴ͣ̑̐ͫ̈̄͊ͥ̓͟͏̫͔̠̤̜̤̥͘ (multiple) LATIN SMALL LETTER O, COMBINING LATIN SMALL LETTER A, COMBINING INVERTED BREVE, COMBINING CANDRABINDU, COMBINING LATIN SMALL LETTER M, COMBINING DIAERESIS, COMBINING MACRON, COMBINING NOT TILDE ABOVE, COMBINING LATIN SMALL LETTER I, COMBINING GREEK KORONIS, COMBINING DOUBLE MACRON BELOW, COMBINING TILDE OVERLAY, COMBINING GRAPHEME JOINER, COMBINING DOT ABOVE RIGHT, COMBINING INVERTED DOUBLE ARCH BELOW, COMBINING LEFT ARROWHEAD BELOW, COMBINING MINUS SIGN BELOW, COMBINING DIAERESIS BELOW, COMBINING LEFT HALF RING BELOW, COMBINING DIAERESIS BELOW, COMBINING RING BELOW\n",
|
||||
"ḍ̛̥͖͓̪͈̹̯͖̱̘͙͖ͧ̿ͧ̓̓͊̈͑͘̕ (multiple) LATIN SMALL LETTER D, COMBINING LATIN SMALL LETTER U, COMBINING DOUBLE OVERLINE, COMBINING LATIN SMALL LETTER U, COMBINING COMMA ABOVE, COMBINING COMMA ABOVE, COMBINING NOT TILDE ABOVE, COMBINING DIAERESIS, COMBINING LEFT HALF RING ABOVE, COMBINING DOT ABOVE RIGHT, COMBINING COMMA ABOVE RIGHT, COMBINING HORN, COMBINING DOT BELOW, COMBINING RING BELOW, COMBINING RIGHT ARROWHEAD AND UP ARROWHEAD BELOW, COMBINING X BELOW, COMBINING BRIDGE BELOW, COMBINING DOUBLE VERTICAL LINE BELOW, COMBINING RIGHT HALF RING BELOW, COMBINING INVERTED BREVE BELOW, COMBINING RIGHT ARROWHEAD AND UP ARROWHEAD BELOW, COMBINING MACRON BELOW, COMBINING LEFT TACK BELOW, COMBINING ASTERISK BELOW, COMBINING RIGHT ARROWHEAD AND UP ARROWHEAD BELOW\n",
|
||||
"e̛̺͈̜̰̜̖͎͚͈͋̒̆̈́̏͊ͬ̎̑̇̾̆̓ͬ̔̐̾ͭ́͞ (multiple) LATIN SMALL LETTER E, COMBINING HOMOTHETIC ABOVE, COMBINING TURNED COMMA ABOVE, COMBINING BREVE, COMBINING GREEK DIALYTIKA TONOS, COMBINING DOUBLE GRAVE ACCENT, COMBINING NOT TILDE ABOVE, COMBINING LATIN SMALL LETTER R, COMBINING DOUBLE VERTICAL LINE ABOVE, COMBINING INVERTED BREVE, COMBINING DOT ABOVE, COMBINING VERTICAL TILDE, COMBINING BREVE, COMBINING GREEK KORONIS, COMBINING LATIN SMALL LETTER R, COMBINING REVERSED COMMA ABOVE, COMBINING CANDRABINDU, COMBINING VERTICAL TILDE, COMBINING LATIN SMALL LETTER T, COMBINING ACUTE TONE MARK, COMBINING HORN, COMBINING DOUBLE MACRON, COMBINING INVERTED BRIDGE BELOW, COMBINING DOUBLE VERTICAL LINE BELOW, COMBINING LEFT HALF RING BELOW, COMBINING TILDE BELOW, COMBINING LEFT HALF RING BELOW, COMBINING GRAVE ACCENT BELOW, COMBINING UPWARDS ARROW BELOW, COMBINING DOUBLE RING BELOW, COMBINING DOUBLE VERTICAL LINE BELOW\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"list_grapheme_clusters('u̶̜͓̬̞͚͙̪̰͓̯̲̝̬͔͎̳̼͇̓͊ͤ̋̃̀̄̓̿͊̀̚͟͜͟ͅņ̷͔̤̜̗̘̠̦̦̖̟͉̹͕̬͎̙̲̲̎̅̈́ͮͣ̔̀̌͂̄͆͑̚i̴̢͖̳̣̙͕̍ͯͧ̀ͥͭ̆ͣ̉͐͆̊͋͛̈́͒͟c̰̟̫̲͇̺̹͖̼̦̾ͮ̍̐ͤͪ̓ͤ̐̈́̅ͯͤ̚̚͘o̴ͣ̑̐ͫ̈̄͊ͥ̓͟͏̫͔̠̤̜̤̥͘ḍ̛̥͖͓̪͈̹̯͖̱̘͙͖ͧ̿ͧ̓̓͊̈͑͘̕e̛̺͈̜̰̜̖͎͚͈͋̒̆̈́̏͊ͬ̎̑̇̾̆̓ͬ̔̐̾ͭ́͞')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 15,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Z LATIN CAPITAL LETTER Z\n",
|
||||
"e LATIN SMALL LETTER E\n",
|
||||
"u LATIN SMALL LETTER U\n",
|
||||
"g LATIN SMALL LETTER G\n",
|
||||
"n LATIN SMALL LETTER N\n",
|
||||
"uͤ (multiple) LATIN SMALL LETTER U, COMBINING LATIN SMALL LETTER E\n",
|
||||
"ß LATIN SMALL LETTER SHARP S\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"list_grapheme_clusters('Zeugnuͤß')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 16,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Z LATIN CAPITAL LETTER Z\n",
|
||||
"e LATIN SMALL LETTER E\n",
|
||||
"u LATIN SMALL LETTER U\n",
|
||||
"g LATIN SMALL LETTER G\n",
|
||||
"n LATIN SMALL LETTER N\n",
|
||||
" private use character 0xE72B\n",
|
||||
"ß LATIN SMALL LETTER SHARP S\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"list_grapheme_clusters('Zeugnß')"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"hide_input": false,
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.7.4"
|
||||
},
|
||||
"toc": {
|
||||
"base_numbering": 1,
|
||||
"nav_menu": {},
|
||||
"number_sections": true,
|
||||
"sideBar": true,
|
||||
"skip_h1_title": false,
|
||||
"title_cell": "Table of Contents",
|
||||
"title_sidebar": "Contents",
|
||||
"toc_cell": false,
|
||||
"toc_position": {},
|
||||
"toc_section_display": true,
|
||||
"toc_window_display": true
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
182
src/dinglehopper/ocr_files.py
Normal file
182
src/dinglehopper/ocr_files.py
Normal file
|
@ -0,0 +1,182 @@
|
|||
from __future__ import division, print_function
|
||||
|
||||
import os
|
||||
import sys
|
||||
from typing import Iterator
|
||||
from warnings import warn
|
||||
|
||||
from lxml import etree as ET
|
||||
from lxml.etree import XMLSyntaxError
|
||||
|
||||
from .extracted_text import ExtractedText, normalize_sbb
|
||||
|
||||
|
||||
def alto_namespace(tree: ET.ElementTree) -> str:
|
||||
"""Return the ALTO namespace used in the given ElementTree.
|
||||
|
||||
This relies on the assumption that, in any given ALTO file, the root element has the local name "alto". We do not
|
||||
check if the files uses any valid ALTO namespace.
|
||||
"""
|
||||
root_name = ET.QName(tree.getroot().tag)
|
||||
if root_name.localname == "alto":
|
||||
return root_name.namespace
|
||||
else:
|
||||
raise ValueError("Not an ALTO tree")
|
||||
|
||||
|
||||
def alto_extract_lines(tree: ET.ElementTree) -> Iterator[ExtractedText]:
|
||||
nsmap = {"alto": alto_namespace(tree)}
|
||||
for line in tree.iterfind(".//alto:TextLine", namespaces=nsmap):
|
||||
line_id = line.attrib.get("ID")
|
||||
line_text = " ".join(
|
||||
string.attrib.get("CONTENT")
|
||||
for string in line.iterfind("alto:String", namespaces=nsmap)
|
||||
)
|
||||
yield ExtractedText(line_id, None, None, normalize_sbb(line_text))
|
||||
# FIXME hardcoded SBB normalization
|
||||
|
||||
|
||||
def alto_extract(tree: ET.ElementTree) -> ExtractedText:
|
||||
"""Extract text from the given ALTO ElementTree."""
|
||||
return ExtractedText(None, list(alto_extract_lines(tree)), "\n", None)
|
||||
|
||||
|
||||
def alto_text(tree):
|
||||
return alto_extract(tree).text
|
||||
|
||||
|
||||
def page_namespace(tree):
|
||||
"""Return the PAGE content namespace used in the given ElementTree.
|
||||
|
||||
This relies on the assumption that, in any given PAGE content file, the root element has the local name "PcGts". We
|
||||
do not check if the files uses any valid PAGE namespace.
|
||||
"""
|
||||
root_name = ET.QName(tree.getroot().tag)
|
||||
if root_name.localname == "PcGts":
|
||||
return root_name.namespace
|
||||
else:
|
||||
raise ValueError("Not a PAGE tree")
|
||||
|
||||
|
||||
def page_extract(tree, *, textequiv_level="region"):
|
||||
"""Extract text from the given PAGE content ElementTree."""
|
||||
|
||||
# Internally, this is just parsing the Reading Order (if it exists) and
|
||||
# and leaves reading the TextRegions to ExtractedText.from_text_segment().
|
||||
|
||||
nsmap = {"page": page_namespace(tree)}
|
||||
|
||||
regions = []
|
||||
reading_order = tree.find(".//page:ReadingOrder", namespaces=nsmap)
|
||||
if reading_order is not None:
|
||||
for group in reading_order.iterfind("./*", namespaces=nsmap):
|
||||
regions.extend(
|
||||
extract_texts_from_reading_order_group(
|
||||
group, tree, nsmap, textequiv_level
|
||||
)
|
||||
)
|
||||
else:
|
||||
for region in tree.iterfind(".//page:TextRegion", namespaces=nsmap):
|
||||
regions.append(
|
||||
ExtractedText.from_text_segment(
|
||||
region, nsmap, textequiv_level=textequiv_level
|
||||
)
|
||||
)
|
||||
|
||||
# Filter empty region texts
|
||||
regions = [r for r in regions if r.text != ""]
|
||||
|
||||
return ExtractedText(None, regions, "\n", None)
|
||||
|
||||
|
||||
def extract_texts_from_reading_order_group(group, tree, nsmap, textequiv_level):
|
||||
"""Recursive function to extract the texts from TextRegions in ReadingOrder."""
|
||||
regions = []
|
||||
|
||||
if ET.QName(group.tag).localname in ["OrderedGroup", "OrderedGroupIndexed"]:
|
||||
ro_children = list(group)
|
||||
|
||||
ro_children = filter(lambda child: "index" in child.attrib.keys(), ro_children)
|
||||
ro_children = sorted(ro_children, key=lambda child: int(child.attrib["index"]))
|
||||
elif ET.QName(group.tag).localname in ["UnorderedGroup", "UnorderedGroupIndexed"]:
|
||||
ro_children = list(group)
|
||||
else:
|
||||
raise NotImplementedError
|
||||
|
||||
for ro_child in ro_children:
|
||||
if ET.QName(ro_child.tag).localname in [
|
||||
"OrderedGroup",
|
||||
"OrderedGroupIndexed",
|
||||
"UnorderedGroup",
|
||||
"UnorderedGroupIndexed",
|
||||
]:
|
||||
regions.extend(
|
||||
extract_texts_from_reading_order_group(
|
||||
ro_child, tree, nsmap, textequiv_level
|
||||
)
|
||||
)
|
||||
else:
|
||||
region_id = ro_child.attrib["regionRef"]
|
||||
region = tree.find(
|
||||
'.//page:TextRegion[@id="%s"]' % region_id, namespaces=nsmap
|
||||
)
|
||||
if region is not None:
|
||||
regions.append(
|
||||
ExtractedText.from_text_segment(
|
||||
region, nsmap, textequiv_level=textequiv_level
|
||||
)
|
||||
)
|
||||
else:
|
||||
pass # Not a TextRegion
|
||||
return regions
|
||||
|
||||
|
||||
def page_text(tree, *, textequiv_level="region"):
|
||||
return page_extract(tree, textequiv_level=textequiv_level).text
|
||||
|
||||
|
||||
def plain_extract(filename, include_filename_in_id=False):
|
||||
id_template = "{filename} - line {no}" if include_filename_in_id else "line {no}"
|
||||
with open(filename, "r") as f:
|
||||
return ExtractedText(
|
||||
None,
|
||||
[
|
||||
ExtractedText(
|
||||
id_template.format(filename=os.path.basename(filename), no=no),
|
||||
None,
|
||||
None,
|
||||
normalize_sbb(line),
|
||||
)
|
||||
for no, line in enumerate(f.readlines())
|
||||
],
|
||||
"\n",
|
||||
None,
|
||||
)
|
||||
# XXX hardcoded SBB normalization
|
||||
|
||||
|
||||
def plain_text(filename):
|
||||
return plain_extract(filename).text
|
||||
|
||||
|
||||
def extract(filename, *, textequiv_level="region"):
|
||||
"""Extract the text from the given file.
|
||||
|
||||
Supports PAGE, ALTO and falls back to plain text.
|
||||
"""
|
||||
try:
|
||||
tree = ET.parse(filename)
|
||||
except XMLSyntaxError:
|
||||
return plain_extract(filename)
|
||||
try:
|
||||
return page_extract(tree, textequiv_level=textequiv_level)
|
||||
except ValueError:
|
||||
return alto_extract(tree)
|
||||
|
||||
|
||||
def text(filename):
|
||||
return extract(filename).text
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
print(text(sys.argv[1]))
|
36
src/dinglehopper/ocrd-tool.json
Normal file
36
src/dinglehopper/ocrd-tool.json
Normal file
|
@ -0,0 +1,36 @@
|
|||
{
|
||||
"version": "0.9.0",
|
||||
"git_url": "https://github.com/qurator-spk/dinglehopper",
|
||||
"tools": {
|
||||
"ocrd-dinglehopper": {
|
||||
"executable": "ocrd-dinglehopper",
|
||||
"description": "Evaluate OCR text against ground truth with dinglehopper",
|
||||
"input_file_grp": [
|
||||
"OCR-D-GT-PAGE",
|
||||
"OCR-D-OCR"
|
||||
],
|
||||
"output_file_grp": [
|
||||
"OCR-D-OCR-EVAL"
|
||||
],
|
||||
"categories": [
|
||||
"Quality assurance"
|
||||
],
|
||||
"steps": [
|
||||
"recognition/text-recognition"
|
||||
],
|
||||
"parameters": {
|
||||
"metrics": {
|
||||
"type": "boolean",
|
||||
"default": true,
|
||||
"description": "Enable/disable metrics and green/red"
|
||||
},
|
||||
"textequiv_level": {
|
||||
"type": "string",
|
||||
"enum": ["region", "line"],
|
||||
"default": "region",
|
||||
"description": "PAGE XML hierarchy level to extract the text from"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
78
src/dinglehopper/ocrd_cli.py
Normal file
78
src/dinglehopper/ocrd_cli.py
Normal file
|
@ -0,0 +1,78 @@
|
|||
import json
|
||||
import os
|
||||
|
||||
import click
|
||||
from ocrd import Processor
|
||||
from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor
|
||||
from ocrd_utils import getLogger, make_file_id, assert_file_grp_cardinality
|
||||
from pkg_resources import resource_string
|
||||
|
||||
from .cli import process as cli_process
|
||||
|
||||
OCRD_TOOL = json.loads(resource_string(__name__, "ocrd-tool.json").decode("utf8"))
|
||||
|
||||
|
||||
@click.command()
|
||||
@ocrd_cli_options
|
||||
def ocrd_dinglehopper(*args, **kwargs):
|
||||
return ocrd_cli_wrap_processor(OcrdDinglehopperEvaluate, *args, **kwargs)
|
||||
|
||||
|
||||
class OcrdDinglehopperEvaluate(Processor):
|
||||
def __init__(self, *args, **kwargs):
|
||||
kwargs["ocrd_tool"] = OCRD_TOOL["tools"]["ocrd-dinglehopper"]
|
||||
super(OcrdDinglehopperEvaluate, self).__init__(*args, **kwargs)
|
||||
|
||||
def process(self):
|
||||
assert_file_grp_cardinality(self.input_file_grp, 2, "GT and OCR")
|
||||
assert_file_grp_cardinality(self.output_file_grp, 1)
|
||||
|
||||
log = getLogger("processor.OcrdDinglehopperEvaluate")
|
||||
|
||||
metrics = self.parameter["metrics"]
|
||||
textequiv_level = self.parameter["textequiv_level"]
|
||||
gt_grp, ocr_grp = self.input_file_grp.split(",")
|
||||
|
||||
input_file_tuples = self.zip_input_files(on_error="abort")
|
||||
for n, (gt_file, ocr_file) in enumerate(input_file_tuples):
|
||||
if not gt_file or not ocr_file:
|
||||
# file/page was not found in this group
|
||||
continue
|
||||
gt_file = self.workspace.download_file(gt_file)
|
||||
ocr_file = self.workspace.download_file(ocr_file)
|
||||
page_id = gt_file.pageId
|
||||
|
||||
log.info("INPUT FILES %i / %s↔ %s", n, gt_file, ocr_file)
|
||||
|
||||
file_id = make_file_id(ocr_file, self.output_file_grp)
|
||||
report_prefix = os.path.join(self.output_file_grp, file_id)
|
||||
|
||||
# Process the files
|
||||
try:
|
||||
os.mkdir(self.output_file_grp)
|
||||
except FileExistsError:
|
||||
pass
|
||||
cli_process(
|
||||
gt_file.local_filename,
|
||||
ocr_file.local_filename,
|
||||
report_prefix,
|
||||
metrics=metrics,
|
||||
textequiv_level=textequiv_level,
|
||||
)
|
||||
|
||||
# Add reports to the workspace
|
||||
for report_suffix, mimetype in [
|
||||
[".html", "text/html"],
|
||||
[".json", "application/json"],
|
||||
]:
|
||||
self.workspace.add_file(
|
||||
file_id=file_id + report_suffix,
|
||||
file_grp=self.output_file_grp,
|
||||
page_id=page_id,
|
||||
mimetype=mimetype,
|
||||
local_filename=report_prefix + report_suffix,
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
ocrd_dinglehopper()
|
110
src/dinglehopper/templates/report.html.j2
Normal file
110
src/dinglehopper/templates/report.html.j2
Normal file
|
@ -0,0 +1,110 @@
|
|||
<!doctype html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="utf-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
|
||||
|
||||
<link rel="stylesheet" href="https://stackpath.bootstrapcdn.com/bootstrap/4.3.1/css/bootstrap.min.css" integrity="sha384-ggOyR0iXCbMQv3Xipma34MD+dH/1fQ784/j6cY/iJTQUOhcWr7x9JvoRxT2MZw1T" crossorigin="anonymous">
|
||||
<style type="text/css">
|
||||
{% if metrics %}
|
||||
.gt .diff {
|
||||
color: green;
|
||||
}
|
||||
.ocr .diff {
|
||||
color: red;
|
||||
}
|
||||
{% else %}
|
||||
.gt .diff, .ocr .diff {
|
||||
color: blue;
|
||||
}
|
||||
{% endif %}
|
||||
.ellipsis {
|
||||
opacity: 0.5;
|
||||
font-style: italic;
|
||||
}
|
||||
.diff-highlight {
|
||||
border: 2px solid;
|
||||
border-radius: 5px;
|
||||
}
|
||||
|
||||
.row {
|
||||
margin-bottom: 20px;
|
||||
}
|
||||
|
||||
table {
|
||||
width: 100%;
|
||||
}
|
||||
|
||||
th {
|
||||
cursor: pointer;
|
||||
}
|
||||
|
||||
th:hover {
|
||||
background-color: #eee;
|
||||
}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
|
||||
|
||||
|
||||
<div class="container">
|
||||
|
||||
{{ gt }}<br>
|
||||
{{ ocr }}
|
||||
|
||||
|
||||
{% if metrics %}
|
||||
<h2>Metrics</h2>
|
||||
<p>CER: {{ cer|round(4) }}</p>
|
||||
<p>WER: {{ wer|round(4) }}</p>
|
||||
{% endif %}
|
||||
|
||||
<h2>Character differences</h2>
|
||||
{{ char_diff_report }}
|
||||
|
||||
<h2>Word differences</h2>
|
||||
{{ word_diff_report }}
|
||||
|
||||
{%- if differences %}
|
||||
{% set sections = [{'title': 'Found differences (character)', 'data': diff_c}, {'title': 'Found differences (word)', 'data': diff_w}] %}
|
||||
|
||||
<div class="row">
|
||||
{% for section in sections %}
|
||||
<div class="col-md-6">
|
||||
<h2>{{ section['title'] }}</h2>
|
||||
<table>
|
||||
<thead>
|
||||
<tr>
|
||||
<th>GT</th>
|
||||
<th>OCR</th>
|
||||
<th>Occurrences</th>
|
||||
</tr>
|
||||
{% for gt_ocr, occurrences in section['data'].items() %}
|
||||
<tr>
|
||||
<td>{{ gt_ocr.split("::")[0] }}</td>
|
||||
<td>{{ gt_ocr.split("::")[1] }}</td>
|
||||
<td>{{ occurrences }}</td>
|
||||
</tr>
|
||||
{% endfor %}
|
||||
</table>
|
||||
</div>
|
||||
{% endfor %}
|
||||
</div>
|
||||
{%- endif %}
|
||||
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
<script src="https://code.jquery.com/jquery-3.3.1.slim.min.js" integrity="sha384-q8i/X+965DzO0rT7abK41JStQIAqVgRVzpbzo5smXKp4YfRvH+8abtTE1Pi6jizo" crossorigin="anonymous"></script>
|
||||
<script src="https://cdnjs.cloudflare.com/ajax/libs/popper.js/1.14.7/umd/popper.min.js" integrity="sha384-UO2eT0CpHqdSJQ6hJty5KVphtPhzWj9WO1clHTMGa3JDZwrnQq4sF86dIHNDz0W1" crossorigin="anonymous"></script>
|
||||
<script src="https://stackpath.bootstrapcdn.com/bootstrap/4.3.1/js/bootstrap.min.js" integrity="sha384-JjSmVgyd0p3pXB1rRibZUAYoIIy6OrQ6VrjIEaFf/nJGzIxFDsf4x0xIM+B07jRM" crossorigin="anonymous"></script>
|
||||
|
||||
<script>
|
||||
{% include 'report.html.js' %}
|
||||
</script>
|
||||
|
||||
|
||||
</body>
|
||||
</html>
|
39
src/dinglehopper/templates/report.html.js
Normal file
39
src/dinglehopper/templates/report.html.js
Normal file
|
@ -0,0 +1,39 @@
|
|||
function find_diff_class(classes) {
|
||||
return $('.' + classes.split(/\s+/).find(x => x.match(/.diff\d.*/)));
|
||||
}
|
||||
|
||||
$(document).ready(function() {
|
||||
/* Enable Bootstrap tooltips */
|
||||
$('[data-toggle="tooltip"]').tooltip();
|
||||
|
||||
$('.diff').mouseover(function() {
|
||||
find_diff_class($(this).attr('class')).addClass('diff-highlight');
|
||||
});
|
||||
$('.diff').mouseout(function() {
|
||||
find_diff_class($(this).attr('class')).removeClass('diff-highlight');
|
||||
});
|
||||
|
||||
/* Sort this column of the table */
|
||||
$('th').click(function () {
|
||||
var table = $(this).closest('table');
|
||||
var rows = table.find('tbody > tr').toArray().sort(compareRows($(this).index()));
|
||||
this.asc = !this.asc;
|
||||
if (!this.asc) {
|
||||
rows = rows.reverse();
|
||||
}
|
||||
for (var i = 0; i < rows.length; i++) {
|
||||
table.children('tbody').append(rows[i]);
|
||||
}
|
||||
});
|
||||
|
||||
function compareRows(index) {
|
||||
return function (row1, row2) {
|
||||
var cell1 = $(row1).children('td').eq(index).text().toLowerCase();
|
||||
var cell2 = $(row2).children('td').eq(index).text().toLowerCase();
|
||||
return cell1.localeCompare(cell2, undefined, {
|
||||
numeric: true,
|
||||
sensitivity: 'base'
|
||||
});
|
||||
}
|
||||
}
|
||||
});
|
16
src/dinglehopper/templates/report.json.j2
Normal file
16
src/dinglehopper/templates/report.json.j2
Normal file
|
@ -0,0 +1,16 @@
|
|||
{
|
||||
"gt": "{{ gt }}",
|
||||
"ocr": "{{ ocr }}",
|
||||
{% if metrics %}
|
||||
"cer": {{ cer|json_float }},
|
||||
"wer": {{ wer|json_float }},
|
||||
{% endif %}
|
||||
{% if differences %}
|
||||
"differences": {
|
||||
"character_level": {{ diff_c|tojson }},
|
||||
"word_level": {{ diff_w|tojson }}
|
||||
},
|
||||
{% endif %}
|
||||
"n_characters": {{ n_characters }},
|
||||
"n_words": {{ n_words }}
|
||||
}
|
136
src/dinglehopper/templates/summary.html.j2
Normal file
136
src/dinglehopper/templates/summary.html.j2
Normal file
|
@ -0,0 +1,136 @@
|
|||
<!doctype html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="utf-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
|
||||
|
||||
<link rel="stylesheet" href="https://stackpath.bootstrapcdn.com/bootstrap/4.3.1/css/bootstrap.min.css" integrity="sha384-ggOyR0iXCbMQv3Xipma34MD+dH/1fQ784/j6cY/iJTQUOhcWr7x9JvoRxT2MZw1T" crossorigin="anonymous">
|
||||
<style type="text/css">
|
||||
{% if metrics %}
|
||||
.gt .diff {
|
||||
color: green;
|
||||
}
|
||||
.ocr .diff {
|
||||
color: red;
|
||||
}
|
||||
{% else %}
|
||||
.gt .diff, .ocr .diff {
|
||||
color: blue;
|
||||
}
|
||||
{% endif %}
|
||||
.ellipsis {
|
||||
opacity: 0.5;
|
||||
font-style: italic;
|
||||
}
|
||||
.diff-highlight {
|
||||
border: 2px solid;
|
||||
border-radius: 5px;
|
||||
}
|
||||
|
||||
.row {
|
||||
margin-bottom: 20px;
|
||||
}
|
||||
|
||||
table {
|
||||
width: 100%;
|
||||
}
|
||||
|
||||
.cer {
|
||||
flex-direction: column;
|
||||
}
|
||||
|
||||
tr:hover {
|
||||
background-color: #f5f5f5;
|
||||
}
|
||||
|
||||
th {
|
||||
cursor: pointer;
|
||||
}
|
||||
|
||||
th:hover {
|
||||
background-color: #eee;
|
||||
}
|
||||
|
||||
td {
|
||||
min-width: 100px;
|
||||
}
|
||||
|
||||
td:hover {
|
||||
background-color: #eee;
|
||||
}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
|
||||
<div class="container">
|
||||
|
||||
<div class="row">
|
||||
<h1>Summary of all reports</h1>
|
||||
</div>
|
||||
|
||||
<div class="row">
|
||||
<p>Number of reports: {{ num_reports }}</p>
|
||||
</div>
|
||||
|
||||
{% if cer_avg and wer_avg -%}
|
||||
<div class="row">
|
||||
<h2>Metrics</h2>
|
||||
</div>
|
||||
|
||||
<div class="row cer">
|
||||
<p>Average CER: {{ cer_avg|round(4) }}</p>
|
||||
<p>Average WER: {{ wer_avg|round(4) }}</p>
|
||||
</div>
|
||||
{% endif %}
|
||||
|
||||
{%- if diff_c and diff_w %}
|
||||
{%- set sections = [{'title': 'Found differences (character)', 'data': diff_c}, {'title': 'Found differences (word)', 'data': diff_w}] %}
|
||||
|
||||
<div class="row">
|
||||
{%- for section in sections %}
|
||||
<div class="col-md-6">
|
||||
<h2>{{ section['title'] }}</h2>
|
||||
<table>
|
||||
<thead>
|
||||
<tr><th>GT</th><th>OCR</th><th>Occurrences</th></tr>
|
||||
</thead>
|
||||
{%- set num_omitted = namespace(value=0) -%}
|
||||
{% for gt_ocr, occurrences in section['data'].items() -%}
|
||||
{% if occurrences < occurrences_threshold -%}
|
||||
{%- set num_omitted.value = num_omitted.value + 1 %}
|
||||
{%- else -%}
|
||||
{%- set gt = gt_ocr.split(" :: ")[0] %}
|
||||
{%- set ocr = gt_ocr.split(" :: ")[1] %}
|
||||
<tr>
|
||||
<td title="{{ gt|urlencode }}">{{ gt }}</td>{# display the unicode character #}
|
||||
<td title="{{ ocr|urlencode }}">{{ ocr }}</td >
|
||||
<td>{{ occurrences }}</td>
|
||||
</tr>
|
||||
{%- endif %}
|
||||
{%- endfor %}
|
||||
|
||||
{% if num_omitted.value > 0 and occurrences_threshold > 1 -%}
|
||||
<p>Skipped {{ num_omitted.value }} diffs with fewer than {{ occurrences_threshold }} occurrences. The complete list of diffs is available in the accompanying JSON file.</p>
|
||||
{%- set num_omitted.value = 0 %}
|
||||
{%- endif %}
|
||||
</table>
|
||||
</div>
|
||||
{%- endfor %}
|
||||
</div>
|
||||
{%- endif %}
|
||||
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
<script src="https://code.jquery.com/jquery-3.3.1.slim.min.js" integrity="sha384-q8i/X+965DzO0rT7abK41JStQIAqVgRVzpbzo5smXKp4YfRvH+8abtTE1Pi6jizo" crossorigin="anonymous"></script>
|
||||
<script src="https://cdnjs.cloudflare.com/ajax/libs/popper.js/1.14.7/umd/popper.min.js" integrity="sha384-UO2eT0CpHqdSJQ6hJty5KVphtPhzWj9WO1clHTMGa3JDZwrnQq4sF86dIHNDz0W1" crossorigin="anonymous"></script>
|
||||
<script src="https://stackpath.bootstrapcdn.com/bootstrap/4.3.1/js/bootstrap.min.js" integrity="sha384-JjSmVgyd0p3pXB1rRibZUAYoIIy6OrQ6VrjIEaFf/nJGzIxFDsf4x0xIM+B07jRM" crossorigin="anonymous"></script>
|
||||
|
||||
<script>
|
||||
{% include 'report.html.js' %}
|
||||
</script>
|
||||
|
||||
|
||||
</body>
|
||||
</html>
|
15
src/dinglehopper/templates/summary.json.j2
Normal file
15
src/dinglehopper/templates/summary.json.j2
Normal file
|
@ -0,0 +1,15 @@
|
|||
{
|
||||
"num_reports": {{ num_reports}}
|
||||
{%- if cer_avg and wer_avg %}
|
||||
,
|
||||
"cer_avg": {{ cer_avg|json_float }},
|
||||
"wer_avg": {{ wer_avg|json_float }}
|
||||
{%- endif %}
|
||||
{%- if diff_c and wer_avg %}
|
||||
,
|
||||
"differences": {
|
||||
"character_level": {{ diff_c|tojson }},
|
||||
"word_level": {{ diff_w|tojson }}
|
||||
}
|
||||
{%- endif %}
|
||||
}
|
0
src/dinglehopper/tests/__init__.py
Normal file
0
src/dinglehopper/tests/__init__.py
Normal file
BIN
src/dinglehopper/tests/data/00000119.tif
Normal file
BIN
src/dinglehopper/tests/data/00000119.tif
Normal file
Binary file not shown.
File diff suppressed because it is too large
Load diff
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
287
src/dinglehopper/tests/data/actevedef_718448162/mets.xml
Normal file
287
src/dinglehopper/tests/data/actevedef_718448162/mets.xml
Normal file
|
@ -0,0 +1,287 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<mets:mets xmlns:mets="http://www.loc.gov/METS/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="info:lc/xmlns/premis-v2 http://www.loc.gov/standards/premis/v2/premis-v2-0.xsd http://www.loc.gov/mods/v3 http://www.loc.gov/standards/mods/v3/mods-3-6.xsd http://www.loc.gov/METS/ http://www.loc.gov/standards/mets/version17/mets.v1-7.xsd http://www.loc.gov/mix/v10 http://www.loc.gov/standards/mix/mix10/mix10.xsd">
|
||||
<mets:metsHdr CREATEDATE="2017-08-22T14:23:38">
|
||||
<mets:agent OTHERTYPE="SOFTWARE" ROLE="CREATOR" TYPE="OTHER">
|
||||
<mets:name>Goobi - UGH-1.11.1-v1.11.0-11-gbafb11b - 16−November−2015</mets:name>
|
||||
<mets:note>Goobi</mets:note>
|
||||
</mets:agent>
|
||||
</mets:metsHdr>
|
||||
<mets:dmdSec ID="DMDLOG_0000">
|
||||
<mets:mdWrap MDTYPE="MODS">
|
||||
<mets:xmlData>
|
||||
<mods:mods xmlns:mods="http://www.loc.gov/mods/v3">
|
||||
<mods:location>
|
||||
<mods:physicalLocation authority="marcorg" displayLabel="Staatsbibliothek zu Berlin - Preußischer Kulturbesitz, Berlin, Germany">DE-1</mods:physicalLocation>
|
||||
<mods:shelfLocator>4" Fy 11178</mods:shelfLocator>
|
||||
</mods:location>
|
||||
<mods:originInfo eventType="publication">
|
||||
<mods:place>
|
||||
<mods:placeTerm type="text">Hanau</mods:placeTerm>
|
||||
</mods:place>
|
||||
<mods:dateIssued encoding="iso8601" keyDate="yes">1749</mods:dateIssued>
|
||||
</mods:originInfo>
|
||||
<mods:originInfo eventType="digitization">
|
||||
<mods:place>
|
||||
<mods:placeTerm type="text">Berlin</mods:placeTerm>
|
||||
</mods:place>
|
||||
<mods:dateCaptured encoding="iso8601">2012</mods:dateCaptured>
|
||||
<mods:publisher>Staatsbibliothek zu Berlin - Preußischer Kulturbesitz, Germany</mods:publisher>
|
||||
<mods:edition>[Electronic ed.]</mods:edition>
|
||||
</mods:originInfo>
|
||||
<mods:classification authority="ZVDD">Historische Drucke</mods:classification>
|
||||
<mods:classification authority="ZVDD">Rechtswissenschaft</mods:classification>
|
||||
<mods:classification authority="ZVDD">VD18 digital</mods:classification>
|
||||
<mods:recordInfo>
|
||||
<mods:recordIdentifier source="gbv-ppn">PPN718448162</mods:recordIdentifier>
|
||||
</mods:recordInfo>
|
||||
<mods:identifier type="purl">http://resolver.staatsbibliothek-berlin.de/SBB00008F1000000000</mods:identifier>
|
||||
<mods:identifier type="vd18">11750219</mods:identifier>
|
||||
<mods:identifier type="PPNanalog">PPN370506340</mods:identifier>
|
||||
<mods:titleInfo>
|
||||
<mods:title>Acten-mäßiger Verlauff, Des Fameusen Processus sich verhaltende zwischen Herrn Hoff-Rath Eraßmus Senckenberg des Raths zu Franckfurt An einem und der Unschuldigen Catharina Agricola, am andern Theil puncto stupri violenti</mods:title>
|
||||
<mods:subTitle>Worinnen allen unpartheyischen Iustitiariis diese unverantwortliche Procedur und dabey gespielte listige Touren klärlich vor Augen gestellet werden</mods:subTitle>
|
||||
</mods:titleInfo>
|
||||
<mods:note type="source characteristics">P_Drucke_VD18</mods:note>
|
||||
<mods:note type="bibliography">VD18 11750219</mods:note>
|
||||
<mods:language>
|
||||
<mods:languageTerm authority="iso639-2b" type="code">ger</mods:languageTerm>
|
||||
</mods:language>
|
||||
<mods:relatedItem type="series">
|
||||
<mods:titleInfo>
|
||||
<mods:title>VD18 digital</mods:title>
|
||||
</mods:titleInfo>
|
||||
</mods:relatedItem>
|
||||
<mods:name type="personal">
|
||||
<mods:role>
|
||||
<mods:roleTerm authority="marcrelator" type="code">asn</mods:roleTerm>
|
||||
</mods:role>
|
||||
<mods:namePart type="family">Senckenberg</mods:namePart>
|
||||
<mods:namePart type="given">Eraßmus</mods:namePart>
|
||||
<mods:displayForm>Senckenberg, Eraßmus</mods:displayForm>
|
||||
</mods:name>
|
||||
<mods:name type="personal">
|
||||
<mods:role>
|
||||
<mods:roleTerm authority="marcrelator" type="code">asn</mods:roleTerm>
|
||||
</mods:role>
|
||||
<mods:namePart type="family">Agricola</mods:namePart>
|
||||
<mods:namePart type="given">Catharina</mods:namePart>
|
||||
<mods:displayForm>Agricola, Catharina</mods:displayForm>
|
||||
</mods:name>
|
||||
<mods:name type="corporate">
|
||||
<mods:role>
|
||||
<mods:roleTerm authority="marcrelator" type="code">fnd</mods:roleTerm>
|
||||
</mods:role>
|
||||
<mods:namePart>Deutsche Forschungsgemeinschaft</mods:namePart>
|
||||
</mods:name>
|
||||
<mods:physicalDescription>
|
||||
<mods:digitalOrigin>reformatted digital</mods:digitalOrigin>
|
||||
<mods:extent>44 S.</mods:extent>
|
||||
<mods:extent>2°</mods:extent>
|
||||
</mods:physicalDescription>
|
||||
<mods:extension>
|
||||
<zvdd:zvddWrap xmlns:zvdd="http://zvdd.gdz-cms.de/">
|
||||
<zvdd:titleWord>Aktenmäßiger Verlauf famosen Prozesses Hofrat Erasmus Rats Frankfurt Justitiariis</zvdd:titleWord>
|
||||
</zvdd:zvddWrap>
|
||||
</mods:extension>
|
||||
<mods:accessCondition type="use and reproduction">CC BY-NC-SA 4.0 International</mods:accessCondition>
|
||||
<mods:typeOfResource>text</mods:typeOfResource>
|
||||
</mods:mods>
|
||||
</mets:xmlData>
|
||||
</mets:mdWrap>
|
||||
</mets:dmdSec>
|
||||
<mets:dmdSec ID="DMDLOG_0001">
|
||||
<mets:mdWrap MDTYPE="MODS">
|
||||
<mets:xmlData>
|
||||
<mods:mods xmlns:mods="http://www.loc.gov/mods/v3">
|
||||
<mods:titleInfo>
|
||||
<mods:title>Ursachen so diesen Druck veranlasset</mods:title>
|
||||
</mods:titleInfo>
|
||||
</mods:mods>
|
||||
</mets:xmlData>
|
||||
</mets:mdWrap>
|
||||
</mets:dmdSec>
|
||||
<mets:dmdSec ID="DMDLOG_0002">
|
||||
<mets:mdWrap MDTYPE="MODS">
|
||||
<mets:xmlData>
|
||||
<mods:mods xmlns:mods="http://www.loc.gov/mods/v3">
|
||||
<mods:titleInfo>
|
||||
<mods:title>Endlich Abgetrungene Rechtliche Interims-Defensions-Schrifft ...</mods:title>
|
||||
</mods:titleInfo>
|
||||
</mods:mods>
|
||||
</mets:xmlData>
|
||||
</mets:mdWrap>
|
||||
</mets:dmdSec>
|
||||
<mets:amdSec ID="AMD">
|
||||
<mets:rightsMD ID="RIGHTS">
|
||||
<mets:mdWrap MDTYPE="OTHER" MIMETYPE="text/xml" OTHERMDTYPE="DVRIGHTS">
|
||||
<mets:xmlData>
|
||||
<dv:rights xmlns:dv="http://dfg-viewer.de/">
|
||||
<dv:owner>Staatsbibliothek zu Berlin - Preußischer Kulturbesitz</dv:owner>
|
||||
<dv:ownerLogo>http://resolver.staatsbibliothek-berlin.de/SBB0000000100000000</dv:ownerLogo>
|
||||
<dv:ownerSiteURL>http://www.staatsbibliothek-berlin.de</dv:ownerSiteURL>
|
||||
<dv:ownerContact>mailto:info@sbb.spk-berlin.de</dv:ownerContact>
|
||||
</dv:rights>
|
||||
</mets:xmlData>
|
||||
</mets:mdWrap>
|
||||
</mets:rightsMD>
|
||||
<mets:digiprovMD ID="DIGIPROV">
|
||||
<mets:mdWrap MDTYPE="OTHER" MIMETYPE="text/xml" OTHERMDTYPE="DVLINKS">
|
||||
<mets:xmlData>
|
||||
<dv:links xmlns:dv="http://dfg-viewer.de/">
|
||||
<dv:reference>http://www.stabikat.de/DB=1/PPN?PPN=718448162 </dv:reference>
|
||||
<dv:presentation>http://digital.staatsbibliothek-berlin.de/dms/werkansicht/?PPN=PPN718448162</dv:presentation>
|
||||
</dv:links>
|
||||
</mets:xmlData>
|
||||
</mets:mdWrap>
|
||||
</mets:digiprovMD>
|
||||
</mets:amdSec>
|
||||
<mets:fileSec>
|
||||
<mets:fileGrp USE="OCR-D-GT-PAGE">
|
||||
<mets:file MIMETYPE="application/xml" ID="OCR-D-GT-PAGE_00000024">
|
||||
<mets:FLocat xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="OCR-D-GT-PAGE/00000024.page.xml"/>
|
||||
</mets:file>
|
||||
</mets:fileGrp>
|
||||
<mets:fileGrp USE="OCR-D-OCR-CALAMARI">
|
||||
<mets:file MIMETYPE="application/vnd.prima.page+xml" ID="OCR-D-OCR-CALAMARI_0001">
|
||||
<mets:FLocat xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="OCR-D-OCR-CALAMARI/OCR-D-OCR-CALAMARI_0001.xml"/>
|
||||
</mets:file>
|
||||
</mets:fileGrp>
|
||||
<mets:fileGrp USE="OCR-D-OCR-TESS">
|
||||
<mets:file MIMETYPE="application/vnd.prima.page+xml" ID="OCR-D-OCR-TESS_0001">
|
||||
<mets:FLocat xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="OCR-D-OCR-TESS/OCR-D-OCR-TESS_0001.xml"/>
|
||||
</mets:file>
|
||||
</mets:fileGrp>
|
||||
</mets:fileSec>
|
||||
<mets:structMap TYPE="LOGICAL">
|
||||
<mets:div ADMID="AMD" CONTENTIDS="http://resolver.staatsbibliothek-berlin.de/SBB00008F1000000000" DMDID="DMDLOG_0000" ID="LOG_0000" LABEL="Acten-mäßiger Verlauff, Des Fameusen Processus sich verhaltende zwischen Herrn Hoff-Rath Eraßmus Senckenberg des Raths zu Franckfurt An einem und der Unschuldigen Catharina Agricola, am andern Theil puncto stupri violenti" ORDERLABEL="Acten-mäßiger Verlauff, Des Fameusen Processus sich verhaltende zwischen Herrn Hoff-Rath Eraßmus Senckenberg des Raths zu Franckfurt An einem und der Unschuldigen Catharina Agricola, am andern Theil puncto stupri violenti" TYPE="monograph">
|
||||
<mets:div ID="LOG_0001" TYPE="binding">
|
||||
<mets:div ID="LOG_0002" TYPE="cover_front"/>
|
||||
</mets:div>
|
||||
<mets:div ID="LOG_0003" TYPE="title_page"/>
|
||||
<mets:div DMDID="DMDLOG_0001" ID="LOG_0004" LABEL="Ursachen so diesen Druck veranlasset" TYPE="section"/>
|
||||
<mets:div DMDID="DMDLOG_0002" ID="LOG_0005" LABEL="Endlich Abgetrungene Rechtliche Interims-Defensions-Schrifft ..." TYPE="section"/>
|
||||
<mets:div ID="LOG_0006" TYPE="binding">
|
||||
<mets:div ID="LOG_0007" TYPE="cover_back"/>
|
||||
</mets:div>
|
||||
</mets:div>
|
||||
</mets:structMap>
|
||||
<mets:structMap TYPE="PHYSICAL">
|
||||
<mets:div CONTENTIDS="http://resolver.staatsbibliothek-berlin.de/SBB00008F1000000000" DMDID="DMDPHYS_0000" ID="PHYS_0000" TYPE="physSequence">
|
||||
<mets:div TYPE="page" ID="00000024">
|
||||
<mets:fptr FILEID="OCR-D-GT-PAGE_00000024"/>
|
||||
<mets:fptr FILEID="OCR-D-OCR-CALAMARI_0001"/>
|
||||
<mets:fptr FILEID="OCR-D-OCR-TESS_0001"/>
|
||||
</mets:div>
|
||||
</mets:div>
|
||||
</mets:structMap>
|
||||
<mets:structLink>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0001" xlink:from="LOG_0000"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0002" xlink:from="LOG_0000"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0003" xlink:from="LOG_0000"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0004" xlink:from="LOG_0000"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0005" xlink:from="LOG_0000"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0006" xlink:from="LOG_0000"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0007" xlink:from="LOG_0000"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0008" xlink:from="LOG_0000"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0009" xlink:from="LOG_0000"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0010" xlink:from="LOG_0000"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0011" xlink:from="LOG_0000"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0012" xlink:from="LOG_0000"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0013" xlink:from="LOG_0000"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0014" xlink:from="LOG_0000"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0015" xlink:from="LOG_0000"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0016" xlink:from="LOG_0000"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0017" xlink:from="LOG_0000"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0018" xlink:from="LOG_0000"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0019" xlink:from="LOG_0000"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0020" xlink:from="LOG_0000"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0021" xlink:from="LOG_0000"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0022" xlink:from="LOG_0000"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0023" xlink:from="LOG_0000"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0024" xlink:from="LOG_0000"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0025" xlink:from="LOG_0000"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0026" xlink:from="LOG_0000"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0027" xlink:from="LOG_0000"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0028" xlink:from="LOG_0000"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0029" xlink:from="LOG_0000"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0030" xlink:from="LOG_0000"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0031" xlink:from="LOG_0000"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0032" xlink:from="LOG_0000"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0033" xlink:from="LOG_0000"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0034" xlink:from="LOG_0000"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0035" xlink:from="LOG_0000"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0036" xlink:from="LOG_0000"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0037" xlink:from="LOG_0000"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0038" xlink:from="LOG_0000"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0039" xlink:from="LOG_0000"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0040" xlink:from="LOG_0000"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0041" xlink:from="LOG_0000"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0042" xlink:from="LOG_0000"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0043" xlink:from="LOG_0000"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0044" xlink:from="LOG_0000"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0045" xlink:from="LOG_0000"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0046" xlink:from="LOG_0000"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0047" xlink:from="LOG_0000"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0048" xlink:from="LOG_0000"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0049" xlink:from="LOG_0000"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0050" xlink:from="LOG_0000"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0051" xlink:from="LOG_0000"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0052" xlink:from="LOG_0000"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0053" xlink:from="LOG_0000"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0001" xlink:from="LOG_0001"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0002" xlink:from="LOG_0001"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0003" xlink:from="LOG_0001"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0004" xlink:from="LOG_0001"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0001" xlink:from="LOG_0002"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0005" xlink:from="LOG_0003"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0006" xlink:from="LOG_0003"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0007" xlink:from="LOG_0004"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0008" xlink:from="LOG_0004"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0008" xlink:from="LOG_0005"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0009" xlink:from="LOG_0005"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0010" xlink:from="LOG_0005"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0011" xlink:from="LOG_0005"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0012" xlink:from="LOG_0005"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0013" xlink:from="LOG_0005"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0014" xlink:from="LOG_0005"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0015" xlink:from="LOG_0005"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0016" xlink:from="LOG_0005"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0017" xlink:from="LOG_0005"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0018" xlink:from="LOG_0005"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0019" xlink:from="LOG_0005"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0020" xlink:from="LOG_0005"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0021" xlink:from="LOG_0005"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0022" xlink:from="LOG_0005"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0023" xlink:from="LOG_0005"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0024" xlink:from="LOG_0005"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0025" xlink:from="LOG_0005"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0026" xlink:from="LOG_0005"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0027" xlink:from="LOG_0005"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0028" xlink:from="LOG_0005"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0029" xlink:from="LOG_0005"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0030" xlink:from="LOG_0005"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0031" xlink:from="LOG_0005"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0032" xlink:from="LOG_0005"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0033" xlink:from="LOG_0005"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0034" xlink:from="LOG_0005"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0035" xlink:from="LOG_0005"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0036" xlink:from="LOG_0005"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0037" xlink:from="LOG_0005"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0038" xlink:from="LOG_0005"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0039" xlink:from="LOG_0005"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0040" xlink:from="LOG_0005"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0041" xlink:from="LOG_0005"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0042" xlink:from="LOG_0005"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0043" xlink:from="LOG_0005"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0044" xlink:from="LOG_0005"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0045" xlink:from="LOG_0005"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0046" xlink:from="LOG_0005"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0047" xlink:from="LOG_0005"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0048" xlink:from="LOG_0005"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0049" xlink:from="LOG_0006"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0050" xlink:from="LOG_0006"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0051" xlink:from="LOG_0006"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0052" xlink:from="LOG_0006"/>
|
||||
<mets:smLink xmlns:xlink="http://www.w3.org/1999/xlink" xlink:to="PHYS_0052" xlink:from="LOG_0007"/>
|
||||
</mets:structLink>
|
||||
</mets:mets>
|
File diff suppressed because it is too large
Load diff
22865
src/dinglehopper/tests/data/bigger-texts/00008228/00008228.gt.xml
Normal file
22865
src/dinglehopper/tests/data/bigger-texts/00008228/00008228.gt.xml
Normal file
File diff suppressed because it is too large
Load diff
5610
src/dinglehopper/tests/data/brochrnx_73075507X/00000139.gt.page.xml
Normal file
5610
src/dinglehopper/tests/data/brochrnx_73075507X/00000139.gt.page.xml
Normal file
File diff suppressed because it is too large
Load diff
|
@ -0,0 +1,289 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<PcGts xmlns="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15">
|
||||
<Metadata>
|
||||
<Creator>OCR-D/core 1.0.0b11</Creator>
|
||||
<Created>2019-08-01T15:03:17.741679</Created>
|
||||
<LastChange>2019-08-01T15:03:17.741679</LastChange>
|
||||
<MetadataItem type="processingStep" name="recognition/text-recognition" value="ocrd-tesserocr-recognize">
|
||||
<Labels>
|
||||
<Label value="frk" type="model"/>
|
||||
<Label value="line" type="textequiv_level"/>
|
||||
<Label value="False" type="overwrite_words"/>
|
||||
</Labels>
|
||||
</MetadataItem>
|
||||
</Metadata>
|
||||
<Page imageFilename="../OCR-D-IMG-BIN/OCR-D-IMG-BIN_0002" imageWidth="1386" imageHeight="2372">
|
||||
<ReadingOrder>
|
||||
<OrderedGroup id="reading-order">
|
||||
<RegionRefIndexed index="0" regionRef="region0000"/>
|
||||
<RegionRefIndexed index="1" regionRef="region0001"/>
|
||||
<RegionRefIndexed index="2" regionRef="region0002"/>
|
||||
<RegionRefIndexed index="3" regionRef="region0003"/>
|
||||
<RegionRefIndexed index="4" regionRef="region0004"/>
|
||||
<RegionRefIndexed index="5" regionRef="region0005"/>
|
||||
<RegionRefIndexed index="6" regionRef="region0006"/>
|
||||
</OrderedGroup>
|
||||
</ReadingOrder>
|
||||
<TextRegion id="region0000">
|
||||
<Coords points="488,133 1197,133 1197,193 488,193"/>
|
||||
<TextEquiv>
|
||||
<Unicode></Unicode>
|
||||
</TextEquiv>
|
||||
</TextRegion>
|
||||
<TextRegion id="region0001">
|
||||
<Coords points="40,221 1198,221 1198,626 40,626"/>
|
||||
<TextLine id="region0001_line0000">
|
||||
<Coords points="40,221 1198,221 1198,281 40,281"/>
|
||||
<TextEquiv conf="0.86">
|
||||
<Unicode>Die ſcheinen uns bald kleine Hügel - bald Hütten x Zelten und bald</Unicode>
|
||||
</TextEquiv>
|
||||
</TextLine>
|
||||
<TextLine id="region0001_line0001">
|
||||
<Coords points="768,290 879,290 879,325 768,325"/>
|
||||
<TextEquiv conf="0.62">
|
||||
<Unicode>„Bellen</Unicode>
|
||||
</TextEquiv>
|
||||
</TextLine>
|
||||
<TextLine id="region0001_line0002">
|
||||
<Coords points="86,337 1174,337 1174,396 86,396"/>
|
||||
<TextEquiv conf="0.8">
|
||||
<Unicode>Den Blicken , welche ſie durchlaufen , von weiten öfters vorzuſtellen,</Unicode>
|
||||
</TextEquiv>
|
||||
</TextLine>
|
||||
<TextLine id="region0001_line0003">
|
||||
<Coords points="88,397 841,397 841,455 88,455"/>
|
||||
<TextEquiv conf="0.84">
|
||||
<Unicode>Sieht man ein ſolch gemähtes Feld - von oben,</Unicode>
|
||||
</TextEquiv>
|
||||
</TextLine>
|
||||
<TextLine id="region0001_line0004">
|
||||
<Coords points="87,455 1142,455 1142,510 87,510"/>
|
||||
<TextEquiv conf="0.92">
|
||||
<Unicode>Sy gleicht es einem weiten Meer - worauf erhabne Wellen kobeny</Unicode>
|
||||
</TextEquiv>
|
||||
</TextLine>
|
||||
<TextLine id="region0001_line0005">
|
||||
<Coords points="87,510 1153,510 1153,570 87,570"/>
|
||||
<TextEquiv conf="0.85">
|
||||
<Unicode>Jedoch mit dieſem Unterſcheid - daß, da ſich die beſtändig rühren:</Unicode>
|
||||
</TextEquiv>
|
||||
</TextLine>
|
||||
<TextLine id="region0001_line0006">
|
||||
<Coords points="88,569 1161,569 1161,626 88,626"/>
|
||||
<TextEquiv conf="0.84">
|
||||
<Unicode>Von einiger Bewegung hier - in dieſen Wellen ; nichts zu ſpähren,</Unicode>
|
||||
</TextEquiv>
|
||||
</TextLine>
|
||||
<TextEquiv>
|
||||
<Unicode>Die ſcheinen uns bald kleine Hügel - bald Hütten x Zelten und bald
|
||||
„Bellen
|
||||
Den Blicken , welche ſie durchlaufen , von weiten öfters vorzuſtellen,
|
||||
Sieht man ein ſolch gemähtes Feld - von oben,
|
||||
Sy gleicht es einem weiten Meer - worauf erhabne Wellen kobeny
|
||||
Jedoch mit dieſem Unterſcheid - daß, da ſich die beſtändig rühren:
|
||||
Von einiger Bewegung hier - in dieſen Wellen ; nichts zu ſpähren,</Unicode>
|
||||
</TextEquiv>
|
||||
</TextRegion>
|
||||
<TextRegion id="region0002">
|
||||
<Coords points="517,670 745,670 745,716 517,716"/>
|
||||
<TextEquiv>
|
||||
<Unicode></Unicode>
|
||||
</TextEquiv>
|
||||
</TextRegion>
|
||||
<TextRegion id="region0003">
|
||||
<Coords points="243,739 1124,739 1124,1094 243,1094"/>
|
||||
<TextLine id="region0003_line0000">
|
||||
<Coords points="243,739 884,739 884,795 243,795"/>
|
||||
<TextEquiv conf="0.83">
|
||||
<Unicode>Was erhebt des Schöpfers Güte</Unicode>
|
||||
</TextEquiv>
|
||||
</TextLine>
|
||||
<TextLine id="region0003_line0001">
|
||||
<Coords points="244,792 972,792 972,859 244,859"/>
|
||||
<TextEquiv conf="0.8">
|
||||
<Unicode>Mehr , als dieſes Seegens Meer?</Unicode>
|
||||
</TextEquiv>
|
||||
</TextLine>
|
||||
<TextLine id="region0003_line0002">
|
||||
<Coords points="243,855 931,855 931,913 243,913"/>
|
||||
<TextEquiv conf="0.83">
|
||||
<Unicode>Kommt dies wohl von ungefehv?</Unicode>
|
||||
</TextEquiv>
|
||||
</TextLine>
|
||||
<TextLine id="region0003_line0003">
|
||||
<Coords points="244,914 918,914 918,971 244,971"/>
|
||||
<TextEquiv conf="0.84">
|
||||
<Unicode>Nein , rüſt mein erfreut Gemühte</Unicode>
|
||||
</TextEquiv>
|
||||
</TextLine>
|
||||
<TextLine id="region0003_line0004">
|
||||
<Coords points="245,972 1059,972 1059,1034 245,1034"/>
|
||||
<TextEquiv conf="0.86">
|
||||
<Unicode>Nur von GOTT komint alles hers</Unicode>
|
||||
</TextEquiv>
|
||||
</TextLine>
|
||||
<TextLine id="region0003_line0005">
|
||||
<Coords points="247,1029 1124,1029 1124,1094 247,1094"/>
|
||||
<TextEquiv conf="0.74">
|
||||
<Unicode>Ihm ſey Preiß und Dan und Ehr!</Unicode>
|
||||
</TextEquiv>
|
||||
</TextLine>
|
||||
<TextEquiv>
|
||||
<Unicode>Was erhebt des Schöpfers Güte
|
||||
Mehr , als dieſes Seegens Meer?
|
||||
Kommt dies wohl von ungefehv?
|
||||
Nein , rüſt mein erfreut Gemühte
|
||||
Nur von GOTT komint alles hers
|
||||
Ihm ſey Preiß und Dan und Ehr!</Unicode>
|
||||
</TextEquiv>
|
||||
</TextRegion>
|
||||
<TextRegion id="region0004">
|
||||
<Coords points="1043,1096 1204,1096 1204,1136 1043,1136"/>
|
||||
<TextLine id="region0004_line0000">
|
||||
<Coords points="1043,1096 1204,1096 1204,1136 1043,1136"/>
|
||||
<TextEquiv conf="0.8">
|
||||
<Unicode>Da Capo,</Unicode>
|
||||
</TextEquiv>
|
||||
</TextLine>
|
||||
<TextEquiv>
|
||||
<Unicode>Da Capo,</Unicode>
|
||||
</TextEquiv>
|
||||
</TextRegion>
|
||||
<TextRegion id="region0005">
|
||||
<Coords points="68,1183 1236,1183 1236,2056 68,2056"/>
|
||||
<TextLine id="region0005_line0000">
|
||||
<Coords points="91,1183 1170,1183 1170,1235 91,1235"/>
|
||||
<TextEquiv conf="0.65">
|
||||
<Unicode>Geht man auf einen ſolhen Felde, ſo eben erſi gemäht - ſpaßtiereny</Unicode>
|
||||
</TextEquiv>
|
||||
</TextLine>
|
||||
<TextLine id="region0005_line0001">
|
||||
<Coords points="89,1236 1182,1236 1182,1289 89,1289"/>
|
||||
<TextEquiv conf="0.73">
|
||||
<Unicode>Das man gewohnt voll Korn zu ſehn; ſo kommen wir uns gröſſer für,</Unicode>
|
||||
</TextEquiv>
|
||||
</TextLine>
|
||||
<TextLine id="region0005_line0002">
|
||||
<Coords points="89,1294 1208,1294 1208,1346 89,1346"/>
|
||||
<TextEquiv conf="0.85">
|
||||
<Unicode>Das Feld hingegen niedriger. Auch nimmt ſodean ein neuer Scheinz</Unicode>
|
||||
</TextEquiv>
|
||||
</TextLine>
|
||||
<TextLine id="region0005_line0003">
|
||||
<Coords points="90,1351 519,1351 519,1399 90,1399"/>
|
||||
<TextEquiv conf="0.92">
|
||||
<Unicode>Und eine neue Farben Zier</Unicode>
|
||||
</TextEquiv>
|
||||
</TextLine>
|
||||
<TextLine id="region0005_line0004">
|
||||
<Coords points="91,1405 561,1405 561,1457 91,1457"/>
|
||||
<TextEquiv conf="0.91">
|
||||
<Unicode>Den erſt gemähten Aker ein,</Unicode>
|
||||
</TextEquiv>
|
||||
</TextLine>
|
||||
<TextLine id="region0005_line0005">
|
||||
<Coords points="92,1459 1208,1459 1208,1510 92,1510"/>
|
||||
<TextEquiv conf="0.88">
|
||||
<Unicode>Der Grund iſt grün - die Stoppeln gelb und wenn fich unjrer Son-</Unicode>
|
||||
</TextEquiv>
|
||||
</TextLine>
|
||||
<TextLine id="region0005_line0006">
|
||||
<Coords points="782,1514 1007,1514 1007,1555 782,1555"/>
|
||||
<TextEquiv conf="0.46">
|
||||
<Unicode>nen B;Of</Unicode>
|
||||
</TextEquiv>
|
||||
</TextLine>
|
||||
<TextLine id="region0005_line0007">
|
||||
<Coords points="68,1562 1177,1562 1177,1617 68,1617"/>
|
||||
<TextEquiv conf="0.82">
|
||||
<Unicode>Un ihre runde glatte Röhren , zumahlen früh und Abends bricht;</Unicode>
|
||||
</TextEquiv>
|
||||
</TextLine>
|
||||
<TextLine id="region0005_line0008">
|
||||
<Coords points="90,1618 1236,1618 1236,1670 90,1670"/>
|
||||
<TextEquiv conf="0.79">
|
||||
<Unicode>So kann ein Gold kaum ſtärcker glänßen.- Dies macht ein liebliches</Unicode>
|
||||
</TextEquiv>
|
||||
</TextLine>
|
||||
<TextLine id="region0005_line0009">
|
||||
<Coords points="777,1671 1159,1671 1159,1716 777,1716"/>
|
||||
<TextEquiv conf="0.76">
|
||||
<Unicode>Gemiſche, |</Unicode>
|
||||
</TextEquiv>
|
||||
</TextLine>
|
||||
<TextLine id="region0005_line0010">
|
||||
<Coords points="92,1722 1211,1722 1211,1783 92,1783"/>
|
||||
<TextEquiv conf="0.7">
|
||||
<Unicode>Zutnahl wenn , in der Nachbarſchaft - ein dumfel-grünendes Gebüſche</Unicode>
|
||||
</TextEquiv>
|
||||
</TextLine>
|
||||
<TextLine id="region0005_line0011">
|
||||
<Coords points="91,1779 1210,1779 1210,1837 91,1837"/>
|
||||
<TextEquiv conf="0.84">
|
||||
<Unicode>Den gelben Schimmer noch erhöht. Wir ich nun jüngſt, zur Abend Zeif,</Unicode>
|
||||
</TextEquiv>
|
||||
</TextLine>
|
||||
<TextLine id="region0005_line0012">
|
||||
<Coords points="93,1837 1210,1837 1210,1895 93,1895"/>
|
||||
<TextEquiv conf="0.84">
|
||||
<Unicode>Durch ſo viel ſhwere Scegens-Berge, mit ſanften Schritten, hin und</Unicode>
|
||||
</TextEquiv>
|
||||
</TextLine>
|
||||
<TextLine id="region0005_line0013">
|
||||
<Coords points="800,1896 914,1896 914,1936 800,1936"/>
|
||||
<TextEquiv conf="0.52">
|
||||
<Unicode>Wieder;</Unicode>
|
||||
</TextEquiv>
|
||||
</TextLine>
|
||||
<TextLine id="region0005_line0014">
|
||||
<Coords points="92,1943 1212,1943 1212,2001 92,2001"/>
|
||||
<TextEquiv conf="0.74">
|
||||
<Unicode>Gepühret durch des Feldes Schmu, gerühret durc< die Fruchtbarkeitz</Unicode>
|
||||
</TextEquiv>
|
||||
</TextLine>
|
||||
<TextLine id="region0005_line0015">
|
||||
<Coords points="90,1998 1125,1998 1125,2056 90,2056"/>
|
||||
<TextEquiv conf="0.76">
|
||||
<Unicode>Vergmigt auf meinem Acker gieng - ertönten dieſe meine Lieder:</Unicode>
|
||||
</TextEquiv>
|
||||
</TextLine>
|
||||
<TextEquiv>
|
||||
<Unicode>Geht man auf einen ſolhen Felde, ſo eben erſi gemäht - ſpaßtiereny
|
||||
Das man gewohnt voll Korn zu ſehn; ſo kommen wir uns gröſſer für,
|
||||
Das Feld hingegen niedriger. Auch nimmt ſodean ein neuer Scheinz
|
||||
Und eine neue Farben Zier
|
||||
Den erſt gemähten Aker ein,
|
||||
Der Grund iſt grün - die Stoppeln gelb und wenn fich unjrer Son-
|
||||
nen B;Of
|
||||
Un ihre runde glatte Röhren , zumahlen früh und Abends bricht;
|
||||
So kann ein Gold kaum ſtärcker glänßen.- Dies macht ein liebliches
|
||||
Gemiſche, |
|
||||
Zutnahl wenn , in der Nachbarſchaft - ein dumfel-grünendes Gebüſche
|
||||
Den gelben Schimmer noch erhöht. Wir ich nun jüngſt, zur Abend Zeif,
|
||||
Durch ſo viel ſhwere Scegens-Berge, mit ſanften Schritten, hin und
|
||||
Wieder;
|
||||
Gepühret durch des Feldes Schmu, gerühret durc< die Fruchtbarkeitz
|
||||
Vergmigt auf meinem Acker gieng - ertönten dieſe meine Lieder:</Unicode>
|
||||
</TextEquiv>
|
||||
</TextRegion>
|
||||
<TextRegion id="region0006">
|
||||
<Coords points="688,2060 1216,2060 1216,2120 688,2120"/>
|
||||
<TextLine id="region0006_line0000">
|
||||
<Coords points="688,2069 787,2069 787,2120 688,2120"/>
|
||||
<TextEquiv conf="0.74">
|
||||
<Unicode>5) 2</Unicode>
|
||||
</TextEquiv>
|
||||
</TextLine>
|
||||
<TextLine id="region0006_line0001">
|
||||
<Coords points="1044,2060 1216,2060 1216,2105 1044,2105"/>
|
||||
<TextEquiv conf="0.89">
|
||||
<Unicode>ARIA.</Unicode>
|
||||
</TextEquiv>
|
||||
</TextLine>
|
||||
<TextEquiv>
|
||||
<Unicode>5) 2
|
||||
ARIA.</Unicode>
|
||||
</TextEquiv>
|
||||
</TextRegion>
|
||||
</Page>
|
||||
</PcGts>
|
3394
src/dinglehopper/tests/data/directory-test/gt/1.xml
Normal file
3394
src/dinglehopper/tests/data/directory-test/gt/1.xml
Normal file
File diff suppressed because it is too large
Load diff
3394
src/dinglehopper/tests/data/directory-test/gt/2.xml
Normal file
3394
src/dinglehopper/tests/data/directory-test/gt/2.xml
Normal file
File diff suppressed because it is too large
Load diff
3394
src/dinglehopper/tests/data/directory-test/ocr/1.xml
Normal file
3394
src/dinglehopper/tests/data/directory-test/ocr/1.xml
Normal file
File diff suppressed because it is too large
Load diff
3394
src/dinglehopper/tests/data/directory-test/ocr/2.xml
Normal file
3394
src/dinglehopper/tests/data/directory-test/ocr/2.xml
Normal file
File diff suppressed because it is too large
Load diff
3394
src/dinglehopper/tests/data/directory-test/ocr/3-has-no-gt.xml
Normal file
3394
src/dinglehopper/tests/data/directory-test/ocr/3-has-no-gt.xml
Normal file
File diff suppressed because it is too large
Load diff
382
src/dinglehopper/tests/data/levels-are-different.page.xml
Normal file
382
src/dinglehopper/tests/data/levels-are-different.page.xml
Normal file
|
@ -0,0 +1,382 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<PcGts xmlns="http://schema.primaresearch.org/PAGE/gts/pagecontent/2018-07-15" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://schema.primaresearch.org/PAGE/gts/pagecontent/2018-07-15 http://schema.primaresearch.org/PAGE/gts/pagecontent/2018-07-15/pagecontent.xsd" pcGtsId="id__00000083">
|
||||
<Metadata>
|
||||
<Creator>doculibtopagexml</Creator>
|
||||
<Created>2018-11-20T05:00:14</Created>
|
||||
<LastChange>2019-04-17T10:47:36</LastChange></Metadata>
|
||||
<Page imageFilename="00000083.tif" imageXResolution="300.00000" imageYResolution="300.00000" imageWidth="1275" imageHeight="2032" type="content" readingDirection="left-to-right" textLineOrder="top-to-bottom" primaryLanguage="German">
|
||||
<PrintSpace>
|
||||
<Coords points="45,93 45,1916 1035,1916 1035,93"/></PrintSpace>
|
||||
<ReadingOrder>
|
||||
<OrderedGroup id="ro357564684568544579089">
|
||||
<RegionRefIndexed regionRef="r1070" index="3"/>
|
||||
</OrderedGroup></ReadingOrder>
|
||||
<TextRegion id="r1070" readingDirection="left-to-right" textLineOrder="top-to-bottom" type="paragraph" align="justify" primaryLanguage="German">
|
||||
<Coords points="190,1215 209,1215 209,1255 376,1255 376,1252 465,1252 465,1251 484,1251 484,1255 588,1255 588,1254 636,1254 636,1249 677,1249 677,1224 690,1224 690,1250 760,1250 760,1251 921,1251 921,1264 964,1264 964,1340 966,1340 966,1384 968,1384 968,1429 969,1429 969,1562 972,1562 972,1636 973,1636 973,1698 975,1698 975,1725 944,1725 944,1788 929,1788 929,1789 898,1789 898,1799 875,1799 875,1797 826,1797 826,1790 690,1790 690,1795 674,1795 674,1785 564,1785 564,1786 519,1786 519,1788 491,1788 491,1800 438,1800 438,1802 422,1802 422,1790 299,1790 299,1795 280,1795 280,1804 228,1804 228,1798 112,1798 112,1659 111,1659 111,1481 109,1481 109,1359 111,1359 111,1264 112,1264 112,1263 113,1263 113,1262 114,1262 114,1260 115,1260 115,1256 116,1256 116,1254 117,1254 117,1253 118,1253 118,1252 119,1252 119,1250 120,1250 120,1249 121,1249 121,1248 122,1248 122,1247 123,1247 123,1246 124,1246 124,1245 125,1245 125,1244 126,1244 126,1243 127,1243 127,1242 128,1242 128,1241 129,1241 129,1240 134,1240 134,1239 139,1239 139,1238 190,1238"/>
|
||||
<TextLine id="l1071">
|
||||
<Coords points="112,1732 280,1732 280,1748 391,1748 391,1753 596,1753 596,1744 635,1744 635,1745 690,1745 690,1748 806,1748 806,1751 858,1751 858,1752 898,1752 898,1762 929,1762 929,1776 944,1776 944,1788 929,1788 929,1789 898,1789 898,1799 875,1799 875,1797 826,1797 826,1790 690,1790 690,1795 674,1795 674,1785 564,1785 564,1786 519,1786 519,1788 491,1788 491,1800 438,1800 438,1802 422,1802 422,1790 299,1790 299,1795 280,1795 280,1804 228,1804 228,1798 112,1798"/>
|
||||
<Word id="w1072">
|
||||
<Coords points="112,1732 146,1732 146,1747 206,1747 206,1773 211,1773 211,1774 213,1774 213,1775 214,1775 214,1779 213,1779 213,1781 212,1781 212,1783 211,1783 211,1785 210,1785 210,1786 209,1786 209,1787 208,1787 208,1788 206,1788 206,1789 146,1789 146,1798 112,1798"/>
|
||||
<Glyph id="c1073">
|
||||
<Coords points="112,1732 146,1732 146,1798 112,1798"/>
|
||||
<TextEquiv conf="0.91338">
|
||||
<Unicode>H</Unicode></TextEquiv></Glyph>
|
||||
<Glyph id="c1074">
|
||||
<Coords points="149,1748 165,1748 165,1776 149,1776"/>
|
||||
<TextEquiv conf="0.61335">
|
||||
<Unicode>a</Unicode></TextEquiv></Glyph>
|
||||
<Glyph id="c1075">
|
||||
<Coords points="167,1750 186,1750 186,1783 167,1783"/>
|
||||
<TextEquiv conf="0.69192">
|
||||
<Unicode>n</Unicode></TextEquiv></Glyph>
|
||||
<Glyph id="c1076">
|
||||
<Coords points="187,1747 206,1747 206,1781 187,1781"/>
|
||||
<TextEquiv conf="0.72500">
|
||||
<Unicode>d</Unicode></TextEquiv></Glyph>
|
||||
<Glyph id="c1205">
|
||||
<Coords points="211,1774 213,1774 213,1775 214,1775 214,1779 213,1779 213,1781 212,1781 212,1783 211,1783 211,1785 210,1785 210,1786 209,1786 209,1787 208,1787 208,1788 206,1788 206,1789 206,1788 205,1788 205,1782 206,1782 206,1780 207,1780 207,1778 210,1778 210,1774 211,1774 211,1773"/>
|
||||
<TextEquiv>
|
||||
<Unicode>,</Unicode></TextEquiv></Glyph>
|
||||
<TextEquiv conf="0.61335">
|
||||
<Unicode>Hand,</Unicode></TextEquiv></Word>
|
||||
<Word id="w1077">
|
||||
<Coords points="228,1732 280,1732 280,1748 391,1748 391,1790 299,1790 299,1795 280,1795 280,1804 228,1804"/>
|
||||
<Glyph id="c1078">
|
||||
<Coords points="228,1732 280,1732 280,1804 228,1804"/>
|
||||
<TextEquiv conf="0.87457">
|
||||
<Unicode>M</Unicode></TextEquiv></Glyph>
|
||||
<Glyph id="c1079">
|
||||
<Coords points="282,1759 299,1759 299,1795 282,1795"/>
|
||||
<TextEquiv conf="0.76524">
|
||||
<Unicode>y</Unicode></TextEquiv></Glyph>
|
||||
<Glyph id="c1080">
|
||||
<Coords points="301,1753 311,1753 311,1788 301,1788"/>
|
||||
<TextEquiv conf="0.86902">
|
||||
<Unicode>l</Unicode></TextEquiv></Glyph>
|
||||
<Glyph id="c1081">
|
||||
<Coords points="313,1761 330,1761 330,1788 313,1788"/>
|
||||
<TextEquiv conf="0.85741">
|
||||
<Unicode>o</Unicode></TextEquiv></Glyph>
|
||||
<Glyph id="c1082">
|
||||
<Coords points="332,1762 345,1762 345,1790 332,1790"/>
|
||||
<TextEquiv conf="0.82725">
|
||||
<Unicode>r</Unicode></TextEquiv></Glyph>
|
||||
<Glyph id="c1083">
|
||||
<Coords points="347,1756 364,1756 364,1789 347,1789"/>
|
||||
<TextEquiv conf="0.84884">
|
||||
<Unicode>d</Unicode></TextEquiv></Glyph>
|
||||
<Glyph id="c1084">
|
||||
<Coords points="373,1748 391,1748 391,1790 373,1790"/>
|
||||
<TextEquiv conf="0.81744">
|
||||
<Unicode>?</Unicode></TextEquiv></Glyph>
|
||||
<TextEquiv conf="0.76524">
|
||||
<Unicode>Mylord?</Unicode></TextEquiv></Word>
|
||||
<Word id="w1085">
|
||||
<Coords points="422,1753 438,1753 438,1757 503,1757 503,1762 519,1762 519,1788 491,1788 491,1800 438,1800 438,1802 422,1802"/>
|
||||
<Glyph id="c1086">
|
||||
<Coords points="422,1753 438,1753 438,1802 422,1802"/>
|
||||
<TextEquiv conf="0.82658">
|
||||
<Unicode>f</Unicode></TextEquiv></Glyph>
|
||||
<Glyph id="c1087">
|
||||
<Coords points="436,1763 450,1763 450,1790 436,1790"/>
|
||||
<TextEquiv conf="0.83664">
|
||||
<Unicode>r</Unicode></TextEquiv></Glyph>
|
||||
<Glyph id="c1088">
|
||||
<Coords points="451,1761 468,1761 468,1788 451,1788"/>
|
||||
<TextEquiv conf="0.74675">
|
||||
<Unicode>a</Unicode></TextEquiv></Glyph>
|
||||
<Glyph id="c1089">
|
||||
<Coords points="472,1762 491,1762 491,1800 472,1800"/>
|
||||
<TextEquiv conf="0.83707">
|
||||
<Unicode>g</Unicode></TextEquiv></Glyph>
|
||||
<Glyph id="c1090">
|
||||
<Coords points="492,1757 503,1757 503,1788 492,1788"/>
|
||||
<TextEquiv conf="0.79790">
|
||||
<Unicode>t</Unicode></TextEquiv></Glyph>
|
||||
<Glyph id="c1091">
|
||||
<Coords points="505,1762 519,1762 519,1788 505,1788"/>
|
||||
<TextEquiv conf="0.88885">
|
||||
<Unicode>e</Unicode></TextEquiv></Glyph>
|
||||
<TextEquiv conf="0.74675">
|
||||
<Unicode>fragte</Unicode></TextEquiv></Word>
|
||||
<Word id="w1092">
|
||||
<Coords points="531,1753 549,1753 549,1757 579,1757 579,1785 564,1785 564,1786 531,1786"/>
|
||||
<Glyph id="c1093">
|
||||
<Coords points="531,1753 549,1753 549,1786 531,1786"/>
|
||||
<TextEquiv conf="0.84252">
|
||||
<Unicode>d</Unicode></TextEquiv></Glyph>
|
||||
<Glyph id="c1094">
|
||||
<Coords points="550,1759 564,1759 564,1786 550,1786"/>
|
||||
<TextEquiv conf="0.88588">
|
||||
<Unicode>e</Unicode></TextEquiv></Glyph>
|
||||
<Glyph id="c1095">
|
||||
<Coords points="566,1757 579,1757 579,1785 566,1785"/>
|
||||
<TextEquiv conf="0.83230">
|
||||
<Unicode>r</Unicode></TextEquiv></Glyph>
|
||||
<TextEquiv conf="0.83230">
|
||||
<Unicode>der</Unicode></TextEquiv></Word>
|
||||
<Word id="w1096">
|
||||
<Coords points="596,1744 635,1744 635,1745 690,1745 690,1795 674,1795 674,1785 596,1785"/>
|
||||
<Glyph id="c1097">
|
||||
<Coords points="596,1744 635,1744 635,1785 596,1785"/>
|
||||
<TextEquiv conf="0.80936">
|
||||
<Unicode>G</Unicode></TextEquiv></Glyph>
|
||||
<Glyph id="c1098">
|
||||
<Coords points="637,1755 651,1755 651,1783 637,1783"/>
|
||||
<TextEquiv conf="0.78064">
|
||||
<Unicode>r</Unicode></TextEquiv></Glyph>
|
||||
<Glyph id="c1099">
|
||||
<Coords points="652,1754 671,1754 671,1784 652,1784"/>
|
||||
<TextEquiv conf="0.79657">
|
||||
<Unicode>a</Unicode></TextEquiv></Glyph>
|
||||
<Glyph id="c1100">
|
||||
<Coords points="674,1745 690,1745 690,1795 674,1795"/>
|
||||
<TextEquiv conf="0.85403">
|
||||
<Unicode>f</Unicode></TextEquiv></Glyph>
|
||||
<TextEquiv conf="0.78064">
|
||||
<Unicode>Graf</Unicode></TextEquiv></Word>
|
||||
<Word id="w1101">
|
||||
<Coords points="697,1755 716,1755 716,1758 757,1758 757,1787 737,1787 737,1786 719,1786 719,1785 697,1785"/>
|
||||
<Glyph id="c1102">
|
||||
<Coords points="697,1755 716,1755 716,1785 697,1785"/>
|
||||
<TextEquiv conf="0.84576">
|
||||
<Unicode>v</Unicode></TextEquiv></Glyph>
|
||||
<Glyph id="c1103">
|
||||
<Coords points="719,1758 735,1758 735,1786 719,1786"/>
|
||||
<TextEquiv conf="0.89206">
|
||||
<Unicode>o</Unicode></TextEquiv></Glyph>
|
||||
<Glyph id="c1104">
|
||||
<Coords points="737,1758 757,1758 757,1787 737,1787"/>
|
||||
<TextEquiv conf="0.85889">
|
||||
<Unicode>n</Unicode></TextEquiv></Glyph>
|
||||
<TextEquiv conf="0.84576">
|
||||
<Unicode>von</Unicode></TextEquiv></Word>
|
||||
<Word id="w1105">
|
||||
<Coords points="768,1748 806,1748 806,1751 858,1751 858,1752 898,1752 898,1762 929,1762 929,1776 944,1776 944,1788 929,1788 929,1789 898,1789 898,1799 875,1799 875,1797 826,1797 826,1790 768,1790"/>
|
||||
<Glyph id="c1106">
|
||||
<Coords points="768,1748 806,1748 806,1790 768,1790"/>
|
||||
<TextEquiv conf="0.81040">
|
||||
<Unicode>R</Unicode></TextEquiv></Glyph>
|
||||
<Glyph id="c1107">
|
||||
<Coords points="808,1761 825,1761 825,1789 808,1789"/>
|
||||
<TextEquiv conf="0.85909">
|
||||
<Unicode>o</Unicode></TextEquiv></Glyph>
|
||||
<Glyph id="c1108">
|
||||
<Coords points="826,1751 858,1751 858,1797 826,1797"/>
|
||||
<TextEquiv conf="0.83404">
|
||||
<Unicode></Unicode></TextEquiv></Glyph>
|
||||
<Glyph id="c1109">
|
||||
<Coords points="860,1763 873,1763 873,1790 860,1790"/>
|
||||
<TextEquiv conf="0.85515">
|
||||
<Unicode>e</Unicode></TextEquiv></Glyph>
|
||||
<Glyph id="c1110">
|
||||
<Coords points="875,1752 898,1752 898,1799 875,1799"/>
|
||||
<TextEquiv conf="0.89503">
|
||||
<Unicode></Unicode></TextEquiv></Glyph>
|
||||
<Glyph id="c1111">
|
||||
<Coords points="899,1762 913,1762 913,1789 899,1789"/>
|
||||
<TextEquiv conf="0.87816">
|
||||
<Unicode>e</Unicode></TextEquiv></Glyph>
|
||||
<Glyph id="c1112">
|
||||
<Coords points="911,1762 929,1762 929,1789 911,1789"/>
|
||||
<TextEquiv conf="0.73941">
|
||||
<Unicode>r</Unicode></TextEquiv></Glyph>
|
||||
<Glyph id="c1113">
|
||||
<Coords points="934,1776 944,1776 944,1788 934,1788"/>
|
||||
<TextEquiv conf="0.69111">
|
||||
<Unicode>.</Unicode></TextEquiv></Glyph>
|
||||
<TextEquiv conf="0.69111">
|
||||
<Unicode>Roeer.</Unicode></TextEquiv></Word>
|
||||
<TextEquiv conf="0.70871">
|
||||
<Unicode>Hand, Mylord? fragte der Graf von Roeer.</Unicode></TextEquiv></TextLine>
|
||||
<TextLine id="l766">
|
||||
<Coords points="109,1359 139,1359 139,1367 168,1367 168,1379 364,1379 364,1378 418,1378 418,1377 428,1377 428,1379 558,1379 558,1374 643,1374 643,1373 661,1373 661,1374 822,1374 822,1372 864,1372 864,1374 898,1374 898,1383 955,1383 955,1384 968,1384 968,1406 955,1406 955,1410 876,1410 876,1411 864,1411 864,1413 722,1413 722,1418 661,1418 661,1421 643,1421 643,1412 373,1412 373,1413 340,1413 340,1414 310,1414 310,1413 241,1413 241,1411 203,1411 203,1410 187,1410 187,1406 149,1406 149,1404 139,1404 139,1402 109,1402"/>
|
||||
<Word id="w769">
|
||||
<Coords points="109,1359 139,1359 139,1367 168,1367 168,1406 149,1406 149,1404 139,1404 139,1402 109,1402"/>
|
||||
<Glyph id="c770">
|
||||
<Coords points="109,1359 139,1359 139,1402 109,1402"/>
|
||||
<TextEquiv conf="0.70756">
|
||||
<Unicode>A</Unicode></TextEquiv></Glyph>
|
||||
<Glyph id="c771">
|
||||
<Coords points="139,1369 149,1369 149,1404 139,1404"/>
|
||||
<TextEquiv conf="0.76907">
|
||||
<Unicode>l</Unicode></TextEquiv></Glyph>
|
||||
<Glyph id="c772">
|
||||
<Coords points="149,1367 168,1367 168,1406 149,1406"/>
|
||||
<TextEquiv conf="0.68295">
|
||||
<Unicode>s</Unicode></TextEquiv></Glyph>
|
||||
<TextEquiv conf="0.68295">
|
||||
<Unicode>Als</Unicode></TextEquiv></Word>
|
||||
<Word id="w773">
|
||||
<Coords points="187,1384 201,1384 201,1386 218,1386 218,1411 203,1411 203,1410 187,1410"/>
|
||||
<Glyph id="c774">
|
||||
<Coords points="187,1384 201,1384 201,1410 187,1410"/>
|
||||
<TextEquiv conf="0.83952">
|
||||
<Unicode>e</Unicode></TextEquiv></Glyph>
|
||||
<Glyph id="c775">
|
||||
<Coords points="203,1386 218,1386 218,1411 203,1411"/>
|
||||
<TextEquiv conf="0.81121">
|
||||
<Unicode>r</Unicode></TextEquiv></Glyph>
|
||||
<TextEquiv conf="0.81121">
|
||||
<Unicode>er</Unicode></TextEquiv></Word>
|
||||
<Word id="w776">
|
||||
<Coords points="364,1378 373,1378 373,1381 393,1381 393,1412 373,1412 373,1413 340,1413 340,1414 310,1414 310,1413 241,1413 241,1388 258,1388 258,1379 364,1379"/>
|
||||
<Glyph id="c777">
|
||||
<Coords points="241,1388 255,1388 255,1413 241,1413"/>
|
||||
<TextEquiv conf="0.88983">
|
||||
<Unicode>e</Unicode></TextEquiv></Glyph>
|
||||
<Glyph id="c778">
|
||||
<Coords points="258,1379 267,1379 267,1413 258,1413"/>
|
||||
<TextEquiv conf="0.87166">
|
||||
<Unicode>i</Unicode></TextEquiv></Glyph>
|
||||
<Glyph id="c779">
|
||||
<Coords points="269,1385 288,1385 288,1413 269,1413"/>
|
||||
<TextEquiv conf="0.85669">
|
||||
<Unicode>n</Unicode></TextEquiv></Glyph>
|
||||
<Glyph id="c782">
|
||||
<Coords points="310,1385 340,1385 340,1414 310,1414"/>
|
||||
<TextEquiv conf="0.90717">
|
||||
<Unicode>m</Unicode></TextEquiv></Glyph>
|
||||
<Glyph id="c783">
|
||||
<Coords points="343,1386 361,1386 361,1412 343,1412"/>
|
||||
<TextEquiv conf="0.77710">
|
||||
<Unicode>a</Unicode></TextEquiv></Glyph>
|
||||
<Glyph id="c784">
|
||||
<Coords points="364,1378 373,1378 373,1413 364,1413"/>
|
||||
<TextEquiv conf="0.80457">
|
||||
<Unicode>l</Unicode></TextEquiv></Glyph>
|
||||
<Glyph id="c785">
|
||||
<Coords points="375,1381 393,1381 393,1412 375,1412"/>
|
||||
<TextEquiv conf="0.79192">
|
||||
<Unicode>s</Unicode></TextEquiv></Glyph>
|
||||
<Glyph id="c1195">
|
||||
<Coords points="303,1385 303,1386 304,1386 304,1401 305,1401 305,1402 304,1402 304,1407 303,1407 303,1409 302,1409 302,1410 301,1410 301,1411 300,1411 300,1412 297,1412 297,1411 296,1411 296,1410 295,1410 295,1409 292,1409 292,1408 291,1408 291,1397 290,1397 291,1397 291,1392 292,1392 292,1391 293,1391 293,1389 295,1389 295,1388 296,1388 296,1387 301,1387 301,1385"/>
|
||||
<TextEquiv>
|
||||
<Unicode>s</Unicode></TextEquiv></Glyph>
|
||||
<TextEquiv conf="0.58499">
|
||||
<Unicode>einsmals</Unicode></TextEquiv></Word>
|
||||
<Word id="w786">
|
||||
<Coords points="418,1377 428,1377 428,1385 450,1385 450,1412 418,1412"/>
|
||||
<Glyph id="c787">
|
||||
<Coords points="418,1377 428,1377 428,1412 418,1412"/>
|
||||
<TextEquiv conf="0.90477">
|
||||
<Unicode>i</Unicode></TextEquiv></Glyph>
|
||||
<Glyph id="c788">
|
||||
<Coords points="431,1385 450,1385 450,1412 431,1412"/>
|
||||
<TextEquiv conf="0.90877">
|
||||
<Unicode>n</Unicode></TextEquiv></Glyph>
|
||||
<TextEquiv conf="0.90477">
|
||||
<Unicode>in</Unicode></TextEquiv></Word>
|
||||
<Word id="w789">
|
||||
<Coords points="471,1379 489,1379 489,1385 538,1385 538,1412 471,1412"/>
|
||||
<Glyph id="c790">
|
||||
<Coords points="471,1379 489,1379 489,1412 471,1412"/>
|
||||
<TextEquiv conf="0.83564">
|
||||
<Unicode>d</Unicode></TextEquiv></Glyph>
|
||||
<Glyph id="c791">
|
||||
<Coords points="491,1386 503,1386 503,1411 491,1411"/>
|
||||
<TextEquiv conf="0.83281">
|
||||
<Unicode>e</Unicode></TextEquiv></Glyph>
|
||||
<Glyph id="c792">
|
||||
<Coords points="506,1385 538,1385 538,1412 506,1412"/>
|
||||
<TextEquiv conf="0.86322">
|
||||
<Unicode>m</Unicode></TextEquiv></Glyph>
|
||||
<TextEquiv conf="0.83281">
|
||||
<Unicode>dem</Unicode></TextEquiv></Word>
|
||||
<Word id="w793">
|
||||
<Coords points="643,1373 661,1373 661,1374 722,1374 722,1385 730,1385 730,1411 722,1411 722,1418 661,1418 661,1421 643,1421 643,1412 558,1412 558,1374 643,1374"/>
|
||||
<Glyph id="c794">
|
||||
<Coords points="558,1374 590,1374 590,1412 558,1412"/>
|
||||
<TextEquiv conf="0.87259">
|
||||
<Unicode>O</Unicode></TextEquiv></Glyph>
|
||||
<Glyph id="c795">
|
||||
<Coords points="593,1374 609,1374 609,1410 593,1410"/>
|
||||
<TextEquiv conf="0.84287">
|
||||
<Unicode>b</Unicode></TextEquiv></Glyph>
|
||||
<Glyph id="c796">
|
||||
<Coords points="611,1384 625,1384 625,1411 611,1411"/>
|
||||
<TextEquiv conf="0.88296">
|
||||
<Unicode>e</Unicode></TextEquiv></Glyph>
|
||||
<Glyph id="c797">
|
||||
<Coords points="627,1384 640,1384 640,1410 627,1410"/>
|
||||
<TextEquiv conf="0.83827">
|
||||
<Unicode>r</Unicode></TextEquiv></Glyph>
|
||||
<Glyph id="c798">
|
||||
<Coords points="643,1373 661,1373 661,1421 643,1421"/>
|
||||
<TextEquiv conf="0.75418">
|
||||
<Unicode>h</Unicode></TextEquiv></Glyph>
|
||||
<Glyph id="c799">
|
||||
<Coords points="664,1383 681,1383 681,1410 664,1410"/>
|
||||
<TextEquiv conf="0.87030">
|
||||
<Unicode>a</Unicode></TextEquiv></Glyph>
|
||||
<Glyph id="c800">
|
||||
<Coords points="683,1383 704,1383 704,1411 683,1411"/>
|
||||
<TextEquiv conf="0.84676">
|
||||
<Unicode>u</Unicode></TextEquiv></Glyph>
|
||||
<Glyph id="c801">
|
||||
<Coords points="705,1374 722,1374 722,1418 705,1418"/>
|
||||
<TextEquiv conf="0.79240">
|
||||
<Unicode>ſ</Unicode></TextEquiv></Glyph>
|
||||
<Glyph id="c802">
|
||||
<Coords points="716,1385 730,1385 730,1411 716,1411"/>
|
||||
<TextEquiv conf="0.89839">
|
||||
<Unicode>e</Unicode></TextEquiv></Glyph>
|
||||
<TextEquiv conf="0.75418">
|
||||
<Unicode>Oberhauſe</Unicode></TextEquiv></Word>
|
||||
<Word id="w811">
|
||||
<Coords points="911,1383 955,1383 955,1384 968,1384 968,1406 955,1406 955,1410 911,1410"/>
|
||||
<Glyph id="c812">
|
||||
<Coords points="911,1383 940,1383 940,1410 911,1410"/>
|
||||
<TextEquiv conf="0.83790">
|
||||
<Unicode>w</Unicode></TextEquiv></Glyph>
|
||||
<Glyph id="c813">
|
||||
<Coords points="942,1383 955,1383 955,1410 942,1410"/>
|
||||
<TextEquiv conf="0.85182">
|
||||
<Unicode>e</Unicode></TextEquiv></Glyph>
|
||||
<Glyph id="c814">
|
||||
<Coords points="957,1384 968,1384 968,1406 957,1406"/>
|
||||
<TextEquiv conf="0.86700">
|
||||
<Unicode>⸗</Unicode></TextEquiv></Glyph>
|
||||
<TextEquiv conf="0.83790">
|
||||
<Unicode>we⸗</Unicode></TextEquiv></Word>
|
||||
<Word id="w1208">
|
||||
<Coords points="764,1376 773,1376 773,1384 811,1384 811,1411 764,1411 764,1410 748,1410 748,1384 764,1384"/>
|
||||
<Glyph id="c804">
|
||||
<Coords points="748,1384 761,1384 761,1410 748,1410"/>
|
||||
<TextEquiv conf="0.85674">
|
||||
<Unicode>e</Unicode></TextEquiv></Glyph>
|
||||
<Glyph id="c805">
|
||||
<Coords points="764,1376 773,1376 773,1411 764,1411"/>
|
||||
<TextEquiv conf="0.91519">
|
||||
<Unicode>i</Unicode></TextEquiv></Glyph>
|
||||
<Glyph id="c806">
|
||||
<Coords points="776,1384 795,1384 795,1410 776,1410"/>
|
||||
<TextEquiv conf="0.89158">
|
||||
<Unicode>n</Unicode></TextEquiv></Glyph>
|
||||
<Glyph id="c807">
|
||||
<Coords points="797,1384 811,1384 811,1411 797,1411"/>
|
||||
<TextEquiv conf="0.95123">
|
||||
<Unicode>e</Unicode></TextEquiv></Glyph>
|
||||
<TextEquiv>
|
||||
<Unicode>eine</Unicode></TextEquiv></Word>
|
||||
<Word id="w1209">
|
||||
<Coords points="822,1372 864,1372 864,1374 898,1374 898,1410 876,1410 876,1411 864,1411 864,1413 822,1413"/>
|
||||
<Glyph id="c808">
|
||||
<Coords points="822,1372 864,1372 864,1413 822,1413"/>
|
||||
<TextEquiv conf="0.79185">
|
||||
<Unicode>B</Unicode></TextEquiv></Glyph>
|
||||
<Glyph id="c809">
|
||||
<Coords points="867,1377 876,1377 876,1411 867,1411"/>
|
||||
<TextEquiv conf="0.91084">
|
||||
<Unicode>i</Unicode></TextEquiv></Glyph>
|
||||
<Glyph id="c810">
|
||||
<Coords points="878,1374 898,1374 898,1410 878,1410"/>
|
||||
<TextEquiv conf="0.83545">
|
||||
<Unicode></Unicode></TextEquiv></Glyph>
|
||||
<TextEquiv>
|
||||
<Unicode>Bi</Unicode></TextEquiv></Word>
|
||||
<TextEquiv conf="0.75683">
|
||||
<Unicode>Als er einsmals in dem Oberhauſe eine Bi we⸗</Unicode></TextEquiv></TextLine>
|
||||
<TextEquiv conf="0.70871">
|
||||
<Unicode>Inconsistent dummy region text</Unicode></TextEquiv>
|
||||
<TextStyle fontFamily="Fraktur"/></TextRegion></Page></PcGts>
|
|
@ -0,0 +1,47 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<PcGts xmlns="http://schema.primaresearch.org/PAGE/gts/pagecontent/2018-07-15" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://schema.primaresearch.org/PAGE/gts/pagecontent/2018-07-15 http://schema.primaresearch.org/PAGE/gts/pagecontent/2018-07-15/pagecontent.xsd">
|
||||
<Metadata>
|
||||
<Creator></Creator>
|
||||
<Created>2019-07-26T13:59:00</Created>
|
||||
<LastChange>2019-07-26T14:00:29</LastChange></Metadata>
|
||||
<Page imageFilename="lorem-ipsum-scan.tif" imageXResolution="300.00000" imageYResolution="300.00000" imageWidth="2481" imageHeight="3508">
|
||||
<TextRegion id="tempReg357564684568544579089">
|
||||
<Coords points="0,0 1,0 1,1 0,1"/>
|
||||
<TextLine id="l0">
|
||||
<Coords points="228,237 228,295 2216,295 2216,237"/>
|
||||
<TextEquiv>
|
||||
<Unicode></Unicode></TextEquiv></TextLine>
|
||||
<TextLine id="l1">
|
||||
<Coords points="228,298 228,348 2160,348 2160,298"/>
|
||||
<TextEquiv>
|
||||
<Unicode></Unicode></TextEquiv></TextLine>
|
||||
<TextLine id="l2">
|
||||
<Coords points="225,348 225,410 2178,410 2178,348"/>
|
||||
<TextEquiv>
|
||||
<Unicode></Unicode></TextEquiv></TextLine>
|
||||
<TextLine id="l3">
|
||||
<Coords points="218,413 218,463 2153,463 2153,413"/>
|
||||
<TextEquiv>
|
||||
<Unicode></Unicode></TextEquiv></TextLine>
|
||||
<TextLine id="l4">
|
||||
<Coords points="225,466 225,522 2153,522 2153,466"/>
|
||||
<TextEquiv>
|
||||
<Unicode></Unicode></TextEquiv></TextLine>
|
||||
<TextLine id="l5">
|
||||
<Coords points="216,524 216,581 2187,581 2187,524"/>
|
||||
<TextEquiv>
|
||||
<Unicode></Unicode></TextEquiv></TextLine>
|
||||
<TextLine id="l6">
|
||||
<Coords points="219,584 219,640 542,640 542,584"/>
|
||||
<TextEquiv>
|
||||
<Unicode></Unicode></TextEquiv></TextLine></TextRegion>
|
||||
<TextRegion id="r7" type="paragraph">
|
||||
<Coords points="204,212 204,651 2227,651 2227,212"/>
|
||||
<TextEquiv>
|
||||
<Unicode>Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt
|
||||
ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo
|
||||
dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit
|
||||
amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor
|
||||
invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et
|
||||
justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum
|
||||
dolor sit amet.</Unicode></TextEquiv></TextRegion></Page></PcGts>
|
|
@ -0,0 +1,139 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<alto xmlns="http://www.loc.gov/standards/alto/ns-v3#" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.loc.gov/standards/alto/ns-v3# http://www.loc.gov/alto/v3/alto-3-0.xsd">
|
||||
<Description>
|
||||
<MeasurementUnit>pixel</MeasurementUnit>
|
||||
<sourceImageInformation>
|
||||
<fileName> </fileName>
|
||||
</sourceImageInformation>
|
||||
<OCRProcessing ID="OCR_0">
|
||||
<ocrProcessingStep>
|
||||
<processingSoftware>
|
||||
<softwareName>tesseract 4.1.0-rc4</softwareName>
|
||||
</processingSoftware>
|
||||
</ocrProcessingStep>
|
||||
</OCRProcessing>
|
||||
</Description>
|
||||
<Layout>
|
||||
<Page WIDTH="2481" HEIGHT="3508" PHYSICAL_IMG_NR="0" ID="page_0">
|
||||
<PrintSpace HPOS="0" VPOS="0" WIDTH="2481" HEIGHT="3508">
|
||||
<TextBlock ID="block_0" HPOS="209" VPOS="258" WIDTH="1954" HEIGHT="437">
|
||||
<TextLine ID="line_0" HPOS="209" VPOS="258" WIDTH="1954" HEIGHT="103">
|
||||
<String ID="string_0" HPOS="209" VPOS="319" WIDTH="134" HEIGHT="34" WC="0.96" CONTENT="Lorem"/><SP WIDTH="13" VPOS="319" HPOS="343"/>
|
||||
<String ID="string_1" HPOS="356" VPOS="316" WIDTH="121" HEIGHT="45" WC="0.96" CONTENT="ipsum"/><SP WIDTH="14" VPOS="316" HPOS="477"/>
|
||||
<String ID="string_2" HPOS="491" VPOS="312" WIDTH="102" HEIGHT="36" WC="0.96" CONTENT="dolor"/><SP WIDTH="15" VPOS="312" HPOS="593"/>
|
||||
<String ID="string_3" HPOS="608" VPOS="309" WIDTH="46" HEIGHT="35" WC="0.96" CONTENT="sit"/><SP WIDTH="14" VPOS="309" HPOS="654"/>
|
||||
<String ID="string_4" HPOS="668" VPOS="311" WIDTH="106" HEIGHT="37" WC="0.96" CONTENT="amet,"/><SP WIDTH="16" VPOS="311" HPOS="774"/>
|
||||
<String ID="string_5" HPOS="790" VPOS="307" WIDTH="201" HEIGHT="32" WC="0.88" CONTENT="consetetur"/><SP WIDTH="14" VPOS="307" HPOS="991"/>
|
||||
<String ID="string_6" HPOS="1005" VPOS="297" WIDTH="205" HEIGHT="46" WC="0.96" CONTENT="sadipscing"/><SP WIDTH="15" VPOS="297" HPOS="1210"/>
|
||||
<String ID="string_7" HPOS="1225" VPOS="293" WIDTH="84" HEIGHT="42" WC="0.91" CONTENT="elitr,"/><SP WIDTH="16" VPOS="293" HPOS="1309"/>
|
||||
<String ID="string_8" HPOS="1325" VPOS="289" WIDTH="65" HEIGHT="38" WC="0.96" CONTENT="sed"/><SP WIDTH="14" VPOS="289" HPOS="1390"/>
|
||||
<String ID="string_9" HPOS="1404" VPOS="286" WIDTH="97" HEIGHT="36" WC="0.93" CONTENT="diam"/><SP WIDTH="14" VPOS="286" HPOS="1501"/>
|
||||
<String ID="string_10" HPOS="1515" VPOS="291" WIDTH="100" HEIGHT="24" WC="0.69" CONTENT="nonu"/><SP WIDTH="32" VPOS="291" HPOS="1615"/>
|
||||
<String ID="string_11" HPOS="1647" VPOS="285" WIDTH="30" HEIGHT="36" WC="0.37" CONTENT="yy"/><SP WIDTH="17" VPOS="285" HPOS="1677"/>
|
||||
<String ID="string_12" HPOS="1694" VPOS="268" WIDTH="140" HEIGHT="42" WC="0.93" CONTENT="eirmod"/><SP WIDTH="11" VPOS="268" HPOS="1834"/>
|
||||
<String ID="string_13" HPOS="1845" VPOS="273" WIDTH="139" HEIGHT="37" WC="0.96" CONTENT="tempor"/><SP WIDTH="15" VPOS="273" HPOS="1984"/>
|
||||
<String ID="string_14" HPOS="1999" VPOS="258" WIDTH="164" HEIGHT="38" WC="0.95" CONTENT="invidunt"/>
|
||||
</TextLine>
|
||||
<TextLine ID="line_1" HPOS="211" VPOS="315" WIDTH="1904" HEIGHT="102">
|
||||
<String ID="string_15" HPOS="211" VPOS="380" WIDTH="39" HEIGHT="31" WC="0.96" CONTENT="ut"/><SP WIDTH="13" VPOS="380" HPOS="250"/>
|
||||
<String ID="string_16" HPOS="263" VPOS="373" WIDTH="123" HEIGHT="44" WC="0.96" CONTENT="labore"/><SP WIDTH="16" VPOS="373" HPOS="386"/>
|
||||
<String ID="string_17" HPOS="402" VPOS="379" WIDTH="33" HEIGHT="27" WC="0.95" CONTENT="et"/><SP WIDTH="14" VPOS="379" HPOS="435"/>
|
||||
<String ID="string_18" HPOS="449" VPOS="370" WIDTH="123" HEIGHT="36" WC="0.95" CONTENT="dolore"/><SP WIDTH="15" VPOS="370" HPOS="572"/>
|
||||
<String ID="string_19" HPOS="587" VPOS="374" WIDTH="133" HEIGHT="37" WC="0.96" CONTENT="magna"/><SP WIDTH="14" VPOS="374" HPOS="720"/>
|
||||
<String ID="string_20" HPOS="734" VPOS="363" WIDTH="183" HEIGHT="43" WC="0.96" CONTENT="aliquyam"/><SP WIDTH="14" VPOS="363" HPOS="917"/>
|
||||
<String ID="string_21" HPOS="931" VPOS="360" WIDTH="82" HEIGHT="36" WC="0.95" CONTENT="erat,"/><SP WIDTH="17" VPOS="360" HPOS="1013"/>
|
||||
<String ID="string_22" HPOS="1030" VPOS="354" WIDTH="65" HEIGHT="35" WC="0.96" CONTENT="sed"/><SP WIDTH="13" VPOS="354" HPOS="1095"/>
|
||||
<String ID="string_23" HPOS="1108" VPOS="352" WIDTH="96" HEIGHT="36" WC="0.96" CONTENT="diam"/><SP WIDTH="13" VPOS="352" HPOS="1204"/>
|
||||
<String ID="string_24" HPOS="1217" VPOS="350" WIDTH="181" HEIGHT="44" WC="0.95" CONTENT="voluptua."/><SP WIDTH="13" VPOS="350" HPOS="1398"/>
|
||||
<String ID="string_25" HPOS="1411" VPOS="345" WIDTH="49" HEIGHT="34" WC="0.95" CONTENT="At"/><SP WIDTH="11" VPOS="345" HPOS="1460"/>
|
||||
<String ID="string_26" HPOS="1471" VPOS="348" WIDTH="88" HEIGHT="26" WC="0.93" CONTENT="Vero"/><SP WIDTH="16" VPOS="348" HPOS="1559"/>
|
||||
<String ID="string_27" HPOS="1575" VPOS="345" WIDTH="65" HEIGHT="26" WC="0.96" CONTENT="eos"/><SP WIDTH="15" VPOS="345" HPOS="1640"/>
|
||||
<String ID="string_28" HPOS="1655" VPOS="339" WIDTH="36" HEIGHT="29" WC="0.96" CONTENT="et"/><SP WIDTH="14" VPOS="339" HPOS="1691"/>
|
||||
<String ID="string_29" HPOS="1705" VPOS="336" WIDTH="168" HEIGHT="31" WC="0.87" CONTENT="accusam"/><SP WIDTH="15" VPOS="336" HPOS="1873"/>
|
||||
<String ID="string_30" HPOS="1888" VPOS="329" WIDTH="34" HEIGHT="28" WC="0.96" CONTENT="et"/><SP WIDTH="11" VPOS="329" HPOS="1922"/>
|
||||
<String ID="string_31" HPOS="1933" VPOS="322" WIDTH="96" HEIGHT="44" WC="0.96" CONTENT="justo"/><SP WIDTH="15" VPOS="322" HPOS="2029"/>
|
||||
<String ID="string_32" HPOS="2044" VPOS="315" WIDTH="71" HEIGHT="63" WC="0.96" CONTENT="duo"/>
|
||||
</TextLine>
|
||||
<TextLine ID="line_2" HPOS="214" VPOS="375" WIDTH="1919" HEIGHT="93">
|
||||
<String ID="string_33" HPOS="214" VPOS="431" WIDTH="144" HEIGHT="37" WC="0.96" CONTENT="dolores"/><SP WIDTH="16" VPOS="431" HPOS="358"/>
|
||||
<String ID="string_34" HPOS="374" VPOS="433" WIDTH="34" HEIGHT="31" WC="0.96" CONTENT="et"/><SP WIDTH="14" VPOS="433" HPOS="408"/>
|
||||
<String ID="string_35" HPOS="422" VPOS="437" WIDTH="42" HEIGHT="25" WC="0.96" CONTENT="ea"/><SP WIDTH="13" VPOS="437" HPOS="464"/>
|
||||
<String ID="string_36" HPOS="477" VPOS="426" WIDTH="136" HEIGHT="35" WC="0.96" CONTENT="rebum."/><SP WIDTH="18" VPOS="426" HPOS="613"/>
|
||||
<String ID="string_37" HPOS="631" VPOS="424" WIDTH="75" HEIGHT="34" WC="0.96" CONTENT="Stet"/><SP WIDTH="14" VPOS="424" HPOS="706"/>
|
||||
<String ID="string_38" HPOS="720" VPOS="419" WIDTH="85" HEIGHT="36" WC="0.96" CONTENT="clita"/><SP WIDTH="13" VPOS="419" HPOS="805"/>
|
||||
<String ID="string_39" HPOS="818" VPOS="415" WIDTH="90" HEIGHT="35" WC="0.97" CONTENT="kasd"/><SP WIDTH="14" VPOS="415" HPOS="908"/>
|
||||
<String ID="string_40" HPOS="922" VPOS="412" WIDTH="206" HEIGHT="48" WC="0.96" CONTENT="gubergren,"/><SP WIDTH="16" VPOS="412" HPOS="1128"/>
|
||||
<String ID="string_41" HPOS="1144" VPOS="417" WIDTH="47" HEIGHT="26" WC="0.97" CONTENT="no"/><SP WIDTH="16" VPOS="417" HPOS="1191"/>
|
||||
<String ID="string_42" HPOS="1207" VPOS="415" WIDTH="61" HEIGHT="25" WC="0.96" CONTENT="sea"/><SP WIDTH="13" VPOS="415" HPOS="1268"/>
|
||||
<String ID="string_43" HPOS="1281" VPOS="405" WIDTH="169" HEIGHT="36" WC="0.91" CONTENT="iakimata"/><SP WIDTH="14" VPOS="405" HPOS="1450"/>
|
||||
<String ID="string_44" HPOS="1464" VPOS="400" WIDTH="144" HEIGHT="33" WC="0.96" CONTENT="sanctus"/><SP WIDTH="16" VPOS="400" HPOS="1608"/>
|
||||
<String ID="string_45" HPOS="1624" VPOS="397" WIDTH="54" HEIGHT="29" WC="0.97" CONTENT="est"/><SP WIDTH="13" VPOS="397" HPOS="1678"/>
|
||||
<String ID="string_46" HPOS="1691" VPOS="390" WIDTH="132" HEIGHT="34" WC="0.96" CONTENT="Lorem"/><SP WIDTH="14" VPOS="390" HPOS="1823"/>
|
||||
<String ID="string_47" HPOS="1837" VPOS="383" WIDTH="120" HEIGHT="44" WC="0.96" CONTENT="ipsum"/><SP WIDTH="14" VPOS="383" HPOS="1957"/>
|
||||
<String ID="string_48" HPOS="1971" VPOS="375" WIDTH="102" HEIGHT="37" WC="0.96" CONTENT="dolor"/><SP WIDTH="15" VPOS="375" HPOS="2073"/>
|
||||
<String ID="string_49" HPOS="2088" VPOS="377" WIDTH="45" HEIGHT="31" WC="0.96" CONTENT="sit"/>
|
||||
</TextLine>
|
||||
<TextLine ID="line_3" HPOS="215" VPOS="435" WIDTH="1896" HEIGHT="93">
|
||||
<String ID="string_50" HPOS="215" VPOS="494" WIDTH="106" HEIGHT="32" WC="0.96" CONTENT="amet."/><SP WIDTH="16" VPOS="494" HPOS="321"/>
|
||||
<String ID="string_51" HPOS="337" VPOS="488" WIDTH="130" HEIGHT="33" WC="0.96" CONTENT="Lorem"/><SP WIDTH="14" VPOS="488" HPOS="467"/>
|
||||
<String ID="string_52" HPOS="481" VPOS="484" WIDTH="121" HEIGHT="44" WC="0.96" CONTENT="ipsum"/><SP WIDTH="14" VPOS="484" HPOS="602"/>
|
||||
<String ID="string_53" HPOS="616" VPOS="479" WIDTH="104" HEIGHT="37" WC="0.96" CONTENT="dolor"/><SP WIDTH="14" VPOS="479" HPOS="720"/>
|
||||
<String ID="string_54" HPOS="734" VPOS="476" WIDTH="46" HEIGHT="36" WC="0.93" CONTENT="sit"/><SP WIDTH="14" VPOS="476" HPOS="780"/>
|
||||
<String ID="string_55" HPOS="794" VPOS="477" WIDTH="104" HEIGHT="36" WC="0.75" CONTENT="armet,"/><SP WIDTH="17" VPOS="477" HPOS="898"/>
|
||||
<String ID="string_56" HPOS="915" VPOS="474" WIDTH="200" HEIGHT="30" WC="0.97" CONTENT="consetetur"/><SP WIDTH="14" VPOS="474" HPOS="1115"/>
|
||||
<String ID="string_57" HPOS="1129" VPOS="463" WIDTH="205" HEIGHT="45" WC="0.96" CONTENT="sadipscing"/><SP WIDTH="15" VPOS="463" HPOS="1334"/>
|
||||
<String ID="string_58" HPOS="1349" VPOS="457" WIDTH="86" HEIGHT="41" WC="0.96" CONTENT="elitr,"/><SP WIDTH="16" VPOS="457" HPOS="1435"/>
|
||||
<String ID="string_59" HPOS="1451" VPOS="452" WIDTH="65" HEIGHT="39" WC="0.96" CONTENT="sed"/><SP WIDTH="14" VPOS="452" HPOS="1516"/>
|
||||
<String ID="string_60" HPOS="1530" VPOS="449" WIDTH="99" HEIGHT="36" WC="0.93" CONTENT="diam"/><SP WIDTH="14" VPOS="449" HPOS="1629"/>
|
||||
<String ID="string_61" HPOS="1643" VPOS="451" WIDTH="162" HEIGHT="36" WC="0.59" CONTENT="nonurny"/><SP WIDTH="16" VPOS="451" HPOS="1805"/>
|
||||
<String ID="string_62" HPOS="1821" VPOS="435" WIDTH="138" HEIGHT="39" WC="0.96" CONTENT="eirmod"/><SP WIDTH="12" VPOS="435" HPOS="1959"/>
|
||||
<String ID="string_63" HPOS="1971" VPOS="440" WIDTH="140" HEIGHT="37" WC="0.96" CONTENT="tempor"/>
|
||||
</TextLine>
|
||||
<TextLine ID="line_4" HPOS="216" VPOS="483" WIDTH="1888" HEIGHT="97">
|
||||
<String ID="string_64" HPOS="216" VPOS="543" WIDTH="165" HEIGHT="37" WC="0.97" CONTENT="invidunt"/><SP WIDTH="13" VPOS="543" HPOS="381"/>
|
||||
<String ID="string_65" HPOS="394" VPOS="546" WIDTH="39" HEIGHT="30" WC="0.97" CONTENT="ut"/><SP WIDTH="12" VPOS="546" HPOS="433"/>
|
||||
<String ID="string_66" HPOS="445" VPOS="539" WIDTH="122" HEIGHT="36" WC="0.96" CONTENT="labore"/><SP WIDTH="16" VPOS="539" HPOS="567"/>
|
||||
<String ID="string_67" HPOS="583" VPOS="543" WIDTH="35" HEIGHT="29" WC="0.96" CONTENT="et"/><SP WIDTH="14" VPOS="543" HPOS="618"/>
|
||||
<String ID="string_68" HPOS="632" VPOS="536" WIDTH="125" HEIGHT="34" WC="0.96" CONTENT="dolore"/><SP WIDTH="14" VPOS="536" HPOS="757"/>
|
||||
<String ID="string_69" HPOS="771" VPOS="539" WIDTH="131" HEIGHT="37" WC="0.46" CONTENT="magna"/><SP WIDTH="14" VPOS="539" HPOS="902"/>
|
||||
<String ID="string_70" HPOS="916" VPOS="526" WIDTH="182" HEIGHT="45" WC="0.96" CONTENT="aliquyam"/><SP WIDTH="14" VPOS="526" HPOS="1098"/>
|
||||
<String ID="string_71" HPOS="1112" VPOS="527" WIDTH="82" HEIGHT="37" WC="0.96" CONTENT="erat,"/><SP WIDTH="17" VPOS="527" HPOS="1194"/>
|
||||
<String ID="string_72" HPOS="1211" VPOS="519" WIDTH="63" HEIGHT="36" WC="0.97" CONTENT="sed"/><SP WIDTH="14" VPOS="519" HPOS="1274"/>
|
||||
<String ID="string_73" HPOS="1288" VPOS="517" WIDTH="97" HEIGHT="37" WC="0.96" CONTENT="diam"/><SP WIDTH="11" VPOS="517" HPOS="1385"/>
|
||||
<String ID="string_74" HPOS="1396" VPOS="513" WIDTH="185" HEIGHT="44" WC="0.96" CONTENT="voluptua."/><SP WIDTH="14" VPOS="513" HPOS="1581"/>
|
||||
<String ID="string_75" HPOS="1595" VPOS="505" WIDTH="50" HEIGHT="35" WC="0.96" CONTENT="At"/><SP WIDTH="11" VPOS="505" HPOS="1645"/>
|
||||
<String ID="string_76" HPOS="1656" VPOS="511" WIDTH="89" HEIGHT="27" WC="0.96" CONTENT="vero"/><SP WIDTH="16" VPOS="511" HPOS="1745"/>
|
||||
<String ID="string_77" HPOS="1761" VPOS="508" WIDTH="63" HEIGHT="26" WC="0.96" CONTENT="eos"/><SP WIDTH="15" VPOS="508" HPOS="1824"/>
|
||||
<String ID="string_78" HPOS="1839" VPOS="501" WIDTH="35" HEIGHT="30" WC="0.97" CONTENT="et"/><SP WIDTH="13" VPOS="501" HPOS="1874"/>
|
||||
<String ID="string_79" HPOS="1887" VPOS="499" WIDTH="168" HEIGHT="53" WC="0.80" CONTENT="accusam"/><SP WIDTH="-3" VPOS="499" HPOS="2055"/>
|
||||
<String ID="string_80" HPOS="2052" VPOS="483" WIDTH="52" HEIGHT="55" WC="0.97" CONTENT="et"/>
|
||||
</TextLine>
|
||||
<TextLine ID="line_5" HPOS="215" VPOS="552" WIDTH="1941" HEIGHT="97">
|
||||
<String ID="string_81" HPOS="215" VPOS="604" WIDTH="97" HEIGHT="45" WC="0.97" CONTENT="justo"/><SP WIDTH="16" VPOS="604" HPOS="312"/>
|
||||
<String ID="string_82" HPOS="328" VPOS="600" WIDTH="71" HEIGHT="35" WC="0.97" CONTENT="duo"/><SP WIDTH="16" VPOS="600" HPOS="399"/>
|
||||
<String ID="string_83" HPOS="415" VPOS="597" WIDTH="143" HEIGHT="36" WC="0.93" CONTENT="dolores"/><SP WIDTH="16" VPOS="597" HPOS="558"/>
|
||||
<String ID="string_84" HPOS="574" VPOS="600" WIDTH="34" HEIGHT="29" WC="0.96" CONTENT="et"/><SP WIDTH="14" VPOS="600" HPOS="608"/>
|
||||
<String ID="string_85" HPOS="622" VPOS="602" WIDTH="43" HEIGHT="26" WC="0.96" CONTENT="ea"/><SP WIDTH="13" VPOS="602" HPOS="665"/>
|
||||
<String ID="string_86" HPOS="678" VPOS="590" WIDTH="136" HEIGHT="36" WC="0.96" CONTENT="rebum."/><SP WIDTH="19" VPOS="590" HPOS="814"/>
|
||||
<String ID="string_87" HPOS="833" VPOS="588" WIDTH="74" HEIGHT="34" WC="0.96" CONTENT="Stet"/><SP WIDTH="14" VPOS="588" HPOS="907"/>
|
||||
<String ID="string_88" HPOS="921" VPOS="584" WIDTH="83" HEIGHT="36" WC="0.96" CONTENT="clita"/><SP WIDTH="12" VPOS="584" HPOS="1004"/>
|
||||
<String ID="string_89" HPOS="1016" VPOS="580" WIDTH="90" HEIGHT="36" WC="0.97" CONTENT="kasd"/><SP WIDTH="15" VPOS="580" HPOS="1106"/>
|
||||
<String ID="string_90" HPOS="1121" VPOS="578" WIDTH="205" HEIGHT="47" WC="0.96" CONTENT="gubergren,"/><SP WIDTH="16" VPOS="578" HPOS="1326"/>
|
||||
<String ID="string_91" HPOS="1342" VPOS="582" WIDTH="47" HEIGHT="25" WC="0.96" CONTENT="no"/><SP WIDTH="16" VPOS="582" HPOS="1389"/>
|
||||
<String ID="string_92" HPOS="1405" VPOS="581" WIDTH="62" HEIGHT="26" WC="0.97" CONTENT="sea"/><SP WIDTH="13" VPOS="581" HPOS="1467"/>
|
||||
<String ID="string_93" HPOS="1480" VPOS="566" WIDTH="172" HEIGHT="38" WC="0.96" CONTENT="takimata"/><SP WIDTH="14" VPOS="566" HPOS="1652"/>
|
||||
<String ID="string_94" HPOS="1666" VPOS="563" WIDTH="145" HEIGHT="33" WC="0.97" CONTENT="sanctus"/><SP WIDTH="15" VPOS="563" HPOS="1811"/>
|
||||
<String ID="string_95" HPOS="1826" VPOS="558" WIDTH="54" HEIGHT="30" WC="0.97" CONTENT="est"/><SP WIDTH="12" VPOS="558" HPOS="1880"/>
|
||||
<String ID="string_96" HPOS="1892" VPOS="552" WIDTH="130" HEIGHT="34" WC="0.96" CONTENT="Lorem"/><SP WIDTH="15" VPOS="552" HPOS="2022"/>
|
||||
<String ID="string_97" HPOS="2037" VPOS="553" WIDTH="119" HEIGHT="37" WC="0.51" CONTENT="Ipsum"/>
|
||||
</TextLine>
|
||||
<TextLine ID="line_6" HPOS="219" VPOS="657" WIDTH="282" HEIGHT="38">
|
||||
<String ID="string_98" HPOS="219" VPOS="658" WIDTH="104" HEIGHT="37" WC="0.97" CONTENT="dolor"/><SP WIDTH="15" VPOS="658" HPOS="323"/>
|
||||
<String ID="string_99" HPOS="338" VPOS="657" WIDTH="45" HEIGHT="35" WC="0.97" CONTENT="sit"/><SP WIDTH="14" VPOS="657" HPOS="383"/>
|
||||
<String ID="string_100" HPOS="397" VPOS="660" WIDTH="104" HEIGHT="35" WC="0.94" CONTENT="amet."/>
|
||||
</TextLine>
|
||||
</TextBlock>
|
||||
</PrintSpace>
|
||||
</Page>
|
||||
</Layout>
|
||||
</alto>
|
BIN
src/dinglehopper/tests/data/lorem-ipsum/lorem-ipsum-scan-bad.pdf
Normal file
BIN
src/dinglehopper/tests/data/lorem-ipsum/lorem-ipsum-scan-bad.pdf
Normal file
Binary file not shown.
BIN
src/dinglehopper/tests/data/lorem-ipsum/lorem-ipsum-scan-bad.tif
Normal file
BIN
src/dinglehopper/tests/data/lorem-ipsum/lorem-ipsum-scan-bad.tif
Normal file
Binary file not shown.
|
@ -0,0 +1,47 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<PcGts xmlns="http://schema.primaresearch.org/PAGE/gts/pagecontent/2018-07-15" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://schema.primaresearch.org/PAGE/gts/pagecontent/2018-07-15 http://schema.primaresearch.org/PAGE/gts/pagecontent/2018-07-15/pagecontent.xsd">
|
||||
<Metadata>
|
||||
<Creator></Creator>
|
||||
<Created>2019-07-26T13:59:00</Created>
|
||||
<LastChange>2019-07-26T14:00:29</LastChange></Metadata>
|
||||
<Page imageFilename="lorem-ipsum-scan.tif" imageXResolution="300.00000" imageYResolution="300.00000" imageWidth="2481" imageHeight="3508">
|
||||
<TextRegion id="tempReg357564684568544579089">
|
||||
<Coords points="0,0 1,0 1,1 0,1"/>
|
||||
<TextLine id="l0">
|
||||
<Coords points="228,237 228,295 2216,295 2216,237"/>
|
||||
<TextEquiv>
|
||||
<Unicode></Unicode></TextEquiv></TextLine>
|
||||
<TextLine id="l1">
|
||||
<Coords points="228,298 228,348 2160,348 2160,298"/>
|
||||
<TextEquiv>
|
||||
<Unicode></Unicode></TextEquiv></TextLine>
|
||||
<TextLine id="l2">
|
||||
<Coords points="225,348 225,410 2178,410 2178,348"/>
|
||||
<TextEquiv>
|
||||
<Unicode></Unicode></TextEquiv></TextLine>
|
||||
<TextLine id="l3">
|
||||
<Coords points="218,413 218,463 2153,463 2153,413"/>
|
||||
<TextEquiv>
|
||||
<Unicode></Unicode></TextEquiv></TextLine>
|
||||
<TextLine id="l4">
|
||||
<Coords points="225,466 225,522 2153,522 2153,466"/>
|
||||
<TextEquiv>
|
||||
<Unicode></Unicode></TextEquiv></TextLine>
|
||||
<TextLine id="l5">
|
||||
<Coords points="216,524 216,581 2187,581 2187,524"/>
|
||||
<TextEquiv>
|
||||
<Unicode></Unicode></TextEquiv></TextLine>
|
||||
<TextLine id="l6">
|
||||
<Coords points="219,584 219,640 542,640 542,584"/>
|
||||
<TextEquiv>
|
||||
<Unicode></Unicode></TextEquiv></TextLine></TextRegion>
|
||||
<TextRegion id="r7" type="paragraph">
|
||||
<Coords points="204,212 204,651 2227,651 2227,212"/>
|
||||
<TextEquiv>
|
||||
<Unicode>Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt
|
||||
ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo
|
||||
dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit
|
||||
amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor
|
||||
invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et
|
||||
justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum
|
||||
dolor sit amet.</Unicode></TextEquiv></TextRegion></Page></PcGts>
|
|
@ -0,0 +1,138 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<alto xmlns="http://www.loc.gov/standards/alto/ns-v3#" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.loc.gov/standards/alto/ns-v3# http://www.loc.gov/alto/v3/alto-3-0.xsd">
|
||||
<Description>
|
||||
<MeasurementUnit>pixel</MeasurementUnit>
|
||||
<sourceImageInformation>
|
||||
<fileName> </fileName>
|
||||
</sourceImageInformation>
|
||||
<OCRProcessing ID="OCR_0">
|
||||
<ocrProcessingStep>
|
||||
<processingSoftware>
|
||||
<softwareName>tesseract 4.1.0-rc4</softwareName>
|
||||
</processingSoftware>
|
||||
</ocrProcessingStep>
|
||||
</OCRProcessing>
|
||||
</Description>
|
||||
<Layout>
|
||||
<Page WIDTH="2481" HEIGHT="3508" PHYSICAL_IMG_NR="0" ID="page_0">
|
||||
<PrintSpace HPOS="0" VPOS="0" WIDTH="2481" HEIGHT="3508">
|
||||
<TextBlock ID="block_0" HPOS="234" VPOS="244" WIDTH="1966" HEIGHT="387">
|
||||
<TextLine ID="line_0" HPOS="237" VPOS="244" WIDTH="1963" HEIGHT="48">
|
||||
<String ID="string_0" HPOS="237" VPOS="248" WIDTH="133" HEIGHT="34" WC="0.96" CONTENT="Lorem"/><SP WIDTH="14" VPOS="248" HPOS="370"/>
|
||||
<String ID="string_1" HPOS="384" VPOS="247" WIDTH="120" HEIGHT="45" WC="0.96" CONTENT="ipsum"/><SP WIDTH="15" VPOS="247" HPOS="504"/>
|
||||
<String ID="string_2" HPOS="519" VPOS="246" WIDTH="103" HEIGHT="36" WC="0.96" CONTENT="dolor"/><SP WIDTH="14" VPOS="246" HPOS="622"/>
|
||||
<String ID="string_3" HPOS="636" VPOS="247" WIDTH="46" HEIGHT="35" WC="0.96" CONTENT="sit"/><SP WIDTH="14" VPOS="247" HPOS="682"/>
|
||||
<String ID="string_4" HPOS="696" VPOS="252" WIDTH="105" HEIGHT="36" WC="0.97" CONTENT="amet,"/><SP WIDTH="17" VPOS="252" HPOS="801"/>
|
||||
<String ID="string_5" HPOS="818" VPOS="251" WIDTH="202" HEIGHT="30" WC="0.96" CONTENT="consetetur"/><SP WIDTH="14" VPOS="251" HPOS="1020"/>
|
||||
<String ID="string_6" HPOS="1034" VPOS="244" WIDTH="207" HEIGHT="46" WC="0.96" CONTENT="sadipscing"/><SP WIDTH="15" VPOS="244" HPOS="1241"/>
|
||||
<String ID="string_7" HPOS="1256" VPOS="244" WIDTH="86" HEIGHT="43" WC="0.96" CONTENT="elitr,"/><SP WIDTH="16" VPOS="244" HPOS="1342"/>
|
||||
<String ID="string_8" HPOS="1358" VPOS="244" WIDTH="65" HEIGHT="36" WC="0.96" CONTENT="sed"/><SP WIDTH="15" VPOS="244" HPOS="1423"/>
|
||||
<String ID="string_9" HPOS="1438" VPOS="244" WIDTH="99" HEIGHT="36" WC="0.96" CONTENT="diam"/><SP WIDTH="14" VPOS="244" HPOS="1537"/>
|
||||
<String ID="string_10" HPOS="1551" VPOS="255" WIDTH="164" HEIGHT="35" WC="0.97" CONTENT="nonumy"/><SP WIDTH="15" VPOS="255" HPOS="1715"/>
|
||||
<String ID="string_11" HPOS="1730" VPOS="244" WIDTH="139" HEIGHT="36" WC="0.96" CONTENT="eirmod"/><SP WIDTH="13" VPOS="244" HPOS="1869"/>
|
||||
<String ID="string_12" HPOS="1882" VPOS="250" WIDTH="140" HEIGHT="40" WC="0.96" CONTENT="tempor"/><SP WIDTH="13" VPOS="250" HPOS="2022"/>
|
||||
<String ID="string_13" HPOS="2035" VPOS="244" WIDTH="165" HEIGHT="35" WC="0.96" CONTENT="invidunt"/>
|
||||
</TextLine>
|
||||
<TextLine ID="line_1" HPOS="237" VPOS="301" WIDTH="1913" HEIGHT="49">
|
||||
<String ID="string_14" HPOS="237" VPOS="310" WIDTH="39" HEIGHT="29" WC="0.96" CONTENT="ut"/><SP WIDTH="13" VPOS="310" HPOS="276"/>
|
||||
<String ID="string_15" HPOS="289" VPOS="304" WIDTH="123" HEIGHT="44" WC="0.96" CONTENT="labore"/><SP WIDTH="16" VPOS="304" HPOS="412"/>
|
||||
<String ID="string_16" HPOS="428" VPOS="310" WIDTH="34" HEIGHT="29" WC="0.97" CONTENT="et"/><SP WIDTH="14" VPOS="310" HPOS="462"/>
|
||||
<String ID="string_17" HPOS="476" VPOS="304" WIDTH="123" HEIGHT="36" WC="0.96" CONTENT="dolore"/><SP WIDTH="15" VPOS="304" HPOS="599"/>
|
||||
<String ID="string_18" HPOS="614" VPOS="313" WIDTH="133" HEIGHT="37" WC="0.96" CONTENT="magna"/><SP WIDTH="14" VPOS="313" HPOS="747"/>
|
||||
<String ID="string_19" HPOS="761" VPOS="302" WIDTH="183" HEIGHT="46" WC="0.96" CONTENT="aliquyam"/><SP WIDTH="15" VPOS="302" HPOS="944"/>
|
||||
<String ID="string_20" HPOS="959" VPOS="308" WIDTH="81" HEIGHT="36" WC="0.96" CONTENT="erat,"/><SP WIDTH="17" VPOS="308" HPOS="1040"/>
|
||||
<String ID="string_21" HPOS="1057" VPOS="301" WIDTH="65" HEIGHT="36" WC="0.96" CONTENT="sed"/><SP WIDTH="14" VPOS="301" HPOS="1122"/>
|
||||
<String ID="string_22" HPOS="1136" VPOS="301" WIDTH="97" HEIGHT="36" WC="0.95" CONTENT="diam"/><SP WIDTH="13" VPOS="301" HPOS="1233"/>
|
||||
<String ID="string_23" HPOS="1246" VPOS="301" WIDTH="183" HEIGHT="46" WC="0.96" CONTENT="voluptua."/><SP WIDTH="13" VPOS="301" HPOS="1429"/>
|
||||
<String ID="string_24" HPOS="1442" VPOS="303" WIDTH="51" HEIGHT="34" WC="0.96" CONTENT="At"/><SP WIDTH="12" VPOS="303" HPOS="1493"/>
|
||||
<String ID="string_25" HPOS="1505" VPOS="312" WIDTH="88" HEIGHT="25" WC="0.96" CONTENT="vero"/><SP WIDTH="17" VPOS="312" HPOS="1593"/>
|
||||
<String ID="string_26" HPOS="1610" VPOS="312" WIDTH="64" HEIGHT="25" WC="0.96" CONTENT="eos"/><SP WIDTH="16" VPOS="312" HPOS="1674"/>
|
||||
<String ID="string_27" HPOS="1690" VPOS="308" WIDTH="35" HEIGHT="29" WC="0.96" CONTENT="et"/><SP WIDTH="14" VPOS="308" HPOS="1725"/>
|
||||
<String ID="string_28" HPOS="1739" VPOS="312" WIDTH="168" HEIGHT="25" WC="0.96" CONTENT="accusam"/><SP WIDTH="15" VPOS="312" HPOS="1907"/>
|
||||
<String ID="string_29" HPOS="1922" VPOS="308" WIDTH="34" HEIGHT="29" WC="0.97" CONTENT="et"/><SP WIDTH="11" VPOS="308" HPOS="1956"/>
|
||||
<String ID="string_30" HPOS="1967" VPOS="302" WIDTH="96" HEIGHT="45" WC="0.97" CONTENT="justo"/><SP WIDTH="16" VPOS="302" HPOS="2063"/>
|
||||
<String ID="string_31" HPOS="2079" VPOS="301" WIDTH="71" HEIGHT="36" WC="0.96" CONTENT="duo"/>
|
||||
</TextLine>
|
||||
<TextLine ID="line_2" HPOS="238" VPOS="359" WIDTH="1928" HEIGHT="46">
|
||||
<String ID="string_32" HPOS="238" VPOS="361" WIDTH="144" HEIGHT="36" WC="0.96" CONTENT="dolores"/><SP WIDTH="16" VPOS="361" HPOS="382"/>
|
||||
<String ID="string_33" HPOS="398" VPOS="368" WIDTH="34" HEIGHT="29" WC="0.96" CONTENT="et"/><SP WIDTH="15" VPOS="368" HPOS="432"/>
|
||||
<String ID="string_34" HPOS="447" VPOS="372" WIDTH="41" HEIGHT="25" WC="0.96" CONTENT="ea"/><SP WIDTH="14" VPOS="372" HPOS="488"/>
|
||||
<String ID="string_35" HPOS="502" VPOS="361" WIDTH="136" HEIGHT="36" WC="0.96" CONTENT="rebum."/><SP WIDTH="19" VPOS="361" HPOS="638"/>
|
||||
<String ID="string_36" HPOS="657" VPOS="363" WIDTH="75" HEIGHT="33" WC="0.97" CONTENT="Stet"/><SP WIDTH="14" VPOS="363" HPOS="732"/>
|
||||
<String ID="string_37" HPOS="746" VPOS="360" WIDTH="84" HEIGHT="36" WC="0.96" CONTENT="clita"/><SP WIDTH="13" VPOS="360" HPOS="830"/>
|
||||
<String ID="string_38" HPOS="843" VPOS="359" WIDTH="91" HEIGHT="36" WC="0.96" CONTENT="kasd"/><SP WIDTH="13" VPOS="359" HPOS="934"/>
|
||||
<String ID="string_39" HPOS="947" VPOS="359" WIDTH="208" HEIGHT="46" WC="0.96" CONTENT="gubergren,"/><SP WIDTH="16" VPOS="359" HPOS="1155"/>
|
||||
<String ID="string_40" HPOS="1171" VPOS="370" WIDTH="47" HEIGHT="24" WC="0.96" CONTENT="no"/><SP WIDTH="16" VPOS="370" HPOS="1218"/>
|
||||
<String ID="string_41" HPOS="1234" VPOS="370" WIDTH="61" HEIGHT="25" WC="0.96" CONTENT="sea"/><SP WIDTH="13" VPOS="370" HPOS="1295"/>
|
||||
<String ID="string_42" HPOS="1308" VPOS="359" WIDTH="172" HEIGHT="36" WC="0.96" CONTENT="takimata"/><SP WIDTH="15" VPOS="359" HPOS="1480"/>
|
||||
<String ID="string_43" HPOS="1495" VPOS="365" WIDTH="145" HEIGHT="30" WC="0.96" CONTENT="sanctus"/><SP WIDTH="16" VPOS="365" HPOS="1640"/>
|
||||
<String ID="string_44" HPOS="1656" VPOS="365" WIDTH="55" HEIGHT="29" WC="0.96" CONTENT="est"/><SP WIDTH="13" VPOS="365" HPOS="1711"/>
|
||||
<String ID="string_45" HPOS="1724" VPOS="361" WIDTH="131" HEIGHT="33" WC="0.96" CONTENT="Lorem"/><SP WIDTH="15" VPOS="361" HPOS="1855"/>
|
||||
<String ID="string_46" HPOS="1870" VPOS="360" WIDTH="119" HEIGHT="44" WC="0.96" CONTENT="ipsum"/><SP WIDTH="15" VPOS="360" HPOS="1989"/>
|
||||
<String ID="string_47" HPOS="2004" VPOS="359" WIDTH="103" HEIGHT="35" WC="0.96" CONTENT="dolor"/><SP WIDTH="14" VPOS="359" HPOS="2107"/>
|
||||
<String ID="string_48" HPOS="2121" VPOS="360" WIDTH="45" HEIGHT="34" WC="0.96" CONTENT="sit"/>
|
||||
</TextLine>
|
||||
<TextLine ID="line_3" HPOS="238" VPOS="416" WIDTH="1905" HEIGHT="48">
|
||||
<String ID="string_49" HPOS="238" VPOS="425" WIDTH="105" HEIGHT="29" WC="0.96" CONTENT="amet."/><SP WIDTH="16" VPOS="425" HPOS="343"/>
|
||||
<String ID="string_50" HPOS="359" VPOS="421" WIDTH="132" HEIGHT="33" WC="0.96" CONTENT="Lorem"/><SP WIDTH="13" VPOS="421" HPOS="491"/>
|
||||
<String ID="string_51" HPOS="504" VPOS="420" WIDTH="121" HEIGHT="44" WC="0.96" CONTENT="ipsum"/><SP WIDTH="15" VPOS="420" HPOS="625"/>
|
||||
<String ID="string_52" HPOS="640" VPOS="418" WIDTH="104" HEIGHT="36" WC="0.96" CONTENT="dolor"/><SP WIDTH="14" VPOS="418" HPOS="744"/>
|
||||
<String ID="string_53" HPOS="758" VPOS="419" WIDTH="45" HEIGHT="35" WC="0.97" CONTENT="sit"/><SP WIDTH="15" VPOS="419" HPOS="803"/>
|
||||
<String ID="string_54" HPOS="818" VPOS="424" WIDTH="104" HEIGHT="36" WC="0.96" CONTENT="amet,"/><SP WIDTH="17" VPOS="424" HPOS="922"/>
|
||||
<String ID="string_55" HPOS="939" VPOS="422" WIDTH="201" HEIGHT="30" WC="0.96" CONTENT="consetetur"/><SP WIDTH="15" VPOS="422" HPOS="1140"/>
|
||||
<String ID="string_56" HPOS="1155" VPOS="416" WIDTH="207" HEIGHT="46" WC="0.96" CONTENT="sadipscing"/><SP WIDTH="15" VPOS="416" HPOS="1362"/>
|
||||
<String ID="string_57" HPOS="1377" VPOS="417" WIDTH="86" HEIGHT="42" WC="0.96" CONTENT="elitr,"/><SP WIDTH="17" VPOS="417" HPOS="1463"/>
|
||||
<String ID="string_58" HPOS="1480" VPOS="416" WIDTH="66" HEIGHT="36" WC="0.96" CONTENT="sed"/><SP WIDTH="15" VPOS="416" HPOS="1546"/>
|
||||
<String ID="string_59" HPOS="1561" VPOS="416" WIDTH="98" HEIGHT="36" WC="0.96" CONTENT="diam"/><SP WIDTH="14" VPOS="416" HPOS="1659"/>
|
||||
<String ID="string_60" HPOS="1673" VPOS="427" WIDTH="163" HEIGHT="35" WC="0.96" CONTENT="nonumy"/><SP WIDTH="16" VPOS="427" HPOS="1836"/>
|
||||
<String ID="string_61" HPOS="1852" VPOS="416" WIDTH="138" HEIGHT="36" WC="0.96" CONTENT="eirmod"/><SP WIDTH="13" VPOS="416" HPOS="1990"/>
|
||||
<String ID="string_62" HPOS="2003" VPOS="422" WIDTH="140" HEIGHT="40" WC="0.96" CONTENT="tempor"/>
|
||||
</TextLine>
|
||||
<TextLine ID="line_4" HPOS="236" VPOS="474" WIDTH="1897" HEIGHT="47">
|
||||
<String ID="string_63" HPOS="236" VPOS="476" WIDTH="166" HEIGHT="35" WC="0.96" CONTENT="invidunt"/><SP WIDTH="14" VPOS="476" HPOS="402"/>
|
||||
<String ID="string_64" HPOS="416" VPOS="482" WIDTH="39" HEIGHT="29" WC="0.96" CONTENT="ut"/><SP WIDTH="12" VPOS="482" HPOS="455"/>
|
||||
<String ID="string_65" HPOS="467" VPOS="476" WIDTH="122" HEIGHT="35" WC="0.96" CONTENT="labore"/><SP WIDTH="16" VPOS="476" HPOS="589"/>
|
||||
<String ID="string_66" HPOS="605" VPOS="482" WIDTH="34" HEIGHT="29" WC="0.96" CONTENT="et"/><SP WIDTH="15" VPOS="482" HPOS="639"/>
|
||||
<String ID="string_67" HPOS="654" VPOS="475" WIDTH="125" HEIGHT="36" WC="0.96" CONTENT="dolore"/><SP WIDTH="14" VPOS="475" HPOS="779"/>
|
||||
<String ID="string_68" HPOS="793" VPOS="484" WIDTH="131" HEIGHT="37" WC="0.96" CONTENT="magna"/><SP WIDTH="15" VPOS="484" HPOS="924"/>
|
||||
<String ID="string_69" HPOS="939" VPOS="474" WIDTH="182" HEIGHT="45" WC="0.96" CONTENT="aliquyam"/><SP WIDTH="15" VPOS="474" HPOS="1121"/>
|
||||
<String ID="string_70" HPOS="1136" VPOS="480" WIDTH="81" HEIGHT="37" WC="0.96" CONTENT="erat,"/><SP WIDTH="18" VPOS="480" HPOS="1217"/>
|
||||
<String ID="string_71" HPOS="1235" VPOS="474" WIDTH="63" HEIGHT="35" WC="0.96" CONTENT="sed"/><SP WIDTH="15" VPOS="474" HPOS="1298"/>
|
||||
<String ID="string_72" HPOS="1313" VPOS="474" WIDTH="97" HEIGHT="35" WC="0.96" CONTENT="diam"/><SP WIDTH="13" VPOS="474" HPOS="1410"/>
|
||||
<String ID="string_73" HPOS="1423" VPOS="474" WIDTH="186" HEIGHT="46" WC="0.96" CONTENT="voluptua."/><SP WIDTH="14" VPOS="474" HPOS="1609"/>
|
||||
<String ID="string_74" HPOS="1623" VPOS="475" WIDTH="50" HEIGHT="34" WC="0.96" CONTENT="At"/><SP WIDTH="12" VPOS="475" HPOS="1673"/>
|
||||
<String ID="string_75" HPOS="1685" VPOS="485" WIDTH="89" HEIGHT="24" WC="0.96" CONTENT="vero"/><SP WIDTH="16" VPOS="485" HPOS="1774"/>
|
||||
<String ID="string_76" HPOS="1790" VPOS="484" WIDTH="63" HEIGHT="25" WC="0.96" CONTENT="eos"/><SP WIDTH="15" VPOS="484" HPOS="1853"/>
|
||||
<String ID="string_77" HPOS="1868" VPOS="480" WIDTH="34" HEIGHT="29" WC="0.96" CONTENT="et"/><SP WIDTH="14" VPOS="480" HPOS="1902"/>
|
||||
<String ID="string_78" HPOS="1916" VPOS="484" WIDTH="168" HEIGHT="25" WC="0.96" CONTENT="accusam"/><SP WIDTH="16" VPOS="484" HPOS="2084"/>
|
||||
<String ID="string_79" HPOS="2100" VPOS="480" WIDTH="33" HEIGHT="29" WC="0.96" CONTENT="et"/>
|
||||
</TextLine>
|
||||
<TextLine ID="line_5" HPOS="234" VPOS="531" WIDTH="1950" HEIGHT="47">
|
||||
<String ID="string_80" HPOS="234" VPOS="534" WIDTH="98" HEIGHT="44" WC="0.97" CONTENT="justo"/><SP WIDTH="16" VPOS="534" HPOS="332"/>
|
||||
<String ID="string_81" HPOS="348" VPOS="533" WIDTH="71" HEIGHT="35" WC="0.96" CONTENT="duo"/><SP WIDTH="16" VPOS="533" HPOS="419"/>
|
||||
<String ID="string_82" HPOS="435" VPOS="533" WIDTH="143" HEIGHT="35" WC="0.96" CONTENT="dolores"/><SP WIDTH="15" VPOS="533" HPOS="578"/>
|
||||
<String ID="string_83" HPOS="593" VPOS="539" WIDTH="35" HEIGHT="29" WC="0.96" CONTENT="et"/><SP WIDTH="14" VPOS="539" HPOS="628"/>
|
||||
<String ID="string_84" HPOS="642" VPOS="543" WIDTH="42" HEIGHT="25" WC="0.97" CONTENT="ea"/><SP WIDTH="14" VPOS="543" HPOS="684"/>
|
||||
<String ID="string_85" HPOS="698" VPOS="533" WIDTH="137" HEIGHT="35" WC="0.96" CONTENT="rebum."/><SP WIDTH="18" VPOS="533" HPOS="835"/>
|
||||
<String ID="string_86" HPOS="853" VPOS="534" WIDTH="74" HEIGHT="34" WC="0.96" CONTENT="Stet"/><SP WIDTH="14" VPOS="534" HPOS="927"/>
|
||||
<String ID="string_87" HPOS="941" VPOS="531" WIDTH="84" HEIGHT="36" WC="0.96" CONTENT="clita"/><SP WIDTH="13" VPOS="531" HPOS="1025"/>
|
||||
<String ID="string_88" HPOS="1038" VPOS="531" WIDTH="89" HEIGHT="35" WC="0.96" CONTENT="kasd"/><SP WIDTH="15" VPOS="531" HPOS="1127"/>
|
||||
<String ID="string_89" HPOS="1142" VPOS="531" WIDTH="208" HEIGHT="46" WC="0.96" CONTENT="gubergren,"/><SP WIDTH="16" VPOS="531" HPOS="1350"/>
|
||||
<String ID="string_90" HPOS="1366" VPOS="542" WIDTH="48" HEIGHT="25" WC="0.96" CONTENT="no"/><SP WIDTH="16" VPOS="542" HPOS="1414"/>
|
||||
<String ID="string_91" HPOS="1430" VPOS="542" WIDTH="62" HEIGHT="25" WC="0.96" CONTENT="sea"/><SP WIDTH="13" VPOS="542" HPOS="1492"/>
|
||||
<String ID="string_92" HPOS="1505" VPOS="531" WIDTH="173" HEIGHT="36" WC="0.96" CONTENT="takimata"/><SP WIDTH="15" VPOS="531" HPOS="1678"/>
|
||||
<String ID="string_93" HPOS="1693" VPOS="538" WIDTH="144" HEIGHT="29" WC="0.96" CONTENT="sanctus"/><SP WIDTH="16" VPOS="538" HPOS="1837"/>
|
||||
<String ID="string_94" HPOS="1853" VPOS="537" WIDTH="53" HEIGHT="29" WC="0.96" CONTENT="est"/><SP WIDTH="14" VPOS="537" HPOS="1906"/>
|
||||
<String ID="string_95" HPOS="1920" VPOS="533" WIDTH="130" HEIGHT="33" WC="0.96" CONTENT="Lorem"/><SP WIDTH="14" VPOS="533" HPOS="2050"/>
|
||||
<String ID="string_96" HPOS="2064" VPOS="532" WIDTH="120" HEIGHT="44" WC="0.95" CONTENT="ipsum"/>
|
||||
</TextLine>
|
||||
<TextLine ID="line_6" HPOS="237" VPOS="590" WIDTH="282" HEIGHT="41">
|
||||
<String ID="string_97" HPOS="237" VPOS="590" WIDTH="104" HEIGHT="35" WC="0.96" CONTENT="dolor"/><SP WIDTH="15" VPOS="590" HPOS="341"/>
|
||||
<String ID="string_98" HPOS="356" VPOS="591" WIDTH="45" HEIGHT="35" WC="0.96" CONTENT="sit"/><SP WIDTH="14" VPOS="591" HPOS="401"/>
|
||||
<String ID="string_99" HPOS="415" VPOS="597" WIDTH="104" HEIGHT="34" WC="0.96" CONTENT="amet."/>
|
||||
</TextLine>
|
||||
</TextBlock>
|
||||
</PrintSpace>
|
||||
</Page>
|
||||
</Layout>
|
||||
</alto>
|
BIN
src/dinglehopper/tests/data/lorem-ipsum/lorem-ipsum-scan.pdf
Normal file
BIN
src/dinglehopper/tests/data/lorem-ipsum/lorem-ipsum-scan.pdf
Normal file
Binary file not shown.
BIN
src/dinglehopper/tests/data/lorem-ipsum/lorem-ipsum-scan.tif
Normal file
BIN
src/dinglehopper/tests/data/lorem-ipsum/lorem-ipsum-scan.tif
Normal file
Binary file not shown.
BIN
src/dinglehopper/tests/data/lorem-ipsum/lorem-ipsum.odt
Normal file
BIN
src/dinglehopper/tests/data/lorem-ipsum/lorem-ipsum.odt
Normal file
Binary file not shown.
290
src/dinglehopper/tests/data/mixed-regions.page.xml
Normal file
290
src/dinglehopper/tests/data/mixed-regions.page.xml
Normal file
|
@ -0,0 +1,290 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<pc:PcGts xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2018-07-15">
|
||||
<pc:Metadata>
|
||||
<pc:Creator>OCR-D/core 1.0.0b19</pc:Creator>
|
||||
<pc:Created>2019-09-26T11:59:19.519140</pc:Created>
|
||||
<pc:LastChange>2019-09-26T11:59:19.519140</pc:LastChange>
|
||||
<pc:MetadataItem type="processingStep" name="layout/segmentation/region" value="ocrd-tesserocr-segment-region">
|
||||
<pc:Labels>
|
||||
<pc:Label value="True" type="overwrite_regions"/>
|
||||
<pc:Label value="8" type="padding"/>
|
||||
<pc:Label value="False" type="crop_polygons"/>
|
||||
<pc:Label value="True" type="find_tables"/>
|
||||
</pc:Labels>
|
||||
</pc:MetadataItem>
|
||||
<pc:MetadataItem type="processingStep" name="layout/segmentation/line" value="ocrd-tesserocr-segment-line">
|
||||
<pc:Labels>
|
||||
<pc:Label value="True" type="overwrite_lines"/>
|
||||
</pc:Labels>
|
||||
</pc:MetadataItem>
|
||||
</pc:Metadata>
|
||||
<pc:Page imageFilename="../OCR-D-IMG-BIN/OCR-D-IMG-BIN_0001.png" imageWidth="1832" imageHeight="2408">
|
||||
<pc:ReadingOrder>
|
||||
<pc:OrderedGroup id="reading-order">
|
||||
<pc:RegionRefIndexed index="0" regionRef="region0000"/>
|
||||
<pc:RegionRefIndexed index="1" regionRef="region0001"/>
|
||||
<pc:RegionRefIndexed index="2" regionRef="region0002"/>
|
||||
<pc:RegionRefIndexed index="3" regionRef="region0003"/>
|
||||
</pc:OrderedGroup>
|
||||
</pc:ReadingOrder>
|
||||
<pc:TextRegion id="region0001">
|
||||
<pc:Coords points="184,196 1338,196 1338,1969 184,1969"/>
|
||||
<pc:TextLine id="region0001_line0000">
|
||||
<pc:Coords points="217,204 1324,204 1324,264 217,264"/>
|
||||
<pc:TextEquiv>
|
||||
<pc:Unicode>phariſei hypocritæ, qui comeditis domos uiduarã ſub</pc:Unicode>
|
||||
</pc:TextEquiv>
|
||||
</pc:TextLine>
|
||||
<pc:TextLine id="region0001_line0001">
|
||||
<pc:Coords points="220,258 1325,258 1325,314 220,314"/>
|
||||
<pc:TextEquiv>
|
||||
<pc:Unicode>prætextu longarum precationum, propterea maiorẽ</pc:Unicode>
|
||||
</pc:TextEquiv>
|
||||
</pc:TextLine>
|
||||
<pc:TextLine id="region0001_line0002">
|
||||
<pc:Coords points="218,305 1325,305 1325,359 218,359"/>
|
||||
<pc:TextEquiv>
|
||||
<pc:Unicode>accipieris condemnationem. Ideo enim ꝙ non oratis</pc:Unicode>
|
||||
</pc:TextEquiv>
|
||||
</pc:TextLine>
|
||||
<pc:TextLine id="region0001_line0003">
|
||||
<pc:Coords points="217,354 1325,354 1325,413 217,413"/>
|
||||
<pc:TextEquiv>
|
||||
<pc:Unicode>ſecundum præſeriptum ſacræ ſcripturæ, nec ex ſpiritu</pc:Unicode>
|
||||
</pc:TextEquiv>
|
||||
</pc:TextLine>
|
||||
<pc:TextLine id="region0001_line0004">
|
||||
<pc:Coords points="216,401 1322,401 1322,460 216,460"/>
|
||||
<pc:TextEquiv>
|
||||
<pc:Unicode>& ueritate ſed iuxta ueſtram propriam conſtitutionẽ,</pc:Unicode>
|
||||
</pc:TextEquiv>
|
||||
</pc:TextLine>
|
||||
<pc:TextLine id="region0001_line0005">
|
||||
<pc:Coords points="219,454 1324,454 1324,505 219,505"/>
|
||||
<pc:TextEquiv>
|
||||
<pc:Unicode>orationes ueſtræ nõ ſiunt Deo acceptæ, neq; ab eo ex⸗</pc:Unicode>
|
||||
</pc:TextEquiv>
|
||||
</pc:TextLine>
|
||||
<pc:TextLine id="region0001_line0006">
|
||||
<pc:Coords points="219,501 1326,501 1326,563 219,563"/>
|
||||
<pc:TextEquiv>
|
||||
<pc:Unicode>audiunt᷑ Eſa, Cum multiplicaueritis orationes ueſtras</pc:Unicode>
|
||||
</pc:TextEquiv>
|
||||
</pc:TextLine>
|
||||
<pc:TextLine id="region0001_line0007">
|
||||
<pc:Coords points="215,556 1325,556 1325,607 215,607"/>
|
||||
<pc:TextEquiv>
|
||||
<pc:Unicode>non exaudiam uos. Chriſtiani uero quia orant iuxta</pc:Unicode>
|
||||
</pc:TextEquiv>
|
||||
</pc:TextLine>
|
||||
<pc:TextLine id="region0001_line0008">
|
||||
<pc:Coords points="218,605 1324,605 1324,665 218,665"/>
|
||||
<pc:TextEquiv>
|
||||
<pc:Unicode>tenorem ſcripturæ, & ex ſpiritu & ueritate, ideo eo⸗</pc:Unicode>
|
||||
</pc:TextEquiv>
|
||||
</pc:TextLine>
|
||||
<pc:TextLine id="region0001_line0009">
|
||||
<pc:Coords points="217,651 1324,651 1324,707 217,707"/>
|
||||
<pc:TextEquiv>
|
||||
<pc:Unicode>rum orationes a Deo exaudiuntur, ſuntq; illi grat iſsi⸗</pc:Unicode>
|
||||
</pc:TextEquiv>
|
||||
</pc:TextLine>
|
||||
<pc:TextLine id="region0001_line0010">
|
||||
<pc:Coords points="219,705 1322,705 1322,756 219,756"/>
|
||||
<pc:TextEquiv>
|
||||
<pc:Unicode>mæ, dicunt enim Pater noſter qui es iu cœlis &c. Vos</pc:Unicode>
|
||||
</pc:TextEquiv>
|
||||
</pc:TextLine>
|
||||
<pc:TextLine id="region0001_line0011">
|
||||
<pc:Coords points="218,756 1323,756 1323,806 218,806"/>
|
||||
<pc:TextEquiv>
|
||||
<pc:Unicode>autem hoc tenore orandi contempto, obmur muratis</pc:Unicode>
|
||||
</pc:TextEquiv>
|
||||
</pc:TextLine>
|
||||
<pc:TextLine id="region0001_line0012">
|
||||
<pc:Coords points="218,803 1327,803 1327,854 218,854"/>
|
||||
<pc:TextEquiv>
|
||||
<pc:Unicode>ueſtras Horas canonicas, hoc eſt, diabolicas ab Anti⸗</pc:Unicode>
|
||||
</pc:TextEquiv>
|
||||
</pc:TextLine>
|
||||
<pc:TextLine id="region0001_line0013">
|
||||
<pc:Coords points="218,852 1324,852 1324,904 218,904"/>
|
||||
<pc:TextEquiv>
|
||||
<pc:Unicode>chriſto inſtitutas. Paulus mauult quinq; uerba in Ec⸗</pc:Unicode>
|
||||
</pc:TextEquiv>
|
||||
</pc:TextLine>
|
||||
<pc:TextLine id="region0001_line0014">
|
||||
<pc:Coords points="219,904 1323,904 1323,958 219,958"/>
|
||||
<pc:TextEquiv>
|
||||
<pc:Unicode>cle ſia loqui in ſenſu, qß decem milia uerborum in lin⸗</pc:Unicode>
|
||||
</pc:TextEquiv>
|
||||
</pc:TextLine>
|
||||
<pc:TextLine id="region0001_line0015">
|
||||
<pc:Coords points="218,954 1326,954 1326,1010 218,1010"/>
|
||||
<pc:TextEquiv>
|
||||
<pc:Unicode>ua, Quibus uerbis adeo dãnat ueſtras prolixas ora⸗</pc:Unicode>
|
||||
</pc:TextEquiv>
|
||||
</pc:TextLine>
|
||||
<pc:TextLine id="region0001_line0016">
|
||||
<pc:Coords points="192,1002 1324,1002 1324,1052 192,1052"/>
|
||||
<pc:TextEquiv>
|
||||
<pc:Unicode>tiones, ut ſi ſemiuncia ſanæ mentis uel mica ſidei eſfet</pc:Unicode>
|
||||
</pc:TextEquiv>
|
||||
</pc:TextLine>
|
||||
<pc:TextLine id="region0001_line0017">
|
||||
<pc:Coords points="218,1055 965,1055 965,1101 218,1101"/>
|
||||
<pc:TextEquiv>
|
||||
<pc:Unicode>in uobis, eas ſine dubio omitteretis.</pc:Unicode>
|
||||
</pc:TextEquiv>
|
||||
</pc:TextLine>
|
||||
<pc:TextLine id="region0001_line0018">
|
||||
<pc:Coords points="325,1103 1323,1103 1323,1160 325,1160"/>
|
||||
<pc:TextEquiv>
|
||||
<pc:Unicode>De inuocatione diuorum ne apiculus quidem ha</pc:Unicode>
|
||||
</pc:TextEquiv>
|
||||
</pc:TextLine>
|
||||
<pc:TextLine id="region0001_line0019">
|
||||
<pc:Coords points="216,1156 1326,1156 1326,1212 216,1212"/>
|
||||
<pc:TextEquiv>
|
||||
<pc:Unicode>betur in ſacris literis, quare ter ſtulti eſtis quod inuo⸗</pc:Unicode>
|
||||
</pc:TextEquiv>
|
||||
</pc:TextLine>
|
||||
<pc:TextLine id="region0001_line0020">
|
||||
<pc:Coords points="220,1210 1326,1210 1326,1262 220,1262"/>
|
||||
<pc:TextEquiv>
|
||||
<pc:Unicode>catis ſanctos, cum ex præce pto Dei ne mo inuocandus</pc:Unicode>
|
||||
</pc:TextEquiv>
|
||||
</pc:TextLine>
|
||||
<pc:TextLine id="region0001_line0021">
|
||||
<pc:Coords points="218,1261 1326,1261 1326,1307 218,1307"/>
|
||||
<pc:TextEquiv>
|
||||
<pc:Unicode>ſit niſi ſolus Deus. Inuoca inquit me in die tribulatio⸗</pc:Unicode>
|
||||
</pc:TextEquiv>
|
||||
</pc:TextLine>
|
||||
<pc:TextLine id="region0001_line0022">
|
||||
<pc:Coords points="222,1305 1324,1305 1324,1354 222,1354"/>
|
||||
<pc:TextEquiv>
|
||||
<pc:Unicode>nis. & eruam te, & honorificabis me. Et omnis qui⸗</pc:Unicode>
|
||||
</pc:TextEquiv>
|
||||
</pc:TextLine>
|
||||
<pc:TextLine id="region0001_line0023">
|
||||
<pc:Coords points="221,1353 1324,1353 1324,1415 221,1415"/>
|
||||
<pc:TextEquiv>
|
||||
<pc:Unicode>cumq; inuocauerit nomen domini, ſaluus erit Sed</pc:Unicode>
|
||||
</pc:TextEquiv>
|
||||
</pc:TextLine>
|
||||
<pc:TextLine id="region0001_line0024">
|
||||
<pc:Coords points="220,1404 1321,1404 1321,1465 220,1465"/>
|
||||
<pc:TextEquiv>
|
||||
<pc:Unicode>quomodo inuocabitis, in quem non credidiſtis? Quo</pc:Unicode>
|
||||
</pc:TextEquiv>
|
||||
</pc:TextLine>
|
||||
<pc:TextLine id="region0001_line0025">
|
||||
<pc:Coords points="221,1456 1325,1456 1325,1508 221,1508"/>
|
||||
<pc:TextEquiv>
|
||||
<pc:Unicode>modo credetis ſine uerbo ? Inuocationẽ ergo in ſcrip⸗</pc:Unicode>
|
||||
</pc:TextEquiv>
|
||||
</pc:TextLine>
|
||||
<pc:TextLine id="region0001_line0026">
|
||||
<pc:Coords points="222,1509 1323,1509 1323,1559 222,1559"/>
|
||||
<pc:TextEquiv>
|
||||
<pc:Unicode>turis non legitis cõmemorationem uero ſæpe, non ut</pc:Unicode>
|
||||
</pc:TextEquiv>
|
||||
</pc:TextLine>
|
||||
<pc:TextLine id="region0001_line0027">
|
||||
<pc:Coords points="222,1555 1330,1555 1330,1612 222,1612"/>
|
||||
<pc:TextEquiv>
|
||||
<pc:Unicode>intercedant pro uobis ſancti, ſed nt meminerit Deus</pc:Unicode>
|
||||
</pc:TextEquiv>
|
||||
</pc:TextLine>
|
||||
<pc:TextLine id="region0001_line0028">
|
||||
<pc:Coords points="219,1604 1325,1604 1325,1664 219,1664"/>
|
||||
<pc:TextEquiv>
|
||||
<pc:Unicode>Teſtamenti cum patribus ſanctis pacti, ut ſimiliter uo⸗</pc:Unicode>
|
||||
</pc:TextEquiv>
|
||||
</pc:TextLine>
|
||||
<pc:TextLine id="region0001_line0029">
|
||||
<pc:Coords points="218,1653 1323,1653 1323,1719 218,1719"/>
|
||||
<pc:TextEquiv>
|
||||
<pc:Unicode>biſcum agat per miſericordiam, quemadmodum cum</pc:Unicode>
|
||||
</pc:TextEquiv>
|
||||
</pc:TextLine>
|
||||
<pc:TextLine id="region0001_line0030">
|
||||
<pc:Coords points="219,1704 1321,1704 1321,1769 219,1769"/>
|
||||
<pc:TextEquiv>
|
||||
<pc:Unicode>ilis egit. Atq; hoc non eſt inuocare ſanctos. ſed Deum</pc:Unicode>
|
||||
</pc:TextEquiv>
|
||||
</pc:TextLine>
|
||||
<pc:TextLine id="region0001_line0031">
|
||||
<pc:Coords points="222,1758 1322,1758 1322,1817 222,1817"/>
|
||||
<pc:TextEquiv>
|
||||
<pc:Unicode>ſuæ miſericordiæ & promiſsionis admonere Sic pſal</pc:Unicode>
|
||||
</pc:TextEquiv>
|
||||
</pc:TextLine>
|
||||
<pc:TextLine id="region0001_line0032">
|
||||
<pc:Coords points="224,1809 1324,1809 1324,1866 224,1866"/>
|
||||
<pc:TextEquiv>
|
||||
<pc:Unicode>mographus dicit, Qui paſcis Iſrael attende, qui de⸗</pc:Unicode>
|
||||
</pc:TextEquiv>
|
||||
</pc:TextLine>
|
||||
<pc:TextLine id="region0001_line0033">
|
||||
<pc:Coords points="222,1858 1320,1858 1320,1913 222,1913"/>
|
||||
<pc:TextEquiv>
|
||||
<pc:Unicode>ducis uelut ouem Iacob Sic & Moſes orat, Memento</pc:Unicode>
|
||||
</pc:TextEquiv>
|
||||
</pc:TextLine>
|
||||
<pc:TextLine id="region0001_line0034">
|
||||
<pc:Coords points="345,1909 1320,1909 1320,1963 345,1963"/>
|
||||
<pc:TextEquiv>
|
||||
<pc:Unicode>B 3 domi⸗</pc:Unicode>
|
||||
</pc:TextEquiv>
|
||||
</pc:TextLine>
|
||||
<pc:TextEquiv>
|
||||
<pc:Unicode>phariſei hypocritæ, qui comeditis domos uiduarã ſub
|
||||
prætextu longarum precationum, propterea maiorẽ
|
||||
accipieris condemnationem. Ideo enim ꝙ non oratis
|
||||
ſecundum præſeriptum ſacræ ſcripturæ, nec ex ſpiritu
|
||||
& ueritate ſed iuxta ueſtram propriam conſtitutionẽ,
|
||||
orationes ueſtræ nõ ſiunt Deo acceptæ, neq; ab eo ex⸗
|
||||
audiunt᷑ Eſa, Cum multiplicaueritis orationes ueſtras
|
||||
non exaudiam uos. Chriſtiani uero quia orant iuxta
|
||||
tenorem ſcripturæ, & ex ſpiritu & ueritate, ideo eo⸗
|
||||
rum orationes a Deo exaudiuntur, ſuntq; illi grat iſsi⸗
|
||||
mæ, dicunt enim Pater noſter qui es iu cœlis &c. Vos
|
||||
autem hoc tenore orandi contempto, obmur muratis
|
||||
ueſtras Horas canonicas, hoc eſt, diabolicas ab Anti⸗
|
||||
chriſto inſtitutas. Paulus mauult quinq; uerba in Ec⸗
|
||||
cle ſia loqui in ſenſu, qß decem milia uerborum in lin⸗
|
||||
ua, Quibus uerbis adeo dãnat ueſtras prolixas ora⸗
|
||||
tiones, ut ſi ſemiuncia ſanæ mentis uel mica ſidei eſfet
|
||||
in uobis, eas ſine dubio omitteretis.
|
||||
De inuocatione diuorum ne apiculus quidem ha
|
||||
betur in ſacris literis, quare ter ſtulti eſtis quod inuo⸗
|
||||
catis ſanctos, cum ex præce pto Dei ne mo inuocandus
|
||||
ſit niſi ſolus Deus. Inuoca inquit me in die tribulatio⸗
|
||||
nis. & eruam te, & honorificabis me. Et omnis qui⸗
|
||||
cumq; inuocauerit nomen domini, ſaluus erit Sed
|
||||
quomodo inuocabitis, in quem non credidiſtis? Quo
|
||||
modo credetis ſine uerbo ? Inuocationẽ ergo in ſcrip⸗
|
||||
turis non legitis cõmemorationem uero ſæpe, non ut
|
||||
intercedant pro uobis ſancti, ſed nt meminerit Deus
|
||||
Teſtamenti cum patribus ſanctis pacti, ut ſimiliter uo⸗
|
||||
biſcum agat per miſericordiam, quemadmodum cum
|
||||
ilis egit. Atq; hoc non eſt inuocare ſanctos. ſed Deum
|
||||
ſuæ miſericordiæ & promiſsionis admonere Sic pſal
|
||||
mographus dicit, Qui paſcis Iſrael attende, qui de⸗
|
||||
ducis uelut ouem Iacob Sic & Moſes orat, Memento
|
||||
B 3 domi⸗</pc:Unicode>
|
||||
</pc:TextEquiv>
|
||||
</pc:TextRegion>
|
||||
<pc:ImageRegion id="region0000">
|
||||
<pc:Coords points="5,21 1790,21 1790,302 5,302"/>
|
||||
</pc:ImageRegion>
|
||||
<pc:ImageRegion id="region0002">
|
||||
<pc:Coords points="0,1962 1813,1962 1813,2361 0,2361"/>
|
||||
</pc:ImageRegion>
|
||||
<pc:ImageRegion id="region0003">
|
||||
<pc:Coords points="1316,166 1790,166 1790,238 1316,238"/>
|
||||
</pc:ImageRegion>
|
||||
</pc:Page>
|
||||
</pc:PcGts>
|
4204
src/dinglehopper/tests/data/order.page.xml
Normal file
4204
src/dinglehopper/tests/data/order.page.xml
Normal file
File diff suppressed because it is too large
Load diff
|
@ -0,0 +1,121 @@
|
|||
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
|
||||
<PcGts xmlns="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15 http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15/pagecontent.xsd">
|
||||
<Metadata>
|
||||
<Creator/>
|
||||
<Created>2020-10-28T08:43:47</Created>
|
||||
<LastChange>1970-01-01T00:00:00</LastChange>
|
||||
<Comments/>
|
||||
</Metadata>
|
||||
<Page imageFilename="0001.png" imageHeight="1123" imageWidth="794">
|
||||
<TextRegion id="r1" orientation="0.0">
|
||||
<Coords points="315,437 315,407 339,407 339,437"/>
|
||||
<TextLine id="l1">
|
||||
<Coords points="318,434 318,409 337,409 337,434"/>
|
||||
<TextEquiv index="0">
|
||||
<Unicode>5</Unicode>
|
||||
</TextEquiv>
|
||||
</TextLine>
|
||||
<TextEquiv>
|
||||
<Unicode>5</Unicode>
|
||||
</TextEquiv>
|
||||
</TextRegion>
|
||||
<TextRegion id="r2" orientation="0.0">
|
||||
<Coords points="425,436 425,406 450,406 450,436"/>
|
||||
<TextLine id="l2">
|
||||
<Coords points="429,434 429,410 446,410 446,434"/>
|
||||
<TextEquiv index="0">
|
||||
<Unicode>6</Unicode>
|
||||
</TextEquiv>
|
||||
</TextLine>
|
||||
<TextEquiv>
|
||||
<Unicode>6</Unicode>
|
||||
</TextEquiv>
|
||||
</TextRegion>
|
||||
<TextRegion id="r3" orientation="0.0">
|
||||
<Coords points="233,499 233,467 262,467 262,499"/>
|
||||
<TextLine id="l3">
|
||||
<Coords points="237,496 237,468 258,468 258,496"/>
|
||||
<TextEquiv index="0">
|
||||
<Unicode>7</Unicode>
|
||||
</TextEquiv>
|
||||
</TextLine>
|
||||
<TextEquiv>
|
||||
<Unicode>7</Unicode>
|
||||
</TextEquiv>
|
||||
</TextRegion>
|
||||
<TextRegion id="r4" orientation="0.0">
|
||||
<Coords points="316,497 316,470 340,470 340,497"/>
|
||||
<TextLine id="l4">
|
||||
<Coords points="319,494 319,472 337,472 337,494"/>
|
||||
<TextEquiv index="0">
|
||||
<Unicode>8</Unicode>
|
||||
</TextEquiv>
|
||||
</TextLine>
|
||||
<TextEquiv>
|
||||
<Unicode>8</Unicode>
|
||||
</TextEquiv>
|
||||
</TextRegion>
|
||||
<TextRegion id="r5" orientation="0.0">
|
||||
<Coords points="423,501 423,468 451,468 451,501"/>
|
||||
<TextLine id="l5">
|
||||
<Coords points="427,497 427,470 447,470 447,497"/>
|
||||
<TextEquiv index="0">
|
||||
<Unicode>9</Unicode>
|
||||
</TextEquiv>
|
||||
</TextLine>
|
||||
<TextEquiv>
|
||||
<Unicode>9</Unicode>
|
||||
</TextEquiv>
|
||||
</TextRegion>
|
||||
<TextRegion id="r6" orientation="0.0">
|
||||
<Coords points="237,373 237,347 259,347 259,373"/>
|
||||
<TextLine id="l6">
|
||||
<Coords points="240,372 240,349 256,349 256,372"/>
|
||||
<TextEquiv index="0">
|
||||
<Unicode>1</Unicode>
|
||||
</TextEquiv>
|
||||
</TextLine>
|
||||
<TextEquiv>
|
||||
<Unicode>1</Unicode>
|
||||
</TextEquiv>
|
||||
</TextRegion>
|
||||
<TextRegion id="r7" orientation="0.0">
|
||||
<Coords points="312,373 312,347 341,347 341,373"/>
|
||||
<TextLine id="l7">
|
||||
<Coords points="318,372 318,350 338,350 338,372"/>
|
||||
<TextEquiv index="0">
|
||||
<Unicode>2</Unicode>
|
||||
</TextEquiv>
|
||||
</TextLine>
|
||||
<TextEquiv>
|
||||
<Unicode>2</Unicode>
|
||||
</TextEquiv>
|
||||
</TextRegion>
|
||||
<TextRegion id="r8" orientation="0.0">
|
||||
<Coords points="428,373 428,349 448,349 448,373"/>
|
||||
<TextLine id="l8">
|
||||
<Coords points="430,373 430,349 445,349 445,373"/>
|
||||
<TextEquiv index="0">
|
||||
<Unicode>3</Unicode>
|
||||
</TextEquiv>
|
||||
</TextLine>
|
||||
<TextEquiv>
|
||||
<Unicode>3</Unicode>
|
||||
</TextEquiv>
|
||||
</TextRegion>
|
||||
<TextRegion id="r9" orientation="0.0">
|
||||
<Coords points="236,438 236,406 261,406 261,438"/>
|
||||
<TextLine id="l9">
|
||||
<Coords points="238,436 238,408 258,408 258,436"/>
|
||||
<TextEquiv index="0">
|
||||
<Unicode>4</Unicode>
|
||||
</TextEquiv>
|
||||
</TextLine>
|
||||
<TextEquiv>
|
||||
<Unicode>4</Unicode>
|
||||
</TextEquiv>
|
||||
</TextRegion>
|
||||
</Page>
|
||||
</PcGts>
|
134
src/dinglehopper/tests/data/table-order/table-order-0001.xml
Normal file
134
src/dinglehopper/tests/data/table-order/table-order-0001.xml
Normal file
|
@ -0,0 +1,134 @@
|
|||
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
|
||||
<PcGts xmlns="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15 http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15/pagecontent.xsd">
|
||||
<Metadata>
|
||||
<Creator/>
|
||||
<Created>2020-10-28T08:43:47</Created>
|
||||
<LastChange>1970-01-01T00:00:00</LastChange>
|
||||
<Comments/>
|
||||
</Metadata>
|
||||
<Page imageFilename="0001.png" imageHeight="1123" imageWidth="794">
|
||||
<ReadingOrder>
|
||||
<OrderedGroup id="g1">
|
||||
<RegionRefIndexed index="0" regionRef="r6"/>
|
||||
<RegionRefIndexed index="1" regionRef="r7"/>
|
||||
<RegionRefIndexed index="2" regionRef="r8"/>
|
||||
<RegionRefIndexed index="3" regionRef="r9"/>
|
||||
<RegionRefIndexed index="4" regionRef="r1"/>
|
||||
<RegionRefIndexed index="5" regionRef="r2"/>
|
||||
<RegionRefIndexed index="6" regionRef="r3"/>
|
||||
<RegionRefIndexed index="7" regionRef="r4"/>
|
||||
<RegionRefIndexed index="8" regionRef="r5"/>
|
||||
</OrderedGroup>
|
||||
</ReadingOrder>
|
||||
<TextRegion id="r1" orientation="0.0">
|
||||
<Coords points="315,437 315,407 339,407 339,437"/>
|
||||
<TextLine id="l1">
|
||||
<Coords points="318,434 318,409 337,409 337,434"/>
|
||||
<TextEquiv index="0">
|
||||
<Unicode>5</Unicode>
|
||||
</TextEquiv>
|
||||
</TextLine>
|
||||
<TextEquiv>
|
||||
<Unicode>5</Unicode>
|
||||
</TextEquiv>
|
||||
</TextRegion>
|
||||
<TextRegion id="r2" orientation="0.0">
|
||||
<Coords points="425,436 425,406 450,406 450,436"/>
|
||||
<TextLine id="l2">
|
||||
<Coords points="429,434 429,410 446,410 446,434"/>
|
||||
<TextEquiv index="0">
|
||||
<Unicode>6</Unicode>
|
||||
</TextEquiv>
|
||||
</TextLine>
|
||||
<TextEquiv>
|
||||
<Unicode>6</Unicode>
|
||||
</TextEquiv>
|
||||
</TextRegion>
|
||||
<TextRegion id="r3" orientation="0.0">
|
||||
<Coords points="233,499 233,467 262,467 262,499"/>
|
||||
<TextLine id="l3">
|
||||
<Coords points="237,496 237,468 258,468 258,496"/>
|
||||
<TextEquiv index="0">
|
||||
<Unicode>7</Unicode>
|
||||
</TextEquiv>
|
||||
</TextLine>
|
||||
<TextEquiv>
|
||||
<Unicode>7</Unicode>
|
||||
</TextEquiv>
|
||||
</TextRegion>
|
||||
<TextRegion id="r4" orientation="0.0">
|
||||
<Coords points="316,497 316,470 340,470 340,497"/>
|
||||
<TextLine id="l4">
|
||||
<Coords points="319,494 319,472 337,472 337,494"/>
|
||||
<TextEquiv index="0">
|
||||
<Unicode>8</Unicode>
|
||||
</TextEquiv>
|
||||
</TextLine>
|
||||
<TextEquiv>
|
||||
<Unicode>8</Unicode>
|
||||
</TextEquiv>
|
||||
</TextRegion>
|
||||
<TextRegion id="r5" orientation="0.0">
|
||||
<Coords points="423,501 423,468 451,468 451,501"/>
|
||||
<TextLine id="l5">
|
||||
<Coords points="427,497 427,470 447,470 447,497"/>
|
||||
<TextEquiv index="0">
|
||||
<Unicode>9</Unicode>
|
||||
</TextEquiv>
|
||||
</TextLine>
|
||||
<TextEquiv>
|
||||
<Unicode>9</Unicode>
|
||||
</TextEquiv>
|
||||
</TextRegion>
|
||||
<TextRegion id="r6" orientation="0.0">
|
||||
<Coords points="237,373 237,347 259,347 259,373"/>
|
||||
<TextLine id="l6">
|
||||
<Coords points="240,372 240,349 256,349 256,372"/>
|
||||
<TextEquiv index="0">
|
||||
<Unicode>1</Unicode>
|
||||
</TextEquiv>
|
||||
</TextLine>
|
||||
<TextEquiv>
|
||||
<Unicode>1</Unicode>
|
||||
</TextEquiv>
|
||||
</TextRegion>
|
||||
<TextRegion id="r7" orientation="0.0">
|
||||
<Coords points="312,373 312,347 341,347 341,373"/>
|
||||
<TextLine id="l7">
|
||||
<Coords points="318,372 318,350 338,350 338,372"/>
|
||||
<TextEquiv index="0">
|
||||
<Unicode>2</Unicode>
|
||||
</TextEquiv>
|
||||
</TextLine>
|
||||
<TextEquiv>
|
||||
<Unicode>2</Unicode>
|
||||
</TextEquiv>
|
||||
</TextRegion>
|
||||
<TextRegion id="r8" orientation="0.0">
|
||||
<Coords points="428,373 428,349 448,349 448,373"/>
|
||||
<TextLine id="l8">
|
||||
<Coords points="430,373 430,349 445,349 445,373"/>
|
||||
<TextEquiv index="0">
|
||||
<Unicode>3</Unicode>
|
||||
</TextEquiv>
|
||||
</TextLine>
|
||||
<TextEquiv>
|
||||
<Unicode>3</Unicode>
|
||||
</TextEquiv>
|
||||
</TextRegion>
|
||||
<TextRegion id="r9" orientation="0.0">
|
||||
<Coords points="236,438 236,406 261,406 261,438"/>
|
||||
<TextLine id="l9">
|
||||
<Coords points="238,436 238,408 258,408 258,436"/>
|
||||
<TextEquiv index="0">
|
||||
<Unicode>4</Unicode>
|
||||
</TextEquiv>
|
||||
</TextLine>
|
||||
<TextEquiv>
|
||||
<Unicode>4</Unicode>
|
||||
</TextEquiv>
|
||||
</TextRegion>
|
||||
</Page>
|
||||
</PcGts>
|
134
src/dinglehopper/tests/data/table-order/table-order-0002.xml
Normal file
134
src/dinglehopper/tests/data/table-order/table-order-0002.xml
Normal file
|
@ -0,0 +1,134 @@
|
|||
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
|
||||
<PcGts xmlns="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15 http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15/pagecontent.xsd">
|
||||
<Metadata>
|
||||
<Creator/>
|
||||
<Created>2020-10-28T08:43:47</Created>
|
||||
<LastChange>1970-01-01T00:00:00</LastChange>
|
||||
<Comments/>
|
||||
</Metadata>
|
||||
<Page imageFilename="0002.png" imageHeight="1123" imageWidth="794">
|
||||
<ReadingOrder>
|
||||
<OrderedGroup id="g1">
|
||||
<RegionRefIndexed index="0" regionRef="r6"/>
|
||||
<RegionRefIndexed index="1" regionRef="r9"/>
|
||||
<RegionRefIndexed index="2" regionRef="r3"/>
|
||||
<RegionRefIndexed index="3" regionRef="r7"/>
|
||||
<RegionRefIndexed index="4" regionRef="r1"/>
|
||||
<RegionRefIndexed index="5" regionRef="r4"/>
|
||||
<RegionRefIndexed index="6" regionRef="r8"/>
|
||||
<RegionRefIndexed index="7" regionRef="r2"/>
|
||||
<RegionRefIndexed index="8" regionRef="r5"/>
|
||||
</OrderedGroup>
|
||||
</ReadingOrder>
|
||||
<TextRegion id="r1" orientation="0.0">
|
||||
<Coords points="315,437 315,407 339,407 339,437"/>
|
||||
<TextLine id="l1">
|
||||
<Coords points="318,434 318,409 337,409 337,434"/>
|
||||
<TextEquiv index="0">
|
||||
<Unicode>5</Unicode>
|
||||
</TextEquiv>
|
||||
</TextLine>
|
||||
<TextEquiv>
|
||||
<Unicode>5</Unicode>
|
||||
</TextEquiv>
|
||||
</TextRegion>
|
||||
<TextRegion id="r2" orientation="0.0">
|
||||
<Coords points="425,436 425,406 450,406 450,436"/>
|
||||
<TextLine id="l2">
|
||||
<Coords points="429,434 429,410 446,410 446,434"/>
|
||||
<TextEquiv index="0">
|
||||
<Unicode>6</Unicode>
|
||||
</TextEquiv>
|
||||
</TextLine>
|
||||
<TextEquiv>
|
||||
<Unicode>6</Unicode>
|
||||
</TextEquiv>
|
||||
</TextRegion>
|
||||
<TextRegion id="r3" orientation="0.0">
|
||||
<Coords points="233,499 233,467 262,467 262,499"/>
|
||||
<TextLine id="l3">
|
||||
<Coords points="237,496 237,468 258,468 258,496"/>
|
||||
<TextEquiv index="0">
|
||||
<Unicode>7</Unicode>
|
||||
</TextEquiv>
|
||||
</TextLine>
|
||||
<TextEquiv>
|
||||
<Unicode>7</Unicode>
|
||||
</TextEquiv>
|
||||
</TextRegion>
|
||||
<TextRegion id="r4" orientation="0.0">
|
||||
<Coords points="316,497 316,470 340,470 340,497"/>
|
||||
<TextLine id="l4">
|
||||
<Coords points="319,494 319,472 337,472 337,494"/>
|
||||
<TextEquiv index="0">
|
||||
<Unicode>8</Unicode>
|
||||
</TextEquiv>
|
||||
</TextLine>
|
||||
<TextEquiv>
|
||||
<Unicode>8</Unicode>
|
||||
</TextEquiv>
|
||||
</TextRegion>
|
||||
<TextRegion id="r5" orientation="0.0">
|
||||
<Coords points="423,501 423,468 451,468 451,501"/>
|
||||
<TextLine id="l5">
|
||||
<Coords points="427,497 427,470 447,470 447,497"/>
|
||||
<TextEquiv index="0">
|
||||
<Unicode>9</Unicode>
|
||||
</TextEquiv>
|
||||
</TextLine>
|
||||
<TextEquiv>
|
||||
<Unicode>9</Unicode>
|
||||
</TextEquiv>
|
||||
</TextRegion>
|
||||
<TextRegion id="r6" orientation="0.0">
|
||||
<Coords points="237,373 237,347 259,347 259,373"/>
|
||||
<TextLine id="l6">
|
||||
<Coords points="240,372 240,349 256,349 256,372"/>
|
||||
<TextEquiv index="0">
|
||||
<Unicode>1</Unicode>
|
||||
</TextEquiv>
|
||||
</TextLine>
|
||||
<TextEquiv>
|
||||
<Unicode>1</Unicode>
|
||||
</TextEquiv>
|
||||
</TextRegion>
|
||||
<TextRegion id="r7" orientation="0.0">
|
||||
<Coords points="312,373 312,347 341,347 341,373"/>
|
||||
<TextLine id="l7">
|
||||
<Coords points="318,372 318,350 338,350 338,372"/>
|
||||
<TextEquiv index="0">
|
||||
<Unicode>2</Unicode>
|
||||
</TextEquiv>
|
||||
</TextLine>
|
||||
<TextEquiv>
|
||||
<Unicode>2</Unicode>
|
||||
</TextEquiv>
|
||||
</TextRegion>
|
||||
<TextRegion id="r8" orientation="0.0">
|
||||
<Coords points="428,373 428,349 448,349 448,373"/>
|
||||
<TextLine id="l8">
|
||||
<Coords points="430,373 430,349 445,349 445,373"/>
|
||||
<TextEquiv index="0">
|
||||
<Unicode>3</Unicode>
|
||||
</TextEquiv>
|
||||
</TextLine>
|
||||
<TextEquiv>
|
||||
<Unicode>3</Unicode>
|
||||
</TextEquiv>
|
||||
</TextRegion>
|
||||
<TextRegion id="r9" orientation="0.0">
|
||||
<Coords points="236,438 236,406 261,406 261,438"/>
|
||||
<TextLine id="l9">
|
||||
<Coords points="238,436 238,408 258,408 258,436"/>
|
||||
<TextEquiv index="0">
|
||||
<Unicode>4</Unicode>
|
||||
</TextEquiv>
|
||||
</TextLine>
|
||||
<TextEquiv>
|
||||
<Unicode>4</Unicode>
|
||||
</TextEquiv>
|
||||
</TextRegion>
|
||||
</Page>
|
||||
</PcGts>
|
139
src/dinglehopper/tests/data/table-order/table-region.xml
Normal file
139
src/dinglehopper/tests/data/table-order/table-region.xml
Normal file
|
@ -0,0 +1,139 @@
|
|||
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
|
||||
<PcGts xmlns="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15 http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15/pagecontent.xsd">
|
||||
<Metadata>
|
||||
<Creator/>
|
||||
<Created>2020-10-28T08:43:47</Created>
|
||||
<LastChange>1970-01-01T00:00:00</LastChange>
|
||||
<Comments/>
|
||||
</Metadata>
|
||||
<Page imageFilename="0001.png" imageHeight="1123" imageWidth="794">
|
||||
<ReadingOrder>
|
||||
<OrderedGroup id="g1">
|
||||
<OrderedGroupIndexed id="r0_order" regionRef="r0" index="0">
|
||||
<RegionRefIndexed index="0" regionRef="r6"/>
|
||||
<RegionRefIndexed index="1" regionRef="r7"/>
|
||||
<RegionRefIndexed index="2" regionRef="r8"/>
|
||||
<RegionRefIndexed index="3" regionRef="r9"/>
|
||||
<RegionRefIndexed index="4" regionRef="r1"/>
|
||||
<RegionRefIndexed index="5" regionRef="r2"/>
|
||||
<RegionRefIndexed index="6" regionRef="r3"/>
|
||||
<RegionRefIndexed index="7" regionRef="r4"/>
|
||||
<RegionRefIndexed index="8" regionRef="r5"/>
|
||||
</OrderedGroupIndexed>
|
||||
</OrderedGroup>
|
||||
</ReadingOrder>
|
||||
<TableRegion id="r0">
|
||||
<Coords points="230,530 230,330 460,330 460,530"/>
|
||||
<TextRegion id="r1" orientation="0.0">
|
||||
<Coords points="315,437 315,407 339,407 339,437"/>
|
||||
<TextLine id="l1">
|
||||
<Coords points="318,434 318,409 337,409 337,434"/>
|
||||
<TextEquiv index="0">
|
||||
<Unicode>5</Unicode>
|
||||
</TextEquiv>
|
||||
</TextLine>
|
||||
<TextEquiv>
|
||||
<Unicode>5</Unicode>
|
||||
</TextEquiv>
|
||||
</TextRegion>
|
||||
<TextRegion id="r2" orientation="0.0">
|
||||
<Coords points="425,436 425,406 450,406 450,436"/>
|
||||
<TextLine id="l2">
|
||||
<Coords points="429,434 429,410 446,410 446,434"/>
|
||||
<TextEquiv index="0">
|
||||
<Unicode>6</Unicode>
|
||||
</TextEquiv>
|
||||
</TextLine>
|
||||
<TextEquiv>
|
||||
<Unicode>6</Unicode>
|
||||
</TextEquiv>
|
||||
</TextRegion>
|
||||
<TextRegion id="r3" orientation="0.0">
|
||||
<Coords points="233,499 233,467 262,467 262,499"/>
|
||||
<TextLine id="l3">
|
||||
<Coords points="237,496 237,468 258,468 258,496"/>
|
||||
<TextEquiv index="0">
|
||||
<Unicode>7</Unicode>
|
||||
</TextEquiv>
|
||||
</TextLine>
|
||||
<TextEquiv>
|
||||
<Unicode>7</Unicode>
|
||||
</TextEquiv>
|
||||
</TextRegion>
|
||||
<TextRegion id="r4" orientation="0.0">
|
||||
<Coords points="316,497 316,470 340,470 340,497"/>
|
||||
<TextLine id="l4">
|
||||
<Coords points="319,494 319,472 337,472 337,494"/>
|
||||
<TextEquiv index="0">
|
||||
<Unicode>8</Unicode>
|
||||
</TextEquiv>
|
||||
</TextLine>
|
||||
<TextEquiv>
|
||||
<Unicode>8</Unicode>
|
||||
</TextEquiv>
|
||||
</TextRegion>
|
||||
<TextRegion id="r5" orientation="0.0">
|
||||
<Coords points="423,501 423,468 451,468 451,501"/>
|
||||
<TextLine id="l5">
|
||||
<Coords points="427,497 427,470 447,470 447,497"/>
|
||||
<TextEquiv index="0">
|
||||
<Unicode>9</Unicode>
|
||||
</TextEquiv>
|
||||
</TextLine>
|
||||
<TextEquiv>
|
||||
<Unicode>9</Unicode>
|
||||
</TextEquiv>
|
||||
</TextRegion>
|
||||
<TextRegion id="r6" orientation="0.0">
|
||||
<Coords points="237,373 237,347 259,347 259,373"/>
|
||||
<TextLine id="l6">
|
||||
<Coords points="240,372 240,349 256,349 256,372"/>
|
||||
<TextEquiv index="0">
|
||||
<Unicode>1</Unicode>
|
||||
</TextEquiv>
|
||||
</TextLine>
|
||||
<TextEquiv>
|
||||
<Unicode>1</Unicode>
|
||||
</TextEquiv>
|
||||
</TextRegion>
|
||||
<TextRegion id="r7" orientation="0.0">
|
||||
<Coords points="312,373 312,347 341,347 341,373"/>
|
||||
<TextLine id="l7">
|
||||
<Coords points="318,372 318,350 338,350 338,372"/>
|
||||
<TextEquiv index="0">
|
||||
<Unicode>2</Unicode>
|
||||
</TextEquiv>
|
||||
</TextLine>
|
||||
<TextEquiv>
|
||||
<Unicode>2</Unicode>
|
||||
</TextEquiv>
|
||||
</TextRegion>
|
||||
<TextRegion id="r8" orientation="0.0">
|
||||
<Coords points="428,373 428,349 448,349 448,373"/>
|
||||
<TextLine id="l8">
|
||||
<Coords points="430,373 430,349 445,349 445,373"/>
|
||||
<TextEquiv index="0">
|
||||
<Unicode>3</Unicode>
|
||||
</TextEquiv>
|
||||
</TextLine>
|
||||
<TextEquiv>
|
||||
<Unicode>3</Unicode>
|
||||
</TextEquiv>
|
||||
</TextRegion>
|
||||
<TextRegion id="r9" orientation="0.0">
|
||||
<Coords points="236,438 236,406 261,406 261,438"/>
|
||||
<TextLine id="l9">
|
||||
<Coords points="238,436 238,408 258,408 258,436"/>
|
||||
<TextEquiv index="0">
|
||||
<Unicode>4</Unicode>
|
||||
</TextEquiv>
|
||||
</TextLine>
|
||||
<TextEquiv>
|
||||
<Unicode>4</Unicode>
|
||||
</TextEquiv>
|
||||
</TextRegion>
|
||||
</TableRegion>
|
||||
</Page>
|
||||
</PcGts>
|
134
src/dinglehopper/tests/data/table-order/table-unordered.xml
Normal file
134
src/dinglehopper/tests/data/table-order/table-unordered.xml
Normal file
|
@ -0,0 +1,134 @@
|
|||
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
|
||||
<PcGts xmlns="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15 http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15/pagecontent.xsd">
|
||||
<Metadata>
|
||||
<Creator/>
|
||||
<Created>2020-10-28T08:43:47</Created>
|
||||
<LastChange>1970-01-01T00:00:00</LastChange>
|
||||
<Comments/>
|
||||
</Metadata>
|
||||
<Page imageFilename="0001.png" imageHeight="1123" imageWidth="794">
|
||||
<ReadingOrder>
|
||||
<UnorderedGroup id="g1">
|
||||
<RegionRef regionRef="r6"/>
|
||||
<RegionRef regionRef="r7"/>
|
||||
<RegionRef regionRef="r8"/>
|
||||
<RegionRef regionRef="r9"/>
|
||||
<RegionRef regionRef="r1"/>
|
||||
<RegionRef regionRef="r2"/>
|
||||
<RegionRef regionRef="r3"/>
|
||||
<RegionRef regionRef="r4"/>
|
||||
<RegionRef regionRef="r5"/>
|
||||
</UnorderedGroup>
|
||||
</ReadingOrder>
|
||||
<TextRegion id="r1" orientation="0.0">
|
||||
<Coords points="315,437 315,407 339,407 339,437"/>
|
||||
<TextLine id="l1">
|
||||
<Coords points="318,434 318,409 337,409 337,434"/>
|
||||
<TextEquiv index="0">
|
||||
<Unicode>5</Unicode>
|
||||
</TextEquiv>
|
||||
</TextLine>
|
||||
<TextEquiv>
|
||||
<Unicode>5</Unicode>
|
||||
</TextEquiv>
|
||||
</TextRegion>
|
||||
<TextRegion id="r2" orientation="0.0">
|
||||
<Coords points="425,436 425,406 450,406 450,436"/>
|
||||
<TextLine id="l2">
|
||||
<Coords points="429,434 429,410 446,410 446,434"/>
|
||||
<TextEquiv index="0">
|
||||
<Unicode>6</Unicode>
|
||||
</TextEquiv>
|
||||
</TextLine>
|
||||
<TextEquiv>
|
||||
<Unicode>6</Unicode>
|
||||
</TextEquiv>
|
||||
</TextRegion>
|
||||
<TextRegion id="r3" orientation="0.0">
|
||||
<Coords points="233,499 233,467 262,467 262,499"/>
|
||||
<TextLine id="l3">
|
||||
<Coords points="237,496 237,468 258,468 258,496"/>
|
||||
<TextEquiv index="0">
|
||||
<Unicode>7</Unicode>
|
||||
</TextEquiv>
|
||||
</TextLine>
|
||||
<TextEquiv>
|
||||
<Unicode>7</Unicode>
|
||||
</TextEquiv>
|
||||
</TextRegion>
|
||||
<TextRegion id="r4" orientation="0.0">
|
||||
<Coords points="316,497 316,470 340,470 340,497"/>
|
||||
<TextLine id="l4">
|
||||
<Coords points="319,494 319,472 337,472 337,494"/>
|
||||
<TextEquiv index="0">
|
||||
<Unicode>8</Unicode>
|
||||
</TextEquiv>
|
||||
</TextLine>
|
||||
<TextEquiv>
|
||||
<Unicode>8</Unicode>
|
||||
</TextEquiv>
|
||||
</TextRegion>
|
||||
<TextRegion id="r5" orientation="0.0">
|
||||
<Coords points="423,501 423,468 451,468 451,501"/>
|
||||
<TextLine id="l5">
|
||||
<Coords points="427,497 427,470 447,470 447,497"/>
|
||||
<TextEquiv index="0">
|
||||
<Unicode>9</Unicode>
|
||||
</TextEquiv>
|
||||
</TextLine>
|
||||
<TextEquiv>
|
||||
<Unicode>9</Unicode>
|
||||
</TextEquiv>
|
||||
</TextRegion>
|
||||
<TextRegion id="r6" orientation="0.0">
|
||||
<Coords points="237,373 237,347 259,347 259,373"/>
|
||||
<TextLine id="l6">
|
||||
<Coords points="240,372 240,349 256,349 256,372"/>
|
||||
<TextEquiv index="0">
|
||||
<Unicode>1</Unicode>
|
||||
</TextEquiv>
|
||||
</TextLine>
|
||||
<TextEquiv>
|
||||
<Unicode>1</Unicode>
|
||||
</TextEquiv>
|
||||
</TextRegion>
|
||||
<TextRegion id="r7" orientation="0.0">
|
||||
<Coords points="312,373 312,347 341,347 341,373"/>
|
||||
<TextLine id="l7">
|
||||
<Coords points="318,372 318,350 338,350 338,372"/>
|
||||
<TextEquiv index="0">
|
||||
<Unicode>2</Unicode>
|
||||
</TextEquiv>
|
||||
</TextLine>
|
||||
<TextEquiv>
|
||||
<Unicode>2</Unicode>
|
||||
</TextEquiv>
|
||||
</TextRegion>
|
||||
<TextRegion id="r8" orientation="0.0">
|
||||
<Coords points="428,373 428,349 448,349 448,373"/>
|
||||
<TextLine id="l8">
|
||||
<Coords points="430,373 430,349 445,349 445,373"/>
|
||||
<TextEquiv index="0">
|
||||
<Unicode>3</Unicode>
|
||||
</TextEquiv>
|
||||
</TextLine>
|
||||
<TextEquiv>
|
||||
<Unicode>3</Unicode>
|
||||
</TextEquiv>
|
||||
</TextRegion>
|
||||
<TextRegion id="r9" orientation="0.0">
|
||||
<Coords points="236,438 236,406 261,406 261,438"/>
|
||||
<TextLine id="l9">
|
||||
<Coords points="238,436 238,408 258,408 258,436"/>
|
||||
<TextEquiv index="0">
|
||||
<Unicode>4</Unicode>
|
||||
</TextEquiv>
|
||||
</TextLine>
|
||||
<TextEquiv>
|
||||
<Unicode>4</Unicode>
|
||||
</TextEquiv>
|
||||
</TextRegion>
|
||||
</Page>
|
||||
</PcGts>
|
3394
src/dinglehopper/tests/data/test-fake-ocr.page2018.xml
Normal file
3394
src/dinglehopper/tests/data/test-fake-ocr.page2018.xml
Normal file
File diff suppressed because it is too large
Load diff
3394
src/dinglehopper/tests/data/test-gt.page2018.xml
Normal file
3394
src/dinglehopper/tests/data/test-gt.page2018.xml
Normal file
File diff suppressed because it is too large
Load diff
20186
src/dinglehopper/tests/data/test.alto1.xml
Normal file
20186
src/dinglehopper/tests/data/test.alto1.xml
Normal file
File diff suppressed because it is too large
Load diff
64
src/dinglehopper/tests/data/test.alto2.xml
Normal file
64
src/dinglehopper/tests/data/test.alto2.xml
Normal file
|
@ -0,0 +1,64 @@
|
|||
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
|
||||
<alto xmlns="http://www.loc.gov/standards/alto/ns-v2#" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.loc.gov/standards/alto/ns-v2# http://www.loc.gov/standards/alto/alto-v2.0.xsd">
|
||||
<Description>
|
||||
<MeasurementUnit>pixel</MeasurementUnit>
|
||||
<OCRProcessing ID="IdOcr"><ocrProcessingStep><processingDateTime>2017-03-27</processingDateTime><processingSoftware><softwareCreator>ABBYY</softwareCreator><softwareName>ABBYY FineReader Engine</softwareName><softwareVersion>11</softwareVersion></processingSoftware></ocrProcessingStep></OCRProcessing>
|
||||
</Description>
|
||||
<Styles><TextStyle ID="font0" FONTFAMILY="Times New Roman" FONTSIZE="7"/><TextStyle ID="font1" FONTFAMILY="Times New Roman" FONTSIZE="11"/>
|
||||
</Styles>
|
||||
<Layout>
|
||||
<Page ID="Page1" PHYSICAL_IMG_NR="1" HEIGHT="2500" WIDTH="1720">
|
||||
<TopMargin HEIGHT="172" WIDTH="1720" VPOS="0" HPOS="0">
|
||||
</TopMargin>
|
||||
<LeftMargin HEIGHT="2016" WIDTH="341" VPOS="172" HPOS="0">
|
||||
</LeftMargin>
|
||||
<RightMargin HEIGHT="2016" WIDTH="111" VPOS="172" HPOS="1609">
|
||||
</RightMargin>
|
||||
<BottomMargin HEIGHT="312" WIDTH="1720" VPOS="2188" HPOS="0">
|
||||
</BottomMargin>
|
||||
<PrintSpace HEIGHT="2016" WIDTH="1268" VPOS="172" HPOS="341">
|
||||
<TextBlock ID="Page1_Block1" HEIGHT="43" WIDTH="72" VPOS="174" HPOS="936" language="de" STYLEREFS="font1">
|
||||
<TextLine HEIGHT="31" WIDTH="60" VPOS="180" HPOS="942"><String STYLE="bold" WC="0.676666677" CONTENT="142" HEIGHT="31" WIDTH="60" VPOS="180" HPOS="942"/></TextLine>
|
||||
</TextBlock>
|
||||
<ComposedBlock ID="Page1_Block2" HEIGHT="1306" WIDTH="1266" VPOS="257" HPOS="341" TYPE="container"><Shape><Polygon POINTS="348,262 1610,262 1610,1564 348,1564 348,262"/></Shape>
|
||||
<TextBlock ID="Page1_Block3" HEIGHT="776" WIDTH="1261" VPOS="257" HPOS="343" language="de" STYLEREFS="font1"><Shape><Polygon POINTS="350,262 1610,262 1610,708 992,708 992,1034 350,1034 350,262"/></Shape>
|
||||
<TextLine HEIGHT="50" WIDTH="1223" VPOS="267" HPOS="363"><String WC="0.6899999976" CONTENT="die" HEIGHT="33" WIDTH="46" VPOS="271" HPOS="363"/><SP WIDTH="16" VPOS="272" HPOS="410"/><String WC="0.7875000238" CONTENT="Zugtiere" HEIGHT="44" WIDTH="142" VPOS="270" HPOS="427"/><SP WIDTH="20" VPOS="281" HPOS="570"/><String WC="0.9499999881" CONTENT="eines" HEIGHT="34" WIDTH="82" VPOS="271" HPOS="591"/><SP WIDTH="10" VPOS="272" HPOS="674"/><String WC="0.6349999905" CONTENT="Joches" HEIGHT="42" WIDTH="113" VPOS="272" HPOS="685"/><SP WIDTH="15" VPOS="271" HPOS="799"/><String WC="0.6009091139" CONTENT="(griechisch" HEIGHT="45" WIDTH="161" VPOS="270" HPOS="815"/><SP WIDTH="19" VPOS="271" HPOS="977"/><String WC="0.7699999809" CONTENT="zygos)," HEIGHT="44" WIDTH="126" VPOS="269" HPOS="997"/><SP WIDTH="21" VPOS="272" HPOS="1124"/><String WC="0.7099999785" CONTENT="so" HEIGHT="42" WIDTH="27" VPOS="271" HPOS="1146"/><SP WIDTH="19" VPOS="280" HPOS="1174"/><String WC="0.6679999828" CONTENT="nennt" HEIGHT="32" WIDTH="94" VPOS="272" HPOS="1194"/><SP WIDTH="19" VPOS="272" HPOS="1289"/><String WC="0.4133333266" CONTENT="man" HEIGHT="23" WIDTH="72" VPOS="281" HPOS="1309"/><SP WIDTH="21" VPOS="271" HPOS="1382"/><String WC="0.5099999905" CONTENT="die" HEIGHT="33" WIDTH="46" VPOS="271" HPOS="1404"/><SP WIDTH="15" VPOS="272" HPOS="1451"/><String WC="0.8700000048" CONTENT="Zporen" HEIGHT="43" WIDTH="119" VPOS="271" HPOS="1467"/></TextLine>
|
||||
<TextLine HEIGHT="51" WIDTH="1224" VPOS="321" HPOS="363"><String WC="0.8133333325" CONTENT="der" HEIGHT="34" WIDTH="50" VPOS="325" HPOS="363"/><SP WIDTH="24" VPOS="327" HPOS="414"/><String WC="0.8700000048" CONTENT="Tonjugaten" HEIGHT="43" WIDTH="197" VPOS="326" HPOS="439"/><SP WIDTH="32" VPOS="337" HPOS="637"/><String WC="0.6499999762" CONTENT="auch" HEIGHT="43" WIDTH="70" VPOS="326" HPOS="670"/><SP WIDTH="31" VPOS="326" HPOS="741"/><String WC="0.7120000124" CONTENT="Jochsporen" HEIGHT="43" WIDTH="185" VPOS="326" HPOS="773"/><SP WIDTH="37" VPOS="336" HPOS="959"/><String WC="0.9200000167" CONTENT="oder" HEIGHT="32" WIDTH="71" VPOS="327" HPOS="997"/><SP WIDTH="31" VPOS="326" HPOS="1069"/><String WC="0.7072727084" CONTENT="Zpgosporen." HEIGHT="44" WIDTH="203" VPOS="325" HPOS="1101"/><SP WIDTH="53" VPOS="326" HPOS="1305"/><String WC="0.5320000052" CONTENT="Daher" HEIGHT="43" WIDTH="107" VPOS="326" HPOS="1359"/><SP WIDTH="36" VPOS="325" HPOS="1467"/><String WC="0.5720000267" CONTENT="heißt" HEIGHT="43" WIDTH="83" VPOS="325" HPOS="1504"/></TextLine>
|
||||
<TextLine HEIGHT="46" WIDTH="655" VPOS="379" HPOS="363"><String WC="0.8650000095" CONTENT="auch" HEIGHT="43" WIDTH="70" VPOS="381" HPOS="363"/><SP WIDTH="29" VPOS="381" HPOS="434"/><String WC="0.6299999952" CONTENT="die" HEIGHT="33" WIDTH="46" VPOS="381" HPOS="464"/><SP WIDTH="24" VPOS="392" HPOS="511"/><String WC="0.7699999809" CONTENT="ganze" HEIGHT="33" WIDTH="94" VPOS="391" HPOS="536"/><SP WIDTH="24" VPOS="381" HPOS="631"/><String WC="0.7371428609" CONTENT="Ordnung" HEIGHT="43" WIDTH="154" VPOS="381" HPOS="656"/><SP WIDTH="24" VPOS="382" HPOS="811"/><String WC="0.800999999" CONTENT="Jochalgen." HEIGHT="43" WIDTH="182" VPOS="381" HPOS="836"/></TextLine>
|
||||
<TextLine HEIGHT="50" WIDTH="1182" VPOS="432" HPOS="406"><String WC="0.3966666758" CONTENT="Wir" HEIGHT="33" WIDTH="69" VPOS="436" HPOS="406"/><SP WIDTH="24" VPOS="446" HPOS="475"/><String WC="0.6949999928" CONTENT="wollen" HEIGHT="33" WIDTH="112" VPOS="436" HPOS="499"/><SP WIDTH="24" VPOS="445" HPOS="611"/><String WC="0.5166666508" CONTENT="nun" HEIGHT="23" WIDTH="65" VPOS="446" HPOS="635"/><SP WIDTH="24" VPOS="446" HPOS="700"/><String WC="0.7570000291" CONTENT="versuchen," HEIGHT="44" WIDTH="166" VPOS="435" HPOS="724"/><SP WIDTH="27" VPOS="446" HPOS="890"/><String WC="0.6733333468" CONTENT="uns" HEIGHT="23" WIDTH="59" VPOS="446" HPOS="917"/><SP WIDTH="25" VPOS="446" HPOS="976"/><String WC="0.6725000143" CONTENT="eine" HEIGHT="33" WIDTH="66" VPOS="436" HPOS="1001"/><SP WIDTH="25" VPOS="436" HPOS="1067"/><String WC="0.6690909266" CONTENT="Vorstellung" HEIGHT="44" WIDTH="192" VPOS="435" HPOS="1092"/><SP WIDTH="25" VPOS="446" HPOS="1284"/><String WC="0.8466666937" CONTENT="von" HEIGHT="23" WIDTH="62" VPOS="446" HPOS="1309"/><SP WIDTH="25" VPOS="436" HPOS="1371"/><String WC="0.5866666436" CONTENT="den" HEIGHT="32" WIDTH="56" VPOS="436" HPOS="1396"/><SP WIDTH="25" VPOS="436" HPOS="1452"/><String WC="0.7366666794" CONTENT="Zchon-" HEIGHT="44" WIDTH="111" VPOS="435" HPOS="1477"/></TextLine>
|
||||
<TextLine HEIGHT="50" WIDTH="1224" VPOS="486" HPOS="363"><String WC="0.7181817889" CONTENT="heitsformen" HEIGHT="45" WIDTH="199" VPOS="489" HPOS="363"/><SP WIDTH="32" VPOS="490" HPOS="563"/><String WC="0.8633333445" CONTENT="der" HEIGHT="33" WIDTH="50" VPOS="490" HPOS="596"/><SP WIDTH="31" VPOS="491" HPOS="647"/><String WC="0.7749999762" CONTENT="in" HEIGHT="33" WIDTH="30" VPOS="491" HPOS="679"/><SP WIDTH="31" VPOS="501" HPOS="710"/><String WC="0.5479999781" CONTENT="viele" HEIGHT="33" WIDTH="75" VPOS="491" HPOS="742"/><SP WIDTH="32" VPOS="502" HPOS="818"/><String WC="0.7345454693" CONTENT="artenreiche" HEIGHT="44" WIDTH="181" VPOS="490" HPOS="851"/><SP WIDTH="31" VPOS="491" HPOS="1033"/><String WC="0.7277777791" CONTENT="Gattungen" HEIGHT="43" WIDTH="181" VPOS="490" HPOS="1065"/><SP WIDTH="32" VPOS="501" HPOS="1247"/><String WC="0.7766666412" CONTENT="geteilten" HEIGHT="43" WIDTH="140" VPOS="490" HPOS="1280"/><SP WIDTH="32" VPOS="491" HPOS="1421"/><String WC="0.7514285445" CONTENT="Familie" HEIGHT="44" WIDTH="133" VPOS="489" HPOS="1454"/></TextLine>
|
||||
<TextLine HEIGHT="51" WIDTH="1225" VPOS="540" HPOS="362"><String WC="0.7633333206" CONTENT="der" HEIGHT="32" WIDTH="51" VPOS="546" HPOS="362"/><SP WIDTH="24" VPOS="544" HPOS="414"/><String WC="0.4366666675" CONTENT="OesmiäiLLeen" HEIGHT="35" WIDTH="254" VPOS="543" HPOS="439"/><SP WIDTH="29" VPOS="555" HPOS="694"/><String WC="0.8199999928" CONTENT="zu" HEIGHT="31" WIDTH="35" VPOS="556" HPOS="724"/><SP WIDTH="24" VPOS="556" HPOS="760"/><String WC="0.5699999928" CONTENT="machen." HEIGHT="44" WIDTH="131" VPOS="545" HPOS="785"/><SP WIDTH="47" VPOS="546" HPOS="917"/><String WC="0.7466666698" CONTENT="Vas" HEIGHT="33" WIDTH="68" VPOS="546" HPOS="965"/><SP WIDTH="25" VPOS="556" HPOS="1034"/><String WC="0.6685714126" CONTENT="gelingt" HEIGHT="43" WIDTH="116" VPOS="545" HPOS="1060"/><SP WIDTH="24" VPOS="545" HPOS="1177"/><String WC="0.5785714388" CONTENT="leicht," HEIGHT="43" WIDTH="95" VPOS="545" HPOS="1202"/><SP WIDTH="31" VPOS="556" HPOS="1298"/><String WC="0.6675000191" CONTENT="wenn" HEIGHT="23" WIDTH="90" VPOS="556" HPOS="1330"/><SP WIDTH="23" VPOS="556" HPOS="1421"/><String WC="0.5666666627" CONTENT="wir" HEIGHT="35" WIDTH="58" VPOS="544" HPOS="1445"/><SP WIDTH="23" VPOS="555" HPOS="1504"/><String WC="0.8000000119" CONTENT="uns" HEIGHT="23" WIDTH="59" VPOS="555" HPOS="1528"/></TextLine>
|
||||
<TextLine HEIGHT="50" WIDTH="1225" VPOS="596" HPOS="362"><String WC="0.6399999857" CONTENT="selbst" HEIGHT="42" WIDTH="84" VPOS="600" HPOS="362"/><SP WIDTH="23" VPOS="603" HPOS="447"/><String WC="0.80400002" CONTENT="etwas" HEIGHT="33" WIDTH="98" VPOS="601" HPOS="471"/><SP WIDTH="23" VPOS="601" HPOS="570"/><String WC="0.6587499976" CONTENT="Material" HEIGHT="34" WIDTH="156" VPOS="600" HPOS="594"/><SP WIDTH="24" VPOS="601" HPOS="751"/><String WC="0.7300000191" CONTENT="holen," HEIGHT="44" WIDTH="99" VPOS="600" HPOS="776"/><SP WIDTH="25" VPOS="600" HPOS="876"/><String WC="0.7516666651" CONTENT="höchst" HEIGHT="43" WIDTH="95" VPOS="600" HPOS="902"/><SP WIDTH="22" VPOS="603" HPOS="998"/><String WC="0.5454545617" CONTENT="mangelhaft," HEIGHT="44" WIDTH="206" VPOS="600" HPOS="1021"/><SP WIDTH="25" VPOS="610" HPOS="1228"/><String WC="0.7599999905" CONTENT="wenn" HEIGHT="23" WIDTH="90" VPOS="610" HPOS="1254"/><SP WIDTH="23" VPOS="610" HPOS="1345"/><String WC="0.6299999952" CONTENT="wir" HEIGHT="34" WIDTH="58" VPOS="600" HPOS="1369"/><SP WIDTH="23" VPOS="611" HPOS="1428"/><String WC="0.8100000024" CONTENT="uns" HEIGHT="24" WIDTH="59" VPOS="610" HPOS="1452"/><SP WIDTH="20" VPOS="610" HPOS="1512"/><String WC="0.5966666937" CONTENT="auf" HEIGHT="42" WIDTH="54" VPOS="600" HPOS="1533"/></TextLine>
|
||||
<TextLine HEIGHT="50" WIDTH="1224" VPOS="651" HPOS="362"><String WC="0.7933333516" CONTENT="die" HEIGHT="33" WIDTH="46" VPOS="655" HPOS="362"/><SP WIDTH="23" VPOS="655" HPOS="409"/><String WC="0.8428571224" CONTENT="Lektüre" HEIGHT="35" WIDTH="129" VPOS="654" HPOS="433"/><SP WIDTH="24" VPOS="655" HPOS="563"/><String WC="0.6150000095" CONTENT="dieses" HEIGHT="42" WIDTH="92" VPOS="655" HPOS="588"/><SP WIDTH="23" VPOS="656" HPOS="681"/><String WC="0.8766666651" CONTENT="Buches" HEIGHT="43" WIDTH="115" VPOS="655" HPOS="705"/><SP WIDTH="30" VPOS="655" HPOS="821"/><String WC="0.6575000286" CONTENT="beschränken." HEIGHT="45" WIDTH="211" VPOS="654" HPOS="852"/><SP WIDTH="46" VPOS="656" HPOS="1064"/><String WC="0.5699999928" CONTENT="Das" HEIGHT="34" WIDTH="68" VPOS="655" HPOS="1111"/><SP WIDTH="23" VPOS="656" HPOS="1180"/><String WC="0.7912499905" CONTENT="Material" HEIGHT="33" WIDTH="156" VPOS="655" HPOS="1204"/><SP WIDTH="24" VPOS="655" HPOS="1361"/><String WC="0.8199999928" CONTENT="ist" HEIGHT="42" WIDTH="33" VPOS="655" HPOS="1386"/><SP WIDTH="23" VPOS="655" HPOS="1420"/><String WC="0.6716666818" CONTENT="leicht" HEIGHT="44" WIDTH="83" VPOS="654" HPOS="1444"/><SP WIDTH="22" VPOS="657" HPOS="1528"/><String WC="0.6999999881" CONTENT="zu" HEIGHT="31" WIDTH="35" VPOS="665" HPOS="1551"/></TextLine>
|
||||
<TextLine HEIGHT="46" WIDTH="608" VPOS="707" HPOS="361"><String WC="0.6736363769" CONTENT="beschaffen." HEIGHT="43" WIDTH="175" VPOS="709" HPOS="361"/><SP WIDTH="30" VPOS="710" HPOS="537"/><String WC="0.6533333063" CONTENT="Man" HEIGHT="33" WIDTH="84" VPOS="710" HPOS="568"/><SP WIDTH="22" VPOS="710" HPOS="653"/><String WC="0.6228571534" CONTENT="sammelt" HEIGHT="42" WIDTH="137" VPOS="710" HPOS="676"/><SP WIDTH="20" VPOS="712" HPOS="814"/><String WC="0.7666666508" CONTENT="aus" HEIGHT="24" WIDTH="57" VPOS="720" HPOS="835"/><SP WIDTH="20" VPOS="710" HPOS="893"/><String WC="0.5966666937" CONTENT="den" HEIGHT="33" WIDTH="55" VPOS="710" HPOS="914"/></TextLine>
|
||||
<TextLine HEIGHT="47" WIDTH="607" VPOS="762" HPOS="364"><String WC="0.7990909219" CONTENT="Torflöchern" HEIGHT="44" WIDTH="195" VPOS="763" HPOS="364"/><SP WIDTH="16" VPOS="764" HPOS="559"/><String WC="0.9300000072" CONTENT="der" HEIGHT="33" WIDTH="52" VPOS="764" HPOS="575"/><SP WIDTH="8" VPOS="764" HPOS="627"/><String WC="0.7636363506" CONTENT="Niedermoore" HEIGHT="34" WIDTH="217" VPOS="764" HPOS="635"/><SP WIDTH="11" VPOS="765" HPOS="852"/><String WC="0.7620000243" CONTENT="Moose" HEIGHT="42" WIDTH="108" VPOS="765" HPOS="863"/></TextLine>
|
||||
<TextLine HEIGHT="48" WIDTH="608" VPOS="817" HPOS="363"><String WC="1." CONTENT="oder" HEIGHT="33" WIDTH="70" VPOS="819" HPOS="363"/><SP WIDTH="28" VPOS="819" HPOS="434"/><String WC="0.6233333349" CONTENT="höhere" HEIGHT="45" WIDTH="111" VPOS="818" HPOS="463"/><SP WIDTH="28" VPOS="820" HPOS="575"/><String WC="0.6035714149" CONTENT="Wasserpflanzen" HEIGHT="44" WIDTH="260" VPOS="818" HPOS="604"/><SP WIDTH="29" VPOS="818" HPOS="865"/><String WC="0.7839999795" CONTENT="(sehr" HEIGHT="45" WIDTH="76" VPOS="818" HPOS="895"/></TextLine>
|
||||
<TextLine HEIGHT="46" WIDTH="609" VPOS="872" HPOS="362"><String WC="0.6299999952" CONTENT="ist" HEIGHT="42" WIDTH="35" VPOS="874" HPOS="362"/><SP WIDTH="25" VPOS="875" HPOS="398"/><String WC="0.9666666389" CONTENT="der" HEIGHT="33" WIDTH="51" VPOS="875" HPOS="424"/><SP WIDTH="25" VPOS="875" HPOS="476"/><String WC="0.5278571248" CONTENT="Wasserschlauch" HEIGHT="44" WIDTH="245" VPOS="874" HPOS="502"/><SP WIDTH="25" VPOS="874" HPOS="748"/><String WC="0.8245454431" CONTENT="Utricularia" HEIGHT="36" WIDTH="197" VPOS="873" HPOS="774"/></TextLine>
|
||||
<TextLine HEIGHT="47" WIDTH="608" VPOS="927" HPOS="361"><String WC="0.7950000167" CONTENT="zu" HEIGHT="32" WIDTH="36" VPOS="939" HPOS="361"/><SP WIDTH="24" VPOS="939" HPOS="398"/><String WC="0.7300000191" CONTENT="empfehlen)," HEIGHT="44" WIDTH="194" VPOS="928" HPOS="423"/><SP WIDTH="32" VPOS="930" HPOS="618"/><String WC="0.9433333278" CONTENT="die" HEIGHT="33" WIDTH="46" VPOS="929" HPOS="651"/><SP WIDTH="29" VPOS="940" HPOS="698"/><String WC="0.5666666627" CONTENT="mit" HEIGHT="33" WIDTH="56" VPOS="930" HPOS="728"/><SP WIDTH="23" VPOS="930" HPOS="785"/><String WC="0.7674999833" CONTENT="braunem," HEIGHT="44" WIDTH="160" VPOS="929" HPOS="809"/></TextLine>
|
||||
<TextLine HEIGHT="49" WIDTH="606" VPOS="980" HPOS="362"><String WC="0.6863636374" CONTENT="schlickigem" HEIGHT="43" WIDTH="176" VPOS="984" HPOS="362"/><SP WIDTH="32" VPOS="981" HPOS="539"/><String WC="0.6887500286" CONTENT="Überzüge" HEIGHT="45" WIDTH="157" VPOS="981" HPOS="572"/><SP WIDTH="31" VPOS="984" HPOS="730"/><String WC="0.5857142806" CONTENT="besetzt" HEIGHT="45" WIDTH="101" VPOS="983" HPOS="762"/><SP WIDTH="32" VPOS="985" HPOS="864"/><String WC="0.8379999995" CONTENT="sind." HEIGHT="42" WIDTH="71" VPOS="984" HPOS="897"/></TextLine>
|
||||
</TextBlock>
|
||||
<Illustration ID="Page1_Block4" HEIGHT="232" WIDTH="604" VPOS="1131" HPOS="374"><Shape><Polygon POINTS="378,1134 982,1134 982,1364 378,1364 378,1134"/></Shape></Illustration>
|
||||
<Illustration ID="Page1_Block5" HEIGHT="664" WIDTH="539" VPOS="732" HPOS="1013"><Shape><Polygon POINTS="1019,737 1556,737 1556,1399 1019,1399 1019,737"/></Shape></Illustration>
|
||||
<TextBlock ID="Page1_Block6" HEIGHT="140" WIDTH="1258" VPOS="1423" HPOS="345" language="de" STYLEREFS="font0"><Shape><Polygon POINTS="348,1428 1606,1428 1606,1564 348,1564 348,1428"/></Shape>
|
||||
<TextLine HEIGHT="32" WIDTH="1225" VPOS="1429" HPOS="362"><String WC="0.4325000048" CONTENT="Fig." HEIGHT="26" WIDTH="46" VPOS="1435" HPOS="362"/><SP WIDTH="22" VPOS="1438" HPOS="409"/><String WC="0.3540000021" CONTENT="J54;." HEIGHT="22" WIDTH="44" VPOS="1438" HPOS="432"/><SP WIDTH="33" VPOS="1434" HPOS="477"/><String WC="0.7620000243" CONTENT="Cosmarium." HEIGHT="22" WIDTH="139" VPOS="1433" HPOS="511"/><SP WIDTH="32" VPOS="1432" HPOS="651"/><String WC="0.4550000131" CONTENT="A." HEIGHT="21" WIDTH="30" VPOS="1432" HPOS="684"/><SP WIDTH="19" VPOS="1432" HPOS="715"/><String WC="0.7699999809" CONTENT="C." HEIGHT="21" WIDTH="25" VPOS="1432" HPOS="735"/><SP WIDTH="23" VPOS="1439" HPOS="761"/><String WC="0.6628571153" CONTENT="margaritaceum," HEIGHT="28" WIDTH="184" VPOS="1431" HPOS="785"/><SP WIDTH="30" VPOS="1432" HPOS="970"/><String WC="0.4524999857" CONTENT="Fig." HEIGHT="27" WIDTH="46" VPOS="1432" HPOS="1001"/><SP WIDTH="15" VPOS="1435" HPOS="1048"/><String WC="0.5400000215" CONTENT="J35." HEIGHT="23" WIDTH="44" VPOS="1435" HPOS="1064"/><SP WIDTH="31" VPOS="1432" HPOS="1109"/><String WC="0.7572727203" CONTENT="Clostcrium." HEIGHT="23" WIDTH="134" VPOS="1430" HPOS="1141"/><SP WIDTH="27" VPOS="1431" HPOS="1276"/><String WC="0.5199999809" CONTENT="A" HEIGHT="19" WIDTH="22" VPOS="1431" HPOS="1304"/><SP WIDTH="18" VPOS="1430" HPOS="1327"/><String WC="0.6366666555" CONTENT="CI." HEIGHT="21" WIDTH="33" VPOS="1430" HPOS="1346"/><SP WIDTH="16" VPOS="1430" HPOS="1380"/><String WC="0.6342856884" CONTENT="lunula," HEIGHT="25" WIDTH="86" VPOS="1430" HPOS="1397"/><SP WIDTH="21" VPOS="1429" HPOS="1484"/><String WC="0.6314285994" CONTENT="Linzel-" HEIGHT="26" WIDTH="81" VPOS="1429" HPOS="1506"/></TextLine>
|
||||
<TextLine HEIGHT="32" WIDTH="1225" VPOS="1461" HPOS="361"><String WC="0.5600000024" CONTENT="a" HEIGHT="13" WIDTH="13" VPOS="1474" HPOS="361"/><SP WIDTH="14" VPOS="1468" HPOS="375"/><String WC="0.5083333254" CONTENT="Lnizelzellp," HEIGHT="26" WIDTH="128" VPOS="1467" HPOS="390"/><SP WIDTH="15" VPOS="1467" HPOS="519"/><String WC="0.25" CONTENT="b" HEIGHT="20" WIDTH="13" VPOS="1466" HPOS="535"/><SP WIDTH="14" VPOS="1466" HPOS="549"/><String WC="0.5822222233" CONTENT="Iochspore" HEIGHT="26" WIDTH="112" VPOS="1465" HPOS="564"/><SP WIDTH="14" VPOS="1471" HPOS="677"/><String WC="0.3700000048" CONTENT="mit" HEIGHT="20" WIDTH="39" VPOS="1465" HPOS="692"/><SP WIDTH="10" VPOS="1465" HPOS="732"/><String WC="0.3100000024" CONTENT="den" HEIGHT="20" WIDTH="37" VPOS="1465" HPOS="743"/><SP WIDTH="13" VPOS="1471" HPOS="781"/><String WC="0.4350000024" CONTENT="entleerten" HEIGHT="21" WIDTH="111" VPOS="1464" HPOS="795"/><SP WIDTH="8" VPOS="1464" HPOS="907"/><String WC="0.7940000296" CONTENT="Zell-" HEIGHT="27" WIDTH="55" VPOS="1464" HPOS="916"/><SP WIDTH="28" VPOS="1471" HPOS="972"/><String WC="0.6333333254" CONTENT="zelle," HEIGHT="25" WIDTH="54" VPOS="1465" HPOS="1001"/><SP WIDTH="15" VPOS="1464" HPOS="1056"/><String WC="0.2800000012" CONTENT="B" HEIGHT="20" WIDTH="18" VPOS="1464" HPOS="1072"/><SP WIDTH="14" VPOS="1464" HPOS="1091"/><String WC="0.9233333468" CONTENT="CI." HEIGHT="21" WIDTH="32" VPOS="1464" HPOS="1106"/><SP WIDTH="15" VPOS="1471" HPOS="1139"/><String WC="0.8188889027" CONTENT="rostratum" HEIGHT="19" WIDTH="111" VPOS="1465" HPOS="1155"/><SP WIDTH="12" VPOS="1463" HPOS="1267"/><String WC="0.2399999946" CONTENT="(nad?" HEIGHT="25" WIDTH="62" VPOS="1463" HPOS="1280"/><SP WIDTH="8" VPOS="1464" HPOS="1343"/><String WC="0.2949999869" CONTENT="Präparat" HEIGHT="26" WIDTH="110" VPOS="1463" HPOS="1352"/><SP WIDTH="10" VPOS="1465" HPOS="1463"/><String WC="0.1566666663" CONTENT="uon" HEIGHT="16" WIDTH="41" VPOS="1467" HPOS="1474"/><SP WIDTH="8" VPOS="1463" HPOS="1516"/><String WC="0.3420000076" CONTENT="pvof." HEIGHT="27" WIDTH="61" VPOS="1461" HPOS="1525"/></TextLine>
|
||||
<TextLine HEIGHT="33" WIDTH="1224" VPOS="1493" HPOS="362"><String WC="0.6571428776" CONTENT="häuten." HEIGHT="27" WIDTH="88" VPOS="1499" HPOS="362"/><SP WIDTH="27" VPOS="1499" HPOS="451"/><String WC="0.400000006" CONTENT="B" HEIGHT="20" WIDTH="18" VPOS="1499" HPOS="479"/><SP WIDTH="15" VPOS="1499" HPOS="498"/><String WC="0.6918181777" CONTENT="Linzelzelle" HEIGHT="27" WIDTH="120" VPOS="1497" HPOS="514"/><SP WIDTH="22" VPOS="1503" HPOS="635"/><String WC="0.453333348" CONTENT="von" HEIGHT="14" WIDTH="42" VPOS="1503" HPOS="658"/><SP WIDTH="21" VPOS="1497" HPOS="701"/><String WC="0.9250000119" CONTENT="C." HEIGHT="20" WIDTH="24" VPOS="1497" HPOS="723"/><SP WIDTH="15" VPOS="1497" HPOS="748"/><String WC="0.8562499881" CONTENT="botrytis" HEIGHT="26" WIDTH="89" VPOS="1497" HPOS="764"/><SP WIDTH="18" VPOS="1502" HPOS="854"/><String WC="0.4499999881" CONTENT="mit" HEIGHT="21" WIDTH="40" VPOS="1496" HPOS="873"/><SP WIDTH="19" VPOS="1498" HPOS="914"/><String WC="0.6700000167" CONTENT="un-" HEIGHT="15" WIDTH="38" VPOS="1502" HPOS="934"/><SP WIDTH="29" VPOS="1496" HPOS="973"/><String WC="0.5155555606" CONTENT="Homfeld)," HEIGHT="27" WIDTH="115" VPOS="1496" HPOS="1003"/><SP WIDTH="20" VPOS="1497" HPOS="1119"/><String WC="0.3355555534" CONTENT=")ochspore" HEIGHT="28" WIDTH="112" VPOS="1495" HPOS="1140"/><SP WIDTH="14" VPOS="1501" HPOS="1253"/><String WC="0.853333354" CONTENT="mit" HEIGHT="20" WIDTH="39" VPOS="1495" HPOS="1268"/><SP WIDTH="13" VPOS="1495" HPOS="1308"/><String WC="0.5233333111" CONTENT="den" HEIGHT="20" WIDTH="37" VPOS="1495" HPOS="1322"/><SP WIDTH="13" VPOS="1494" HPOS="1360"/><String WC="0.4783333242" CONTENT="leeren" HEIGHT="22" WIDTH="65" VPOS="1494" HPOS="1374"/><SP WIDTH="10" VPOS="1494" HPOS="1440"/><String WC="0.6600000262" CONTENT="Zellhäuten," HEIGHT="28" WIDTH="135" VPOS="1493" HPOS="1451"/></TextLine>
|
||||
<TextLine HEIGHT="29" WIDTH="839" VPOS="1527" HPOS="568"><String WC="0.4187499881" CONTENT="gleichen" HEIGHT="27" WIDTH="90" VPOS="1529" HPOS="568"/><SP WIDTH="14" VPOS="1529" HPOS="659"/><String WC="0.6687499881" CONTENT="Hälften." HEIGHT="27" WIDTH="97" VPOS="1529" HPOS="674"/><SP WIDTH="411" VPOS="1527" HPOS="772"/><String WC="0.7599999905" CONTENT="in" HEIGHT="21" WIDTH="22" VPOS="1527" HPOS="1184"/><SP WIDTH="13" VPOS="1534" HPOS="1207"/><String WC="0.4300000072" CONTENT="zwei" HEIGHT="26" WIDTH="50" VPOS="1527" HPOS="1221"/><SP WIDTH="15" VPOS="1527" HPOS="1272"/><String WC="0.6629999876" CONTENT="Ansichten." HEIGHT="26" WIDTH="119" VPOS="1527" HPOS="1288"/></TextLine>
|
||||
</TextBlock></ComposedBlock>
|
||||
<TextBlock ID="Page1_Block7" HEIGHT="610" WIDTH="1241" VPOS="1578" HPOS="354" language="de" STYLEREFS="font1"><Shape><Polygon POINTS="357,1583 1596,1583 1596,2189 357,2189 357,1583"/></Shape>
|
||||
<TextLine HEIGHT="49" WIDTH="1224" VPOS="1583" HPOS="363"><String WC="0.6650000215" CONTENT="Zu" HEIGHT="34" WIDTH="45" VPOS="1589" HPOS="363"/><SP WIDTH="37" VPOS="1590" HPOS="409"/><String WC="0.7360000014" CONTENT="hause" HEIGHT="43" WIDTH="97" VPOS="1589" HPOS="447"/><SP WIDTH="37" VPOS="1588" HPOS="545"/><String WC="0.7419999838" CONTENT="spült" HEIGHT="43" WIDTH="77" VPOS="1587" HPOS="583"/><SP WIDTH="32" VPOS="1589" HPOS="661"/><String WC="0.6266666651" CONTENT="man" HEIGHT="24" WIDTH="75" VPOS="1597" HPOS="694"/><SP WIDTH="37" VPOS="1587" HPOS="770"/><String WC="0.9300000072" CONTENT="die" HEIGHT="34" WIDTH="46" VPOS="1587" HPOS="808"/><SP WIDTH="36" VPOS="1596" HPOS="855"/><String WC="0.8169230819" CONTENT="mitgenommenen" HEIGHT="43" WIDTH="280" VPOS="1586" HPOS="892"/><SP WIDTH="38" VPOS="1586" HPOS="1173"/><String WC="0.7077777982" CONTENT="Pröbchen," HEIGHT="43" WIDTH="172" VPOS="1585" HPOS="1212"/><SP WIDTH="39" VPOS="1584" HPOS="1385"/><String WC="0.5366666913" CONTENT="die" HEIGHT="35" WIDTH="46" VPOS="1584" HPOS="1425"/><SP WIDTH="40" VPOS="1594" HPOS="1472"/><String WC="0.6233333349" CONTENT="man" HEIGHT="24" WIDTH="74" VPOS="1594" HPOS="1513"/></TextLine>
|
||||
<TextLine HEIGHT="48" WIDTH="1224" VPOS="1639" HPOS="363"><String WC="0.6377778053" CONTENT="natürlich" HEIGHT="43" WIDTH="148" VPOS="1644" HPOS="363"/><SP WIDTH="43" VPOS="1643" HPOS="512"/><String WC="0.5960000157" CONTENT="nicht" HEIGHT="43" WIDTH="75" VPOS="1642" HPOS="556"/><SP WIDTH="41" VPOS="1642" HPOS="632"/><String WC="0.7549999952" CONTENT="literweise" HEIGHT="43" WIDTH="157" VPOS="1642" HPOS="674"/><SP WIDTH="42" VPOS="1642" HPOS="832"/><String WC="0.6299999952" CONTENT="sammelt," HEIGHT="43" WIDTH="156" VPOS="1641" HPOS="875"/><SP WIDTH="43" VPOS="1641" HPOS="1032"/><String WC="1." CONTENT="in" HEIGHT="34" WIDTH="30" VPOS="1641" HPOS="1076"/><SP WIDTH="41" VPOS="1651" HPOS="1107"/><String WC="0.6600000262" CONTENT="wenig" HEIGHT="44" WIDTH="102" VPOS="1640" HPOS="1149"/><SP WIDTH="37" VPOS="1641" HPOS="1252"/><String WC="0.6949999928" CONTENT="Wasser" HEIGHT="42" WIDTH="118" VPOS="1640" HPOS="1290"/><SP WIDTH="37" VPOS="1650" HPOS="1409"/><String WC="0.8700000048" CONTENT="ab" HEIGHT="33" WIDTH="39" VPOS="1640" HPOS="1447"/><SP WIDTH="38" VPOS="1639" HPOS="1487"/><String WC="0.3733333349" CONTENT="und" HEIGHT="33" WIDTH="61" VPOS="1639" HPOS="1526"/></TextLine>
|
||||
<TextLine HEIGHT="48" WIDTH="1226" VPOS="1693" HPOS="362"><String WC="0.7250000238" CONTENT="bringt" HEIGHT="42" WIDTH="107" VPOS="1699" HPOS="362"/><SP WIDTH="43" VPOS="1700" HPOS="469"/><String WC="0.6857143044" CONTENT="winzige" HEIGHT="44" WIDTH="131" VPOS="1697" HPOS="512"/><SP WIDTH="36" VPOS="1698" HPOS="643"/><String WC="0.7214285731" CONTENT="Partien" HEIGHT="43" WIDTH="129" VPOS="1697" HPOS="679"/><SP WIDTH="46" VPOS="1697" HPOS="808"/><String WC="0.7133333087" CONTENT="des" HEIGHT="35" WIDTH="53" VPOS="1696" HPOS="854"/><SP WIDTH="46" VPOS="1706" HPOS="907"/><String WC="0.7216666937" CONTENT="abgeklopften" HEIGHT="43" WIDTH="222" VPOS="1696" HPOS="953"/><SP WIDTH="38" VPOS="1696" HPOS="1175"/><String WC="0.5181818008" CONTENT="Scf]lid?es-" HEIGHT="43" WIDTH="151" VPOS="1695" HPOS="1213"/><SP WIDTH="32" VPOS="1705" HPOS="1364"/><String WC="0.7933333516" CONTENT="mit" HEIGHT="35" WIDTH="57" VPOS="1694" HPOS="1396"/><SP WIDTH="37" VPOS="1696" HPOS="1453"/><String WC="0.7400000095" CONTENT="einem" HEIGHT="35" WIDTH="98" VPOS="1694" HPOS="1490"/></TextLine>
|
||||
<TextLine HEIGHT="47" WIDTH="1224" VPOS="1749" HPOS="363"><String WC="0.7430769205" CONTENT="Wassertropfen" HEIGHT="43" WIDTH="240" VPOS="1753" HPOS="363"/><SP WIDTH="32" VPOS="1763" HPOS="604"/><String WC="0.6000000238" CONTENT="auf" HEIGHT="42" WIDTH="55" VPOS="1752" HPOS="637"/><SP WIDTH="29" VPOS="1752" HPOS="693"/><String WC="0.6359999776" CONTENT="einen" HEIGHT="34" WIDTH="87" VPOS="1752" HPOS="723"/><SP WIDTH="31" VPOS="1753" HPOS="811"/><String WC="0.7069230676" CONTENT="Objektträger." HEIGHT="44" WIDTH="233" VPOS="1751" HPOS="843"/><SP WIDTH="51" VPOS="1752" HPOS="1077"/><String WC="0.6866666675" CONTENT="Mit" HEIGHT="35" WIDTH="65" VPOS="1750" HPOS="1129"/><SP WIDTH="29" VPOS="1752" HPOS="1195"/><String WC="0.6750000119" CONTENT="zwei" HEIGHT="42" WIDTH="75" VPOS="1750" HPOS="1225"/><SP WIDTH="30" VPOS="1750" HPOS="1301"/><String WC="0.7866666913" CONTENT="feinen" HEIGHT="42" WIDTH="101" VPOS="1750" HPOS="1332"/><SP WIDTH="30" VPOS="1751" HPOS="1434"/><String WC="0.6683333516" CONTENT="Nadeln" HEIGHT="35" WIDTH="122" VPOS="1749" HPOS="1465"/></TextLine>
|
||||
<TextLine HEIGHT="48" WIDTH="1224" VPOS="1804" HPOS="363"><String WC="0.7785714269" CONTENT="breitet" HEIGHT="33" WIDTH="109" VPOS="1809" HPOS="363"/><SP WIDTH="23" VPOS="1810" HPOS="473"/><String WC="0.4099999964" CONTENT="man" HEIGHT="24" WIDTH="74" VPOS="1818" HPOS="497"/><SP WIDTH="24" VPOS="1808" HPOS="572"/><String WC="0.8100000024" CONTENT="das" HEIGHT="33" WIDTH="56" VPOS="1808" HPOS="597"/><SP WIDTH="19" VPOS="1808" HPOS="654"/><String WC="0.7633333206" CONTENT="Klümpchen" HEIGHT="43" WIDTH="186" VPOS="1807" HPOS="674"/><SP WIDTH="24" VPOS="1817" HPOS="861"/><String WC="0.678888917" CONTENT="möglichst" HEIGHT="44" WIDTH="151" VPOS="1806" HPOS="886"/><SP WIDTH="23" VPOS="1809" HPOS="1038"/><String WC="0.6850000024" CONTENT="weit" HEIGHT="34" WIDTH="71" VPOS="1806" HPOS="1062"/><SP WIDTH="23" VPOS="1809" HPOS="1134"/><String WC="0.6025000215" CONTENT="aus," HEIGHT="33" WIDTH="68" VPOS="1816" HPOS="1158"/><SP WIDTH="25" VPOS="1805" HPOS="1227"/><String WC="0.7080000043" CONTENT="damit" HEIGHT="34" WIDTH="98" VPOS="1805" HPOS="1253"/><SP WIDTH="23" VPOS="1807" HPOS="1352"/><String WC="1." CONTENT="es" HEIGHT="24" WIDTH="31" VPOS="1815" HPOS="1376"/><SP WIDTH="25" VPOS="1807" HPOS="1408"/><String WC="0.8366666436" CONTENT="übersicht" HEIGHT="44" WIDTH="140" VPOS="1804" HPOS="1434" SUBS_TYPE="HypPart1" SUBS_CONTENT="übersichtlich"/><HYP CONTENT=""/></TextLine>
|
||||
<TextLine HEIGHT="48" WIDTH="1224" VPOS="1859" HPOS="363"><String WC="0.6650000215" CONTENT="lich" HEIGHT="43" WIDTH="52" VPOS="1864" HPOS="363" SUBS_TYPE="HypPart2" SUBS_CONTENT="übersichtlich"/><SP WIDTH="31" VPOS="1864" HPOS="416"/><String WC="0.5849999785" CONTENT="wird" HEIGHT="33" WIDTH="76" VPOS="1864" HPOS="448"/><SP WIDTH="31" VPOS="1863" HPOS="525"/><String WC="0.9066666961" CONTENT="und" HEIGHT="34" WIDTH="61" VPOS="1862" HPOS="557"/><SP WIDTH="31" VPOS="1862" HPOS="619"/><String WC="0.8728571534" CONTENT="bedeckt" HEIGHT="34" WIDTH="119" VPOS="1862" HPOS="651"/><SP WIDTH="30" VPOS="1863" HPOS="771"/><String WC="0.7833333611" CONTENT="das" HEIGHT="33" WIDTH="57" VPOS="1863" HPOS="802"/><SP WIDTH="24" VPOS="1862" HPOS="860"/><String WC="0.7537500262" CONTENT="Präparat" HEIGHT="43" WIDTH="161" VPOS="1862" HPOS="885"/><SP WIDTH="27" VPOS="1863" HPOS="1047"/><String WC="0.7566666603" CONTENT="mit" HEIGHT="34" WIDTH="56" VPOS="1861" HPOS="1075"/><SP WIDTH="24" VPOS="1863" HPOS="1132"/><String WC="0.7179999948" CONTENT="einem" HEIGHT="34" WIDTH="96" VPOS="1861" HPOS="1157"/><SP WIDTH="24" VPOS="1861" HPOS="1254"/><String WC="0.6629999876" CONTENT="veckglase." HEIGHT="42" WIDTH="171" VPOS="1861" HPOS="1279"/><SP WIDTH="47" VPOS="1860" HPOS="1451"/><String WC="1." CONTENT="Beim" HEIGHT="34" WIDTH="88" VPOS="1859" HPOS="1499"/></TextLine>
|
||||
<TextLine HEIGHT="48" WIDTH="1223" VPOS="1914" HPOS="364"><String WC="0.5649999976" CONTENT="Züchen" HEIGHT="43" WIDTH="115" VPOS="1919" HPOS="364"/><SP WIDTH="24" VPOS="1929" HPOS="480"/><String WC="0.8666666746" CONTENT="mit" HEIGHT="35" WIDTH="55" VPOS="1918" HPOS="505"/><SP WIDTH="22" VPOS="1919" HPOS="561"/><String WC="0.8566666842" CONTENT="mittlerer" HEIGHT="33" WIDTH="148" VPOS="1918" HPOS="584"/><SP WIDTH="24" VPOS="1918" HPOS="733"/><String WC="0.6583333611" CONTENT="Vergrößerung" HEIGHT="44" WIDTH="238" VPOS="1917" HPOS="758"/><SP WIDTH="31" VPOS="1927" HPOS="997"/><String WC="0.4524999857" CONTENT="wird" HEIGHT="34" WIDTH="78" VPOS="1916" HPOS="1029"/><SP WIDTH="24" VPOS="1917" HPOS="1108"/><String WC="0.6800000072" CONTENT="man" HEIGHT="25" WIDTH="73" VPOS="1926" HPOS="1133"/><SP WIDTH="25" VPOS="1916" HPOS="1207"/><String WC="0.6316666603" CONTENT="Formen" HEIGHT="42" WIDTH="132" VPOS="1916" HPOS="1233"/><SP WIDTH="24" VPOS="1915" HPOS="1366"/><String WC="0.7300000191" CONTENT="finden," HEIGHT="43" WIDTH="116" VPOS="1914" HPOS="1391"/><SP WIDTH="32" VPOS="1915" HPOS="1508"/><String WC="0.8633333445" CONTENT="die" HEIGHT="33" WIDTH="46" VPOS="1915" HPOS="1541"/></TextLine>
|
||||
<TextLine HEIGHT="48" WIDTH="1222" VPOS="1969" HPOS="365"><String WC="0.6333333254" CONTENT="aus" HEIGHT="23" WIDTH="58" VPOS="1984" HPOS="365"/><SP WIDTH="29" VPOS="1984" HPOS="424"/><String WC="0.7825000286" CONTENT="zwei" HEIGHT="41" WIDTH="74" VPOS="1974" HPOS="454"/><SP WIDTH="31" VPOS="1973" HPOS="529"/><String WC="0.6437500119" CONTENT="einander" HEIGHT="34" WIDTH="147" VPOS="1972" HPOS="561"/><SP WIDTH="30" VPOS="1983" HPOS="709"/><String WC="0.6938889027" CONTENT="gegenüberstehenden" HEIGHT="44" WIDTH="336" VPOS="1972" HPOS="740"/><SP WIDTH="32" VPOS="1972" HPOS="1077"/><String WC="0.5190908909" CONTENT="Halbkreisen" HEIGHT="45" WIDTH="198" VPOS="1970" HPOS="1110"/><SP WIDTH="33" VPOS="1970" HPOS="1309"/><String WC="0.2849999964" CONTENT="in" HEIGHT="34" WIDTH="31" VPOS="1970" HPOS="1343"/><SP WIDTH="33" VPOS="1970" HPOS="1375"/><String WC="0.8033333421" CONTENT="der" HEIGHT="33" WIDTH="51" VPOS="1970" HPOS="1409"/><SP WIDTH="30" VPOS="1971" HPOS="1461"/><String WC="0.8820000291" CONTENT="Mitte" HEIGHT="35" WIDTH="95" VPOS="1969" HPOS="1492"/></TextLine>
|
||||
<TextLine HEIGHT="49" WIDTH="1227" VPOS="2023" HPOS="361"><String WC="0.6323529482" CONTENT="zusammengewachsen" HEIGHT="43" WIDTH="350" VPOS="2028" HPOS="361"/><SP WIDTH="32" VPOS="2038" HPOS="711"/><String WC="0.7599999905" CONTENT="erscheinen" HEIGHT="44" WIDTH="163" VPOS="2027" HPOS="743"/><SP WIDTH="26" VPOS="2025" HPOS="906"/><String WC="0.8854545355" CONTENT="(Cosmarium," HEIGHT="44" WIDTH="238" VPOS="2025" HPOS="932"/><SP WIDTH="31" VPOS="2026" HPOS="1170"/><String WC="0.7774999738" CONTENT="Fig." HEIGHT="42" WIDTH="68" VPOS="2026" HPOS="1201"/><SP WIDTH="30" VPOS="2029" HPOS="1269"/><String WC="0.6140000224" CONTENT="134)," HEIGHT="43" WIDTH="84" VPOS="2024" HPOS="1299"/><SP WIDTH="34" VPOS="2035" HPOS="1383"/><String WC="0.6825000048" CONTENT="oder" HEIGHT="33" WIDTH="72" VPOS="2025" HPOS="1417"/><SP WIDTH="24" VPOS="2034" HPOS="1489"/><String WC="0.6833333373" CONTENT="man" HEIGHT="24" WIDTH="75" VPOS="2034" HPOS="1513"/></TextLine>
|
||||
<TextLine HEIGHT="47" WIDTH="1223" VPOS="2079" HPOS="365"><String WC="0.7799999714" CONTENT="findet" HEIGHT="41" WIDTH="94" VPOS="2083" HPOS="365"/><SP WIDTH="18" VPOS="2083" HPOS="460"/><String WC="0.8355555534" CONTENT="türkische" HEIGHT="44" WIDTH="142" VPOS="2082" HPOS="479"/><SP WIDTH="15" VPOS="2083" HPOS="622"/><String WC="0.6140000224" CONTENT="Halbmonde," HEIGHT="43" WIDTH="203" VPOS="2082" HPOS="638"/><SP WIDTH="20" VPOS="2083" HPOS="842"/><String WC="0.7233333588" CONTENT="die" HEIGHT="34" WIDTH="46" VPOS="2082" HPOS="863"/><SP WIDTH="21" VPOS="2092" HPOS="910"/><String WC="0.5899999738" CONTENT="genau" HEIGHT="33" WIDTH="101" VPOS="2091" HPOS="932"/><SP WIDTH="17" VPOS="2081" HPOS="1034"/><String WC="0.6620000005" CONTENT="durch" HEIGHT="43" WIDTH="86" VPOS="2081" HPOS="1052"/><SP WIDTH="20" VPOS="2081" HPOS="1139"/><String WC="0.6340000033" CONTENT="einen" HEIGHT="35" WIDTH="87" VPOS="2080" HPOS="1160"/><SP WIDTH="15" VPOS="2081" HPOS="1248"/><String WC="0.7910000086" CONTENT="Ouerstrich" HEIGHT="43" WIDTH="168" VPOS="2080" HPOS="1264"/><SP WIDTH="21" VPOS="2080" HPOS="1433"/><String WC="0.5950000286" CONTENT="halbiert" HEIGHT="44" WIDTH="133" VPOS="2079" HPOS="1455"/></TextLine>
|
||||
<TextLine HEIGHT="50" WIDTH="1222" VPOS="2133" HPOS="365"><String WC="0.5674999952" CONTENT="sind" HEIGHT="43" WIDTH="62" VPOS="2137" HPOS="365"/><SP WIDTH="37" VPOS="2137" HPOS="428"/><String WC="0.8000000119" CONTENT="und" HEIGHT="34" WIDTH="61" VPOS="2137" HPOS="466"/><SP WIDTH="38" VPOS="2136" HPOS="528"/><String WC="0.6499999762" CONTENT="an" HEIGHT="24" WIDTH="40" VPOS="2147" HPOS="567"/><SP WIDTH="33" VPOS="2137" HPOS="608"/><String WC="0.8183333278" CONTENT="beiden" HEIGHT="35" WIDTH="107" VPOS="2137" HPOS="642"/><SP WIDTH="34" VPOS="2138" HPOS="750"/><String WC="0.4499999881" CONTENT="Enden" HEIGHT="34" WIDTH="106" VPOS="2137" HPOS="785"/><SP WIDTH="36" VPOS="2137" HPOS="892"/><String WC="0.8600000143" CONTENT="je" HEIGHT="44" WIDTH="27" VPOS="2137" HPOS="929"/><SP WIDTH="34" VPOS="2146" HPOS="957"/><String WC="0.7225000262" CONTENT="eine" HEIGHT="34" WIDTH="64" VPOS="2136" HPOS="992"/><SP WIDTH="33" VPOS="2136" HPOS="1057"/><String WC="0.9139999747" CONTENT="kreisrunde" HEIGHT="35" WIDTH="180" VPOS="2135" HPOS="1091"/><SP WIDTH="33" VPOS="2135" HPOS="1272"/><String WC="0.6079999804" CONTENT="Blase" HEIGHT="43" WIDTH="89" VPOS="2135" HPOS="1306"/><SP WIDTH="33" VPOS="2145" HPOS="1396"/><String WC="0.7266666889" CONTENT="enthalten" HEIGHT="46" WIDTH="157" VPOS="2133" HPOS="1430"/></TextLine>
|
||||
</TextBlock><GraphicalElement ID="Page1_Block8" HEIGHT="184" WIDTH="8" VPOS="900" HPOS="1258"/><GraphicalElement ID="Page1_Block9" HEIGHT="90" WIDTH="3" VPOS="896" HPOS="1427"/><GraphicalElement ID="Page1_Block10" HEIGHT="146" WIDTH="7" VPOS="885" HPOS="1544"/>
|
||||
</PrintSpace>
|
||||
</Page>
|
||||
</Layout>
|
||||
</alto>
|
37
src/dinglehopper/tests/data/test.alto3.xml
Normal file
37
src/dinglehopper/tests/data/test.alto3.xml
Normal file
|
@ -0,0 +1,37 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<alto xmlns="http://www.loc.gov/standards/alto/ns-v3#">
|
||||
<Layout>
|
||||
<Page WIDTH="1148" HEIGHT="1852" PHYSICAL_IMG_NR="0" ID="page_0">
|
||||
<PrintSpace HPOS="0" VPOS="0" WIDTH="1148" HEIGHT="1852">
|
||||
<TextBlock ID="block_3" HPOS="135" VPOS="251" WIDTH="741" HEIGHT="47">
|
||||
<TextLine ID="line_3" HPOS="135" VPOS="251" WIDTH="741" HEIGHT="47">
|
||||
<String ID="string_5" HPOS="135" VPOS="251" WIDTH="65" HEIGHT="34" WC="0.89" CONTENT="über"/><SP WIDTH="19" VPOS="251" HPOS="200"/>
|
||||
<String ID="string_6" HPOS="219" VPOS="256" WIDTH="41" HEIGHT="31" WC="0.96" CONTENT="die"/><SP WIDTH="23" VPOS="256" HPOS="260"/>
|
||||
<String ID="string_7" HPOS="283" VPOS="258" WIDTH="87" HEIGHT="30" WC="0.87" CONTENT="vielen"/><SP WIDTH="16" VPOS="258" HPOS="370"/>
|
||||
<String ID="string_8" HPOS="386" VPOS="259" WIDTH="118" HEIGHT="37" WC="0.96" CONTENT="Sorgen"/><SP WIDTH="14" VPOS="259" HPOS="504"/>
|
||||
<String ID="string_9" HPOS="518" VPOS="265" WIDTH="90" HEIGHT="32" WC="0.21" CONTENT="wegen"/><SP WIDTH="12" VPOS="265" HPOS="608"/>
|
||||
<String ID="string_10" HPOS="620" VPOS="254" WIDTH="130" HEIGHT="42" WC="0.21" CONTENT="deſſelben"/><SP WIDTH="24" VPOS="254" HPOS="750"/>
|
||||
<String ID="string_11" HPOS="774" VPOS="255" WIDTH="102" HEIGHT="43" WC="0.74" CONTENT="vergaß"/>
|
||||
</TextLine>
|
||||
</TextBlock>
|
||||
<TextBlock ID="block_4" HPOS="134" VPOS="304" WIDTH="740" HEIGHT="40">
|
||||
<TextLine ID="line_4" HPOS="134" VPOS="304" WIDTH="740" HEIGHT="40">
|
||||
<String ID="string_12" HPOS="134" VPOS="304" WIDTH="203" HEIGHT="40" WC="0.75" CONTENT="Hartkopf,"/><SP WIDTH="30" VPOS="304" HPOS="337"/>
|
||||
<String ID="string_13" HPOS="367" VPOS="310" WIDTH="45" HEIGHT="27" WC="0.93" CONTENT="der"/><SP WIDTH="24" VPOS="310" HPOS="412"/>
|
||||
<String ID="string_14" HPOS="436" VPOS="309" WIDTH="74" HEIGHT="35" WC="0.59" CONTENT="Frau"/><SP WIDTH="22" VPOS="309" HPOS="510"/>
|
||||
<String ID="string_15" HPOS="532" VPOS="306" WIDTH="189" HEIGHT="36" WC="0.23" CONTENT="Amtmännin"/><SP WIDTH="16" VPOS="306" HPOS="721"/>
|
||||
<String ID="string_16" HPOS="737" VPOS="307" WIDTH="66" HEIGHT="34" WC="0.52" CONTENT="das"/><SP WIDTH="16" VPOS="307" HPOS="803"/>
|
||||
<String ID="string_17" HPOS="819" VPOS="318" WIDTH="55" HEIGHT="24" WC="0.0" CONTENT="ver-"/>
|
||||
</TextLine>
|
||||
</TextBlock>
|
||||
<TextBlock ID="block_5" HPOS="134" VPOS="356" WIDTH="761" HEIGHT="46">
|
||||
<TextLine ID="line_5" HPOS="134" VPOS="356" WIDTH="761" HEIGHT="46">
|
||||
<String ID="string_18" HPOS="134" VPOS="356" WIDTH="137" HEIGHT="37" WC="0.92" CONTENT="ſprochene"/><SP WIDTH="31" VPOS="356" HPOS="271"/>
|
||||
<String ID="string_19" HPOS="302" VPOS="365" WIDTH="32" HEIGHT="30" WC="0.73" CONTENT="zu"/><SP WIDTH="29" VPOS="365" HPOS="334"/>
|
||||
<String ID="string_20" HPOS="363" VPOS="356" WIDTH="170" HEIGHT="39" WC="0.52" CONTENT="überliefern."/><SP WIDTH="28" VPOS="356" HPOS="533"/>
|
||||
</TextLine>
|
||||
</TextBlock>
|
||||
</PrintSpace>
|
||||
</Page>
|
||||
</Layout>
|
||||
</alto>
|
3394
src/dinglehopper/tests/data/test.page2018.xml
Normal file
3394
src/dinglehopper/tests/data/test.page2018.xml
Normal file
File diff suppressed because it is too large
Load diff
1
src/dinglehopper/tests/data/test.txt
Normal file
1
src/dinglehopper/tests/data/test.txt
Normal file
|
@ -0,0 +1 @@
|
|||
Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet.
|
Binary file not shown.
After Width: | Height: | Size: 426 KiB |
|
@ -0,0 +1,348 @@
|
|||
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
|
||||
<PcGts xmlns="http://schema.primaresearch.org/PAGE/gts/pagecontent/2017-07-15" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://schema.primaresearch.org/PAGE/gts/pagecontent/2017-07-15 http://schema.primaresearch.org/PAGE/gts/pagecontent/2017-07-15/pagecontent.xsd">
|
||||
<Metadata>
|
||||
<Creator/>
|
||||
<Created>2020-09-16T15:51:31</Created>
|
||||
<LastChange>1970-01-01T01:00:00</LastChange>
|
||||
<Comments/>
|
||||
</Metadata>
|
||||
<Page imageFilename="462875_0008.jpg" imageHeight="2396" imageWidth="1504">
|
||||
<ReadingOrder>
|
||||
<OrderedGroup id="g1">
|
||||
<RegionRefIndexed index="0" regionRef="r1"/>
|
||||
<RegionRefIndexed index="1" regionRef="r2"/>
|
||||
<RegionRefIndexed index="2" regionRef="r3"/>
|
||||
</OrderedGroup>
|
||||
</ReadingOrder>
|
||||
<TextRegion id="r1" orientation="0.114332257404385">
|
||||
<Coords points="729,187 952,186 952,218 729,219"/>
|
||||
<TextLine id="l1">
|
||||
<Coords points="729,187 952,186 952,218 729,219"/>
|
||||
<TextEquiv index="0">
|
||||
<Unicode>— Vl —</Unicode>
|
||||
</TextEquiv>
|
||||
<TextEquiv index="1">
|
||||
<Unicode>— Vl —</Unicode>
|
||||
</TextEquiv>
|
||||
</TextLine>
|
||||
<TextEquiv>
|
||||
<Unicode/>
|
||||
</TextEquiv>
|
||||
</TextRegion>
|
||||
<TextRegion id="r2" orientation="-0.010667742595615">
|
||||
<Coords points="296,269 1390,267 1393,2064 299,2066"/>
|
||||
<TextLine id="l2">
|
||||
<Coords points="301,270 1389,270 1389,306 301,306"/>
|
||||
<TextEquiv index="0">
|
||||
<Unicode>sondere Schrift daraus zu machen. Locke scheint fort-</Unicode>
|
||||
</TextEquiv>
|
||||
<TextEquiv index="1">
|
||||
<Unicode>gondere Schrift daraus zu machen. LDocke scheint fort—-</Unicode>
|
||||
</TextEquiv>
|
||||
</TextLine>
|
||||
<TextLine id="l3">
|
||||
<Coords points="301,322 1386,322 1386,366 301,366"/>
|
||||
<TextEquiv index="1">
|
||||
<Unicode>gefahren zu haben, einzelne Bemorkungen und Beobach—</Unicode>
|
||||
</TextEquiv>
|
||||
<TextEquiv index="0">
|
||||
<Unicode>gefahren zu haben, einzelne Bemerkungen und Beobach-</Unicode>
|
||||
</TextEquiv>
|
||||
</TextLine>
|
||||
<TextLine id="l4">
|
||||
<Coords points="301,375 1387,375 1387,419 301,419"/>
|
||||
<TextEquiv index="0">
|
||||
<Unicode>tungen zu derselben niederzuschreiben, je nachdem sich</Unicode>
|
||||
</TextEquiv>
|
||||
<TextEquiv index="1">
|
||||
<Unicode>tungen zu derselben niederzuschreiben, je nachdem sich</Unicode>
|
||||
</TextEquiv>
|
||||
</TextLine>
|
||||
<TextLine id="l5">
|
||||
<Coords points="299,428 1385,428 1385,471 299,471"/>
|
||||
<TextEquiv index="0">
|
||||
<Unicode>ihm dieselben aufdrängten. Der Tod rief ihn im Jahre</Unicode>
|
||||
</TextEquiv>
|
||||
<TextEquiv index="1">
|
||||
<Unicode>ibm dicselben aufdrängten. Der Tod rief ihn im Jahre</Unicode>
|
||||
</TextEquiv>
|
||||
</TextLine>
|
||||
<TextLine id="l6">
|
||||
<Coords points="300,482 1385,482 1385,526 300,526"/>
|
||||
<TextEquiv index="0">
|
||||
<Unicode>1704 ab, bevor die Schrift vollendet war. Somit lagen</Unicode>
|
||||
</TextEquiv>
|
||||
<TextEquiv index="1">
|
||||
<Unicode>1704 ab, bevor die Schrift vollendet war. Somit lagen</Unicode>
|
||||
</TextEquiv>
|
||||
</TextLine>
|
||||
<TextLine id="l7">
|
||||
<Coords points="301,535 1388,535 1388,578 301,578"/>
|
||||
<TextEquiv index="0">
|
||||
<Unicode>dem Herausgeber der hinterlassenen Schriften nur mehr</Unicode>
|
||||
</TextEquiv>
|
||||
<TextEquiv index="1">
|
||||
<Unicode>dem ILerausgeber der hinterlassenen Schriften nur melr</Unicode>
|
||||
</TextEquiv>
|
||||
</TextLine>
|
||||
<TextLine id="l8">
|
||||
<Coords points="301,589 1389,589 1389,633 301,633"/>
|
||||
<TextEquiv index="0">
|
||||
<Unicode>oder weniger zusammenhängende Fragmente zu dieser</Unicode>
|
||||
</TextEquiv>
|
||||
<TextEquiv index="1">
|
||||
<Unicode>oder weniger zusammenhängende Pragmente zu dieser</Unicode>
|
||||
</TextEquiv>
|
||||
</TextLine>
|
||||
<TextLine id="l9">
|
||||
<Coords points="301,642 1387,642 1387,685 301,685"/>
|
||||
<TextEquiv index="0">
|
||||
<Unicode>Schrift vor; manche Wiederholungen und Unfertigkeiten</Unicode>
|
||||
</TextEquiv>
|
||||
<TextEquiv index="1">
|
||||
<Unicode>Schrift vor; manche wiederholungen und Unfertigkeiten</Unicode>
|
||||
</TextEquiv>
|
||||
</TextLine>
|
||||
<TextLine id="l10">
|
||||
<Coords points="302,695 1386,695 1386,737 302,737"/>
|
||||
<TextEquiv index="0">
|
||||
<Unicode>finden ihre Erklärung in diesem Umstande. Trotzdem</Unicode>
|
||||
</TextEquiv>
|
||||
<TextEquiv index="1">
|
||||
<Unicode>finden ihre Erklärung in diesem Umsſtande. Trotzdem</Unicode>
|
||||
</TextEquiv>
|
||||
</TextLine>
|
||||
<TextLine id="l11">
|
||||
<Coords points="302,747 1210,747 1210,791 302,791"/>
|
||||
<TextEquiv index="0">
|
||||
<Unicode>ist klar, was Locke in der Hauptsache wollte.</Unicode>
|
||||
</TextEquiv>
|
||||
<TextEquiv index="1">
|
||||
<Unicode>ist kKlar, was Locke in der Hauptsache wollte.</Unicode>
|
||||
</TextEquiv>
|
||||
</TextLine>
|
||||
<TextLine id="l12">
|
||||
<Coords points="388,800 1387,800 1387,844 388,844"/>
|
||||
<TextEquiv index="0">
|
||||
<Unicode>Der Philosoph ging von der Klage darüber aus, dass</Unicode>
|
||||
</TextEquiv>
|
||||
<TextEquiv index="1">
|
||||
<Unicode>Der Philosoph ging von der Klage darüber aus, dass</Unicode>
|
||||
</TextEquiv>
|
||||
</TextLine>
|
||||
<TextLine id="l13">
|
||||
<Coords points="300,852 1388,852 1388,894 300,894"/>
|
||||
<TextEquiv index="0">
|
||||
<Unicode>der Mensch bei der Leitung seiner Selbst die rechte</Unicode>
|
||||
</TextEquiv>
|
||||
<TextEquiv index="1">
|
||||
<Unicode>der Mensch bei der Leitung seiner Selbst die rechitée</Unicode>
|
||||
</TextEquiv>
|
||||
</TextLine>
|
||||
<TextLine id="l14">
|
||||
<Coords points="302,905 1389,905 1389,950 302,950"/>
|
||||
<TextEquiv index="0">
|
||||
<Unicode>Hülfe so selten bei seinem Verstande sucht; er wollte</Unicode>
|
||||
</TextEquiv>
|
||||
<TextEquiv index="1">
|
||||
<Unicode>IIülfe so selten bei seinem Verstande sucht; er wollte</Unicode>
|
||||
</TextEquiv>
|
||||
</TextLine>
|
||||
<TextLine id="l15">
|
||||
<Coords points="301,959 1388,959 1388,1002 301,1002"/>
|
||||
<TextEquiv index="0">
|
||||
<Unicode>die Thorheit dieser Vernachlässigung in ihren schlimmen</Unicode>
|
||||
</TextEquiv>
|
||||
<TextEquiv index="1">
|
||||
<Unicode>die Thorbeit dieser Vernachlässigung in ihren schlimmen</Unicode>
|
||||
</TextEquiv>
|
||||
</TextLine>
|
||||
<TextLine id="l16">
|
||||
<Coords points="301,1012 1385,1012 1385,1057 301,1057"/>
|
||||
<TextEquiv index="0">
|
||||
<Unicode>Folgen für die Handlung und Stellung des Menschen im</Unicode>
|
||||
</TextEquiv>
|
||||
<TextEquiv index="1">
|
||||
<Unicode>Folgen für die Iandlung und Stellung des Menschen im</Unicode>
|
||||
</TextEquiv>
|
||||
</TextLine>
|
||||
<TextLine id="l17">
|
||||
<Coords points="299,1066 1386,1066 1386,1110 299,1110"/>
|
||||
<TextEquiv index="0">
|
||||
<Unicode>Leben darlegen. Besonders auf deren Fehlgriffe in der</Unicode>
|
||||
</TextEquiv>
|
||||
<TextEquiv index="1">
|
||||
<Unicode>Leben darlegen. Besonders auf deren PFeblgriſſe in der</Unicode>
|
||||
</TextEquiv>
|
||||
</TextLine>
|
||||
<TextLine id="l18">
|
||||
<Coords points="299,1119 1383,1119 1383,1162 299,1162"/>
|
||||
<TextEquiv index="0">
|
||||
<Unicode>Bechandlung und Verwendung des Verstandes wies er hin.</Unicode>
|
||||
</TextEquiv>
|
||||
<TextEquiv index="1">
|
||||
<Unicode>Bechandlung und Verwendung des Verstandes wies er hin.</Unicode>
|
||||
</TextEquiv>
|
||||
</TextLine>
|
||||
<TextLine id="l19">
|
||||
<Coords points="298,1172 298,1215 1384,1215 1384,1172 298,1172"/>
|
||||
<TextEquiv index="0">
|
||||
<Unicode>Als ersten Fehlgriff rügte er die Weise Derer, die über-</Unicode>
|
||||
</TextEquiv>
|
||||
<TextEquiv index="1">
|
||||
<Unicode>Als ersten Fehlgriff rügte cr die Weise Derer, die über—</Unicode>
|
||||
</TextEquiv>
|
||||
</TextLine>
|
||||
<TextLine id="l20">
|
||||
<Coords points="300,1223 1387,1223 1387,1267 300,1267"/>
|
||||
<TextEquiv index="0">
|
||||
<Unicode>haupt ihren eigenen Verstand selten gebrauchen, vielmehr</Unicode>
|
||||
</TextEquiv>
|
||||
<TextEquiv index="1">
|
||||
<Unicode>haupt ilren cigenen Verstand seclten gebrauchen, vielmelir</Unicode>
|
||||
</TextEquiv>
|
||||
</TextLine>
|
||||
<TextLine id="l21">
|
||||
<Coords points="300,1268 1386,1268 1386,1318 300,1318"/>
|
||||
<TextEquiv index="0">
|
||||
<Unicode>nach dem Verstande ihrer Eltern, Nachbarn oder Vor-</Unicode>
|
||||
</TextEquiv>
|
||||
<TextEquiv index="1">
|
||||
<Unicode>nach dem Veorstande ihrer Eltern, Nachbarn odor Vor—</Unicode>
|
||||
</TextEquiv>
|
||||
</TextLine>
|
||||
<TextLine id="l22">
|
||||
<Coords points="300,1329 1388,1329 1388,1374 300,1374"/>
|
||||
<TextEquiv index="0">
|
||||
<Unicode>gesetzten urtheilen, um der Mühe und Sorge eigenen</Unicode>
|
||||
</TextEquiv>
|
||||
<TextEquiv index="1">
|
||||
<Unicode>gesetaten urtheilen, um der Mühe und dorge eigenen</Unicode>
|
||||
</TextEquiv>
|
||||
</TextLine>
|
||||
<TextLine id="l23">
|
||||
<Coords points="302,1381 1389,1381 1389,1419 302,1419"/>
|
||||
<TextEquiv index="0">
|
||||
<Unicode>Denkens und Urtheilens überhoben zu sein. Als zweiten</Unicode>
|
||||
</TextEquiv>
|
||||
<TextEquiv index="1">
|
||||
<Unicode>Denkens und Urtheilens überhboben zu sein. Als 2weiten</Unicode>
|
||||
</TextEquiv>
|
||||
</TextLine>
|
||||
<TextLine id="l24">
|
||||
<Coords points="300,1434 1389,1434 1389,1477 300,1477"/>
|
||||
<TextEquiv index="0">
|
||||
<Unicode>Fehlgriſf hob er das häufige Herbeiziehen von Leiden-</Unicode>
|
||||
</TextEquiv>
|
||||
<TextEquiv index="1">
|
||||
<Unicode>Fehlgriſf hob er das häuſige DIerbeiziehen von Leiden—</Unicode>
|
||||
</TextEquiv>
|
||||
</TextLine>
|
||||
<TextLine id="l25">
|
||||
<Coords points="300,1488 1388,1488 1388,1533 300,1533"/>
|
||||
<TextEquiv index="0">
|
||||
<Unicode>schaft der eigenen Wünsche oder des Parteigeistes an</Unicode>
|
||||
</TextEquiv>
|
||||
<TextEquiv index="1">
|
||||
<Unicode>schaft der eigenen Wünsche oder des Parteigeistes an</Unicode>
|
||||
</TextEquiv>
|
||||
</TextLine>
|
||||
<TextLine id="l26">
|
||||
<Coords points="299,1533 1390,1533 1390,1576 299,1576"/>
|
||||
<TextEquiv index="0">
|
||||
<Unicode>Stelle der Vernunft hervor. Und als dritten Fehler be-</Unicode>
|
||||
</TextEquiv>
|
||||
<TextEquiv index="1">
|
||||
<Unicode>Stelle der Vernunft hervor. Und als dritten Fohler be—</Unicode>
|
||||
</TextEquiv>
|
||||
</TextLine>
|
||||
<TextLine id="l27">
|
||||
<Coords points="302,1592 1389,1592 1389,1635 302,1635"/>
|
||||
<TextEquiv index="0">
|
||||
<Unicode>zeichnete er die ebenfalls nicht seltene Einseitigkeit und</Unicode>
|
||||
</TextEquiv>
|
||||
<TextEquiv index="1">
|
||||
<Unicode>zeichnete exr die ebenfalls nicht seltene Einscitigkeit und</Unicode>
|
||||
</TextEquiv>
|
||||
</TextLine>
|
||||
<TextLine id="l28">
|
||||
<Coords points="306,1645 1392,1645 1391,1692 305,1692"/>
|
||||
<TextEquiv index="0">
|
||||
<Unicode>Kurzsichtigkeit der Verstandesbildung. Um diesen Fehlern</Unicode>
|
||||
</TextEquiv>
|
||||
<TextEquiv index="1">
|
||||
<Unicode>Kurzsichtigkeit der Verstandesbildung. Um diesen Fohlern</Unicode>
|
||||
</TextEquiv>
|
||||
</TextLine>
|
||||
<TextLine id="l29">
|
||||
<Coords points="307,1698 1390,1698 1390,1742 307,1742"/>
|
||||
<TextEquiv index="0">
|
||||
<Unicode>und den durch sie herbeigeführten Vorurtheilen, welche</Unicode>
|
||||
</TextEquiv>
|
||||
<TextEquiv index="1">
|
||||
<Unicode>und den durch sie herbeigeführten Vorurtheilen, welche</Unicode>
|
||||
</TextEquiv>
|
||||
</TextLine>
|
||||
<TextLine id="l30">
|
||||
<Coords points="307,1751 1388,1751 1388,1795 307,1795"/>
|
||||
<TextEquiv index="0">
|
||||
<Unicode>die Erkenntniss der Wahrheit hindern, thunlichst vor-</Unicode>
|
||||
</TextEquiv>
|
||||
<TextEquiv index="1">
|
||||
<Unicode>die Erkenntniss der Wahrheit hindern, thunlichst vor—</Unicode>
|
||||
</TextEquiv>
|
||||
</TextLine>
|
||||
<TextLine id="l31">
|
||||
<Coords points="305,1804 1390,1804 1390,1850 305,1850"/>
|
||||
<TextEquiv index="0">
|
||||
<Unicode>zubeugen, verlangt Locke mit grossem Nachdrucke und</Unicode>
|
||||
</TextEquiv>
|
||||
<TextEquiv index="1">
|
||||
<Unicode>zubeugen, verlangt Locke mit grossem Nachdrucke und</Unicode>
|
||||
</TextEquiv>
|
||||
</TextLine>
|
||||
<TextLine id="l32">
|
||||
<Coords points="305,1857 305,1905 1393,1905 1393,1857 305,1857"/>
|
||||
<TextEquiv index="0">
|
||||
<Unicode>mit der vollen Wärmee eines aufrichtigen Wahrheitsfreundes,</Unicode>
|
||||
</TextEquiv>
|
||||
<TextEquiv index="1">
|
||||
<Unicode>mit der vollen Wärmeo eines aufrichtigen Wahrheitsſreundes,</Unicode>
|
||||
</TextEquiv>
|
||||
</TextLine>
|
||||
<TextLine id="l33">
|
||||
<Coords points="304,1910 304,1957 1393,1957 1393,1910 304,1910"/>
|
||||
<TextEquiv index="0">
|
||||
<Unicode>dass der Mensch sich um die rechte Bildung und Leitung</Unicode>
|
||||
</TextEquiv>
|
||||
<TextEquiv index="1">
|
||||
<Unicode>dass der Mensch sich um die rechte Bildung und Leitung</Unicode>
|
||||
</TextEquiv>
|
||||
</TextLine>
|
||||
<TextLine id="l34">
|
||||
<Coords points="304,1963 1392,1963 1392,2011 304,2011"/>
|
||||
<TextEquiv index="0">
|
||||
<Unicode>seines Verstandes die gleiche Mühe geben soll, wie er</Unicode>
|
||||
</TextEquiv>
|
||||
<TextEquiv index="1">
|
||||
<Unicode>seines Verstandes die gleiche Mühe geben soll, wie er</Unicode>
|
||||
</TextEquiv>
|
||||
</TextLine>
|
||||
<TextLine id="l35">
|
||||
<Coords points="303,2018 303,2065 846,2065 1393,2064 1393,2018 303,2018"/>
|
||||
<TextEquiv index="0">
|
||||
<Unicode>auf sein leibliches Wohlergehen zu verwenden pflegt. Es</Unicode>
|
||||
</TextEquiv>
|
||||
<TextEquiv index="1">
|
||||
<Unicode>auſf sein leibliches Wohlergehen zu verwenden pflegt. LEs</Unicode>
|
||||
</TextEquiv>
|
||||
</TextLine>
|
||||
<TextEquiv>
|
||||
<Unicode/>
|
||||
</TextEquiv>
|
||||
</TextRegion>
|
||||
<ImageRegion id="r3">
|
||||
<Coords points="0,0 0,2396 1504,2396 1504,0"/>
|
||||
</ImageRegion>
|
||||
</Page>
|
||||
</PcGts>
|
File diff suppressed because it is too large
Load diff
145
src/dinglehopper/tests/extracted_text_test.py
Normal file
145
src/dinglehopper/tests/extracted_text_test.py
Normal file
|
@ -0,0 +1,145 @@
|
|||
import logging
|
||||
import unicodedata
|
||||
from collections import namedtuple
|
||||
|
||||
import pytest
|
||||
from lxml import etree as ET
|
||||
from uniseg.graphemecluster import grapheme_clusters
|
||||
|
||||
from .. import seq_align, ExtractedText
|
||||
|
||||
|
||||
def test_text():
|
||||
test1 = ExtractedText(
|
||||
None,
|
||||
[
|
||||
ExtractedText("s0", None, None, "foo"),
|
||||
ExtractedText("s1", None, None, "bar"),
|
||||
ExtractedText("s2", None, None, "bazinga"),
|
||||
],
|
||||
" ",
|
||||
None,
|
||||
)
|
||||
|
||||
assert test1.text == "foo bar bazinga"
|
||||
assert test1.segment_id_for_pos(0) == "s0"
|
||||
assert test1.segment_id_for_pos(3) is None
|
||||
assert test1.segment_id_for_pos(10) == "s2"
|
||||
|
||||
|
||||
def test_normalization_check():
|
||||
with pytest.raises(ValueError, match=r".*is not in NFC.*"):
|
||||
ExtractedText("foo", None, None, unicodedata.normalize("NFD", "Schlyñ"))
|
||||
assert ExtractedText("foo", None, None, unicodedata.normalize("NFC", "Schlyñ"))
|
||||
|
||||
|
||||
AlignmentElement = namedtuple("AlignmentElement", "left right left_id right_id")
|
||||
|
||||
|
||||
def test_align():
|
||||
"""
|
||||
Test aligning by character while retaining segment id info
|
||||
|
||||
The difficulty here is that aligning should work on grapheme clusters,
|
||||
not Python characters.
|
||||
"""
|
||||
|
||||
test1 = ExtractedText(
|
||||
None,
|
||||
[
|
||||
ExtractedText("s0", None, None, "foo"),
|
||||
ExtractedText("s1", None, None, "bar"),
|
||||
ExtractedText("s2", None, None, "batzinga"),
|
||||
],
|
||||
" ",
|
||||
None,
|
||||
)
|
||||
test2 = ExtractedText(
|
||||
None,
|
||||
[
|
||||
ExtractedText("x0", None, None, "foo"),
|
||||
ExtractedText("x1", None, None, "bar"),
|
||||
# extra .
|
||||
ExtractedText("x2", None, None, "."),
|
||||
# deletion + different grapheme cluster, m̃ also is two Python characters
|
||||
ExtractedText("x3", None, None, "bazim̃ga"),
|
||||
],
|
||||
" ",
|
||||
None,
|
||||
)
|
||||
|
||||
left_pos = 0
|
||||
right_pos = 0
|
||||
alignment = []
|
||||
for left, right in seq_align(
|
||||
grapheme_clusters(test1.text), grapheme_clusters(test2.text)
|
||||
):
|
||||
left_id = test1.segment_id_for_pos(left_pos) if left is not None else None
|
||||
right_id = test2.segment_id_for_pos(right_pos) if right is not None else None
|
||||
el = AlignmentElement(left, right, left_id, right_id)
|
||||
alignment.append(el)
|
||||
if left is not None:
|
||||
left_pos += len(left)
|
||||
if right is not None:
|
||||
right_pos += len(right)
|
||||
|
||||
print("test1: {}".format(test1.text))
|
||||
print("test2: {}".format(test2.text))
|
||||
|
||||
assert alignment[0] == ("f", "f", "s0", "x0")
|
||||
assert alignment[8] == (None, ".", None, "x2")
|
||||
assert alignment[12] == ("t", None, "s2", None)
|
||||
assert alignment[15] == ("n", "m̃", "s2", "x3")
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"attributes,expected_index,expected_log",
|
||||
[
|
||||
([], None, None),
|
||||
(['index="0"'], 0, None),
|
||||
([""], 0, None),
|
||||
(['conf="0.5"'], 0, None),
|
||||
(['index="1"', 'index="0"'], 1, None),
|
||||
(['index="0" conf="0.4"', 'conf="0.5"'], 0, "TextEquiv without index"),
|
||||
(
|
||||
['conf="0.4"', 'conf="0.5"', 'conf="0.9"'],
|
||||
2,
|
||||
"No index attributes, use 'conf' attribute to sort TextEquiv",
|
||||
),
|
||||
(['index="0"', ""], 0, "TextEquiv without index"),
|
||||
(
|
||||
["", 'conf="0.4"'],
|
||||
1,
|
||||
"No index attributes, use 'conf' attribute to sort TextEquiv",
|
||||
),
|
||||
(["", ""], 0, "No index attributes, use first TextEquiv"),
|
||||
],
|
||||
)
|
||||
def test_textequiv(attributes, expected_index, expected_log, caplog):
|
||||
"""Test that extracting text from a PAGE TextEquiv is working without index attr."""
|
||||
caplog.set_level(logging.INFO)
|
||||
xml = '<?xml version="1.0"?>'
|
||||
ns = "http://schema.primaresearch.org/PAGE/gts/pagecontent/2018-07-15"
|
||||
text = ["Text {0}".format(i) for i in range(len(attributes) + 1)]
|
||||
|
||||
equiv = [
|
||||
"<TextEquiv {0}><Unicode>{1}</Unicode></TextEquiv>".format(attr, text[i])
|
||||
for i, attr in enumerate(attributes)
|
||||
]
|
||||
|
||||
textline = '{0}<TextLine id="l3" xmlns="{1}">{2}</TextLine>'
|
||||
textline = textline.format(xml, ns, "".join(equiv))
|
||||
|
||||
root = ET.fromstring(textline)
|
||||
result = ExtractedText.from_text_segment(
|
||||
root, {"page": ns}, textequiv_level="line"
|
||||
).text
|
||||
if expected_index is None:
|
||||
assert not result
|
||||
else:
|
||||
assert result == text[expected_index]
|
||||
|
||||
if expected_log is None:
|
||||
assert "no_index" not in caplog.text
|
||||
else:
|
||||
assert expected_log in caplog.text
|
183
src/dinglehopper/tests/test_align.py
Normal file
183
src/dinglehopper/tests/test_align.py
Normal file
|
@ -0,0 +1,183 @@
|
|||
import pytest
|
||||
from .util import unzip
|
||||
from .. import align, seq_align, distance
|
||||
|
||||
|
||||
def test_left_empty():
|
||||
result = list(align("", "foo"))
|
||||
expected = [(None, "f"), (None, "o"), (None, "o")]
|
||||
assert result == expected
|
||||
|
||||
|
||||
def test_right_empty():
|
||||
result = list(align("foo", ""))
|
||||
expected = [("f", None), ("o", None), ("o", None)]
|
||||
assert result == expected
|
||||
|
||||
|
||||
def test_left_longer():
|
||||
result = list(align("food", "foo"))
|
||||
expected = [("f", "f"), ("o", "o"), ("o", "o"), ("d", None)]
|
||||
assert result == expected
|
||||
|
||||
|
||||
def test_right_longer():
|
||||
result = list(align("foo", "food"))
|
||||
expected = [("f", "f"), ("o", "o"), ("o", "o"), (None, "d")]
|
||||
assert result == expected
|
||||
|
||||
|
||||
def test_some_diff():
|
||||
result = list(align("abcde", "aaadef"))
|
||||
left, right = unzip(result)
|
||||
assert list(left) == ["a", "b", "c", "d", "e", None]
|
||||
assert list(right) == ["a", "a", "a", "d", "e", "f"]
|
||||
|
||||
|
||||
def test_longer():
|
||||
s1 = "Dies ist eine Tst!"
|
||||
s2 = "Dies ist ein Test."
|
||||
|
||||
result = list(align(s1, s2)) # ; diffprint(*unzip(result))
|
||||
expected = [
|
||||
("D", "D"),
|
||||
("i", "i"),
|
||||
("e", "e"),
|
||||
("s", "s"),
|
||||
(" ", " "),
|
||||
("i", "i"),
|
||||
("s", "s"),
|
||||
("t", "t"),
|
||||
(" ", " "),
|
||||
("e", "e"),
|
||||
("i", "i"),
|
||||
("n", "n"),
|
||||
("e", None),
|
||||
(" ", " "),
|
||||
("T", "T"),
|
||||
(None, "e"),
|
||||
("s", "s"),
|
||||
("t", "t"),
|
||||
("!", "."),
|
||||
]
|
||||
assert result == expected
|
||||
|
||||
|
||||
def test_completely_different():
|
||||
assert len(list(align("abcde", "fghij"))) == 5
|
||||
|
||||
|
||||
def test_with_some_fake_ocr_errors():
|
||||
result = list(
|
||||
align(
|
||||
"Über die vielen Sorgen wegen desselben vergaß",
|
||||
"SomeJunk MoreJunk Übey die vielen Sorgen wegen AdditionalJunk deffelben vcrgab",
|
||||
)
|
||||
)
|
||||
left, right = unzip(result)
|
||||
|
||||
# Beginning
|
||||
assert list(left[:18]) == [None] * 18
|
||||
assert list(right[:18]) == list("SomeJunk MoreJunk ")
|
||||
|
||||
# End
|
||||
assert list(left[-1:]) == ["ß"]
|
||||
assert list(right[-1:]) == ["b"]
|
||||
|
||||
|
||||
def test_lines():
|
||||
"""Test comparing list of lines.
|
||||
|
||||
This mainly serves as documentation for comparing lists of lines.
|
||||
"""
|
||||
result = list(
|
||||
seq_align(
|
||||
["This is a line.", "This is another", "And the last line"],
|
||||
[
|
||||
"This is a line.",
|
||||
"This is another",
|
||||
"J u n k",
|
||||
"And the last line",
|
||||
],
|
||||
)
|
||||
)
|
||||
left, right = unzip(result)
|
||||
assert list(left) == [
|
||||
"This is a line.",
|
||||
"This is another",
|
||||
None,
|
||||
"And the last line",
|
||||
]
|
||||
assert list(right) == [
|
||||
"This is a line.",
|
||||
"This is another",
|
||||
"J u n k",
|
||||
"And the last line",
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.skip(reason="This fails with rapidfuzz <2.6 and is flawed anyway")
|
||||
# FIXME
|
||||
# This was based on our own implementation that used __eq__ and not __hash__ as
|
||||
# rapidfuzz does. Need to review this.
|
||||
def test_lines_similar():
|
||||
"""
|
||||
Test comparing list of lines while using a "weaker equivalence".
|
||||
|
||||
This mainly serves as documentation.
|
||||
"""
|
||||
|
||||
class SimilarString:
|
||||
def __init__(self, string):
|
||||
self._string = string
|
||||
|
||||
def __eq__(self, other):
|
||||
# Just an example!
|
||||
min_len = min(len(self._string), len(other._string))
|
||||
if min_len > 0:
|
||||
normalized_distance = distance(self._string, other._string) / min_len
|
||||
similar = normalized_distance < 0.1
|
||||
else:
|
||||
similar = False
|
||||
return similar
|
||||
|
||||
def __ne__(self, other):
|
||||
return not self.__eq__(other)
|
||||
|
||||
def __repr__(self):
|
||||
return "SimilarString('%s')" % self._string
|
||||
|
||||
def __hash__(self):
|
||||
return hash(self._string)
|
||||
|
||||
result = list(
|
||||
seq_align(
|
||||
[
|
||||
SimilarString("This is a line."),
|
||||
SimilarString("This is another"),
|
||||
SimilarString("And the last line"),
|
||||
],
|
||||
[
|
||||
SimilarString("This is a ljne."),
|
||||
SimilarString("This is another"),
|
||||
SimilarString("J u n k"),
|
||||
SimilarString("And the last line"),
|
||||
],
|
||||
)
|
||||
)
|
||||
left, right = unzip(result)
|
||||
assert list(left) == [
|
||||
SimilarString("This is a line."),
|
||||
SimilarString("This is another"),
|
||||
None,
|
||||
SimilarString("And the last line"),
|
||||
]
|
||||
assert list(right) == [
|
||||
SimilarString("This is a ljne."),
|
||||
SimilarString("This is another"),
|
||||
SimilarString("J u n k"),
|
||||
SimilarString("And the last line"),
|
||||
]
|
||||
|
||||
# Test __eq__ (i.e. is it a substitution or a similar string?)
|
||||
assert list(left)[0] == list(right)[0]
|
41
src/dinglehopper/tests/test_character_error_rate.py
Normal file
41
src/dinglehopper/tests/test_character_error_rate.py
Normal file
|
@ -0,0 +1,41 @@
|
|||
from __future__ import division, print_function
|
||||
|
||||
import math
|
||||
import unicodedata
|
||||
|
||||
from .. import character_error_rate
|
||||
|
||||
|
||||
def test_character_error_rate():
|
||||
assert character_error_rate("a", "a") == 0
|
||||
assert character_error_rate("a", "b") == 1 / 1
|
||||
assert character_error_rate("Foo", "Bar") == 3 / 3
|
||||
|
||||
assert character_error_rate("Foo", "") == 3 / 3
|
||||
|
||||
assert character_error_rate("", "") == 0
|
||||
assert math.isinf(character_error_rate("", "Foo"))
|
||||
|
||||
assert character_error_rate("Foo", "Food") == 1 / 3
|
||||
assert character_error_rate("Fnord", "Food") == 2 / 5
|
||||
assert character_error_rate("Müll", "Mull") == 1 / 4
|
||||
assert character_error_rate("Abstand", "Sand") == 4 / 7
|
||||
|
||||
|
||||
def test_character_error_rate_hard():
|
||||
s1 = unicodedata.normalize("NFC", "Schlyñ lorem ipsum.")
|
||||
s2 = unicodedata.normalize("NFD", "Schlyñ lorem ipsum!") # Different, decomposed!
|
||||
assert character_error_rate(s1, s2) == 1 / 19
|
||||
|
||||
s1 = "Schlyñ"
|
||||
assert (
|
||||
len(s1) == 6
|
||||
) # This ends with LATIN SMALL LETTER N WITH TILDE, so 6 code points
|
||||
s2 = "Schlym̃"
|
||||
assert (
|
||||
len(s2) == 7
|
||||
) # This, OTOH, ends with LATIN SMALL LETTER M + COMBINING TILDE, 7 code points
|
||||
|
||||
# Both strings have the same length in terms of grapheme clusters. So the CER should be symmetrical.
|
||||
assert character_error_rate(s2, s1) == 1 / 6
|
||||
assert character_error_rate(s1, s2) == 1 / 6
|
24
src/dinglehopper/tests/test_edit_distance.py
Normal file
24
src/dinglehopper/tests/test_edit_distance.py
Normal file
|
@ -0,0 +1,24 @@
|
|||
from __future__ import division, print_function
|
||||
|
||||
import unicodedata
|
||||
|
||||
from .. import distance
|
||||
|
||||
|
||||
def test_distance():
|
||||
assert distance("Fnord", "Food") == 2
|
||||
assert distance("Müll", "Mull") == 1
|
||||
|
||||
word1 = unicodedata.normalize("NFC", "Schlyñ")
|
||||
word2 = unicodedata.normalize("NFD", "Schlyñ") # Different, decomposed!
|
||||
assert distance(word1, word2) == 0
|
||||
|
||||
word1 = "Schlyñ"
|
||||
assert (
|
||||
len(word1) == 6
|
||||
) # This ends with LATIN SMALL LETTER N WITH TILDE, so 6 code points
|
||||
word2 = "Schlym̃"
|
||||
assert (
|
||||
len(word2) == 7
|
||||
) # This, OTOH, ends with LATIN SMALL LETTER M + COMBINING TILDE, 7 code points
|
||||
assert distance(word1, word2) == 1
|
23
src/dinglehopper/tests/test_editops.py
Normal file
23
src/dinglehopper/tests/test_editops.py
Normal file
|
@ -0,0 +1,23 @@
|
|||
import unicodedata
|
||||
|
||||
from .. import editops
|
||||
|
||||
|
||||
def test_editops():
|
||||
"""Test editops() in cases where dealing with grapheme clusters matters"""
|
||||
|
||||
# In these cases, one of the words has a composed form, the other one does not.
|
||||
# (Also, note that old terminal emulators might not render the combining characters
|
||||
# correctly, be sure to read in an editor.)
|
||||
assert editops("Schlyñ", "Schlym̃") == [("replace", 5, 5)]
|
||||
assert editops("oͤde", "öde") == [("replace", 0, 0)]
|
||||
|
||||
|
||||
def test_editops_canonically_equivalent():
|
||||
left = unicodedata.lookup("LATIN SMALL LETTER N") + unicodedata.lookup(
|
||||
"COMBINING TILDE"
|
||||
)
|
||||
right = unicodedata.lookup("LATIN SMALL LETTER N WITH TILDE")
|
||||
assert left != right
|
||||
assert unicodedata.normalize("NFC", left) == unicodedata.normalize("NFC", right)
|
||||
assert editops(left, right) == []
|
27
src/dinglehopper/tests/test_integ_align.py
Normal file
27
src/dinglehopper/tests/test_integ_align.py
Normal file
|
@ -0,0 +1,27 @@
|
|||
from __future__ import division, print_function
|
||||
|
||||
import os
|
||||
|
||||
import pytest
|
||||
from lxml import etree as ET
|
||||
|
||||
from .. import align, page_text
|
||||
|
||||
data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")
|
||||
|
||||
|
||||
@pytest.mark.integration
|
||||
def test_align_page_files():
|
||||
# In the fake OCR file, we changed 2 characters and replaced a fi ligature with fi.
|
||||
# → 2 elements in the alignment should be different, the ligature is
|
||||
# (currently) not counted due to normalization.
|
||||
# NOTE: In this example, it doesn't matter that we work with "characters", not grapheme clusters.
|
||||
|
||||
gt = page_text(ET.parse(os.path.join(data_dir, "test-gt.page2018.xml")))
|
||||
ocr = page_text(ET.parse(os.path.join(data_dir, "test-fake-ocr.page2018.xml")))
|
||||
|
||||
result = list(align(gt, ocr))
|
||||
for left, right in result:
|
||||
if left != right:
|
||||
print(left, right)
|
||||
assert sum(left != right for left, right in result) == 2
|
29
src/dinglehopper/tests/test_integ_bigger_texts.py
Normal file
29
src/dinglehopper/tests/test_integ_bigger_texts.py
Normal file
|
@ -0,0 +1,29 @@
|
|||
from __future__ import division, print_function
|
||||
|
||||
import os
|
||||
|
||||
import pytest
|
||||
from lxml import etree as ET
|
||||
from uniseg.graphemecluster import grapheme_clusters
|
||||
|
||||
from .. import character_error_rate, page_text, alto_text
|
||||
|
||||
data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")
|
||||
|
||||
|
||||
@pytest.mark.integration
|
||||
def test_bigger_texts():
|
||||
gt = page_text(
|
||||
ET.parse(os.path.join(data_dir, "bigger-texts", "00008228", "00008228.gt.xml"))
|
||||
)
|
||||
ocr = alto_text(
|
||||
ET.parse(
|
||||
os.path.join(
|
||||
data_dir, "bigger-texts", "00008228", "00008228-00236534.gt4hist.xml"
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
# Only interested in a result here: In earlier versions this would have used
|
||||
# tens of GB of RAM and should now not break a sweat.
|
||||
assert character_error_rate(gt, ocr) >= 0.0
|
|
@ -0,0 +1,59 @@
|
|||
from __future__ import division, print_function
|
||||
|
||||
import os
|
||||
|
||||
import pytest
|
||||
from lxml import etree as ET
|
||||
from uniseg.graphemecluster import grapheme_clusters
|
||||
|
||||
from .. import character_error_rate, page_text, alto_text
|
||||
|
||||
data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")
|
||||
|
||||
|
||||
@pytest.mark.integration
|
||||
def test_character_error_rate_between_page_files():
|
||||
# In the fake OCR file, we changed 2 characters and replaced a fi ligature with fi.
|
||||
# The fi ligature does not count.
|
||||
gt = page_text(ET.parse(os.path.join(data_dir, "test-gt.page2018.xml")))
|
||||
ocr = page_text(ET.parse(os.path.join(data_dir, "test-fake-ocr.page2018.xml")))
|
||||
|
||||
gt_len = len(list(grapheme_clusters(gt)))
|
||||
expected_cer = 2 / gt_len
|
||||
|
||||
assert character_error_rate(gt, ocr) == expected_cer
|
||||
|
||||
|
||||
@pytest.mark.integration
|
||||
def test_character_error_rate_between_page_alto():
|
||||
gt = page_text(
|
||||
ET.parse(os.path.join(data_dir, "lorem-ipsum", "lorem-ipsum-scan.gt.page.xml"))
|
||||
)
|
||||
ocr = alto_text(
|
||||
ET.parse(
|
||||
os.path.join(
|
||||
data_dir, "lorem-ipsum", "lorem-ipsum-scan.ocr.tesseract.alto.xml"
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
assert gt == ocr
|
||||
assert character_error_rate(gt, ocr) == 0
|
||||
|
||||
|
||||
@pytest.mark.integration
|
||||
def test_character_error_rate_between_page_alto_2():
|
||||
gt = page_text(
|
||||
ET.parse(
|
||||
os.path.join(data_dir, "lorem-ipsum", "lorem-ipsum-scan-bad.gt.page.xml")
|
||||
)
|
||||
)
|
||||
ocr = alto_text(
|
||||
ET.parse(
|
||||
os.path.join(
|
||||
data_dir, "lorem-ipsum", "lorem-ipsum-scan-bad.ocr.tesseract.alto.xml"
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
assert character_error_rate(gt, ocr) == 8 / 591 # Manually verified
|
41
src/dinglehopper/tests/test_integ_cli_dir.py
Normal file
41
src/dinglehopper/tests/test_integ_cli_dir.py
Normal file
|
@ -0,0 +1,41 @@
|
|||
import os
|
||||
import pytest
|
||||
from ocrd_utils import initLogging
|
||||
from dinglehopper.cli import process_dir
|
||||
|
||||
data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")
|
||||
|
||||
|
||||
@pytest.mark.integration
|
||||
def test_cli_directory(tmp_path):
|
||||
"""
|
||||
Test that the cli/process_dir() processes a directory of files and
|
||||
yields JSON and HTML reports.
|
||||
"""
|
||||
|
||||
initLogging()
|
||||
process_dir(os.path.join(data_dir, "directory-test", "gt"),
|
||||
os.path.join(data_dir, "directory-test", "ocr"),
|
||||
"report", str(tmp_path / "reports"), False, True,
|
||||
"line")
|
||||
|
||||
assert os.path.exists(tmp_path / "reports/1.xml-report.json")
|
||||
assert os.path.exists(tmp_path / "reports/1.xml-report.html")
|
||||
assert os.path.exists(tmp_path / "reports/2.xml-report.json")
|
||||
assert os.path.exists(tmp_path / "reports/2.xml-report.html")
|
||||
|
||||
|
||||
@pytest.mark.integration
|
||||
def test_cli_fail_without_gt(tmp_path):
|
||||
"""
|
||||
Test that the cli/process_dir skips a file if there is no corresponding file
|
||||
in the other directory.
|
||||
"""
|
||||
|
||||
initLogging()
|
||||
process_dir(os.path.join(data_dir, "directory-test", "gt"),
|
||||
os.path.join(data_dir, "directory-test", "ocr"),
|
||||
"report", str(tmp_path / "reports"), False, True,
|
||||
"line")
|
||||
|
||||
assert len(os.listdir(tmp_path / "reports")) == 2 * 2
|
42
src/dinglehopper/tests/test_integ_cli_valid_json.py
Normal file
42
src/dinglehopper/tests/test_integ_cli_valid_json.py
Normal file
|
@ -0,0 +1,42 @@
|
|||
import json
|
||||
|
||||
import pytest
|
||||
from .util import working_directory
|
||||
|
||||
from ..cli import process
|
||||
|
||||
|
||||
@pytest.mark.integration
|
||||
def test_cli_json(tmp_path):
|
||||
"""Test that the cli/process() yields a loadable JSON report"""
|
||||
|
||||
with working_directory(tmp_path):
|
||||
with open("gt.txt", "w") as gtf:
|
||||
gtf.write("AAAAA")
|
||||
with open("ocr.txt", "w") as ocrf:
|
||||
ocrf.write("AAAAB")
|
||||
|
||||
with open("gt.txt", "r") as gtf:
|
||||
print(gtf.read())
|
||||
process("gt.txt", "ocr.txt", "report")
|
||||
with open("report.json", "r") as jsonf:
|
||||
print(jsonf.read())
|
||||
with open("report.json", "r") as jsonf:
|
||||
j = json.load(jsonf)
|
||||
assert j["cer"] == pytest.approx(0.2)
|
||||
|
||||
|
||||
@pytest.mark.integration
|
||||
def test_cli_json_cer_is_infinity(tmp_path):
|
||||
"""Test that the cli/process() yields a loadable JSON report when CER == inf"""
|
||||
|
||||
with working_directory(tmp_path):
|
||||
with open("gt.txt", "w") as gtf:
|
||||
gtf.write("") # Empty to yield CER == inf
|
||||
with open("ocr.txt", "w") as ocrf:
|
||||
ocrf.write("Not important")
|
||||
|
||||
process("gt.txt", "ocr.txt", "report")
|
||||
with open("report.json", "r") as jsonf:
|
||||
j = json.load(jsonf)
|
||||
assert j["cer"] == pytest.approx(float("inf"))
|
27
src/dinglehopper/tests/test_integ_differences.py
Normal file
27
src/dinglehopper/tests/test_integ_differences.py
Normal file
|
@ -0,0 +1,27 @@
|
|||
import json
|
||||
import os
|
||||
import pytest
|
||||
from ocrd_utils import initLogging
|
||||
from dinglehopper.cli import process
|
||||
|
||||
data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")
|
||||
|
||||
|
||||
@pytest.mark.integration
|
||||
def test_cli_differences(tmp_path):
|
||||
"""Test that the cli/process() yields a JSON report that includes
|
||||
the differences found between the GT and OCR text"""
|
||||
|
||||
initLogging()
|
||||
process(os.path.join(data_dir, "test-gt.page2018.xml"),
|
||||
os.path.join(data_dir, "test-fake-ocr.page2018.xml"),
|
||||
"report", tmp_path, differences=True)
|
||||
|
||||
assert os.path.exists(tmp_path / "report.json")
|
||||
|
||||
with open(tmp_path / "report.json", "r") as jsonf:
|
||||
j = json.load(jsonf)
|
||||
|
||||
assert j["differences"] == {"character_level": {'n :: m': 1, 'ſ :: f': 1},
|
||||
"word_level": {'Augenblick :: Augemblick': 1,
|
||||
'Verſprochene :: Verfprochene': 1}}
|
55
src/dinglehopper/tests/test_integ_edit_distance_ocr.py
Normal file
55
src/dinglehopper/tests/test_integ_edit_distance_ocr.py
Normal file
|
@ -0,0 +1,55 @@
|
|||
from __future__ import division, print_function
|
||||
|
||||
import os
|
||||
|
||||
import pytest
|
||||
from lxml import etree as ET
|
||||
|
||||
from .. import distance, page_text, alto_text
|
||||
|
||||
data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")
|
||||
|
||||
|
||||
@pytest.mark.integration
|
||||
def test_distance_between_page_files():
|
||||
# In the fake OCR file, we changed 2 characters and replaced a fi ligature with fi.
|
||||
# Due to normalization, we don't count the ligature.
|
||||
# → 2 differences
|
||||
gt = page_text(ET.parse(os.path.join(data_dir, "test-gt.page2018.xml")))
|
||||
ocr = page_text(ET.parse(os.path.join(data_dir, "test-fake-ocr.page2018.xml")))
|
||||
assert distance(gt, ocr) == 2
|
||||
|
||||
|
||||
@pytest.mark.integration
|
||||
def test_distance_between_page_alto():
|
||||
gt = page_text(
|
||||
ET.parse(os.path.join(data_dir, "lorem-ipsum", "lorem-ipsum-scan.gt.page.xml"))
|
||||
)
|
||||
ocr = alto_text(
|
||||
ET.parse(
|
||||
os.path.join(
|
||||
data_dir, "lorem-ipsum", "lorem-ipsum-scan.ocr.tesseract.alto.xml"
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
assert gt == ocr
|
||||
assert distance(gt, ocr) == 0
|
||||
|
||||
|
||||
@pytest.mark.integration
|
||||
def test_distance_between_page_alto_2():
|
||||
gt = page_text(
|
||||
ET.parse(
|
||||
os.path.join(data_dir, "lorem-ipsum", "lorem-ipsum-scan-bad.gt.page.xml")
|
||||
)
|
||||
)
|
||||
ocr = alto_text(
|
||||
ET.parse(
|
||||
os.path.join(
|
||||
data_dir, "lorem-ipsum", "lorem-ipsum-scan-bad.ocr.tesseract.alto.xml"
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
assert distance(gt, ocr) == 8 # Manually verified
|
44
src/dinglehopper/tests/test_integ_ocrd_cli.py
Normal file
44
src/dinglehopper/tests/test_integ_ocrd_cli.py
Normal file
|
@ -0,0 +1,44 @@
|
|||
import os
|
||||
import shutil
|
||||
import json
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
from click.testing import CliRunner
|
||||
from .util import working_directory
|
||||
|
||||
|
||||
from ..ocrd_cli import ocrd_dinglehopper
|
||||
|
||||
data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")
|
||||
|
||||
|
||||
@pytest.mark.integration
|
||||
@pytest.mark.skipif(sys.platform == "win32", reason="only on unix")
|
||||
def test_ocrd_cli(tmp_path):
|
||||
"""Test OCR-D interface"""
|
||||
|
||||
# Copy test workspace
|
||||
test_workspace_dir_source = Path(data_dir) / "actevedef_718448162"
|
||||
test_workspace_dir = tmp_path / "test_ocrd_cli"
|
||||
shutil.copytree(test_workspace_dir_source, test_workspace_dir)
|
||||
|
||||
# Run through the OCR-D interface
|
||||
with working_directory(test_workspace_dir):
|
||||
runner = CliRunner()
|
||||
args = [
|
||||
"-m",
|
||||
"mets.xml",
|
||||
"-I",
|
||||
"OCR-D-GT-PAGE,OCR-D-OCR-CALAMARI",
|
||||
"-O",
|
||||
"OCR-D-OCR-CALAMARI-EVAL",
|
||||
]
|
||||
sys.argv[
|
||||
1:
|
||||
] = args # XXX Hack to satisfy ocrd_cli_wrap_processor() check for arguments
|
||||
result = runner.invoke(ocrd_dinglehopper, args)
|
||||
assert result.exit_code == 0
|
||||
result_json = list((test_workspace_dir / "OCR-D-OCR-CALAMARI-EVAL").glob("*.json"))
|
||||
assert json.load(open(str(result_json[0])))["cer"] < 0.03
|
101
src/dinglehopper/tests/test_integ_summarize.py
Normal file
101
src/dinglehopper/tests/test_integ_summarize.py
Normal file
|
@ -0,0 +1,101 @@
|
|||
import json
|
||||
import os
|
||||
import pytest
|
||||
from .util import working_directory
|
||||
from .. import cli_summarize
|
||||
|
||||
expected_cer_avg = (0.05 + 0.10) / 2
|
||||
expected_wer_avg = (0.15 + 0.20) / 2
|
||||
expected_diff_c = {"a": 30, "b": 50}
|
||||
expected_diff_w = {"c": 70, "d": 90}
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def create_summaries(tmp_path):
|
||||
"""Create two summary reports with mock data"""
|
||||
reports_dirname = tmp_path / "reports"
|
||||
reports_dirname.mkdir()
|
||||
|
||||
report1 = {"cer": 0.05, "wer": 0.15,
|
||||
"differences": {
|
||||
"character_level": {"a": 10, "b": 20},
|
||||
"word_level": {"c": 30, "d": 40}
|
||||
}}
|
||||
report2 = {"cer": 0.10, "wer": 0.20,
|
||||
"differences": {
|
||||
"character_level": {"a": 20, "b": 30},
|
||||
"word_level": {"c": 40, "d": 50}
|
||||
}}
|
||||
|
||||
with open(os.path.join(reports_dirname, "report1.json"), "w") as f:
|
||||
json.dump(report1, f)
|
||||
with open(os.path.join(reports_dirname, "report2.json"), "w") as f:
|
||||
json.dump(report2, f)
|
||||
|
||||
return str(reports_dirname)
|
||||
|
||||
|
||||
@pytest.mark.integration
|
||||
def test_cli_summarize_json(tmp_path, create_summaries):
|
||||
"""Test that the cli/process() yields a summarized JSON report"""
|
||||
with working_directory(tmp_path):
|
||||
reports_dirname = create_summaries
|
||||
cli_summarize.process(reports_dirname)
|
||||
|
||||
with open(os.path.join(reports_dirname, "summary.json"), "r") as f:
|
||||
summary_data = json.load(f)
|
||||
|
||||
|
||||
assert summary_data["num_reports"] == 2
|
||||
assert summary_data["cer_avg"] == expected_cer_avg
|
||||
assert summary_data["wer_avg"] == expected_wer_avg
|
||||
assert summary_data["differences"]["character_level"] == expected_diff_c
|
||||
assert summary_data["differences"]["word_level"] == expected_diff_w
|
||||
|
||||
|
||||
@pytest.mark.integration
|
||||
def test_cli_summarize_html(tmp_path, create_summaries):
|
||||
"""Test that the cli/process() yields an HTML report"""
|
||||
with working_directory(tmp_path):
|
||||
reports_dirname = create_summaries
|
||||
cli_summarize.process(reports_dirname)
|
||||
|
||||
html_file = os.path.join(reports_dirname, "summary.html")
|
||||
assert os.path.isfile(html_file)
|
||||
|
||||
with open(html_file, "r") as f:
|
||||
contents = f.read()
|
||||
|
||||
assert len(contents) > 0
|
||||
assert "Number of reports: 2" in contents
|
||||
assert f"Average CER: {round(expected_cer_avg, 4)}" in contents
|
||||
assert f"Average WER: {round(expected_wer_avg, 4)}" in contents
|
||||
|
||||
|
||||
@pytest.mark.integration
|
||||
def test_cli_summarize_html_skip_invalid(tmp_path, create_summaries):
|
||||
"""
|
||||
Test that the cli/process() does not include reports that are missing a WER value.
|
||||
"""
|
||||
with working_directory(tmp_path):
|
||||
reports_dirname = create_summaries
|
||||
|
||||
# This third report has no WER value and should not be included in the summary
|
||||
report3 = {"cer": 0.10,
|
||||
"differences": {
|
||||
"character_level": {"a": 20, "b": 30},
|
||||
"word_level": {"c": 40, "d": 50}
|
||||
}}
|
||||
|
||||
with open(os.path.join(reports_dirname, "report3-missing-wer.json"), "w") as f:
|
||||
json.dump(report3, f)
|
||||
|
||||
cli_summarize.process(reports_dirname)
|
||||
|
||||
html_file = os.path.join(reports_dirname, "summary.html")
|
||||
assert os.path.isfile(html_file)
|
||||
|
||||
with open(html_file, "r") as f:
|
||||
contents = f.read()
|
||||
|
||||
assert "Number of reports: 2" in contents # report3 is not included
|
25
src/dinglehopper/tests/test_integ_table_extraction.py
Normal file
25
src/dinglehopper/tests/test_integ_table_extraction.py
Normal file
|
@ -0,0 +1,25 @@
|
|||
import os
|
||||
|
||||
import pytest
|
||||
from lxml import etree as ET
|
||||
|
||||
from .. import page_text
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"file,expected_text",
|
||||
[
|
||||
("table-order-0001.xml", "1\n2\n3\n4\n5\n6\n7\n8\n9"),
|
||||
("table-order-0002.xml", "1\n4\n7\n2\n5\n8\n3\n6\n9"),
|
||||
("table-region.xml", "1\n2\n3\n4\n5\n6\n7\n8\n9"),
|
||||
("table-no-reading-order.xml", "5\n6\n7\n8\n9\n1\n2\n3\n4"),
|
||||
("table-unordered.xml", "1\n2\n3\n4\n5\n6\n7\n8\n9"),
|
||||
],
|
||||
)
|
||||
@pytest.mark.integration
|
||||
def test_reading_order_settings(file, expected_text):
|
||||
data_dir = os.path.join(
|
||||
os.path.dirname(os.path.abspath(__file__)), "data", "table-order"
|
||||
)
|
||||
ocr = page_text(ET.parse(os.path.join(data_dir, file)))
|
||||
assert ocr == expected_text
|
68
src/dinglehopper/tests/test_integ_word_error_rate_ocr.py
Normal file
68
src/dinglehopper/tests/test_integ_word_error_rate_ocr.py
Normal file
|
@ -0,0 +1,68 @@
|
|||
from __future__ import division, print_function
|
||||
|
||||
import os
|
||||
|
||||
import pytest
|
||||
from lxml import etree as ET
|
||||
|
||||
from .. import word_error_rate, words, page_text, alto_text
|
||||
|
||||
data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")
|
||||
|
||||
|
||||
@pytest.mark.integration
|
||||
def test_word_error_rate_between_page_files():
|
||||
# In the fake OCR file, we changed 2 characters and replaced a fi ligature with fi. So we have 3 changed words,
|
||||
# the ligature does not count → 2 errors
|
||||
gt = page_text(ET.parse(os.path.join(data_dir, "test-gt.page2018.xml")))
|
||||
|
||||
gt_word_count = (
|
||||
7 + 6 + 5 + 8 + 7 + 6 + 7 + 8 + 6 + 7 + 7 + 5 + 6 + 8 + 8 + 7 + 7 + 6 + 5 + 4
|
||||
) # Manually verified word count per line
|
||||
assert len(list(words(gt))) == gt_word_count
|
||||
|
||||
ocr = page_text(ET.parse(os.path.join(data_dir, "test-fake-ocr.page2018.xml")))
|
||||
assert word_error_rate(gt, ocr) == 2 / gt_word_count
|
||||
|
||||
|
||||
@pytest.mark.integration
|
||||
def test_word_error_rate_between_page_alto():
|
||||
gt = page_text(
|
||||
ET.parse(os.path.join(data_dir, "lorem-ipsum", "lorem-ipsum-scan.gt.page.xml"))
|
||||
)
|
||||
ocr = alto_text(
|
||||
ET.parse(
|
||||
os.path.join(
|
||||
data_dir, "lorem-ipsum", "lorem-ipsum-scan.ocr.tesseract.alto.xml"
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
assert gt == ocr
|
||||
assert word_error_rate(gt, ocr) == 0
|
||||
|
||||
|
||||
@pytest.mark.integration
|
||||
def test_word_error_rate_between_page_alto_2():
|
||||
gt = page_text(
|
||||
ET.parse(
|
||||
os.path.join(data_dir, "lorem-ipsum", "lorem-ipsum-scan-bad.gt.page.xml")
|
||||
)
|
||||
)
|
||||
|
||||
gt_word_count = (
|
||||
14 + 18 + 17 + 14 + 17 + 17 + 3
|
||||
) # Manually verified word count per line
|
||||
assert len(list(words(gt))) == gt_word_count
|
||||
|
||||
ocr = alto_text(
|
||||
ET.parse(
|
||||
os.path.join(
|
||||
data_dir, "lorem-ipsum", "lorem-ipsum-scan-bad.ocr.tesseract.alto.xml"
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
assert (
|
||||
word_error_rate(gt, ocr) == 7 / gt_word_count
|
||||
) # Manually verified, 6 words are wrong, 1 got split (=2 errors)
|
185
src/dinglehopper/tests/test_ocr_files.py
Normal file
185
src/dinglehopper/tests/test_ocr_files.py
Normal file
|
@ -0,0 +1,185 @@
|
|||
import os
|
||||
import re
|
||||
|
||||
import lxml.etree as ET
|
||||
import textwrap
|
||||
|
||||
import pytest
|
||||
|
||||
from .util import working_directory
|
||||
from .. import alto_namespace, alto_text, page_namespace, page_text, plain_text, text
|
||||
|
||||
data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")
|
||||
|
||||
|
||||
def test_alto_namespace():
|
||||
tree = ET.parse(os.path.join(data_dir, "test.alto3.xml"))
|
||||
assert alto_namespace(tree) == "http://www.loc.gov/standards/alto/ns-v3#"
|
||||
|
||||
|
||||
def test_alto_text():
|
||||
tree = ET.parse(os.path.join(data_dir, "test.alto3.xml"))
|
||||
result = alto_text(tree)
|
||||
expected = textwrap.dedent(
|
||||
"""\
|
||||
über die vielen Sorgen wegen deſſelben vergaß
|
||||
Hartkopf, der Frau Amtmännin das ver-
|
||||
ſprochene zu überliefern."""
|
||||
)
|
||||
assert result == expected
|
||||
|
||||
|
||||
def test_alto_text_ALTO1():
|
||||
tree = ET.parse(os.path.join(data_dir, "test.alto1.xml"))
|
||||
assert "being erected at the Broadway stock" in alto_text(tree)
|
||||
|
||||
|
||||
def test_alto_text_ALTO2():
|
||||
tree = ET.parse(os.path.join(data_dir, "test.alto2.xml"))
|
||||
assert (
|
||||
"Halbmonde, die genau durch einen Ouerstrich halbiert\nsind und an beiden Enden"
|
||||
in alto_text(tree)
|
||||
)
|
||||
|
||||
|
||||
def test_alto_text_ALTO3():
|
||||
tree = ET.parse(os.path.join(data_dir, "test.alto3.xml"))
|
||||
assert "über die vielen Sorgen wegen deſſelben vergaß" in alto_text(tree)
|
||||
|
||||
|
||||
def test_page_namespace():
|
||||
tree = ET.parse(os.path.join(data_dir, "test.page2018.xml"))
|
||||
assert (
|
||||
page_namespace(tree)
|
||||
== "http://schema.primaresearch.org/PAGE/gts/pagecontent/2018-07-15"
|
||||
)
|
||||
|
||||
|
||||
def test_page_test():
|
||||
tree = ET.parse(os.path.join(data_dir, "test.page2018.xml"))
|
||||
result = page_text(tree)
|
||||
|
||||
# We are currently normalizing on extraction, so the text is normalized.
|
||||
#
|
||||
# expected = textwrap.dedent("""\
|
||||
# ber die vielen Sorgen wegen deelben vergaß
|
||||
# Hartkopf, der Frau Amtmnnin das ver⸗
|
||||
# ſproene zu berliefern. — Ein Erpreer
|
||||
# wurde an ihn abgeſit, um ihn ums Him⸗
|
||||
# melswien zu ſagen, daß er das Verſproene
|
||||
# glei den Augenbli berbringen mte, die
|
||||
# Frau Amtmnnin htte auf ihn verlaen,
|
||||
# und nun wßte e nit, was e anfangen
|
||||
# ſote. Den Augenbli ſote er kommen,
|
||||
# ſon vergieng e in ihrer Ang. — Die
|
||||
# Ge wren ſon angekommen, und es fehlte
|
||||
# ihr do no an aem. —
|
||||
# Hartkopf mußte er bennen, und
|
||||
# endli na langem Nadenken fiel es ihm er
|
||||
# wieder ein. — Er langte den Zettel aus dem
|
||||
# Accisbue heraus, und ſagte ſeiner Frau, daß
|
||||
# e das, was da wre, herbeyſaffen mte.
|
||||
# Jndeß mangelten do einige Generalia, die
|
||||
# alſo wegfielen. — Hartkopf gieng ſelb
|
||||
# mit und berbrate es. —""")
|
||||
expected = textwrap.dedent(
|
||||
"""\
|
||||
über die vielen Sorgen wegen deſſelben vergaß
|
||||
Hartkopf, der Frau Amtmännin das ver-
|
||||
ſprochene zu überliefern. – Ein Erpreſſer
|
||||
wurde an ihn abgeſchickt, um ihn ums Him-
|
||||
melswillen zu ſagen, daß er das Verſprochene
|
||||
gleich den Augenblick überbringen möchte, die
|
||||
Frau Amtmännin hätte ſich auf ihn verlaſſen,
|
||||
und nun wüßte ſie nicht, was ſie anfangen
|
||||
ſollte. Den Augenblick ſollte er kommen,
|
||||
ſonſt vergieng ſie in ihrer Angſt. – Die
|
||||
Gäſte wären ſchon angekommen, und es fehlte
|
||||
ihr doch noch an allem. –
|
||||
Hartkopf mußte ſich erſt beſinnen, und
|
||||
endlich nach langem Nachdenken fiel es ihm erſt
|
||||
wieder ein. – Er langte den Zettel aus dem
|
||||
Accisbuche heraus, und ſagte ſeiner Frau, daß
|
||||
ſie das, was da wäre, herbeyſchaffen möchte.
|
||||
Jndeß mangelten doch einige Generalia, die
|
||||
alſo wegfielen. – Hartkopf gieng ſelbſt
|
||||
mit und überbrachte es. –"""
|
||||
)
|
||||
assert result == expected
|
||||
|
||||
|
||||
def test_page_with_empty_region():
|
||||
# This file contains an empty TextRegion:
|
||||
#
|
||||
# <TextRegion id="region0000">
|
||||
# <Coords points="488,133 1197,133 1197,193 488,193"/>
|
||||
# <TextEquiv>
|
||||
# <Unicode></Unicode>
|
||||
# </TextEquiv>
|
||||
# </TextRegion>
|
||||
tree = ET.parse(
|
||||
os.path.join(data_dir, "brochrnx_73075507X/00000139.ocrd-tess.ocr.page.xml")
|
||||
)
|
||||
result = page_text(tree)
|
||||
assert result
|
||||
|
||||
|
||||
def test_page_order():
|
||||
# This file contains TextRegions where file order is not the same as reading order.
|
||||
tree = ET.parse(os.path.join(data_dir, "order.page.xml"))
|
||||
result = page_text(tree)
|
||||
|
||||
print(result)
|
||||
assert re.search(
|
||||
r"Herr Konfrater.*75.*Etwas f.r Wittwen.*Ein gewi.{1,2}er Lord.*76\. Die",
|
||||
result,
|
||||
re.DOTALL,
|
||||
)
|
||||
|
||||
|
||||
def test_page_mixed_regions():
|
||||
# This file contains ImageRegions and TextRegions in the ReadingOrder
|
||||
tree = ET.parse(os.path.join(data_dir, "mixed-regions.page.xml"))
|
||||
result = page_text(tree)
|
||||
|
||||
assert "non exaudiam uos. Chriſtiani uero quia orant iuxta" in result
|
||||
|
||||
|
||||
def test_page_level():
|
||||
# This file contains inconsistent TextRegion and TextLine texts
|
||||
|
||||
# TextRegion
|
||||
tree = ET.parse(os.path.join(data_dir, "levels-are-different.page.xml"))
|
||||
result = page_text(tree)
|
||||
assert result == "Inconsistent dummy region text"
|
||||
tree = ET.parse(os.path.join(data_dir, "levels-are-different.page.xml"))
|
||||
result = page_text(tree, textequiv_level="region")
|
||||
assert result == "Inconsistent dummy region text"
|
||||
|
||||
# TextLine
|
||||
tree = ET.parse(os.path.join(data_dir, "levels-are-different.page.xml"))
|
||||
result = page_text(tree, textequiv_level="line")
|
||||
assert (
|
||||
result
|
||||
== "Hand, Mylord? fragte der Graf von Rocheſter.\nAls er einsmals in dem Oberhauſe eine Bill we-"
|
||||
)
|
||||
|
||||
|
||||
def test_text():
|
||||
assert "being erected at the Broadway stock" in text(
|
||||
os.path.join(data_dir, "test.alto1.xml")
|
||||
)
|
||||
assert "wieder ein. – Er langte den Zettel aus dem" in text(
|
||||
os.path.join(data_dir, "test.page2018.xml")
|
||||
)
|
||||
assert "Lorem ipsum" in text(os.path.join(data_dir, "test.txt"))
|
||||
|
||||
|
||||
def test_plain(tmp_path):
|
||||
with working_directory(tmp_path):
|
||||
with open("ocr.txt", "w") as ocrf:
|
||||
ocrf.write("AAAAB")
|
||||
|
||||
result = plain_text("ocr.txt")
|
||||
expected = "AAAAB"
|
||||
assert result == expected
|
86
src/dinglehopper/tests/test_word_error_rate.py
Normal file
86
src/dinglehopper/tests/test_word_error_rate.py
Normal file
|
@ -0,0 +1,86 @@
|
|||
from __future__ import division, print_function
|
||||
|
||||
import math
|
||||
|
||||
from .. import word_error_rate, words
|
||||
|
||||
|
||||
def test_words():
|
||||
result = list(
|
||||
words("Der schnelle [„braune“] Fuchs kann keine 3,14 Meter springen, oder?")
|
||||
)
|
||||
expected = [
|
||||
"Der",
|
||||
"schnelle",
|
||||
"braune",
|
||||
"Fuchs",
|
||||
"kann",
|
||||
"keine",
|
||||
"3,14",
|
||||
"Meter",
|
||||
"springen",
|
||||
"oder",
|
||||
]
|
||||
assert result == expected
|
||||
|
||||
|
||||
def test_words_private_use_area():
|
||||
result = list(
|
||||
words(
|
||||
"ber die vielen Sorgen wegen deelben vergaß Hartkopf, der Frau Amtmnnin das ver⸗\n"
|
||||
"ſproene zu berliefern."
|
||||
)
|
||||
)
|
||||
expected = [
|
||||
"ber",
|
||||
"die",
|
||||
"vielen",
|
||||
"Sorgen",
|
||||
"wegen",
|
||||
"deelben",
|
||||
"vergaß",
|
||||
"Hartkopf",
|
||||
"der",
|
||||
"Frau",
|
||||
"Amtmnnin",
|
||||
"das",
|
||||
"ver",
|
||||
"ſproene",
|
||||
"zu",
|
||||
"berliefern",
|
||||
]
|
||||
assert result == expected
|
||||
|
||||
|
||||
def test_word_error_rate():
|
||||
assert (
|
||||
word_error_rate("Dies ist ein Beispielsatz!", "Dies ist ein Beispielsatz!") == 0
|
||||
)
|
||||
assert (
|
||||
word_error_rate("Dies. ist ein Beispielsatz!", "Dies ist ein Beispielsatz!")
|
||||
== 0
|
||||
)
|
||||
assert (
|
||||
word_error_rate("Dies. ist ein Beispielsatz!", "Dies ist ein Beispielsatz.")
|
||||
== 0
|
||||
)
|
||||
|
||||
assert (
|
||||
word_error_rate("Dies ist ein Beispielsatz!", "Dies ist ein Beispielsarz:")
|
||||
== 1 / 4
|
||||
)
|
||||
assert (
|
||||
word_error_rate("Dies ist ein Beispielsatz!", "Dies ein ist Beispielsatz!")
|
||||
== 2 / 4
|
||||
)
|
||||
|
||||
assert word_error_rate("Dies ist ein Beispielsatz!", "") == 4 / 4
|
||||
assert math.isinf(word_error_rate("", "Dies ist ein Beispielsatz!"))
|
||||
assert word_error_rate("", "") == 0
|
||||
|
||||
assert (
|
||||
word_error_rate(
|
||||
"Schlyñ lorem ipsum dolor sit amet,", "Schlym̃ lorem ipsum dolor sit amet."
|
||||
)
|
||||
== 1 / 6
|
||||
)
|
39
src/dinglehopper/tests/util.py
Normal file
39
src/dinglehopper/tests/util.py
Normal file
|
@ -0,0 +1,39 @@
|
|||
from itertools import zip_longest
|
||||
from typing import Iterable
|
||||
|
||||
import colorama
|
||||
import os
|
||||
|
||||
|
||||
def diffprint(x, y):
|
||||
"""Print elements or lists x and y, with differences in red"""
|
||||
|
||||
def _diffprint(x, y):
|
||||
if x != y:
|
||||
print(colorama.Fore.RED, x, y, colorama.Fore.RESET)
|
||||
else:
|
||||
print(x, y)
|
||||
|
||||
if isinstance(x, Iterable):
|
||||
for xe, ye in zip_longest(x, y):
|
||||
_diffprint(xe, ye)
|
||||
else:
|
||||
_diffprint(x, y)
|
||||
|
||||
|
||||
def unzip(an_iterable_of_tuples):
|
||||
return zip(*an_iterable_of_tuples)
|
||||
|
||||
|
||||
class working_directory:
|
||||
"""Context manager to temporarily change the working directory"""
|
||||
|
||||
def __init__(self, wd):
|
||||
self.wd = wd
|
||||
|
||||
def __enter__(self):
|
||||
self.old_wd = os.getcwd()
|
||||
os.chdir(self.wd)
|
||||
|
||||
def __exit__(self, etype, value, traceback):
|
||||
os.chdir(self.old_wd)
|
111
src/dinglehopper/word_error_rate.py
Normal file
111
src/dinglehopper/word_error_rate.py
Normal file
|
@ -0,0 +1,111 @@
|
|||
from __future__ import division
|
||||
|
||||
import unicodedata
|
||||
from typing import Tuple, Iterable
|
||||
from multimethod import multimethod
|
||||
|
||||
import uniseg.wordbreak
|
||||
|
||||
from rapidfuzz.distance import Levenshtein
|
||||
from . import ExtractedText
|
||||
|
||||
|
||||
# Did we patch uniseg.wordbreak.word_break already?
|
||||
word_break_patched = False
|
||||
|
||||
|
||||
def patch_word_break():
|
||||
"""
|
||||
Patch uniseg.wordbreak.word_break to deal with our private use characters.
|
||||
|
||||
See also
|
||||
https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/WordBreakProperty.txt
|
||||
"""
|
||||
old_word_break = uniseg.wordbreak.word_break
|
||||
|
||||
def new_word_break(c, index=0):
|
||||
if 0xE000 <= ord(c) <= 0xF8FF: # Private Use Area
|
||||
return "ALetter"
|
||||
else:
|
||||
return old_word_break(c, index)
|
||||
|
||||
uniseg.wordbreak.word_break = new_word_break
|
||||
global word_break_patched
|
||||
word_break_patched = True
|
||||
|
||||
|
||||
@multimethod
|
||||
def words(s: str):
|
||||
"""Extract words from a string"""
|
||||
|
||||
global word_break_patched
|
||||
if not word_break_patched:
|
||||
patch_word_break()
|
||||
|
||||
# Check if c is an unwanted character, i.e. whitespace, punctuation, or similar
|
||||
def unwanted(c):
|
||||
# See https://www.fileformat.info/info/unicode/category/index.htm
|
||||
# and https://unicodebook.readthedocs.io/unicode.html#categories
|
||||
unwanted_categories = "O", "M", "P", "Z", "S"
|
||||
unwanted_subcategories = "Cc", "Cf"
|
||||
|
||||
subcat = unicodedata.category(c)
|
||||
cat = subcat[0]
|
||||
return cat in unwanted_categories or subcat in unwanted_subcategories
|
||||
|
||||
# We follow Unicode Standard Annex #29 on Unicode Text Segmentation here: Split on word boundaries using
|
||||
# uniseg.wordbreak.words() and ignore all "words" that contain only whitespace, punctation "or similar characters."
|
||||
for word in uniseg.wordbreak.words(s):
|
||||
if all(unwanted(c) for c in word):
|
||||
pass
|
||||
else:
|
||||
yield word
|
||||
|
||||
|
||||
@multimethod
|
||||
def words(s: ExtractedText):
|
||||
return words(s.text)
|
||||
|
||||
|
||||
@multimethod
|
||||
def words_normalized(s: str):
|
||||
return words(unicodedata.normalize("NFC", s))
|
||||
|
||||
|
||||
@multimethod
|
||||
def words_normalized(s: ExtractedText):
|
||||
return words_normalized(s.text)
|
||||
|
||||
|
||||
@multimethod
|
||||
def word_error_rate_n(reference: str, compared: str) -> Tuple[float, int]:
|
||||
reference_seq = list(words_normalized(reference))
|
||||
compared_seq = list(words_normalized(compared))
|
||||
return word_error_rate_n(reference_seq, compared_seq)
|
||||
|
||||
|
||||
@multimethod
|
||||
def word_error_rate_n(
|
||||
reference: ExtractedText, compared: ExtractedText
|
||||
) -> Tuple[float, int]:
|
||||
return word_error_rate_n(reference.text, compared.text)
|
||||
|
||||
|
||||
@multimethod
|
||||
def word_error_rate_n(reference: Iterable, compared: Iterable) -> Tuple[float, int]:
|
||||
reference_seq = list(reference)
|
||||
compared_seq = list(compared)
|
||||
|
||||
d = Levenshtein.distance(reference_seq, compared_seq)
|
||||
n = len(reference_seq)
|
||||
|
||||
if d == 0:
|
||||
return 0, n
|
||||
if n == 0:
|
||||
return float("inf"), n
|
||||
return d / n, n
|
||||
|
||||
|
||||
def word_error_rate(reference, compared) -> float:
|
||||
wer, _ = word_error_rate_n(reference, compared)
|
||||
return wer
|
Loading…
Add table
Add a link
Reference in a new issue