From 207804e6a6b52f4c28c638bd3439263b33beacf6 Mon Sep 17 00:00:00 2001 From: Ruud de Jong Date: Fri, 12 May 2023 09:55:00 +0200 Subject: [PATCH] Add batch processing and report summaries --- README.md | 58 +- dinglehopper/cli.py | 88 +- dinglehopper/cli_summarize.py | 101 + dinglehopper/templates/report.html.j2 | 42 + dinglehopper/templates/report.html.js | 24 + dinglehopper/templates/report.json.j2 | 6 + dinglehopper/templates/summary.html.j2 | 136 + dinglehopper/templates/summary.json.j2 | 15 + .../tests/data/directory-test/gt/1.xml | 3394 +++++++++++++++++ .../tests/data/directory-test/gt/2.xml | 3394 +++++++++++++++++ .../tests/data/directory-test/ocr/1.xml | 3394 +++++++++++++++++ .../tests/data/directory-test/ocr/2.xml | 3394 +++++++++++++++++ .../data/directory-test/ocr/3-has-no-gt.xml | 3394 +++++++++++++++++ dinglehopper/tests/test_integ_cli_dir.py | 41 + dinglehopper/tests/test_integ_differences.py | 27 + dinglehopper/tests/test_integ_summarize.py | 101 + setup.py | 1 + 17 files changed, 17584 insertions(+), 26 deletions(-) create mode 100644 dinglehopper/cli_summarize.py create mode 100644 dinglehopper/templates/summary.html.j2 create mode 100644 dinglehopper/templates/summary.json.j2 create mode 100644 dinglehopper/tests/data/directory-test/gt/1.xml create mode 100644 dinglehopper/tests/data/directory-test/gt/2.xml create mode 100644 dinglehopper/tests/data/directory-test/ocr/1.xml create mode 100644 dinglehopper/tests/data/directory-test/ocr/2.xml create mode 100644 dinglehopper/tests/data/directory-test/ocr/3-has-no-gt.xml create mode 100644 dinglehopper/tests/test_integ_cli_dir.py create mode 100644 dinglehopper/tests/test_integ_differences.py create mode 100644 dinglehopper/tests/test_integ_summarize.py diff --git a/README.md b/README.md index e7b3c7b..240f017 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,8 @@ dinglehopper is an OCR evaluation tool and reads [ALTO](https://github.com/altoxml), [PAGE](https://github.com/PRImA-Research-Lab/PAGE-XML) and text files. It compares a ground truth (GT) document page with a OCR result page to compute -metrics and a word/character differences report. +metrics and a word/character differences report. It also supports batch processing by +generating, aggregating and summarizing multiple reports. [![Build Status](https://circleci.com/gh/qurator-spk/dinglehopper.svg?style=svg)](https://circleci.com/gh/qurator-spk/dinglehopper) @@ -27,7 +28,7 @@ sudo pip install . Usage ----- ~~~ -Usage: dinglehopper [OPTIONS] GT OCR [REPORT_PREFIX] +Usage: dinglehopper [OPTIONS] GT OCR [REPORT_PREFIX] [REPORTS_FOLDER] Compare the PAGE/ALTO/text document GT against the document OCR. @@ -35,19 +36,23 @@ Usage: dinglehopper [OPTIONS] GT OCR [REPORT_PREFIX] their text and falls back to plain text if no ALTO or PAGE is detected. The files GT and OCR are usually a ground truth document and the result of - an OCR software, but you may use dinglehopper to compare two OCR results. - In that case, use --no-metrics to disable the then meaningless metrics and - also change the color scheme from green/red to blue. + an OCR software, but you may use dinglehopper to compare two OCR results. In + that case, use --no-metrics to disable the then meaningless metrics and also + change the color scheme from green/red to blue. - The comparison report will be written to $REPORT_PREFIX.{html,json}, where - $REPORT_PREFIX defaults to "report". The reports include the character - error rate (CER) and the word error rate (WER). + The comparison report will be written to + $REPORTS_FOLDER/$REPORT_PREFIX.{html,json}, where $REPORTS_FOLDER defaults + to the current working directory and $REPORT_PREFIX defaults to "report". + The reports include the character error rate (CER) and the word error rate + (WER). By default, the text of PAGE files is extracted on 'region' level. You may use "--textequiv-level line" to extract from the level of TextLine tags. Options: --metrics / --no-metrics Enable/disable metrics and green/red + --differences BOOLEAN Enable reporting character and word level + differences --textequiv-level LEVEL PAGE TextEquiv level to extract text from --progress Show progress bar --help Show this message and exit. @@ -61,6 +66,43 @@ This generates `report.html` and `report.json`. ![dinglehopper displaying metrics and character differences](.screenshots/dinglehopper.png?raw=true) +Batch comparison between folders of GT and OCR files can be done by simply providing +folders: +~~~ +dinglehopper gt/ ocr/ report output_folder/ +~~~ +This assumes that you have files with the same name in both folders, e.g. +`gt/00000001.page.xml` and `ocr/00000001.alto.xml`. + +The example generates reports for each set of files, with the prefix `report`, in the +(automatically created) folder `output_folder/`. + +By default, the JSON report does not contain the character and word differences, only +the calculated metrics. If you want to include the differences, use the +`--differences` flag: + +~~~ +dinglehopper gt/ ocr/ report output_folder/ --differences +~~~ + +### dinglehopper-summarize +A set of (JSON) reports can be summarized into a single set of +reports. This is useful after having generated reports in batch. +Example: +~~~ +dinglehopper-summarize output_folder/ +~~~ +This generates `summary.html` and `summary.json` in the same `output_folder`. + +If you are summarizing many reports and have used the `--differences` flag while +generating them, it may be useful to limit the number of differences reported by using +the `--occurences-threshold` parameter. This will reduce the size of the generated HTML +report, making it easier to open and navigate. Note that the JSON report will still +contain all differences. Example: +~~~ +dinglehopper-summarize output_folder/ --occurences-threshold 10 +~~~ + ### dinglehopper-line-dirs You also may want to compare a directory of GT text files (i.e. `gt/line0001.gt.txt`) with a directory of OCR text files (i.e. `ocr/line0001.some-ocr.txt`) with a separate diff --git a/dinglehopper/cli.py b/dinglehopper/cli.py index 72d428d..b22aafc 100644 --- a/dinglehopper/cli.py +++ b/dinglehopper/cli.py @@ -1,4 +1,5 @@ import os +from collections import Counter import click from jinja2 import Environment, FileSystemLoader @@ -6,15 +7,15 @@ from markupsafe import escape from uniseg.graphemecluster import grapheme_clusters from ocrd_utils import initLogging -from .character_error_rate import character_error_rate_n -from .word_error_rate import word_error_rate_n, words_normalized -from .align import seq_align -from .extracted_text import ExtractedText -from .ocr_files import extract -from .config import Config +from dinglehopper.character_error_rate import character_error_rate_n +from dinglehopper.word_error_rate import word_error_rate_n, words_normalized +from dinglehopper.align import seq_align +from dinglehopper.extracted_text import ExtractedText +from dinglehopper.ocr_files import extract +from dinglehopper.config import Config -def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none): +def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none, differences=False): gtx = "" ocrx = "" @@ -54,6 +55,8 @@ def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none): g_pos = 0 o_pos = 0 + found_differences = [] + for k, (g, o) in enumerate(seq_align(gt_things, ocr_things)): css_classes = None gt_id = None @@ -66,6 +69,9 @@ def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none): # Deletions and inserts only produce one id + None, UI must # support this, i.e. display for the one id produced + if differences: + found_differences.append(f'{g} :: {o}') + gtx += joiner + format_thing(g, css_classes, gt_id) ocrx += joiner + format_thing(o, css_classes, ocr_id) @@ -74,6 +80,8 @@ def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none): if o is not None: o_pos += len(o) + found_differences = dict(Counter(elem for elem in found_differences)) + return """
{}
@@ -81,7 +89,7 @@ def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none):
""".format( gtx, ocrx - ) + ), found_differences def json_float(value): @@ -97,7 +105,8 @@ def json_float(value): return str(value) -def process(gt, ocr, report_prefix, *, metrics=True, textequiv_level="region"): +def process(gt, ocr, report_prefix, reports_folder='.', *, metrics=True, + differences=False, textequiv_level="region"): """Check OCR result against GT. The @click decorators change the signature of the decorated functions, so we keep this undecorated version and use @@ -110,14 +119,15 @@ def process(gt, ocr, report_prefix, *, metrics=True, textequiv_level="region"): cer, n_characters = character_error_rate_n(gt_text, ocr_text) wer, n_words = word_error_rate_n(gt_text, ocr_text) - char_diff_report = gen_diff_report( - gt_text, ocr_text, css_prefix="c", joiner="", none="·" - ) + char_diff_report, diff_c = gen_diff_report(gt_text, ocr_text, css_prefix="c", + joiner="", + none="·", differences=differences) gt_words = words_normalized(gt_text) ocr_words = words_normalized(ocr_text) - word_diff_report = gen_diff_report( - gt_words, ocr_words, css_prefix="w", joiner=" ", none="⋯" + word_diff_report, diff_w = gen_diff_report( + gt_words, ocr_words, css_prefix="w", joiner=" ", none="⋯", + differences=differences ) env = Environment( @@ -129,7 +139,11 @@ def process(gt, ocr, report_prefix, *, metrics=True, textequiv_level="region"): for report_suffix in (".html", ".json"): template_fn = "report" + report_suffix + ".j2" - out_fn = report_prefix + report_suffix + + if not os.path.isdir(reports_folder): + os.mkdir(reports_folder) + + out_fn = os.path.join(reports_folder, report_prefix + report_suffix) template = env.get_template(template_fn) template.stream( @@ -142,16 +156,42 @@ def process(gt, ocr, report_prefix, *, metrics=True, textequiv_level="region"): char_diff_report=char_diff_report, word_diff_report=word_diff_report, metrics=metrics, + differences=differences, + diff_c=diff_c, + diff_w=diff_w, ).dump(out_fn) +def process_dir(gt, ocr, report_prefix, reports_folder, metrics, differences, + textequiv_level): + for gt_file in os.listdir(gt): + gt_file_path = os.path.join(gt, gt_file) + ocr_file_path = os.path.join(ocr, gt_file) + + if os.path.isfile(gt_file_path) and os.path.isfile(ocr_file_path): + process(gt_file_path, ocr_file_path, + f"{gt_file}-{report_prefix}", + reports_folder=reports_folder, + metrics=metrics, + differences=differences, + textequiv_level=textequiv_level) + else: + print("Skipping {0} and {1}".format(gt_file_path, ocr_file_path)) + + @click.command() @click.argument("gt", type=click.Path(exists=True)) @click.argument("ocr", type=click.Path(exists=True)) @click.argument("report_prefix", type=click.Path(), default="report") +@click.argument("reports_folder", type=click.Path(), default=".") @click.option( "--metrics/--no-metrics", default=True, help="Enable/disable metrics and green/red" ) +@click.option( + "--differences", + default=False, + help="Enable reporting character and word level differences" +) @click.option( "--textequiv-level", default="region", @@ -159,7 +199,8 @@ def process(gt, ocr, report_prefix, *, metrics=True, textequiv_level="region"): metavar="LEVEL", ) @click.option("--progress", default=False, is_flag=True, help="Show progress bar") -def main(gt, ocr, report_prefix, metrics, textequiv_level, progress): +def main(gt, ocr, report_prefix, reports_folder, metrics, differences, textequiv_level, + progress): """ Compare the PAGE/ALTO/text document GT against the document OCR. @@ -171,7 +212,8 @@ def main(gt, ocr, report_prefix, metrics, textequiv_level, progress): that case, use --no-metrics to disable the then meaningless metrics and also change the color scheme from green/red to blue. - The comparison report will be written to $REPORT_PREFIX.{html,json}, where + The comparison report will be written to $REPORTS_FOLDER/$REPORT_PREFIX.{html,json}, + where $REPORTS_FOLDER defaults to the current working directory and $REPORT_PREFIX defaults to "report". The reports include the character error rate (CER) and the word error rate (WER). @@ -180,7 +222,17 @@ def main(gt, ocr, report_prefix, metrics, textequiv_level, progress): """ initLogging() Config.progress = progress - process(gt, ocr, report_prefix, metrics=metrics, textequiv_level=textequiv_level) + if os.path.isdir(gt): + if not os.path.isdir(ocr): + raise click.BadParameter( + "OCR must be a directory if GT is a directory", param_hint="ocr" + ) + else: + process_dir(gt, ocr, report_prefix, reports_folder, metrics, + differences, textequiv_level) + else: + process(gt, ocr, report_prefix, reports_folder, metrics=metrics, + differences=differences, textequiv_level=textequiv_level) if __name__ == "__main__": diff --git a/dinglehopper/cli_summarize.py b/dinglehopper/cli_summarize.py new file mode 100644 index 0000000..1cf1c91 --- /dev/null +++ b/dinglehopper/cli_summarize.py @@ -0,0 +1,101 @@ +import json +import os + +import click +from ocrd_utils import initLogging +from jinja2 import Environment, FileSystemLoader + +from dinglehopper.cli import json_float + + +def process(reports_folder, occurrences_threshold=1): + cer_list = [] + wer_list = [] + cer_sum = 0 + wer_sum = 0 + diff_c = {} + diff_w = {} + + for report in os.listdir(reports_folder): + if report.endswith(".json"): + with open(os.path.join(reports_folder, report), "r") as f: + report_data = json.load(f) + + if "cer" not in report_data or "wer" not in report_data: + click.echo( + f"Skipping {report} because it does not contain CER and WER") + continue + + cer = report_data["cer"] + wer = report_data["wer"] + cer_list.append(cer) + wer_list.append(wer) + cer_sum += cer + wer_sum += wer + + for key, value in report_data["differences"]["character_level"].items(): + diff_c[key] = diff_c.get(key, 0) + value + for key, value in report_data["differences"]["word_level"].items(): + diff_w[key] = diff_w.get(key, 0) + value + + if len(cer_list) == 0: + click.echo(f"No reports found in folder '{os.path.abspath(reports_folder)}'") + return + + cer_avg = cer_sum / len(cer_list) + wer_avg = wer_sum / len(wer_list) + + print(f"Number of reports: {len(cer_list)}") + print(f"Average CER: {cer_avg}") + print(f"Average WER: {wer_avg}") + print(f"Sum of common mistakes: {cer_sum}") + print(f"Sum of common mistakes: {wer_sum}") + + env = Environment( + loader=FileSystemLoader( + os.path.join(os.path.dirname(os.path.realpath(__file__)), "templates") + ) + ) + env.filters["json_float"] = json_float + for report_suffix in (".html", ".json"): + template_fn = "summary" + report_suffix + ".j2" + + out_fn = os.path.join(reports_folder, 'summary' + report_suffix) + template = env.get_template(template_fn) + template.stream( + num_reports=len(cer_list), + cer_avg=cer_avg, + wer_avg=wer_avg, + diff_c=diff_c, + diff_w=diff_w, + occurrences_threshold=occurrences_threshold, + ).dump(out_fn) + + +@click.command() +@click.argument("reports_folder", + type=click.Path(exists=True), + default="./reports" + ) +@click.option("--occurrences-threshold", + type=int, + default=1, + help="Only show differences that occur at least this many times.") +def main(reports_folder, occurrences_threshold): + """ + Summarize the results from multiple reports generated earlier by dinglehopper. + It calculates the average CER and WER, as well as a sum of common mistakes. + Reports include lists of mistakes and their occurrences. + + You may use a threshold to reduce the file size of the HTML report by only showing + mistakes whose number of occurrences is above the threshold. The JSON report will + always contain all mistakes. + + All JSON files in the provided folder will be gathered and summarized. + """ + initLogging() + process(reports_folder, occurrences_threshold) + + +if __name__ == "__main__": + main() diff --git a/dinglehopper/templates/report.html.j2 b/dinglehopper/templates/report.html.j2 index 0c2f464..435b98a 100644 --- a/dinglehopper/templates/report.html.j2 +++ b/dinglehopper/templates/report.html.j2 @@ -26,6 +26,22 @@ border: 2px solid; border-radius: 5px; } + + .row { + margin-bottom: 20px; + } + + table { + width: 100%; + } + + th { + cursor: pointer; + } + + th:hover { + background-color: #eee; + } @@ -50,6 +66,32 @@

Word differences

{{ word_diff_report }} +{%- if differences %} +{% set sections = [{'title': 'Found differences (character)', 'data': diff_c}, {'title': 'Found differences (word)', 'data': diff_w}] %} + +
+{% for section in sections %} +
+

{{ section['title'] }}

+ + + + + + + + {% for gt_ocr, occurrences in section['data'].items() %} + + + + + + {% endfor %} +
GTOCROccurrences
{{ gt_ocr.split("::")[0] }}{{ gt_ocr.split("::")[1] }}{{ occurrences }}
+
+{% endfor %} +
+{%- endif %} diff --git a/dinglehopper/templates/report.html.js b/dinglehopper/templates/report.html.js index 4c2ba28..f47cee7 100644 --- a/dinglehopper/templates/report.html.js +++ b/dinglehopper/templates/report.html.js @@ -12,4 +12,28 @@ $(document).ready(function() { $('.diff').mouseout(function() { find_diff_class($(this).attr('class')).removeClass('diff-highlight'); }); + + /* Sort this column of the table */ + $('th').click(function () { + var table = $(this).closest('table'); + var rows = table.find('tbody > tr').toArray().sort(compareRows($(this).index())); + this.asc = !this.asc; + if (!this.asc) { + rows = rows.reverse(); + } + for (var i = 0; i < rows.length; i++) { + table.children('tbody').append(rows[i]); + } + }); + + function compareRows(index) { + return function (row1, row2) { + var cell1 = $(row1).children('td').eq(index).text().toLowerCase(); + var cell2 = $(row2).children('td').eq(index).text().toLowerCase(); + return cell1.localeCompare(cell2, undefined, { + numeric: true, + sensitivity: 'base' + }); + } + } }); diff --git a/dinglehopper/templates/report.json.j2 b/dinglehopper/templates/report.json.j2 index 0e8af03..64dd8d4 100644 --- a/dinglehopper/templates/report.json.j2 +++ b/dinglehopper/templates/report.json.j2 @@ -4,6 +4,12 @@ {% if metrics %} "cer": {{ cer|json_float }}, "wer": {{ wer|json_float }}, +{% endif %} +{% if differences %} + "differences": { + "character_level": {{ diff_c|tojson }}, + "word_level": {{ diff_w|tojson }} + }, {% endif %} "n_characters": {{ n_characters }}, "n_words": {{ n_words }} diff --git a/dinglehopper/templates/summary.html.j2 b/dinglehopper/templates/summary.html.j2 new file mode 100644 index 0000000..e61e808 --- /dev/null +++ b/dinglehopper/templates/summary.html.j2 @@ -0,0 +1,136 @@ + + + + + + + + + + + +
+ +
+

Summary of all reports

+
+ +
+

Number of reports: {{ num_reports }}

+
+ +{% if cer_avg and wer_avg -%} +
+

Metrics

+
+ +
+

Average CER: {{ cer_avg|round(4) }}

+

Average WER: {{ wer_avg|round(4) }}

+
+{% endif %} + +{%- if diff_c and diff_w %} +{%- set sections = [{'title': 'Found differences (character)', 'data': diff_c}, {'title': 'Found differences (word)', 'data': diff_w}] %} + +
+{%- for section in sections %} +
+

{{ section['title'] }}

+ + + + + {%- set num_omitted = namespace(value=0) -%} + {% for gt_ocr, occurrences in section['data'].items() -%} + {% if occurrences < occurrences_threshold -%} + {%- set num_omitted.value = num_omitted.value + 1 %} + {%- else -%} + {%- set gt = gt_ocr.split(" :: ")[0] %} + {%- set ocr = gt_ocr.split(" :: ")[1] %} + + {# display the unicode character #} + + + + {%- endif %} + {%- endfor %} + + {% if num_omitted.value > 0 and occurrences_threshold > 1 -%} +

Skipped {{ num_omitted.value }} diffs with fewer than {{ occurrences_threshold }} occurrences. The complete list of diffs is available in the accompanying JSON file.

+ {%- set num_omitted.value = 0 %} + {%- endif %} +
GTOCROccurrences
{{ gt }}{{ ocr }}{{ occurrences }}
+
+{%- endfor %} +
+{%- endif %} + +
+ + + + + + + + + + + + diff --git a/dinglehopper/templates/summary.json.j2 b/dinglehopper/templates/summary.json.j2 new file mode 100644 index 0000000..bb45f4e --- /dev/null +++ b/dinglehopper/templates/summary.json.j2 @@ -0,0 +1,15 @@ +{ +"num_reports": {{ num_reports}} +{%- if cer_avg and wer_avg %} + , + "cer_avg": {{ cer_avg|json_float }}, + "wer_avg": {{ wer_avg|json_float }} +{%- endif %} +{%- if diff_c and wer_avg %} + , + "differences": { + "character_level": {{ diff_c|tojson }}, + "word_level": {{ diff_w|tojson }} + } +{%- endif %} +} diff --git a/dinglehopper/tests/data/directory-test/gt/1.xml b/dinglehopper/tests/data/directory-test/gt/1.xml new file mode 100644 index 0000000..c0dc183 --- /dev/null +++ b/dinglehopper/tests/data/directory-test/gt/1.xml @@ -0,0 +1,3394 @@ + + + + doculibtopagexml + 2019-01-08T10:25:36 + 2019-04-26T07:11:05 + + + + + + + + + + + + + + + + + + + + + + + b + + + + e + + + + r + + ber + + + + + + + d + + + + i + + + + e + + die + + + + + + + v + + + + i + + + + e + + + + l + + + + e + + + + n + + vielen + + + + + + + S + + + + o + + + + r + + + + g + + + + e + + + + n + + Sorgen + + + + + + + w + + + + e + + + + g + + + + e + + + + n + + wegen + + + + + + + d + + + + e + + + + + + + + e + + + + l + + + + b + + + + e + + + + n + + deelben + + + + + + + v + + + + e + + + + r + + + + g + + + + a + + + + ß + + vergaß + + + ber die vielen Sorgen wegen deelben vergaß + + + + + + + + i + + + + h + + + + r + + ihr + + + + + + + d + + + + o + + + + + + do + + + + + + + n + + + + o + + + + + + no + + + + + + + a + + + + n + + an + + + + + + + a + + + + + + + + e + + + + m + + + + . + + aem. + + + + + + + + + + + + ihr do no an aem. — + + + + + + + + H + + + + a + + + + r + + + + t + + + + k + + + + o + + + + p + + + + f + + + + , + + Hartkopf, + + + + + + + d + + + + e + + + + r + + der + + + + + + + F + + + + r + + + + a + + + + u + + Frau + + + + + + + A + + + + m + + + + t + + + + m + + + + + + + + n + + + + n + + + + i + + + + n + + Amtmnnin + + + + + + + d + + + + a + + + + s + + das + + + + + + + v + + + + e + + + + r + + + + + + ver⸗ + + + Hartkopf, der Frau Amtmnnin das ver⸗ + + + + + + + + ſ + + + + p + + + + r + + + + o + + + + + + + + e + + + + n + + + + e + + ſproene + + + + + + + z + + + + u + + zu + + + + + + + + + + + b + + + + e + + + + r + + + + l + + + + i + + + + e + + + + f + + + + e + + + + r + + + + n + + + + . + + berliefern. + + + + + + + + + + + + + + + + E + + + + i + + + + n + + Ein + + + + + + + E + + + + r + + + + p + + + + + + + + e + + + + r + + + + r + + + + e + + Erpreer + + + ſproene zu berliefern. — Ein Erpreer + + + + + + + + w + + + + d + + + + e + + + + u + + + + r + + wurde + + + + + + + a + + + + n + + an + + + + + + + i + + + + h + + + + n + + ihn + + + + + + + a + + + + b + + + + g + + + + e + + + + ſ + + + + + + + + i + + + + + + + + t + + + + , + + abgeſit, + + + + + + + u + + + + m + + um + + + + + + + i + + + + h + + + + n + + ihn + + + + + + + u + + + + m + + + + s + + ums + + + + + + + H + + + + i + + + + m + + + + + + Him⸗ + + + wurde an ihn abgeſit, um ihn ums Him⸗ + + + + + + + + m + + + + e + + + + l + + + + s + + + + w + + + + i + + + + + + + + e + + + + n + + melswien + + + + + + + z + + + + u + + zu + + + + + + + ſ + + + + a + + + + g + + + + e + + + + n + + + + , + + ſagen, + + + + + + + d + + + + a + + + + ß + + daß + + + + + + + e + + + + r + + er + + + + + + + d + + + + a + + + + s + + das + + + + + + + V + + + + e + + + + r + + + + ſ + + + + p + + + + r + + + + o + + + + + + + + e + + + + n + + + + e + + Verſproene + + + melswien zu ſagen, daß er das Verſproene + + + + + + + + g + + + + l + + + + e + + + + i + + + + + + glei + + + + + + + d + + + + e + + + + n + + den + + + + + + + A + + + + u + + + + g + + + + e + + + + n + + + + b + + + + l + + + + i + + + + + + Augenbli + + + + + + + + + + + b + + + + e + + + + r + + + + b + + + + r + + + + i + + + + n + + + + g + + + + e + + + + n + + berbringen + + + + + + + m + + + + + + + + + + + + t + + + + e + + + + , + + mte, + + + + + + + d + + + + i + + + + e + + die + + + glei den Augenbli berbringen mte, die + + + + + + + + F + + + + r + + + + a + + + + u + + Frau + + + + + + + A + + + + m + + + + t + + + + m + + + + + + + + n + + + + n + + + + i + + + + n + + Amtmnnin + + + + + + + h + + + + + + + + t + + + + t + + + + e + + htte + + + + + + + + + + + + +  + + + + + + + a + + + + u + + + + f + + auf + + + + + + + i + + + + h + + + + n + + ihn + + + + + + + v + + + + e + + + + r + + + + l + + + + a + + + + + + + + e + + + + n + + + + , + + verlaen, + + + Frau Amtmnnin htte  auf ihn verlaen, + + + + + + + + u + + + + n + + + + d + + und + + + + + + + n + + + + u + + + + n + + nun + + + + + + + w + + + + + + + + ß + + + + t + + + + e + + wßte + + + + + + + + + + + e + + e + + + + + + + n + + + + i + + + + + + + + t + + + + , + + nit, + + + + + + + w + + + + a + + + + s + + was + + + + + + + + + + + e + + e + + + + + + + a + + + + n + + + + f + + + + a + + + + n + + + + g + + + + e + + + + n + + anfangen + + + und nun wßte e nit, was e anfangen + + + + + + + + ſ + + + + o + + + + + + + + t + + + + e + + + + . + + ſote. + + + + + + + D + + + + e + + + + n + + Den + + + + + + + A + + + + u + + + + g + + + + e + + + + n + + + + b + + + + l + + + + i + + + + + + Augenbli + + + + + + + ſ + + + + o + + + + + + + + t + + + + e + + ſote + + + + + + + e + + + + r + + er + + + + + + + k + + + + o + + + + m + + + + m + + + + e + + + + n + + + + , + + kommen, + + + ſote. Den Augenbli ſote er kommen, + + + + + + + + + + + + e + + e + + + + + + + i + + + + n + + in + + + + + + + i + + + + h + + + + r + + + + e + + + + r + + ihrer + + + + + + + A + + + + n + + + + g + + + + + + + + . + + Ang. + + + + + + + + + + + + + + + + D + + + + i + + + + e + + Die + + + + + + + ſ + + + + o + + + + n + + + + + + ſon + + + + + + + v + + + + e + + + + r + + + + g + + + + i + + + + e + + + + n + + + + g + + vergieng + + + ſon vergieng e in ihrer Ang. — Die + + + + + + + + G + + + + + + + + + + + + e + + Ge + + + + + + + w + + + + + + + + r + + + + e + + + + n + + wren + + + + + + + ſ + + + + + + + + o + + + + n + + ſon + + + + + + + a + + + + n + + + + g + + + + e + + + + k + + + + o + + + + m + + + + m + + + + e + + + + n + + + + , + + angekommen, + + + + + + + u + + + + n + + + + d + + und + + + + + + + e + + + + s + + es + + + + + + + f + + + + e + + + + h + + + + l + + + + t + + + + e + + fehlte + + + Ge wren ſon angekommen, und es fehlte + + ber die vielen Sorgen wegen deelben vergaß +Hartkopf, der Frau Amtmnnin das ver⸗ +ſproene zu berliefern. — Ein Erpreer +wurde an ihn abgeſit, um ihn ums Him⸗ +melswien zu ſagen, daß er das Verſproene +glei den Augenbli berbringen mte, die +Frau Amtmnnin htte  auf ihn verlaen, +und nun wßte e nit, was e anfangen +ſote. Den Augenbli ſote er kommen, +ſon vergieng e in ihrer Ang. — Die +Ge wren ſon angekommen, und es fehlte +ihr do no an aem. — + + + + + + + + + + + H + + + + a + + + + r + + + + t + + + + k + + + + o + + + + f + + + + p + + Hartkopf + + + + + + + m + + + + u + + + + ß + + + + t + + + + e + + mußte + + + + + + + + + + + + +  + + + + + + + e + + + + r + + + + + + er + + + + + + + b + + + + e + + + + + + + + n + + + + n + + + + e + + + + n + + + + , + + bennen, + + + + + + + u + + + + n + + + + d + + und + + + Hartkopf mußte  er bennen, und + + + + + + + + m + + + + i + + + + t + + mit + + + + + + + u + + + + n + + + + d + + und + + + + + + + + + + + b + + + + e + + + + r + + + + b + + + + r + + + + a + + + + + + + + t + + + + e + + berbrate + + + + + + + e + + + + s + + + + . + + es. + + + + + + + + + + + + mit und berbrate es. — + + + + + + + + l + + + + a + + + + n + + + + g + + + + e + + + + m + + langem + + + + + + + N + + + + a + + + + + + + + d + + + + e + + + + n + + + + k + + + + e + + + + n + + Nadenken + + + + + + + + + + + e + + + + l + + fiel + + + + + + + e + + + + s + + es + + + + + + + i + + + + h + + + + m + + ihm + + + + + + + e + + + + r + + + + + + er + + + + + + + e + + + + n + + + + d + + + + l + + + + i + + + + + + endli + + + + + + + n + + + + a + + + + + + na + + + endli na langem Nadenken fiel es ihm er + + + + + + + + w + + + + i + + + + e + + + + d + + + + e + + + + r + + wieder + + + + + + + e + + + + i + + + + n + + + + . + + ein. + + + + + + + + + + + + + + + + E + + + + r + + Er + + + + + + + l + + + + a + + + + n + + + + g + + + + t + + + + e + + langte + + + + + + + d + + + + e + + + + n + + den + + + + + + + Z + + + + e + + + + t + + + + t + + + + e + + + + l + + Zettel + + + + + + + a + + + + u + + + + s + + aus + + + + + + + d + + + + e + + + + m + + dem + + + wieder ein. — Er langte den Zettel aus dem + + + + + + + + A + + + + c + + + + c + + + + i + + + + + + + + e + + + + s + + + + b + + + + u + + Accisbue + + + + + + + h + + + + e + + + + r + + + + a + + + + u + + + + s + + + + , + + heraus, + + + + + + + u + + + + n + + + + d + + und + + + + + + + ſ + + + + a + + + + g + + + + t + + + + e + + ſagte + + + + + + + ſ + + + + e + + + + i + + + + n + + + + e + + + + r + + ſeiner + + + + + + + F + + + + r + + + + a + + + + u + + + + , + + Frau, + + + + + + + d + + + + a + + + + ß + + daß + + + Accisbue heraus, und ſagte ſeiner Frau, daß + + + + + + + + + + + + e + + e + + + + + + + d + + + + a + + + + s + + + + , + + das, + + + + + + + w + + + + a + + + + s + + was + + + + + + + d + + + + a + + da + + + + + + + w + + + + + + + + r + + + + e + + + + , + + wre, + + + + + + + h + + + + e + + + + r + + + + b + + + + e + + + + y + + + + ſ + + + + + + + + a + + + + + + + + e + + + + n + + herbeyſaffen + + + + + + + m + + + + + + + + + + + + t + + + + e + + + + . + + mte. + + + e das, was da wre, herbeyſaffen mte. + + + + + + + + J + + + + n + + + + d + + + + e + + + + ß + + Jndeß + + + + + + + m + + + + a + + + + n + + + + g + + + + e + + + + l + + + + t + + + + e + + + + n + + mangelten + + + + + + + d + + + + i + + + + e + + die + + + + + + + d + + + + o + + + + + + do + + + + + + + e + + + + i + + + + n + + + + i + + + + g + + + + e + + einige + + + + + + + G + + + + e + + + + n + + + + e + + + + l + + + + i + + + + a + + + + , + + + + r + + + + a + + Generalia, + + + Jndeß mangelten do einige Generalia, die + + + + + + + + a + + + + l + + + + ſ + + + + o + + alſo + + + + + + + w + + + + e + + + + g + + + + + + + + e + + + + l + + + + e + + + + n + + + + . + + wegfielen. + + + + + + + + + + + + + + + + H + + + + a + + + + r + + + + t + + + + k + + + + o + + + + p + + + + f + + Hartkopf + + + + + + + g + + + + i + + + + e + + + + n + + + + g + + gieng + + + + + + + ſ + + + + e + + + + l + + + + b + + + + + + ſelb + + + alſo wegfielen. — Hartkopf gieng ſelb + + Hartkopf mußte  er bennen, und +endli na langem Nadenken fiel es ihm er +wieder ein. — Er langte den Zettel aus dem +Accisbue heraus, und ſagte ſeiner Frau, daß +e das, was da wre, herbeyſaffen mte. +Jndeß mangelten do einige Generalia, die +alſo wegfielen. — Hartkopf gieng ſelb +mit und berbrate es. — + + + + diff --git a/dinglehopper/tests/data/directory-test/gt/2.xml b/dinglehopper/tests/data/directory-test/gt/2.xml new file mode 100644 index 0000000..c0dc183 --- /dev/null +++ b/dinglehopper/tests/data/directory-test/gt/2.xml @@ -0,0 +1,3394 @@ + + + + doculibtopagexml + 2019-01-08T10:25:36 + 2019-04-26T07:11:05 + + + + + + + + + + + + + + + + + + + + + + + b + + + + e + + + + r + + ber + + + + + + + d + + + + i + + + + e + + die + + + + + + + v + + + + i + + + + e + + + + l + + + + e + + + + n + + vielen + + + + + + + S + + + + o + + + + r + + + + g + + + + e + + + + n + + Sorgen + + + + + + + w + + + + e + + + + g + + + + e + + + + n + + wegen + + + + + + + d + + + + e + + + + + + + + e + + + + l + + + + b + + + + e + + + + n + + deelben + + + + + + + v + + + + e + + + + r + + + + g + + + + a + + + + ß + + vergaß + + + ber die vielen Sorgen wegen deelben vergaß + + + + + + + + i + + + + h + + + + r + + ihr + + + + + + + d + + + + o + + + + + + do + + + + + + + n + + + + o + + + + + + no + + + + + + + a + + + + n + + an + + + + + + + a + + + + + + + + e + + + + m + + + + . + + aem. + + + + + + + + + + + + ihr do no an aem. — + + + + + + + + H + + + + a + + + + r + + + + t + + + + k + + + + o + + + + p + + + + f + + + + , + + Hartkopf, + + + + + + + d + + + + e + + + + r + + der + + + + + + + F + + + + r + + + + a + + + + u + + Frau + + + + + + + A + + + + m + + + + t + + + + m + + + + + + + + n + + + + n + + + + i + + + + n + + Amtmnnin + + + + + + + d + + + + a + + + + s + + das + + + + + + + v + + + + e + + + + r + + + + + + ver⸗ + + + Hartkopf, der Frau Amtmnnin das ver⸗ + + + + + + + + ſ + + + + p + + + + r + + + + o + + + + + + + + e + + + + n + + + + e + + ſproene + + + + + + + z + + + + u + + zu + + + + + + + + + + + b + + + + e + + + + r + + + + l + + + + i + + + + e + + + + f + + + + e + + + + r + + + + n + + + + . + + berliefern. + + + + + + + + + + + + + + + + E + + + + i + + + + n + + Ein + + + + + + + E + + + + r + + + + p + + + + + + + + e + + + + r + + + + r + + + + e + + Erpreer + + + ſproene zu berliefern. — Ein Erpreer + + + + + + + + w + + + + d + + + + e + + + + u + + + + r + + wurde + + + + + + + a + + + + n + + an + + + + + + + i + + + + h + + + + n + + ihn + + + + + + + a + + + + b + + + + g + + + + e + + + + ſ + + + + + + + + i + + + + + + + + t + + + + , + + abgeſit, + + + + + + + u + + + + m + + um + + + + + + + i + + + + h + + + + n + + ihn + + + + + + + u + + + + m + + + + s + + ums + + + + + + + H + + + + i + + + + m + + + + + + Him⸗ + + + wurde an ihn abgeſit, um ihn ums Him⸗ + + + + + + + + m + + + + e + + + + l + + + + s + + + + w + + + + i + + + + + + + + e + + + + n + + melswien + + + + + + + z + + + + u + + zu + + + + + + + ſ + + + + a + + + + g + + + + e + + + + n + + + + , + + ſagen, + + + + + + + d + + + + a + + + + ß + + daß + + + + + + + e + + + + r + + er + + + + + + + d + + + + a + + + + s + + das + + + + + + + V + + + + e + + + + r + + + + ſ + + + + p + + + + r + + + + o + + + + + + + + e + + + + n + + + + e + + Verſproene + + + melswien zu ſagen, daß er das Verſproene + + + + + + + + g + + + + l + + + + e + + + + i + + + + + + glei + + + + + + + d + + + + e + + + + n + + den + + + + + + + A + + + + u + + + + g + + + + e + + + + n + + + + b + + + + l + + + + i + + + + + + Augenbli + + + + + + + + + + + b + + + + e + + + + r + + + + b + + + + r + + + + i + + + + n + + + + g + + + + e + + + + n + + berbringen + + + + + + + m + + + + + + + + + + + + t + + + + e + + + + , + + mte, + + + + + + + d + + + + i + + + + e + + die + + + glei den Augenbli berbringen mte, die + + + + + + + + F + + + + r + + + + a + + + + u + + Frau + + + + + + + A + + + + m + + + + t + + + + m + + + + + + + + n + + + + n + + + + i + + + + n + + Amtmnnin + + + + + + + h + + + + + + + + t + + + + t + + + + e + + htte + + + + + + + + + + + + +  + + + + + + + a + + + + u + + + + f + + auf + + + + + + + i + + + + h + + + + n + + ihn + + + + + + + v + + + + e + + + + r + + + + l + + + + a + + + + + + + + e + + + + n + + + + , + + verlaen, + + + Frau Amtmnnin htte  auf ihn verlaen, + + + + + + + + u + + + + n + + + + d + + und + + + + + + + n + + + + u + + + + n + + nun + + + + + + + w + + + + + + + + ß + + + + t + + + + e + + wßte + + + + + + + + + + + e + + e + + + + + + + n + + + + i + + + + + + + + t + + + + , + + nit, + + + + + + + w + + + + a + + + + s + + was + + + + + + + + + + + e + + e + + + + + + + a + + + + n + + + + f + + + + a + + + + n + + + + g + + + + e + + + + n + + anfangen + + + und nun wßte e nit, was e anfangen + + + + + + + + ſ + + + + o + + + + + + + + t + + + + e + + + + . + + ſote. + + + + + + + D + + + + e + + + + n + + Den + + + + + + + A + + + + u + + + + g + + + + e + + + + n + + + + b + + + + l + + + + i + + + + + + Augenbli + + + + + + + ſ + + + + o + + + + + + + + t + + + + e + + ſote + + + + + + + e + + + + r + + er + + + + + + + k + + + + o + + + + m + + + + m + + + + e + + + + n + + + + , + + kommen, + + + ſote. Den Augenbli ſote er kommen, + + + + + + + + + + + + e + + e + + + + + + + i + + + + n + + in + + + + + + + i + + + + h + + + + r + + + + e + + + + r + + ihrer + + + + + + + A + + + + n + + + + g + + + + + + + + . + + Ang. + + + + + + + + + + + + + + + + D + + + + i + + + + e + + Die + + + + + + + ſ + + + + o + + + + n + + + + + + ſon + + + + + + + v + + + + e + + + + r + + + + g + + + + i + + + + e + + + + n + + + + g + + vergieng + + + ſon vergieng e in ihrer Ang. — Die + + + + + + + + G + + + + + + + + + + + + e + + Ge + + + + + + + w + + + + + + + + r + + + + e + + + + n + + wren + + + + + + + ſ + + + + + + + + o + + + + n + + ſon + + + + + + + a + + + + n + + + + g + + + + e + + + + k + + + + o + + + + m + + + + m + + + + e + + + + n + + + + , + + angekommen, + + + + + + + u + + + + n + + + + d + + und + + + + + + + e + + + + s + + es + + + + + + + f + + + + e + + + + h + + + + l + + + + t + + + + e + + fehlte + + + Ge wren ſon angekommen, und es fehlte + + ber die vielen Sorgen wegen deelben vergaß +Hartkopf, der Frau Amtmnnin das ver⸗ +ſproene zu berliefern. — Ein Erpreer +wurde an ihn abgeſit, um ihn ums Him⸗ +melswien zu ſagen, daß er das Verſproene +glei den Augenbli berbringen mte, die +Frau Amtmnnin htte  auf ihn verlaen, +und nun wßte e nit, was e anfangen +ſote. Den Augenbli ſote er kommen, +ſon vergieng e in ihrer Ang. — Die +Ge wren ſon angekommen, und es fehlte +ihr do no an aem. — + + + + + + + + + + + H + + + + a + + + + r + + + + t + + + + k + + + + o + + + + f + + + + p + + Hartkopf + + + + + + + m + + + + u + + + + ß + + + + t + + + + e + + mußte + + + + + + + + + + + + +  + + + + + + + e + + + + r + + + + + + er + + + + + + + b + + + + e + + + + + + + + n + + + + n + + + + e + + + + n + + + + , + + bennen, + + + + + + + u + + + + n + + + + d + + und + + + Hartkopf mußte  er bennen, und + + + + + + + + m + + + + i + + + + t + + mit + + + + + + + u + + + + n + + + + d + + und + + + + + + + + + + + b + + + + e + + + + r + + + + b + + + + r + + + + a + + + + + + + + t + + + + e + + berbrate + + + + + + + e + + + + s + + + + . + + es. + + + + + + + + + + + + mit und berbrate es. — + + + + + + + + l + + + + a + + + + n + + + + g + + + + e + + + + m + + langem + + + + + + + N + + + + a + + + + + + + + d + + + + e + + + + n + + + + k + + + + e + + + + n + + Nadenken + + + + + + + + + + + e + + + + l + + fiel + + + + + + + e + + + + s + + es + + + + + + + i + + + + h + + + + m + + ihm + + + + + + + e + + + + r + + + + + + er + + + + + + + e + + + + n + + + + d + + + + l + + + + i + + + + + + endli + + + + + + + n + + + + a + + + + + + na + + + endli na langem Nadenken fiel es ihm er + + + + + + + + w + + + + i + + + + e + + + + d + + + + e + + + + r + + wieder + + + + + + + e + + + + i + + + + n + + + + . + + ein. + + + + + + + + + + + + + + + + E + + + + r + + Er + + + + + + + l + + + + a + + + + n + + + + g + + + + t + + + + e + + langte + + + + + + + d + + + + e + + + + n + + den + + + + + + + Z + + + + e + + + + t + + + + t + + + + e + + + + l + + Zettel + + + + + + + a + + + + u + + + + s + + aus + + + + + + + d + + + + e + + + + m + + dem + + + wieder ein. — Er langte den Zettel aus dem + + + + + + + + A + + + + c + + + + c + + + + i + + + + + + + + e + + + + s + + + + b + + + + u + + Accisbue + + + + + + + h + + + + e + + + + r + + + + a + + + + u + + + + s + + + + , + + heraus, + + + + + + + u + + + + n + + + + d + + und + + + + + + + ſ + + + + a + + + + g + + + + t + + + + e + + ſagte + + + + + + + ſ + + + + e + + + + i + + + + n + + + + e + + + + r + + ſeiner + + + + + + + F + + + + r + + + + a + + + + u + + + + , + + Frau, + + + + + + + d + + + + a + + + + ß + + daß + + + Accisbue heraus, und ſagte ſeiner Frau, daß + + + + + + + + + + + + e + + e + + + + + + + d + + + + a + + + + s + + + + , + + das, + + + + + + + w + + + + a + + + + s + + was + + + + + + + d + + + + a + + da + + + + + + + w + + + + + + + + r + + + + e + + + + , + + wre, + + + + + + + h + + + + e + + + + r + + + + b + + + + e + + + + y + + + + ſ + + + + + + + + a + + + + + + + + e + + + + n + + herbeyſaffen + + + + + + + m + + + + + + + + + + + + t + + + + e + + + + . + + mte. + + + e das, was da wre, herbeyſaffen mte. + + + + + + + + J + + + + n + + + + d + + + + e + + + + ß + + Jndeß + + + + + + + m + + + + a + + + + n + + + + g + + + + e + + + + l + + + + t + + + + e + + + + n + + mangelten + + + + + + + d + + + + i + + + + e + + die + + + + + + + d + + + + o + + + + + + do + + + + + + + e + + + + i + + + + n + + + + i + + + + g + + + + e + + einige + + + + + + + G + + + + e + + + + n + + + + e + + + + l + + + + i + + + + a + + + + , + + + + r + + + + a + + Generalia, + + + Jndeß mangelten do einige Generalia, die + + + + + + + + a + + + + l + + + + ſ + + + + o + + alſo + + + + + + + w + + + + e + + + + g + + + + + + + + e + + + + l + + + + e + + + + n + + + + . + + wegfielen. + + + + + + + + + + + + + + + + H + + + + a + + + + r + + + + t + + + + k + + + + o + + + + p + + + + f + + Hartkopf + + + + + + + g + + + + i + + + + e + + + + n + + + + g + + gieng + + + + + + + ſ + + + + e + + + + l + + + + b + + + + + + ſelb + + + alſo wegfielen. — Hartkopf gieng ſelb + + Hartkopf mußte  er bennen, und +endli na langem Nadenken fiel es ihm er +wieder ein. — Er langte den Zettel aus dem +Accisbue heraus, und ſagte ſeiner Frau, daß +e das, was da wre, herbeyſaffen mte. +Jndeß mangelten do einige Generalia, die +alſo wegfielen. — Hartkopf gieng ſelb +mit und berbrate es. — + + + + diff --git a/dinglehopper/tests/data/directory-test/ocr/1.xml b/dinglehopper/tests/data/directory-test/ocr/1.xml new file mode 100644 index 0000000..0e62647 --- /dev/null +++ b/dinglehopper/tests/data/directory-test/ocr/1.xml @@ -0,0 +1,3394 @@ + + + + doculibtopagexml + 2019-01-08T10:25:36 + 2019-04-26T07:11:05 + + + + + + + + + + + + + + + + + + + + + + + b + + + + e + + + + r + + ber + + + + + + + d + + + + i + + + + e + + die + + + + + + + v + + + + i + + + + e + + + + l + + + + e + + + + n + + vielen + + + + + + + S + + + + o + + + + r + + + + g + + + + e + + + + n + + Sorgen + + + + + + + w + + + + e + + + + g + + + + e + + + + n + + wegen + + + + + + + d + + + + e + + + + + + + + e + + + + l + + + + b + + + + e + + + + n + + deelben + + + + + + + v + + + + e + + + + r + + + + g + + + + a + + + + ß + + vergaß + + + ber die vielen Sorgen wegen deelben vergaß + + + + + + + + i + + + + h + + + + r + + ihr + + + + + + + d + + + + o + + + + + + do + + + + + + + n + + + + o + + + + + + no + + + + + + + a + + + + n + + an + + + + + + + a + + + + + + + + e + + + + m + + + + . + + aem. + + + + + + + + + + + + ihr do no an aem. — + + + + + + + + H + + + + a + + + + r + + + + t + + + + k + + + + o + + + + p + + + + f + + + + , + + Hartkopf, + + + + + + + d + + + + e + + + + r + + der + + + + + + + F + + + + r + + + + a + + + + u + + Frau + + + + + + + A + + + + m + + + + t + + + + m + + + + + + + + n + + + + n + + + + i + + + + n + + Amtmnnin + + + + + + + d + + + + a + + + + s + + das + + + + + + + v + + + + e + + + + r + + + + + + ver⸗ + + + Hartkopf, der Frau Amtmnnin das ver⸗ + + + + + + + + ſ + + + + p + + + + r + + + + o + + + + + + + + e + + + + n + + + + e + + ſproene + + + + + + + z + + + + u + + zu + + + + + + + + + + + b + + + + e + + + + r + + + + l + + + + i + + + + e + + + + f + + + + e + + + + r + + + + n + + + + . + + berliefern. + + + + + + + + + + + + + + + + E + + + + i + + + + n + + Ein + + + + + + + E + + + + r + + + + p + + + + + + + + e + + + + r + + + + r + + + + e + + Erpreer + + + ſproene zu berliefern. — Ein Erpreer + + + + + + + + w + + + + d + + + + e + + + + u + + + + r + + wurde + + + + + + + a + + + + n + + an + + + + + + + i + + + + h + + + + n + + ihn + + + + + + + a + + + + b + + + + g + + + + e + + + + ſ + + + + + + + + i + + + + + + + + t + + + + , + + abgeſit, + + + + + + + u + + + + m + + um + + + + + + + i + + + + h + + + + n + + ihn + + + + + + + u + + + + m + + + + s + + ums + + + + + + + H + + + + i + + + + m + + + + + + Him⸗ + + + wurde an ihn abgeſit, um ihn ums Him⸗ + + + + + + + + m + + + + e + + + + l + + + + s + + + + w + + + + i + + + + + + + + e + + + + n + + melswien + + + + + + + z + + + + u + + zu + + + + + + + ſ + + + + a + + + + g + + + + e + + + + n + + + + , + + ſagen, + + + + + + + d + + + + a + + + + ß + + daß + + + + + + + e + + + + r + + er + + + + + + + d + + + + a + + + + s + + das + + + + + + + V + + + + e + + + + r + + + + ſ + + + + p + + + + r + + + + o + + + + + + + + e + + + + n + + + + e + + Verſproene + + + melswien zu ſagen, daß er das Verſproene + + + + + + + + g + + + + l + + + + e + + + + i + + + + + + glei + + + + + + + d + + + + e + + + + n + + den + + + + + + + A + + + + u + + + + g + + + + e + + + + n + + + + b + + + + l + + + + i + + + + + + Augenbli + + + + + + + + + + + b + + + + e + + + + r + + + + b + + + + r + + + + i + + + + n + + + + g + + + + e + + + + n + + berbringen + + + + + + + m + + + + + + + + + + + + t + + + + e + + + + , + + mte, + + + + + + + d + + + + i + + + + e + + die + + + glei den Augenbli berbringen mte, die + + + + + + + + F + + + + r + + + + a + + + + u + + Frau + + + + + + + A + + + + m + + + + t + + + + m + + + + + + + + n + + + + n + + + + i + + + + n + + Amtmnnin + + + + + + + h + + + + + + + + t + + + + t + + + + e + + htte + + + + + + + + + + + + +  + + + + + + + a + + + + u + + + + f + + auf + + + + + + + i + + + + h + + + + n + + ihn + + + + + + + v + + + + e + + + + r + + + + l + + + + a + + + + + + + + e + + + + n + + + + , + + verlaen, + + + Frau Amtmnnin htte  auf ihn verlaen, + + + + + + + + u + + + + n + + + + d + + und + + + + + + + n + + + + u + + + + n + + nun + + + + + + + w + + + + + + + + ß + + + + t + + + + e + + wßte + + + + + + + + + + + e + + e + + + + + + + n + + + + i + + + + + + + + t + + + + , + + nit, + + + + + + + w + + + + a + + + + s + + was + + + + + + + + + + + e + + e + + + + + + + a + + + + n + + + + f + + + + a + + + + n + + + + g + + + + e + + + + n + + anfangen + + + und nun wßte e nit, was e anfangen + + + + + + + + ſ + + + + o + + + + + + + + t + + + + e + + + + . + + ſote. + + + + + + + D + + + + e + + + + n + + Den + + + + + + + A + + + + u + + + + g + + + + e + + + + n + + + + b + + + + l + + + + i + + + + + + Augenbli + + + + + + + ſ + + + + o + + + + + + + + t + + + + e + + ſote + + + + + + + e + + + + r + + er + + + + + + + k + + + + o + + + + m + + + + m + + + + e + + + + n + + + + , + + kommen, + + + ſote. Den Augenbli ſote er kommen, + + + + + + + + + + + + e + + e + + + + + + + i + + + + n + + in + + + + + + + i + + + + h + + + + r + + + + e + + + + r + + ihrer + + + + + + + A + + + + n + + + + g + + + + + + + + . + + Ang. + + + + + + + + + + + + + + + + D + + + + i + + + + e + + Die + + + + + + + ſ + + + + o + + + + n + + + + + + ſon + + + + + + + v + + + + e + + + + r + + + + g + + + + i + + + + e + + + + n + + + + g + + vergieng + + + ſon vergieng e in ihrer Ang. — Die + + + + + + + + G + + + + + + + + + + + + e + + Ge + + + + + + + w + + + + + + + + r + + + + e + + + + n + + wren + + + + + + + ſ + + + + + + + + o + + + + n + + ſon + + + + + + + a + + + + n + + + + g + + + + e + + + + k + + + + o + + + + m + + + + m + + + + e + + + + n + + + + , + + angekommen, + + + + + + + u + + + + n + + + + d + + und + + + + + + + e + + + + s + + es + + + + + + + f + + + + e + + + + h + + + + l + + + + t + + + + e + + fehlte + + + Ge wren ſon angekommen, und es fehlte + + ber die vielen Sorgen wegen deelben vergaß +Hartkopf, der Frau Amtmnnin das ver⸗ +ſproene zu berliefern. — Ein Erpreer +wurde an ihn abgeſit, um ihn ums Him⸗ +melswien zu ſagen, daß er das Verfproene +glei den Augenbli berbringen mte, die +Frau Amtmnnin htte  auf ihn verlaen, +und nun wßte e nit, was e anfangen +ſote. Den Augembli ſote er kommen, +ſon vergieng e in ihrer Ang. — Die +Ge wren ſon angekommen, und es fehlte +ihr do no an aem. — + + + + + + + + + + + H + + + + a + + + + r + + + + t + + + + k + + + + o + + + + f + + + + p + + Hartkopf + + + + + + + m + + + + u + + + + ß + + + + t + + + + e + + mußte + + + + + + + + + + + + +  + + + + + + + e + + + + r + + + + + + er + + + + + + + b + + + + e + + + + + + + + n + + + + n + + + + e + + + + n + + + + , + + bennen, + + + + + + + u + + + + n + + + + d + + und + + + Hartkopf mußte  er bennen, und + + + + + + + + m + + + + i + + + + t + + mit + + + + + + + u + + + + n + + + + d + + und + + + + + + + + + + + b + + + + e + + + + r + + + + b + + + + r + + + + a + + + + + + + + t + + + + e + + berbrate + + + + + + + e + + + + s + + + + . + + es. + + + + + + + + + + + + mit und berbrate es. — + + + + + + + + l + + + + a + + + + n + + + + g + + + + e + + + + m + + langem + + + + + + + N + + + + a + + + + + + + + d + + + + e + + + + n + + + + k + + + + e + + + + n + + Nadenken + + + + + + + + + + + e + + + + l + + fiel + + + + + + + e + + + + s + + es + + + + + + + i + + + + h + + + + m + + ihm + + + + + + + e + + + + r + + + + + + er + + + + + + + e + + + + n + + + + d + + + + l + + + + i + + + + + + endli + + + + + + + n + + + + a + + + + + + na + + + endli na langem Nadenken fiel es ihm er + + + + + + + + w + + + + i + + + + e + + + + d + + + + e + + + + r + + wieder + + + + + + + e + + + + i + + + + n + + + + . + + ein. + + + + + + + + + + + + + + + + E + + + + r + + Er + + + + + + + l + + + + a + + + + n + + + + g + + + + t + + + + e + + langte + + + + + + + d + + + + e + + + + n + + den + + + + + + + Z + + + + e + + + + t + + + + t + + + + e + + + + l + + Zettel + + + + + + + a + + + + u + + + + s + + aus + + + + + + + d + + + + e + + + + m + + dem + + + wieder ein. — Er langte den Zettel aus dem + + + + + + + + A + + + + c + + + + c + + + + i + + + + + + + + e + + + + s + + + + b + + + + u + + Accisbue + + + + + + + h + + + + e + + + + r + + + + a + + + + u + + + + s + + + + , + + heraus, + + + + + + + u + + + + n + + + + d + + und + + + + + + + ſ + + + + a + + + + g + + + + t + + + + e + + ſagte + + + + + + + ſ + + + + e + + + + i + + + + n + + + + e + + + + r + + ſeiner + + + + + + + F + + + + r + + + + a + + + + u + + + + , + + Frau, + + + + + + + d + + + + a + + + + ß + + daß + + + Accisbue heraus, und ſagte ſeiner Frau, daß + + + + + + + + + + + + e + + e + + + + + + + d + + + + a + + + + s + + + + , + + das, + + + + + + + w + + + + a + + + + s + + was + + + + + + + d + + + + a + + da + + + + + + + w + + + + + + + + r + + + + e + + + + , + + wre, + + + + + + + h + + + + e + + + + r + + + + b + + + + e + + + + y + + + + ſ + + + + + + + + a + + + + + + + + e + + + + n + + herbeyſaffen + + + + + + + m + + + + + + + + + + + + t + + + + e + + + + . + + mte. + + + e das, was da wre, herbeyſaffen mte. + + + + + + + + J + + + + n + + + + d + + + + e + + + + ß + + Jndeß + + + + + + + m + + + + a + + + + n + + + + g + + + + e + + + + l + + + + t + + + + e + + + + n + + mangelten + + + + + + + d + + + + i + + + + e + + die + + + + + + + d + + + + o + + + + + + do + + + + + + + e + + + + i + + + + n + + + + i + + + + g + + + + e + + einige + + + + + + + G + + + + e + + + + n + + + + e + + + + l + + + + i + + + + a + + + + , + + + + r + + + + a + + Generalia, + + + Jndeß mangelten do einige Generalia, die + + + + + + + + a + + + + l + + + + ſ + + + + o + + alſo + + + + + + + w + + + + e + + + + g + + + + + + + + e + + + + l + + + + e + + + + n + + + + . + + wegfielen. + + + + + + + + + + + + + + + + H + + + + a + + + + r + + + + t + + + + k + + + + o + + + + p + + + + f + + Hartkopf + + + + + + + g + + + + i + + + + e + + + + n + + + + g + + gieng + + + + + + + ſ + + + + e + + + + l + + + + b + + + + + + ſelb + + + alſo wegfielen. — Hartkopf gieng ſelb + + Hartkopf mußte  er bennen, und +endli na langem Nadenken fiel es ihm er +wieder ein. — Er langte den Zettel aus dem +Accisbue heraus, und ſagte ſeiner Frau, daß +e das, was da wre, herbeyſaffen mte. +Jndeß mangelten do einige Generalia, die +alſo wegfielen. — Hartkopf gieng ſelb +mit und berbrate es. — + + + + diff --git a/dinglehopper/tests/data/directory-test/ocr/2.xml b/dinglehopper/tests/data/directory-test/ocr/2.xml new file mode 100644 index 0000000..0e62647 --- /dev/null +++ b/dinglehopper/tests/data/directory-test/ocr/2.xml @@ -0,0 +1,3394 @@ + + + + doculibtopagexml + 2019-01-08T10:25:36 + 2019-04-26T07:11:05 + + + + + + + + + + + + + + + + + + + + + + + b + + + + e + + + + r + + ber + + + + + + + d + + + + i + + + + e + + die + + + + + + + v + + + + i + + + + e + + + + l + + + + e + + + + n + + vielen + + + + + + + S + + + + o + + + + r + + + + g + + + + e + + + + n + + Sorgen + + + + + + + w + + + + e + + + + g + + + + e + + + + n + + wegen + + + + + + + d + + + + e + + + + + + + + e + + + + l + + + + b + + + + e + + + + n + + deelben + + + + + + + v + + + + e + + + + r + + + + g + + + + a + + + + ß + + vergaß + + + ber die vielen Sorgen wegen deelben vergaß + + + + + + + + i + + + + h + + + + r + + ihr + + + + + + + d + + + + o + + + + + + do + + + + + + + n + + + + o + + + + + + no + + + + + + + a + + + + n + + an + + + + + + + a + + + + + + + + e + + + + m + + + + . + + aem. + + + + + + + + + + + + ihr do no an aem. — + + + + + + + + H + + + + a + + + + r + + + + t + + + + k + + + + o + + + + p + + + + f + + + + , + + Hartkopf, + + + + + + + d + + + + e + + + + r + + der + + + + + + + F + + + + r + + + + a + + + + u + + Frau + + + + + + + A + + + + m + + + + t + + + + m + + + + + + + + n + + + + n + + + + i + + + + n + + Amtmnnin + + + + + + + d + + + + a + + + + s + + das + + + + + + + v + + + + e + + + + r + + + + + + ver⸗ + + + Hartkopf, der Frau Amtmnnin das ver⸗ + + + + + + + + ſ + + + + p + + + + r + + + + o + + + + + + + + e + + + + n + + + + e + + ſproene + + + + + + + z + + + + u + + zu + + + + + + + + + + + b + + + + e + + + + r + + + + l + + + + i + + + + e + + + + f + + + + e + + + + r + + + + n + + + + . + + berliefern. + + + + + + + + + + + + + + + + E + + + + i + + + + n + + Ein + + + + + + + E + + + + r + + + + p + + + + + + + + e + + + + r + + + + r + + + + e + + Erpreer + + + ſproene zu berliefern. — Ein Erpreer + + + + + + + + w + + + + d + + + + e + + + + u + + + + r + + wurde + + + + + + + a + + + + n + + an + + + + + + + i + + + + h + + + + n + + ihn + + + + + + + a + + + + b + + + + g + + + + e + + + + ſ + + + + + + + + i + + + + + + + + t + + + + , + + abgeſit, + + + + + + + u + + + + m + + um + + + + + + + i + + + + h + + + + n + + ihn + + + + + + + u + + + + m + + + + s + + ums + + + + + + + H + + + + i + + + + m + + + + + + Him⸗ + + + wurde an ihn abgeſit, um ihn ums Him⸗ + + + + + + + + m + + + + e + + + + l + + + + s + + + + w + + + + i + + + + + + + + e + + + + n + + melswien + + + + + + + z + + + + u + + zu + + + + + + + ſ + + + + a + + + + g + + + + e + + + + n + + + + , + + ſagen, + + + + + + + d + + + + a + + + + ß + + daß + + + + + + + e + + + + r + + er + + + + + + + d + + + + a + + + + s + + das + + + + + + + V + + + + e + + + + r + + + + ſ + + + + p + + + + r + + + + o + + + + + + + + e + + + + n + + + + e + + Verſproene + + + melswien zu ſagen, daß er das Verſproene + + + + + + + + g + + + + l + + + + e + + + + i + + + + + + glei + + + + + + + d + + + + e + + + + n + + den + + + + + + + A + + + + u + + + + g + + + + e + + + + n + + + + b + + + + l + + + + i + + + + + + Augenbli + + + + + + + + + + + b + + + + e + + + + r + + + + b + + + + r + + + + i + + + + n + + + + g + + + + e + + + + n + + berbringen + + + + + + + m + + + + + + + + + + + + t + + + + e + + + + , + + mte, + + + + + + + d + + + + i + + + + e + + die + + + glei den Augenbli berbringen mte, die + + + + + + + + F + + + + r + + + + a + + + + u + + Frau + + + + + + + A + + + + m + + + + t + + + + m + + + + + + + + n + + + + n + + + + i + + + + n + + Amtmnnin + + + + + + + h + + + + + + + + t + + + + t + + + + e + + htte + + + + + + + + + + + + +  + + + + + + + a + + + + u + + + + f + + auf + + + + + + + i + + + + h + + + + n + + ihn + + + + + + + v + + + + e + + + + r + + + + l + + + + a + + + + + + + + e + + + + n + + + + , + + verlaen, + + + Frau Amtmnnin htte  auf ihn verlaen, + + + + + + + + u + + + + n + + + + d + + und + + + + + + + n + + + + u + + + + n + + nun + + + + + + + w + + + + + + + + ß + + + + t + + + + e + + wßte + + + + + + + + + + + e + + e + + + + + + + n + + + + i + + + + + + + + t + + + + , + + nit, + + + + + + + w + + + + a + + + + s + + was + + + + + + + + + + + e + + e + + + + + + + a + + + + n + + + + f + + + + a + + + + n + + + + g + + + + e + + + + n + + anfangen + + + und nun wßte e nit, was e anfangen + + + + + + + + ſ + + + + o + + + + + + + + t + + + + e + + + + . + + ſote. + + + + + + + D + + + + e + + + + n + + Den + + + + + + + A + + + + u + + + + g + + + + e + + + + n + + + + b + + + + l + + + + i + + + + + + Augenbli + + + + + + + ſ + + + + o + + + + + + + + t + + + + e + + ſote + + + + + + + e + + + + r + + er + + + + + + + k + + + + o + + + + m + + + + m + + + + e + + + + n + + + + , + + kommen, + + + ſote. Den Augenbli ſote er kommen, + + + + + + + + + + + + e + + e + + + + + + + i + + + + n + + in + + + + + + + i + + + + h + + + + r + + + + e + + + + r + + ihrer + + + + + + + A + + + + n + + + + g + + + + + + + + . + + Ang. + + + + + + + + + + + + + + + + D + + + + i + + + + e + + Die + + + + + + + ſ + + + + o + + + + n + + + + + + ſon + + + + + + + v + + + + e + + + + r + + + + g + + + + i + + + + e + + + + n + + + + g + + vergieng + + + ſon vergieng e in ihrer Ang. — Die + + + + + + + + G + + + + + + + + + + + + e + + Ge + + + + + + + w + + + + + + + + r + + + + e + + + + n + + wren + + + + + + + ſ + + + + + + + + o + + + + n + + ſon + + + + + + + a + + + + n + + + + g + + + + e + + + + k + + + + o + + + + m + + + + m + + + + e + + + + n + + + + , + + angekommen, + + + + + + + u + + + + n + + + + d + + und + + + + + + + e + + + + s + + es + + + + + + + f + + + + e + + + + h + + + + l + + + + t + + + + e + + fehlte + + + Ge wren ſon angekommen, und es fehlte + + ber die vielen Sorgen wegen deelben vergaß +Hartkopf, der Frau Amtmnnin das ver⸗ +ſproene zu berliefern. — Ein Erpreer +wurde an ihn abgeſit, um ihn ums Him⸗ +melswien zu ſagen, daß er das Verfproene +glei den Augenbli berbringen mte, die +Frau Amtmnnin htte  auf ihn verlaen, +und nun wßte e nit, was e anfangen +ſote. Den Augembli ſote er kommen, +ſon vergieng e in ihrer Ang. — Die +Ge wren ſon angekommen, und es fehlte +ihr do no an aem. — + + + + + + + + + + + H + + + + a + + + + r + + + + t + + + + k + + + + o + + + + f + + + + p + + Hartkopf + + + + + + + m + + + + u + + + + ß + + + + t + + + + e + + mußte + + + + + + + + + + + + +  + + + + + + + e + + + + r + + + + + + er + + + + + + + b + + + + e + + + + + + + + n + + + + n + + + + e + + + + n + + + + , + + bennen, + + + + + + + u + + + + n + + + + d + + und + + + Hartkopf mußte  er bennen, und + + + + + + + + m + + + + i + + + + t + + mit + + + + + + + u + + + + n + + + + d + + und + + + + + + + + + + + b + + + + e + + + + r + + + + b + + + + r + + + + a + + + + + + + + t + + + + e + + berbrate + + + + + + + e + + + + s + + + + . + + es. + + + + + + + + + + + + mit und berbrate es. — + + + + + + + + l + + + + a + + + + n + + + + g + + + + e + + + + m + + langem + + + + + + + N + + + + a + + + + + + + + d + + + + e + + + + n + + + + k + + + + e + + + + n + + Nadenken + + + + + + + + + + + e + + + + l + + fiel + + + + + + + e + + + + s + + es + + + + + + + i + + + + h + + + + m + + ihm + + + + + + + e + + + + r + + + + + + er + + + + + + + e + + + + n + + + + d + + + + l + + + + i + + + + + + endli + + + + + + + n + + + + a + + + + + + na + + + endli na langem Nadenken fiel es ihm er + + + + + + + + w + + + + i + + + + e + + + + d + + + + e + + + + r + + wieder + + + + + + + e + + + + i + + + + n + + + + . + + ein. + + + + + + + + + + + + + + + + E + + + + r + + Er + + + + + + + l + + + + a + + + + n + + + + g + + + + t + + + + e + + langte + + + + + + + d + + + + e + + + + n + + den + + + + + + + Z + + + + e + + + + t + + + + t + + + + e + + + + l + + Zettel + + + + + + + a + + + + u + + + + s + + aus + + + + + + + d + + + + e + + + + m + + dem + + + wieder ein. — Er langte den Zettel aus dem + + + + + + + + A + + + + c + + + + c + + + + i + + + + + + + + e + + + + s + + + + b + + + + u + + Accisbue + + + + + + + h + + + + e + + + + r + + + + a + + + + u + + + + s + + + + , + + heraus, + + + + + + + u + + + + n + + + + d + + und + + + + + + + ſ + + + + a + + + + g + + + + t + + + + e + + ſagte + + + + + + + ſ + + + + e + + + + i + + + + n + + + + e + + + + r + + ſeiner + + + + + + + F + + + + r + + + + a + + + + u + + + + , + + Frau, + + + + + + + d + + + + a + + + + ß + + daß + + + Accisbue heraus, und ſagte ſeiner Frau, daß + + + + + + + + + + + + e + + e + + + + + + + d + + + + a + + + + s + + + + , + + das, + + + + + + + w + + + + a + + + + s + + was + + + + + + + d + + + + a + + da + + + + + + + w + + + + + + + + r + + + + e + + + + , + + wre, + + + + + + + h + + + + e + + + + r + + + + b + + + + e + + + + y + + + + ſ + + + + + + + + a + + + + + + + + e + + + + n + + herbeyſaffen + + + + + + + m + + + + + + + + + + + + t + + + + e + + + + . + + mte. + + + e das, was da wre, herbeyſaffen mte. + + + + + + + + J + + + + n + + + + d + + + + e + + + + ß + + Jndeß + + + + + + + m + + + + a + + + + n + + + + g + + + + e + + + + l + + + + t + + + + e + + + + n + + mangelten + + + + + + + d + + + + i + + + + e + + die + + + + + + + d + + + + o + + + + + + do + + + + + + + e + + + + i + + + + n + + + + i + + + + g + + + + e + + einige + + + + + + + G + + + + e + + + + n + + + + e + + + + l + + + + i + + + + a + + + + , + + + + r + + + + a + + Generalia, + + + Jndeß mangelten do einige Generalia, die + + + + + + + + a + + + + l + + + + ſ + + + + o + + alſo + + + + + + + w + + + + e + + + + g + + + + + + + + e + + + + l + + + + e + + + + n + + + + . + + wegfielen. + + + + + + + + + + + + + + + + H + + + + a + + + + r + + + + t + + + + k + + + + o + + + + p + + + + f + + Hartkopf + + + + + + + g + + + + i + + + + e + + + + n + + + + g + + gieng + + + + + + + ſ + + + + e + + + + l + + + + b + + + + + + ſelb + + + alſo wegfielen. — Hartkopf gieng ſelb + + Hartkopf mußte  er bennen, und +endli na langem Nadenken fiel es ihm er +wieder ein. — Er langte den Zettel aus dem +Accisbue heraus, und ſagte ſeiner Frau, daß +e das, was da wre, herbeyſaffen mte. +Jndeß mangelten do einige Generalia, die +alſo wegfielen. — Hartkopf gieng ſelb +mit und berbrate es. — + + + + diff --git a/dinglehopper/tests/data/directory-test/ocr/3-has-no-gt.xml b/dinglehopper/tests/data/directory-test/ocr/3-has-no-gt.xml new file mode 100644 index 0000000..0e62647 --- /dev/null +++ b/dinglehopper/tests/data/directory-test/ocr/3-has-no-gt.xml @@ -0,0 +1,3394 @@ + + + + doculibtopagexml + 2019-01-08T10:25:36 + 2019-04-26T07:11:05 + + + + + + + + + + + + + + + + + + + + + + + b + + + + e + + + + r + + ber + + + + + + + d + + + + i + + + + e + + die + + + + + + + v + + + + i + + + + e + + + + l + + + + e + + + + n + + vielen + + + + + + + S + + + + o + + + + r + + + + g + + + + e + + + + n + + Sorgen + + + + + + + w + + + + e + + + + g + + + + e + + + + n + + wegen + + + + + + + d + + + + e + + + + + + + + e + + + + l + + + + b + + + + e + + + + n + + deelben + + + + + + + v + + + + e + + + + r + + + + g + + + + a + + + + ß + + vergaß + + + ber die vielen Sorgen wegen deelben vergaß + + + + + + + + i + + + + h + + + + r + + ihr + + + + + + + d + + + + o + + + + + + do + + + + + + + n + + + + o + + + + + + no + + + + + + + a + + + + n + + an + + + + + + + a + + + + + + + + e + + + + m + + + + . + + aem. + + + + + + + + + + + + ihr do no an aem. — + + + + + + + + H + + + + a + + + + r + + + + t + + + + k + + + + o + + + + p + + + + f + + + + , + + Hartkopf, + + + + + + + d + + + + e + + + + r + + der + + + + + + + F + + + + r + + + + a + + + + u + + Frau + + + + + + + A + + + + m + + + + t + + + + m + + + + + + + + n + + + + n + + + + i + + + + n + + Amtmnnin + + + + + + + d + + + + a + + + + s + + das + + + + + + + v + + + + e + + + + r + + + + + + ver⸗ + + + Hartkopf, der Frau Amtmnnin das ver⸗ + + + + + + + + ſ + + + + p + + + + r + + + + o + + + + + + + + e + + + + n + + + + e + + ſproene + + + + + + + z + + + + u + + zu + + + + + + + + + + + b + + + + e + + + + r + + + + l + + + + i + + + + e + + + + f + + + + e + + + + r + + + + n + + + + . + + berliefern. + + + + + + + + + + + + + + + + E + + + + i + + + + n + + Ein + + + + + + + E + + + + r + + + + p + + + + + + + + e + + + + r + + + + r + + + + e + + Erpreer + + + ſproene zu berliefern. — Ein Erpreer + + + + + + + + w + + + + d + + + + e + + + + u + + + + r + + wurde + + + + + + + a + + + + n + + an + + + + + + + i + + + + h + + + + n + + ihn + + + + + + + a + + + + b + + + + g + + + + e + + + + ſ + + + + + + + + i + + + + + + + + t + + + + , + + abgeſit, + + + + + + + u + + + + m + + um + + + + + + + i + + + + h + + + + n + + ihn + + + + + + + u + + + + m + + + + s + + ums + + + + + + + H + + + + i + + + + m + + + + + + Him⸗ + + + wurde an ihn abgeſit, um ihn ums Him⸗ + + + + + + + + m + + + + e + + + + l + + + + s + + + + w + + + + i + + + + + + + + e + + + + n + + melswien + + + + + + + z + + + + u + + zu + + + + + + + ſ + + + + a + + + + g + + + + e + + + + n + + + + , + + ſagen, + + + + + + + d + + + + a + + + + ß + + daß + + + + + + + e + + + + r + + er + + + + + + + d + + + + a + + + + s + + das + + + + + + + V + + + + e + + + + r + + + + ſ + + + + p + + + + r + + + + o + + + + + + + + e + + + + n + + + + e + + Verſproene + + + melswien zu ſagen, daß er das Verſproene + + + + + + + + g + + + + l + + + + e + + + + i + + + + + + glei + + + + + + + d + + + + e + + + + n + + den + + + + + + + A + + + + u + + + + g + + + + e + + + + n + + + + b + + + + l + + + + i + + + + + + Augenbli + + + + + + + + + + + b + + + + e + + + + r + + + + b + + + + r + + + + i + + + + n + + + + g + + + + e + + + + n + + berbringen + + + + + + + m + + + + + + + + + + + + t + + + + e + + + + , + + mte, + + + + + + + d + + + + i + + + + e + + die + + + glei den Augenbli berbringen mte, die + + + + + + + + F + + + + r + + + + a + + + + u + + Frau + + + + + + + A + + + + m + + + + t + + + + m + + + + + + + + n + + + + n + + + + i + + + + n + + Amtmnnin + + + + + + + h + + + + + + + + t + + + + t + + + + e + + htte + + + + + + + + + + + + +  + + + + + + + a + + + + u + + + + f + + auf + + + + + + + i + + + + h + + + + n + + ihn + + + + + + + v + + + + e + + + + r + + + + l + + + + a + + + + + + + + e + + + + n + + + + , + + verlaen, + + + Frau Amtmnnin htte  auf ihn verlaen, + + + + + + + + u + + + + n + + + + d + + und + + + + + + + n + + + + u + + + + n + + nun + + + + + + + w + + + + + + + + ß + + + + t + + + + e + + wßte + + + + + + + + + + + e + + e + + + + + + + n + + + + i + + + + + + + + t + + + + , + + nit, + + + + + + + w + + + + a + + + + s + + was + + + + + + + + + + + e + + e + + + + + + + a + + + + n + + + + f + + + + a + + + + n + + + + g + + + + e + + + + n + + anfangen + + + und nun wßte e nit, was e anfangen + + + + + + + + ſ + + + + o + + + + + + + + t + + + + e + + + + . + + ſote. + + + + + + + D + + + + e + + + + n + + Den + + + + + + + A + + + + u + + + + g + + + + e + + + + n + + + + b + + + + l + + + + i + + + + + + Augenbli + + + + + + + ſ + + + + o + + + + + + + + t + + + + e + + ſote + + + + + + + e + + + + r + + er + + + + + + + k + + + + o + + + + m + + + + m + + + + e + + + + n + + + + , + + kommen, + + + ſote. Den Augenbli ſote er kommen, + + + + + + + + + + + + e + + e + + + + + + + i + + + + n + + in + + + + + + + i + + + + h + + + + r + + + + e + + + + r + + ihrer + + + + + + + A + + + + n + + + + g + + + + + + + + . + + Ang. + + + + + + + + + + + + + + + + D + + + + i + + + + e + + Die + + + + + + + ſ + + + + o + + + + n + + + + + + ſon + + + + + + + v + + + + e + + + + r + + + + g + + + + i + + + + e + + + + n + + + + g + + vergieng + + + ſon vergieng e in ihrer Ang. — Die + + + + + + + + G + + + + + + + + + + + + e + + Ge + + + + + + + w + + + + + + + + r + + + + e + + + + n + + wren + + + + + + + ſ + + + + + + + + o + + + + n + + ſon + + + + + + + a + + + + n + + + + g + + + + e + + + + k + + + + o + + + + m + + + + m + + + + e + + + + n + + + + , + + angekommen, + + + + + + + u + + + + n + + + + d + + und + + + + + + + e + + + + s + + es + + + + + + + f + + + + e + + + + h + + + + l + + + + t + + + + e + + fehlte + + + Ge wren ſon angekommen, und es fehlte + + ber die vielen Sorgen wegen deelben vergaß +Hartkopf, der Frau Amtmnnin das ver⸗ +ſproene zu berliefern. — Ein Erpreer +wurde an ihn abgeſit, um ihn ums Him⸗ +melswien zu ſagen, daß er das Verfproene +glei den Augenbli berbringen mte, die +Frau Amtmnnin htte  auf ihn verlaen, +und nun wßte e nit, was e anfangen +ſote. Den Augembli ſote er kommen, +ſon vergieng e in ihrer Ang. — Die +Ge wren ſon angekommen, und es fehlte +ihr do no an aem. — + + + + + + + + + + + H + + + + a + + + + r + + + + t + + + + k + + + + o + + + + f + + + + p + + Hartkopf + + + + + + + m + + + + u + + + + ß + + + + t + + + + e + + mußte + + + + + + + + + + + + +  + + + + + + + e + + + + r + + + + + + er + + + + + + + b + + + + e + + + + + + + + n + + + + n + + + + e + + + + n + + + + , + + bennen, + + + + + + + u + + + + n + + + + d + + und + + + Hartkopf mußte  er bennen, und + + + + + + + + m + + + + i + + + + t + + mit + + + + + + + u + + + + n + + + + d + + und + + + + + + + + + + + b + + + + e + + + + r + + + + b + + + + r + + + + a + + + + + + + + t + + + + e + + berbrate + + + + + + + e + + + + s + + + + . + + es. + + + + + + + + + + + + mit und berbrate es. — + + + + + + + + l + + + + a + + + + n + + + + g + + + + e + + + + m + + langem + + + + + + + N + + + + a + + + + + + + + d + + + + e + + + + n + + + + k + + + + e + + + + n + + Nadenken + + + + + + + + + + + e + + + + l + + fiel + + + + + + + e + + + + s + + es + + + + + + + i + + + + h + + + + m + + ihm + + + + + + + e + + + + r + + + + + + er + + + + + + + e + + + + n + + + + d + + + + l + + + + i + + + + + + endli + + + + + + + n + + + + a + + + + + + na + + + endli na langem Nadenken fiel es ihm er + + + + + + + + w + + + + i + + + + e + + + + d + + + + e + + + + r + + wieder + + + + + + + e + + + + i + + + + n + + + + . + + ein. + + + + + + + + + + + + + + + + E + + + + r + + Er + + + + + + + l + + + + a + + + + n + + + + g + + + + t + + + + e + + langte + + + + + + + d + + + + e + + + + n + + den + + + + + + + Z + + + + e + + + + t + + + + t + + + + e + + + + l + + Zettel + + + + + + + a + + + + u + + + + s + + aus + + + + + + + d + + + + e + + + + m + + dem + + + wieder ein. — Er langte den Zettel aus dem + + + + + + + + A + + + + c + + + + c + + + + i + + + + + + + + e + + + + s + + + + b + + + + u + + Accisbue + + + + + + + h + + + + e + + + + r + + + + a + + + + u + + + + s + + + + , + + heraus, + + + + + + + u + + + + n + + + + d + + und + + + + + + + ſ + + + + a + + + + g + + + + t + + + + e + + ſagte + + + + + + + ſ + + + + e + + + + i + + + + n + + + + e + + + + r + + ſeiner + + + + + + + F + + + + r + + + + a + + + + u + + + + , + + Frau, + + + + + + + d + + + + a + + + + ß + + daß + + + Accisbue heraus, und ſagte ſeiner Frau, daß + + + + + + + + + + + + e + + e + + + + + + + d + + + + a + + + + s + + + + , + + das, + + + + + + + w + + + + a + + + + s + + was + + + + + + + d + + + + a + + da + + + + + + + w + + + + + + + + r + + + + e + + + + , + + wre, + + + + + + + h + + + + e + + + + r + + + + b + + + + e + + + + y + + + + ſ + + + + + + + + a + + + + + + + + e + + + + n + + herbeyſaffen + + + + + + + m + + + + + + + + + + + + t + + + + e + + + + . + + mte. + + + e das, was da wre, herbeyſaffen mte. + + + + + + + + J + + + + n + + + + d + + + + e + + + + ß + + Jndeß + + + + + + + m + + + + a + + + + n + + + + g + + + + e + + + + l + + + + t + + + + e + + + + n + + mangelten + + + + + + + d + + + + i + + + + e + + die + + + + + + + d + + + + o + + + + + + do + + + + + + + e + + + + i + + + + n + + + + i + + + + g + + + + e + + einige + + + + + + + G + + + + e + + + + n + + + + e + + + + l + + + + i + + + + a + + + + , + + + + r + + + + a + + Generalia, + + + Jndeß mangelten do einige Generalia, die + + + + + + + + a + + + + l + + + + ſ + + + + o + + alſo + + + + + + + w + + + + e + + + + g + + + + + + + + e + + + + l + + + + e + + + + n + + + + . + + wegfielen. + + + + + + + + + + + + + + + + H + + + + a + + + + r + + + + t + + + + k + + + + o + + + + p + + + + f + + Hartkopf + + + + + + + g + + + + i + + + + e + + + + n + + + + g + + gieng + + + + + + + ſ + + + + e + + + + l + + + + b + + + + + + ſelb + + + alſo wegfielen. — Hartkopf gieng ſelb + + Hartkopf mußte  er bennen, und +endli na langem Nadenken fiel es ihm er +wieder ein. — Er langte den Zettel aus dem +Accisbue heraus, und ſagte ſeiner Frau, daß +e das, was da wre, herbeyſaffen mte. +Jndeß mangelten do einige Generalia, die +alſo wegfielen. — Hartkopf gieng ſelb +mit und berbrate es. — + + + + diff --git a/dinglehopper/tests/test_integ_cli_dir.py b/dinglehopper/tests/test_integ_cli_dir.py new file mode 100644 index 0000000..435b452 --- /dev/null +++ b/dinglehopper/tests/test_integ_cli_dir.py @@ -0,0 +1,41 @@ +import os +import pytest +from ocrd_utils import initLogging +from dinglehopper.cli import process_dir + +data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data") + + +@pytest.mark.integration +def test_cli_directory(tmp_path): + """ + Test that the cli/process_dir() processes a directory of files and + yields JSON and HTML reports. + """ + + initLogging() + process_dir(os.path.join(data_dir, "directory-test", "gt"), + os.path.join(data_dir, "directory-test", "ocr"), + "report", str(tmp_path / "reports"), False, True, + "line") + + assert os.path.exists(tmp_path / "reports/1.xml-report.json") + assert os.path.exists(tmp_path / "reports/1.xml-report.html") + assert os.path.exists(tmp_path / "reports/2.xml-report.json") + assert os.path.exists(tmp_path / "reports/2.xml-report.html") + + +@pytest.mark.integration +def test_cli_fail_without_gt(tmp_path): + """ + Test that the cli/process_dir skips a file if there is no corresponding file + in the other directory. + """ + + initLogging() + process_dir(os.path.join(data_dir, "directory-test", "gt"), + os.path.join(data_dir, "directory-test", "ocr"), + "report", str(tmp_path / "reports"), False, True, + "line") + + assert len(os.listdir(tmp_path / "reports")) == 2 * 2 diff --git a/dinglehopper/tests/test_integ_differences.py b/dinglehopper/tests/test_integ_differences.py new file mode 100644 index 0000000..3590317 --- /dev/null +++ b/dinglehopper/tests/test_integ_differences.py @@ -0,0 +1,27 @@ +import json +import os +import pytest +from ocrd_utils import initLogging +from dinglehopper.cli import process + +data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data") + + +@pytest.mark.integration +def test_cli_differences(tmp_path): + """Test that the cli/process() yields a JSON report that includes + the differences found between the GT and OCR text""" + + initLogging() + process(os.path.join(data_dir, "test-gt.page2018.xml"), + os.path.join(data_dir, "test-fake-ocr.page2018.xml"), + "report", tmp_path, differences=True) + + assert os.path.exists(tmp_path / "report.json") + + with open(tmp_path / "report.json", "r") as jsonf: + j = json.load(jsonf) + + assert j["differences"] == {"character_level": {'n :: m': 1, 'ſ :: f': 1}, + "word_level": {'Augenblick :: Augemblick': 1, + 'Verſprochene :: Verfprochene': 1}} diff --git a/dinglehopper/tests/test_integ_summarize.py b/dinglehopper/tests/test_integ_summarize.py new file mode 100644 index 0000000..0908152 --- /dev/null +++ b/dinglehopper/tests/test_integ_summarize.py @@ -0,0 +1,101 @@ +import json +import os +import pytest +from .util import working_directory +from .. import cli_summarize + +expected_cer_avg = (0.05 + 0.10) / 2 +expected_wer_avg = (0.15 + 0.20) / 2 +expected_diff_c = {"a": 30, "b": 50} +expected_diff_w = {"c": 70, "d": 90} + + +@pytest.fixture +def create_summaries(tmp_path): + """Create two summary reports with mock data""" + reports_dirname = tmp_path / "reports" + reports_dirname.mkdir() + + report1 = {"cer": 0.05, "wer": 0.15, + "differences": { + "character_level": {"a": 10, "b": 20}, + "word_level": {"c": 30, "d": 40} + }} + report2 = {"cer": 0.10, "wer": 0.20, + "differences": { + "character_level": {"a": 20, "b": 30}, + "word_level": {"c": 40, "d": 50} + }} + + with open(os.path.join(reports_dirname, "report1.json"), "w") as f: + json.dump(report1, f) + with open(os.path.join(reports_dirname, "report2.json"), "w") as f: + json.dump(report2, f) + + return str(reports_dirname) + + +@pytest.mark.integration +def test_cli_summarize_json(tmp_path, create_summaries): + """Test that the cli/process() yields a summarized JSON report""" + with working_directory(tmp_path): + reports_dirname = create_summaries + cli_summarize.process(reports_dirname) + + with open(os.path.join(reports_dirname, "summary.json"), "r") as f: + summary_data = json.load(f) + + + assert summary_data["num_reports"] == 2 + assert summary_data["cer_avg"] == expected_cer_avg + assert summary_data["wer_avg"] == expected_wer_avg + assert summary_data["differences"]["character_level"] == expected_diff_c + assert summary_data["differences"]["word_level"] == expected_diff_w + + +@pytest.mark.integration +def test_cli_summarize_html(tmp_path, create_summaries): + """Test that the cli/process() yields an HTML report""" + with working_directory(tmp_path): + reports_dirname = create_summaries + cli_summarize.process(reports_dirname) + + html_file = os.path.join(reports_dirname, "summary.html") + assert os.path.isfile(html_file) + + with open(html_file, "r") as f: + contents = f.read() + + assert len(contents) > 0 + assert "Number of reports: 2" in contents + assert f"Average CER: {round(expected_cer_avg, 4)}" in contents + assert f"Average WER: {round(expected_wer_avg, 4)}" in contents + + +@pytest.mark.integration +def test_cli_summarize_html_skip_invalid(tmp_path, create_summaries): + """ + Test that the cli/process() does not include reports that are missing a WER value. + """ + with working_directory(tmp_path): + reports_dirname = create_summaries + + # This third report has no WER value and should not be included in the summary + report3 = {"cer": 0.10, + "differences": { + "character_level": {"a": 20, "b": 30}, + "word_level": {"c": 40, "d": 50} + }} + + with open(os.path.join(reports_dirname, "report3-missing-wer.json"), "w") as f: + json.dump(report3, f) + + cli_summarize.process(reports_dirname) + + html_file = os.path.join(reports_dirname, "summary.html") + assert os.path.isfile(html_file) + + with open(html_file, "r") as f: + contents = f.read() + + assert "Number of reports: 2" in contents # report3 is not included diff --git a/setup.py b/setup.py index d7a3776..476ec8f 100644 --- a/setup.py +++ b/setup.py @@ -27,6 +27,7 @@ setup( "dinglehopper=dinglehopper.cli:main", "dinglehopper-line-dirs=dinglehopper.cli_line_dirs:main", "dinglehopper-extract=dinglehopper.cli_extract:main", + "dinglehopper-summarize=dinglehopper.cli_summarize:main", "ocrd-dinglehopper=dinglehopper.ocrd_cli:ocrd_dinglehopper", ] },