Merge commit '7930ecd42868cb6785a58f8ee95b05882704621d'

pull/29/head
Gerber, Mike 4 years ago
commit 717801bdbb

7
.gitignore vendored

@ -1,3 +1,10 @@
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
# Distribution / packaging
*.egg-info/
# User-specific stuff # User-specific stuff
.idea/**/workspace.xml .idea/**/workspace.xml
.idea/**/tasks.xml .idea/**/tasks.xml

@ -0,0 +1,5 @@
<component name="ProjectCodeStyleConfiguration">
<state>
<option name="PREFERRED_PROJECT_CODE_STYLE" value="Default" />
</state>
</component>

@ -1,7 +1,11 @@
dinglehopper dinglehopper
============ ============
dinglehopper is an OCR evaluation tool and reads [ALTO](https://github.com/altoxml), [PAGE](https://github.com/PRImA-Research-Lab/PAGE-XML) and text files. dinglehopper is an OCR evaluation tool and reads
[ALTO](https://github.com/altoxml),
[PAGE](https://github.com/PRImA-Research-Lab/PAGE-XML) and text files. It
compares a ground truth (GT) document page with a OCR result page to compute
metrics and a word/character differences report.
[![Build Status](https://travis-ci.org/qurator-spk/dinglehopper.svg?branch=master)](https://travis-ci.org/qurator-spk/dinglehopper) [![Build Status](https://travis-ci.org/qurator-spk/dinglehopper.svg?branch=master)](https://travis-ci.org/qurator-spk/dinglehopper)
@ -23,6 +27,22 @@ sudo pip install .
Usage Usage
----- -----
~~~ ~~~
Usage: dinglehopper [OPTIONS] GT OCR [REPORT_PREFIX]
Compare the PAGE/ALTO/text document GT against the document OCR.
The files GT and OCR are usually a ground truth document and the result of
an OCR software, but you may use dinglehopper to compare two OCR results.
In that case, use --no-metrics to disable the then meaningless metrics and
also change the color scheme from green/red to blue.
Options:
--metrics / --no-metrics Enable/disable metrics and green/red
--help Show this message and exit.
~~~
For example:
~~~
dinglehopper some-document.gt.page.xml some-document.ocr.alto.xml dinglehopper some-document.gt.page.xml some-document.ocr.alto.xml
~~~ ~~~
This generates `report.html` and `report.json`. This generates `report.html` and `report.json`.
@ -30,13 +50,20 @@ This generates `report.html` and `report.json`.
As a OCR-D processor: As a OCR-D processor:
~~~ ~~~
ocrd-dinglehopper -m mets.xml -I OCR-D-GT-PAGE,OCR-D-OCR-TESS -O OCR-D-OCR-TESS-EVAL ocrd-dinglehopper -I OCR-D-GT-PAGE,OCR-D-OCR-TESS -O OCR-D-OCR-TESS-EVAL
~~~ ~~~
This generates HTML and JSON reports in the `OCR-D-OCR-TESS-EVAL` filegroup. This generates HTML and JSON reports in the `OCR-D-OCR-TESS-EVAL` filegroup.
![dinglehopper displaying metrics and character differences](.screenshots/dinglehopper.png?raw=true) ![dinglehopper displaying metrics and character differences](.screenshots/dinglehopper.png?raw=true)
You may also want to disable metrics and the green-red color scheme by
parameter:
~~~
ocrd-dinglehopper -I ABBYY-FULLTEXT,OCR-D-OCR-CALAMARI -O OCR-D-OCR-COMPARE-ABBYY-CALAMARI -p '{"metrics": false}'
~~~
Testing Testing
------- -------
Use `pytest` to run the tests in [the tests directory](qurator/dinglehopper/tests): Use `pytest` to run the tests in [the tests directory](qurator/dinglehopper/tests):

@ -44,7 +44,7 @@ def gen_diff_report(gt_things, ocr_things, css_prefix, joiner, none, align):
'''.format(gtx, ocrx) '''.format(gtx, ocrx)
def process(gt, ocr, report_prefix): def process(gt, ocr, report_prefix, *, metrics=True):
"""Check OCR result against GT. """Check OCR result against GT.
The @click decorators change the signature of the decorated functions, so we keep this undecorated version and use The @click decorators change the signature of the decorated functions, so we keep this undecorated version and use
@ -91,7 +91,8 @@ def process(gt, ocr, report_prefix):
cer=cer, n_characters=n_characters, cer=cer, n_characters=n_characters,
wer=wer, n_words=n_words, wer=wer, n_words=n_words,
char_diff_report=char_diff_report, char_diff_report=char_diff_report,
word_diff_report=word_diff_report word_diff_report=word_diff_report,
metrics=metrics,
).dump(out_fn) ).dump(out_fn)
@ -99,8 +100,17 @@ def process(gt, ocr, report_prefix):
@click.argument('gt', type=click.Path(exists=True)) @click.argument('gt', type=click.Path(exists=True))
@click.argument('ocr', type=click.Path(exists=True)) @click.argument('ocr', type=click.Path(exists=True))
@click.argument('report_prefix', type=click.Path(), default='report') @click.argument('report_prefix', type=click.Path(), default='report')
def main(gt, ocr, report_prefix): @click.option('--metrics/--no-metrics', default=True, help='Enable/disable metrics and green/red')
process(gt, ocr, report_prefix) def main(gt, ocr, report_prefix, metrics):
"""
Compare the PAGE/ALTO/text document GT against the document OCR.
The files GT and OCR are usually a ground truth document and the result of
an OCR software, but you may use dinglehopper to compare two OCR results. In
that case, use --no-metrics to disable the then meaningless metrics and also
change the color scheme from green/red to blue.
"""
process(gt, ocr, report_prefix, metrics=metrics)
if __name__ == '__main__': if __name__ == '__main__':

@ -16,7 +16,14 @@
], ],
"steps": [ "steps": [
"recognition/text-recognition" "recognition/text-recognition"
] ],
"parameters": {
"metrics": {
"type": "boolean",
"default": true,
"description": "Enable/disable metrics and green/red"
}
}
} }
} }
} }

@ -4,7 +4,7 @@ import os
import click import click
from ocrd import Processor from ocrd import Processor
from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor
from ocrd_utils import concat_padded, getLogger from ocrd_utils import getLogger, make_file_id, assert_file_grp_cardinality
from pkg_resources import resource_string from pkg_resources import resource_string
from qurator.dinglehopper.cli import process as cli_process from qurator.dinglehopper.cli import process as cli_process
@ -27,20 +27,20 @@ class OcrdDinglehopperEvaluate(Processor):
kwargs['ocrd_tool'] = OCRD_TOOL['tools']['ocrd-dinglehopper'] kwargs['ocrd_tool'] = OCRD_TOOL['tools']['ocrd-dinglehopper']
super(OcrdDinglehopperEvaluate, self).__init__(*args, **kwargs) super(OcrdDinglehopperEvaluate, self).__init__(*args, **kwargs)
def _make_file_id(self, input_file, input_file_grp, n):
file_id = input_file.ID.replace(input_file_grp, self.output_file_grp)
if file_id == input_file.ID:
file_id = concat_padded(self.output_file_grp, n)
return file_id
def process(self): def process(self):
assert_file_grp_cardinality(self.input_file_grp, 2, 'GT and OCR')
assert_file_grp_cardinality(self.output_file_grp, 1)
metrics = self.parameter['metrics']
gt_grp, ocr_grp = self.input_file_grp.split(',') gt_grp, ocr_grp = self.input_file_grp.split(',')
for n, page_id in enumerate(self.workspace.mets.physical_pages): for n, page_id in enumerate(self.workspace.mets.physical_pages):
gt_file = self.workspace.mets.find_files(fileGrp=gt_grp, pageId=page_id)[0] gt_file = self.workspace.mets.find_files(fileGrp=gt_grp, pageId=page_id)[0]
ocr_file = self.workspace.mets.find_files(fileGrp=ocr_grp, pageId=page_id)[0] ocr_file = self.workspace.mets.find_files(fileGrp=ocr_grp, pageId=page_id)[0]
gt_file = self.workspace.download_file(gt_file)
ocr_file = self.workspace.download_file(ocr_file)
log.info("INPUT FILES %i / %s%s", n, gt_file, ocr_file) log.info("INPUT FILES %i / %s%s", n, gt_file, ocr_file)
file_id = self._make_file_id(ocr_file, ocr_grp, n) file_id = make_file_id(ocr_file, self.output_file_grp)
report_prefix = os.path.join(self.output_file_grp, file_id) report_prefix = os.path.join(self.output_file_grp, file_id)
# Process the files # Process the files
@ -48,7 +48,12 @@ class OcrdDinglehopperEvaluate(Processor):
os.mkdir(self.output_file_grp) os.mkdir(self.output_file_grp)
except FileExistsError: except FileExistsError:
pass pass
cli_process(gt_file.local_filename, ocr_file.local_filename, report_prefix) cli_process(
gt_file.local_filename,
ocr_file.local_filename,
report_prefix,
metrics=metrics
)
# Add reports to the workspace # Add reports to the workspace
for report_suffix, mimetype in \ for report_suffix, mimetype in \

@ -6,12 +6,18 @@
<link rel="stylesheet" href="https://stackpath.bootstrapcdn.com/bootstrap/4.3.1/css/bootstrap.min.css" integrity="sha384-ggOyR0iXCbMQv3Xipma34MD+dH/1fQ784/j6cY/iJTQUOhcWr7x9JvoRxT2MZw1T" crossorigin="anonymous"> <link rel="stylesheet" href="https://stackpath.bootstrapcdn.com/bootstrap/4.3.1/css/bootstrap.min.css" integrity="sha384-ggOyR0iXCbMQv3Xipma34MD+dH/1fQ784/j6cY/iJTQUOhcWr7x9JvoRxT2MZw1T" crossorigin="anonymous">
<style type="text/css"> <style type="text/css">
{% if metrics %}
.gt .diff { .gt .diff {
color: green; color: green;
} }
.ocr .diff { .ocr .diff {
color: red; color: red;
} }
{% else %}
.gt .diff, .ocr .diff {
color: blue;
}
{% endif %}
.ellipsis { .ellipsis {
opacity: 0.5; opacity: 0.5;
font-style: italic; font-style: italic;
@ -32,9 +38,11 @@
{{ ocr }} {{ ocr }}
{% if metrics %}
<h2>Metrics</h2> <h2>Metrics</h2>
<p>CER: {{ cer|round(4) }}</p> <p>CER: {{ cer|round(4) }}</p>
<p>WER: {{ wer|round(4) }}</p> <p>WER: {{ wer|round(4) }}</p>
{% endif %}
<h2>Character differences</h2> <h2>Character differences</h2>
{{ char_diff_report }} {{ char_diff_report }}

@ -4,4 +4,5 @@ lxml
uniseg uniseg
numpy numpy
colorama colorama
ocrd >= 1.0.0b15 MarkupSafe
ocrd >= 2.13.1

@ -0,0 +1,2 @@
[flake8]
max-line-length = 90
Loading…
Cancel
Save