mirror of
https://github.com/qurator-spk/dinglehopper.git
synced 2025-07-01 14:40:00 +02:00
Merge commit '7930ecd428
'
This commit is contained in:
commit
717801bdbb
9 changed files with 89 additions and 17 deletions
7
.gitignore
vendored
7
.gitignore
vendored
|
@ -1,3 +1,10 @@
|
|||
# Byte-compiled / optimized / DLL files
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
|
||||
# Distribution / packaging
|
||||
*.egg-info/
|
||||
|
||||
# User-specific stuff
|
||||
.idea/**/workspace.xml
|
||||
.idea/**/tasks.xml
|
||||
|
|
5
.idea/codeStyles/codeStyleConfig.xml
generated
Normal file
5
.idea/codeStyles/codeStyleConfig.xml
generated
Normal file
|
@ -0,0 +1,5 @@
|
|||
<component name="ProjectCodeStyleConfiguration">
|
||||
<state>
|
||||
<option name="PREFERRED_PROJECT_CODE_STYLE" value="Default" />
|
||||
</state>
|
||||
</component>
|
31
README.md
31
README.md
|
@ -1,7 +1,11 @@
|
|||
dinglehopper
|
||||
============
|
||||
|
||||
dinglehopper is an OCR evaluation tool and reads [ALTO](https://github.com/altoxml), [PAGE](https://github.com/PRImA-Research-Lab/PAGE-XML) and text files.
|
||||
dinglehopper is an OCR evaluation tool and reads
|
||||
[ALTO](https://github.com/altoxml),
|
||||
[PAGE](https://github.com/PRImA-Research-Lab/PAGE-XML) and text files. It
|
||||
compares a ground truth (GT) document page with a OCR result page to compute
|
||||
metrics and a word/character differences report.
|
||||
|
||||
[](https://travis-ci.org/qurator-spk/dinglehopper)
|
||||
|
||||
|
@ -23,6 +27,22 @@ sudo pip install .
|
|||
Usage
|
||||
-----
|
||||
~~~
|
||||
Usage: dinglehopper [OPTIONS] GT OCR [REPORT_PREFIX]
|
||||
|
||||
Compare the PAGE/ALTO/text document GT against the document OCR.
|
||||
|
||||
The files GT and OCR are usually a ground truth document and the result of
|
||||
an OCR software, but you may use dinglehopper to compare two OCR results.
|
||||
In that case, use --no-metrics to disable the then meaningless metrics and
|
||||
also change the color scheme from green/red to blue.
|
||||
|
||||
Options:
|
||||
--metrics / --no-metrics Enable/disable metrics and green/red
|
||||
--help Show this message and exit.
|
||||
~~~
|
||||
|
||||
For example:
|
||||
~~~
|
||||
dinglehopper some-document.gt.page.xml some-document.ocr.alto.xml
|
||||
~~~
|
||||
This generates `report.html` and `report.json`.
|
||||
|
@ -30,13 +50,20 @@ This generates `report.html` and `report.json`.
|
|||
|
||||
As a OCR-D processor:
|
||||
~~~
|
||||
ocrd-dinglehopper -m mets.xml -I OCR-D-GT-PAGE,OCR-D-OCR-TESS -O OCR-D-OCR-TESS-EVAL
|
||||
ocrd-dinglehopper -I OCR-D-GT-PAGE,OCR-D-OCR-TESS -O OCR-D-OCR-TESS-EVAL
|
||||
~~~
|
||||
This generates HTML and JSON reports in the `OCR-D-OCR-TESS-EVAL` filegroup.
|
||||
|
||||
|
||||

|
||||
|
||||
You may also want to disable metrics and the green-red color scheme by
|
||||
parameter:
|
||||
|
||||
~~~
|
||||
ocrd-dinglehopper -I ABBYY-FULLTEXT,OCR-D-OCR-CALAMARI -O OCR-D-OCR-COMPARE-ABBYY-CALAMARI -p '{"metrics": false}'
|
||||
~~~
|
||||
|
||||
Testing
|
||||
-------
|
||||
Use `pytest` to run the tests in [the tests directory](qurator/dinglehopper/tests):
|
||||
|
|
|
@ -44,7 +44,7 @@ def gen_diff_report(gt_things, ocr_things, css_prefix, joiner, none, align):
|
|||
'''.format(gtx, ocrx)
|
||||
|
||||
|
||||
def process(gt, ocr, report_prefix):
|
||||
def process(gt, ocr, report_prefix, *, metrics=True):
|
||||
"""Check OCR result against GT.
|
||||
|
||||
The @click decorators change the signature of the decorated functions, so we keep this undecorated version and use
|
||||
|
@ -91,7 +91,8 @@ def process(gt, ocr, report_prefix):
|
|||
cer=cer, n_characters=n_characters,
|
||||
wer=wer, n_words=n_words,
|
||||
char_diff_report=char_diff_report,
|
||||
word_diff_report=word_diff_report
|
||||
word_diff_report=word_diff_report,
|
||||
metrics=metrics,
|
||||
).dump(out_fn)
|
||||
|
||||
|
||||
|
@ -99,8 +100,17 @@ def process(gt, ocr, report_prefix):
|
|||
@click.argument('gt', type=click.Path(exists=True))
|
||||
@click.argument('ocr', type=click.Path(exists=True))
|
||||
@click.argument('report_prefix', type=click.Path(), default='report')
|
||||
def main(gt, ocr, report_prefix):
|
||||
process(gt, ocr, report_prefix)
|
||||
@click.option('--metrics/--no-metrics', default=True, help='Enable/disable metrics and green/red')
|
||||
def main(gt, ocr, report_prefix, metrics):
|
||||
"""
|
||||
Compare the PAGE/ALTO/text document GT against the document OCR.
|
||||
|
||||
The files GT and OCR are usually a ground truth document and the result of
|
||||
an OCR software, but you may use dinglehopper to compare two OCR results. In
|
||||
that case, use --no-metrics to disable the then meaningless metrics and also
|
||||
change the color scheme from green/red to blue.
|
||||
"""
|
||||
process(gt, ocr, report_prefix, metrics=metrics)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
|
|
@ -16,7 +16,14 @@
|
|||
],
|
||||
"steps": [
|
||||
"recognition/text-recognition"
|
||||
]
|
||||
],
|
||||
"parameters": {
|
||||
"metrics": {
|
||||
"type": "boolean",
|
||||
"default": true,
|
||||
"description": "Enable/disable metrics and green/red"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -4,7 +4,7 @@ import os
|
|||
import click
|
||||
from ocrd import Processor
|
||||
from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor
|
||||
from ocrd_utils import concat_padded, getLogger
|
||||
from ocrd_utils import getLogger, make_file_id, assert_file_grp_cardinality
|
||||
from pkg_resources import resource_string
|
||||
|
||||
from qurator.dinglehopper.cli import process as cli_process
|
||||
|
@ -27,20 +27,20 @@ class OcrdDinglehopperEvaluate(Processor):
|
|||
kwargs['ocrd_tool'] = OCRD_TOOL['tools']['ocrd-dinglehopper']
|
||||
super(OcrdDinglehopperEvaluate, self).__init__(*args, **kwargs)
|
||||
|
||||
def _make_file_id(self, input_file, input_file_grp, n):
|
||||
file_id = input_file.ID.replace(input_file_grp, self.output_file_grp)
|
||||
if file_id == input_file.ID:
|
||||
file_id = concat_padded(self.output_file_grp, n)
|
||||
return file_id
|
||||
|
||||
def process(self):
|
||||
assert_file_grp_cardinality(self.input_file_grp, 2, 'GT and OCR')
|
||||
assert_file_grp_cardinality(self.output_file_grp, 1)
|
||||
|
||||
metrics = self.parameter['metrics']
|
||||
gt_grp, ocr_grp = self.input_file_grp.split(',')
|
||||
for n, page_id in enumerate(self.workspace.mets.physical_pages):
|
||||
gt_file = self.workspace.mets.find_files(fileGrp=gt_grp, pageId=page_id)[0]
|
||||
ocr_file = self.workspace.mets.find_files(fileGrp=ocr_grp, pageId=page_id)[0]
|
||||
gt_file = self.workspace.download_file(gt_file)
|
||||
ocr_file = self.workspace.download_file(ocr_file)
|
||||
log.info("INPUT FILES %i / %s↔ %s", n, gt_file, ocr_file)
|
||||
|
||||
file_id = self._make_file_id(ocr_file, ocr_grp, n)
|
||||
file_id = make_file_id(ocr_file, self.output_file_grp)
|
||||
report_prefix = os.path.join(self.output_file_grp, file_id)
|
||||
|
||||
# Process the files
|
||||
|
@ -48,7 +48,12 @@ class OcrdDinglehopperEvaluate(Processor):
|
|||
os.mkdir(self.output_file_grp)
|
||||
except FileExistsError:
|
||||
pass
|
||||
cli_process(gt_file.local_filename, ocr_file.local_filename, report_prefix)
|
||||
cli_process(
|
||||
gt_file.local_filename,
|
||||
ocr_file.local_filename,
|
||||
report_prefix,
|
||||
metrics=metrics
|
||||
)
|
||||
|
||||
# Add reports to the workspace
|
||||
for report_suffix, mimetype in \
|
||||
|
|
|
@ -6,12 +6,18 @@
|
|||
|
||||
<link rel="stylesheet" href="https://stackpath.bootstrapcdn.com/bootstrap/4.3.1/css/bootstrap.min.css" integrity="sha384-ggOyR0iXCbMQv3Xipma34MD+dH/1fQ784/j6cY/iJTQUOhcWr7x9JvoRxT2MZw1T" crossorigin="anonymous">
|
||||
<style type="text/css">
|
||||
{% if metrics %}
|
||||
.gt .diff {
|
||||
color: green;
|
||||
}
|
||||
.ocr .diff {
|
||||
color: red;
|
||||
}
|
||||
{% else %}
|
||||
.gt .diff, .ocr .diff {
|
||||
color: blue;
|
||||
}
|
||||
{% endif %}
|
||||
.ellipsis {
|
||||
opacity: 0.5;
|
||||
font-style: italic;
|
||||
|
@ -32,9 +38,11 @@
|
|||
{{ ocr }}
|
||||
|
||||
|
||||
{% if metrics %}
|
||||
<h2>Metrics</h2>
|
||||
<p>CER: {{ cer|round(4) }}</p>
|
||||
<p>WER: {{ wer|round(4) }}</p>
|
||||
{% endif %}
|
||||
|
||||
<h2>Character differences</h2>
|
||||
{{ char_diff_report }}
|
||||
|
|
|
@ -4,4 +4,5 @@ lxml
|
|||
uniseg
|
||||
numpy
|
||||
colorama
|
||||
ocrd >= 1.0.0b15
|
||||
MarkupSafe
|
||||
ocrd >= 2.13.1
|
||||
|
|
2
setup.cfg
Normal file
2
setup.cfg
Normal file
|
@ -0,0 +1,2 @@
|
|||
[flake8]
|
||||
max-line-length = 90
|
Loading…
Add table
Add a link
Reference in a new issue