mirror of
https://github.com/qurator-spk/dinglehopper.git
synced 2025-06-07 19:05:13 +02:00
Merge commit '7930ecd428
'
This commit is contained in:
commit
717801bdbb
9 changed files with 89 additions and 17 deletions
7
.gitignore
vendored
7
.gitignore
vendored
|
@ -1,3 +1,10 @@
|
||||||
|
# Byte-compiled / optimized / DLL files
|
||||||
|
__pycache__/
|
||||||
|
*.py[cod]
|
||||||
|
|
||||||
|
# Distribution / packaging
|
||||||
|
*.egg-info/
|
||||||
|
|
||||||
# User-specific stuff
|
# User-specific stuff
|
||||||
.idea/**/workspace.xml
|
.idea/**/workspace.xml
|
||||||
.idea/**/tasks.xml
|
.idea/**/tasks.xml
|
||||||
|
|
5
.idea/codeStyles/codeStyleConfig.xml
generated
Normal file
5
.idea/codeStyles/codeStyleConfig.xml
generated
Normal file
|
@ -0,0 +1,5 @@
|
||||||
|
<component name="ProjectCodeStyleConfiguration">
|
||||||
|
<state>
|
||||||
|
<option name="PREFERRED_PROJECT_CODE_STYLE" value="Default" />
|
||||||
|
</state>
|
||||||
|
</component>
|
31
README.md
31
README.md
|
@ -1,7 +1,11 @@
|
||||||
dinglehopper
|
dinglehopper
|
||||||
============
|
============
|
||||||
|
|
||||||
dinglehopper is an OCR evaluation tool and reads [ALTO](https://github.com/altoxml), [PAGE](https://github.com/PRImA-Research-Lab/PAGE-XML) and text files.
|
dinglehopper is an OCR evaluation tool and reads
|
||||||
|
[ALTO](https://github.com/altoxml),
|
||||||
|
[PAGE](https://github.com/PRImA-Research-Lab/PAGE-XML) and text files. It
|
||||||
|
compares a ground truth (GT) document page with a OCR result page to compute
|
||||||
|
metrics and a word/character differences report.
|
||||||
|
|
||||||
[](https://travis-ci.org/qurator-spk/dinglehopper)
|
[](https://travis-ci.org/qurator-spk/dinglehopper)
|
||||||
|
|
||||||
|
@ -23,6 +27,22 @@ sudo pip install .
|
||||||
Usage
|
Usage
|
||||||
-----
|
-----
|
||||||
~~~
|
~~~
|
||||||
|
Usage: dinglehopper [OPTIONS] GT OCR [REPORT_PREFIX]
|
||||||
|
|
||||||
|
Compare the PAGE/ALTO/text document GT against the document OCR.
|
||||||
|
|
||||||
|
The files GT and OCR are usually a ground truth document and the result of
|
||||||
|
an OCR software, but you may use dinglehopper to compare two OCR results.
|
||||||
|
In that case, use --no-metrics to disable the then meaningless metrics and
|
||||||
|
also change the color scheme from green/red to blue.
|
||||||
|
|
||||||
|
Options:
|
||||||
|
--metrics / --no-metrics Enable/disable metrics and green/red
|
||||||
|
--help Show this message and exit.
|
||||||
|
~~~
|
||||||
|
|
||||||
|
For example:
|
||||||
|
~~~
|
||||||
dinglehopper some-document.gt.page.xml some-document.ocr.alto.xml
|
dinglehopper some-document.gt.page.xml some-document.ocr.alto.xml
|
||||||
~~~
|
~~~
|
||||||
This generates `report.html` and `report.json`.
|
This generates `report.html` and `report.json`.
|
||||||
|
@ -30,13 +50,20 @@ This generates `report.html` and `report.json`.
|
||||||
|
|
||||||
As a OCR-D processor:
|
As a OCR-D processor:
|
||||||
~~~
|
~~~
|
||||||
ocrd-dinglehopper -m mets.xml -I OCR-D-GT-PAGE,OCR-D-OCR-TESS -O OCR-D-OCR-TESS-EVAL
|
ocrd-dinglehopper -I OCR-D-GT-PAGE,OCR-D-OCR-TESS -O OCR-D-OCR-TESS-EVAL
|
||||||
~~~
|
~~~
|
||||||
This generates HTML and JSON reports in the `OCR-D-OCR-TESS-EVAL` filegroup.
|
This generates HTML and JSON reports in the `OCR-D-OCR-TESS-EVAL` filegroup.
|
||||||
|
|
||||||
|
|
||||||

|

|
||||||
|
|
||||||
|
You may also want to disable metrics and the green-red color scheme by
|
||||||
|
parameter:
|
||||||
|
|
||||||
|
~~~
|
||||||
|
ocrd-dinglehopper -I ABBYY-FULLTEXT,OCR-D-OCR-CALAMARI -O OCR-D-OCR-COMPARE-ABBYY-CALAMARI -p '{"metrics": false}'
|
||||||
|
~~~
|
||||||
|
|
||||||
Testing
|
Testing
|
||||||
-------
|
-------
|
||||||
Use `pytest` to run the tests in [the tests directory](qurator/dinglehopper/tests):
|
Use `pytest` to run the tests in [the tests directory](qurator/dinglehopper/tests):
|
||||||
|
|
|
@ -44,7 +44,7 @@ def gen_diff_report(gt_things, ocr_things, css_prefix, joiner, none, align):
|
||||||
'''.format(gtx, ocrx)
|
'''.format(gtx, ocrx)
|
||||||
|
|
||||||
|
|
||||||
def process(gt, ocr, report_prefix):
|
def process(gt, ocr, report_prefix, *, metrics=True):
|
||||||
"""Check OCR result against GT.
|
"""Check OCR result against GT.
|
||||||
|
|
||||||
The @click decorators change the signature of the decorated functions, so we keep this undecorated version and use
|
The @click decorators change the signature of the decorated functions, so we keep this undecorated version and use
|
||||||
|
@ -91,7 +91,8 @@ def process(gt, ocr, report_prefix):
|
||||||
cer=cer, n_characters=n_characters,
|
cer=cer, n_characters=n_characters,
|
||||||
wer=wer, n_words=n_words,
|
wer=wer, n_words=n_words,
|
||||||
char_diff_report=char_diff_report,
|
char_diff_report=char_diff_report,
|
||||||
word_diff_report=word_diff_report
|
word_diff_report=word_diff_report,
|
||||||
|
metrics=metrics,
|
||||||
).dump(out_fn)
|
).dump(out_fn)
|
||||||
|
|
||||||
|
|
||||||
|
@ -99,8 +100,17 @@ def process(gt, ocr, report_prefix):
|
||||||
@click.argument('gt', type=click.Path(exists=True))
|
@click.argument('gt', type=click.Path(exists=True))
|
||||||
@click.argument('ocr', type=click.Path(exists=True))
|
@click.argument('ocr', type=click.Path(exists=True))
|
||||||
@click.argument('report_prefix', type=click.Path(), default='report')
|
@click.argument('report_prefix', type=click.Path(), default='report')
|
||||||
def main(gt, ocr, report_prefix):
|
@click.option('--metrics/--no-metrics', default=True, help='Enable/disable metrics and green/red')
|
||||||
process(gt, ocr, report_prefix)
|
def main(gt, ocr, report_prefix, metrics):
|
||||||
|
"""
|
||||||
|
Compare the PAGE/ALTO/text document GT against the document OCR.
|
||||||
|
|
||||||
|
The files GT and OCR are usually a ground truth document and the result of
|
||||||
|
an OCR software, but you may use dinglehopper to compare two OCR results. In
|
||||||
|
that case, use --no-metrics to disable the then meaningless metrics and also
|
||||||
|
change the color scheme from green/red to blue.
|
||||||
|
"""
|
||||||
|
process(gt, ocr, report_prefix, metrics=metrics)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
|
|
@ -16,7 +16,14 @@
|
||||||
],
|
],
|
||||||
"steps": [
|
"steps": [
|
||||||
"recognition/text-recognition"
|
"recognition/text-recognition"
|
||||||
]
|
],
|
||||||
|
"parameters": {
|
||||||
|
"metrics": {
|
||||||
|
"type": "boolean",
|
||||||
|
"default": true,
|
||||||
|
"description": "Enable/disable metrics and green/red"
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -4,7 +4,7 @@ import os
|
||||||
import click
|
import click
|
||||||
from ocrd import Processor
|
from ocrd import Processor
|
||||||
from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor
|
from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor
|
||||||
from ocrd_utils import concat_padded, getLogger
|
from ocrd_utils import getLogger, make_file_id, assert_file_grp_cardinality
|
||||||
from pkg_resources import resource_string
|
from pkg_resources import resource_string
|
||||||
|
|
||||||
from qurator.dinglehopper.cli import process as cli_process
|
from qurator.dinglehopper.cli import process as cli_process
|
||||||
|
@ -27,20 +27,20 @@ class OcrdDinglehopperEvaluate(Processor):
|
||||||
kwargs['ocrd_tool'] = OCRD_TOOL['tools']['ocrd-dinglehopper']
|
kwargs['ocrd_tool'] = OCRD_TOOL['tools']['ocrd-dinglehopper']
|
||||||
super(OcrdDinglehopperEvaluate, self).__init__(*args, **kwargs)
|
super(OcrdDinglehopperEvaluate, self).__init__(*args, **kwargs)
|
||||||
|
|
||||||
def _make_file_id(self, input_file, input_file_grp, n):
|
|
||||||
file_id = input_file.ID.replace(input_file_grp, self.output_file_grp)
|
|
||||||
if file_id == input_file.ID:
|
|
||||||
file_id = concat_padded(self.output_file_grp, n)
|
|
||||||
return file_id
|
|
||||||
|
|
||||||
def process(self):
|
def process(self):
|
||||||
|
assert_file_grp_cardinality(self.input_file_grp, 2, 'GT and OCR')
|
||||||
|
assert_file_grp_cardinality(self.output_file_grp, 1)
|
||||||
|
|
||||||
|
metrics = self.parameter['metrics']
|
||||||
gt_grp, ocr_grp = self.input_file_grp.split(',')
|
gt_grp, ocr_grp = self.input_file_grp.split(',')
|
||||||
for n, page_id in enumerate(self.workspace.mets.physical_pages):
|
for n, page_id in enumerate(self.workspace.mets.physical_pages):
|
||||||
gt_file = self.workspace.mets.find_files(fileGrp=gt_grp, pageId=page_id)[0]
|
gt_file = self.workspace.mets.find_files(fileGrp=gt_grp, pageId=page_id)[0]
|
||||||
ocr_file = self.workspace.mets.find_files(fileGrp=ocr_grp, pageId=page_id)[0]
|
ocr_file = self.workspace.mets.find_files(fileGrp=ocr_grp, pageId=page_id)[0]
|
||||||
|
gt_file = self.workspace.download_file(gt_file)
|
||||||
|
ocr_file = self.workspace.download_file(ocr_file)
|
||||||
log.info("INPUT FILES %i / %s↔ %s", n, gt_file, ocr_file)
|
log.info("INPUT FILES %i / %s↔ %s", n, gt_file, ocr_file)
|
||||||
|
|
||||||
file_id = self._make_file_id(ocr_file, ocr_grp, n)
|
file_id = make_file_id(ocr_file, self.output_file_grp)
|
||||||
report_prefix = os.path.join(self.output_file_grp, file_id)
|
report_prefix = os.path.join(self.output_file_grp, file_id)
|
||||||
|
|
||||||
# Process the files
|
# Process the files
|
||||||
|
@ -48,7 +48,12 @@ class OcrdDinglehopperEvaluate(Processor):
|
||||||
os.mkdir(self.output_file_grp)
|
os.mkdir(self.output_file_grp)
|
||||||
except FileExistsError:
|
except FileExistsError:
|
||||||
pass
|
pass
|
||||||
cli_process(gt_file.local_filename, ocr_file.local_filename, report_prefix)
|
cli_process(
|
||||||
|
gt_file.local_filename,
|
||||||
|
ocr_file.local_filename,
|
||||||
|
report_prefix,
|
||||||
|
metrics=metrics
|
||||||
|
)
|
||||||
|
|
||||||
# Add reports to the workspace
|
# Add reports to the workspace
|
||||||
for report_suffix, mimetype in \
|
for report_suffix, mimetype in \
|
||||||
|
|
|
@ -6,12 +6,18 @@
|
||||||
|
|
||||||
<link rel="stylesheet" href="https://stackpath.bootstrapcdn.com/bootstrap/4.3.1/css/bootstrap.min.css" integrity="sha384-ggOyR0iXCbMQv3Xipma34MD+dH/1fQ784/j6cY/iJTQUOhcWr7x9JvoRxT2MZw1T" crossorigin="anonymous">
|
<link rel="stylesheet" href="https://stackpath.bootstrapcdn.com/bootstrap/4.3.1/css/bootstrap.min.css" integrity="sha384-ggOyR0iXCbMQv3Xipma34MD+dH/1fQ784/j6cY/iJTQUOhcWr7x9JvoRxT2MZw1T" crossorigin="anonymous">
|
||||||
<style type="text/css">
|
<style type="text/css">
|
||||||
|
{% if metrics %}
|
||||||
.gt .diff {
|
.gt .diff {
|
||||||
color: green;
|
color: green;
|
||||||
}
|
}
|
||||||
.ocr .diff {
|
.ocr .diff {
|
||||||
color: red;
|
color: red;
|
||||||
}
|
}
|
||||||
|
{% else %}
|
||||||
|
.gt .diff, .ocr .diff {
|
||||||
|
color: blue;
|
||||||
|
}
|
||||||
|
{% endif %}
|
||||||
.ellipsis {
|
.ellipsis {
|
||||||
opacity: 0.5;
|
opacity: 0.5;
|
||||||
font-style: italic;
|
font-style: italic;
|
||||||
|
@ -32,9 +38,11 @@
|
||||||
{{ ocr }}
|
{{ ocr }}
|
||||||
|
|
||||||
|
|
||||||
|
{% if metrics %}
|
||||||
<h2>Metrics</h2>
|
<h2>Metrics</h2>
|
||||||
<p>CER: {{ cer|round(4) }}</p>
|
<p>CER: {{ cer|round(4) }}</p>
|
||||||
<p>WER: {{ wer|round(4) }}</p>
|
<p>WER: {{ wer|round(4) }}</p>
|
||||||
|
{% endif %}
|
||||||
|
|
||||||
<h2>Character differences</h2>
|
<h2>Character differences</h2>
|
||||||
{{ char_diff_report }}
|
{{ char_diff_report }}
|
||||||
|
|
|
@ -4,4 +4,5 @@ lxml
|
||||||
uniseg
|
uniseg
|
||||||
numpy
|
numpy
|
||||||
colorama
|
colorama
|
||||||
ocrd >= 1.0.0b15
|
MarkupSafe
|
||||||
|
ocrd >= 2.13.1
|
||||||
|
|
2
setup.cfg
Normal file
2
setup.cfg
Normal file
|
@ -0,0 +1,2 @@
|
||||||
|
[flake8]
|
||||||
|
max-line-length = 90
|
Loading…
Add table
Add a link
Reference in a new issue