dinglehopper: Add CLI option to choose TextEquiv level

pull/38/head
Gerber, Mike 4 years ago
parent 75733039b8
commit 9744fa2567

@ -43,8 +43,12 @@ Usage: dinglehopper [OPTIONS] GT OCR [REPORT_PREFIX]
$REPORT_PREFIX defaults to "report". The reports include the character $REPORT_PREFIX defaults to "report". The reports include the character
error rate (CER) and the word error rate (WER). error rate (CER) and the word error rate (WER).
By default, the text of PAGE files is extracted on 'region' level. You may
use "--textequiv-level line" to extract from the level of TextLine tags.
Options: Options:
--metrics / --no-metrics Enable/disable metrics and green/red --metrics / --no-metrics Enable/disable metrics and green/red
--textequiv-level LEVEL PAGE TextEquiv level to extract text from
--progress Show progress bar --progress Show progress bar
--help Show this message and exit. --help Show this message and exit.
~~~ ~~~

@ -80,15 +80,15 @@ def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none):
'''.format(gtx, ocrx) '''.format(gtx, ocrx)
def process(gt, ocr, report_prefix, *, metrics=True): def process(gt, ocr, report_prefix, *, metrics=True, textequiv_level='region'):
"""Check OCR result against GT. """Check OCR result against GT.
The @click decorators change the signature of the decorated functions, so we keep this undecorated version and use The @click decorators change the signature of the decorated functions, so we keep this undecorated version and use
Click on a wrapper. Click on a wrapper.
""" """
gt_text = extract(gt) gt_text = extract(gt, textequiv_level=textequiv_level)
ocr_text = extract(ocr) ocr_text = extract(ocr, textequiv_level=textequiv_level)
cer, n_characters = character_error_rate_n(gt_text, ocr_text) cer, n_characters = character_error_rate_n(gt_text, ocr_text)
wer, n_words = word_error_rate_n(gt_text, ocr_text) wer, n_words = word_error_rate_n(gt_text, ocr_text)
@ -134,8 +134,9 @@ def process(gt, ocr, report_prefix, *, metrics=True):
@click.argument('ocr', type=click.Path(exists=True)) @click.argument('ocr', type=click.Path(exists=True))
@click.argument('report_prefix', type=click.Path(), default='report') @click.argument('report_prefix', type=click.Path(), default='report')
@click.option('--metrics/--no-metrics', default=True, help='Enable/disable metrics and green/red') @click.option('--metrics/--no-metrics', default=True, help='Enable/disable metrics and green/red')
@click.option('--textequiv-level', default='region', help='PAGE TextEquiv level to extract text from', metavar='LEVEL')
@click.option('--progress', default=False, is_flag=True, help='Show progress bar') @click.option('--progress', default=False, is_flag=True, help='Show progress bar')
def main(gt, ocr, report_prefix, metrics, progress): def main(gt, ocr, report_prefix, metrics, textequiv_level, progress):
""" """
Compare the PAGE/ALTO/text document GT against the document OCR. Compare the PAGE/ALTO/text document GT against the document OCR.
@ -150,9 +151,12 @@ def main(gt, ocr, report_prefix, metrics, progress):
The comparison report will be written to $REPORT_PREFIX.{html,json}, where The comparison report will be written to $REPORT_PREFIX.{html,json}, where
$REPORT_PREFIX defaults to "report". The reports include the character error $REPORT_PREFIX defaults to "report". The reports include the character error
rate (CER) and the word error rate (WER). rate (CER) and the word error rate (WER).
By default, the text of PAGE files is extracted on 'region' level. You may
use "--textequiv-level line" to extract from the level of TextLine tags.
""" """
Config.progress = progress Config.progress = progress
process(gt, ocr, report_prefix, metrics=metrics) process(gt, ocr, report_prefix, metrics=metrics, textequiv_level=textequiv_level)
if __name__ == '__main__': if __name__ == '__main__':

@ -54,7 +54,7 @@ def page_namespace(tree):
raise ValueError('Not a PAGE tree') raise ValueError('Not a PAGE tree')
def page_extract(tree, textequiv_level='region'): def page_extract(tree, *, textequiv_level='region'):
"""Extract text from the given PAGE content ElementTree.""" """Extract text from the given PAGE content ElementTree."""
# Internally, this is just parsing the Reading Order (if it exists) and # Internally, this is just parsing the Reading Order (if it exists) and
@ -87,7 +87,7 @@ def page_extract(tree, textequiv_level='region'):
return ExtractedText(None, regions, '\n', None) return ExtractedText(None, regions, '\n', None)
def page_text(tree, textequiv_level='region'): def page_text(tree, *, textequiv_level='region'):
return page_extract(tree, textequiv_level=textequiv_level).text return page_extract(tree, textequiv_level=textequiv_level).text
@ -105,7 +105,7 @@ def plain_text(filename):
return plain_extract(filename).text return plain_extract(filename).text
def extract(filename): def extract(filename, *, textequiv_level='region'):
"""Extract the text from the given file. """Extract the text from the given file.
Supports PAGE, ALTO and falls back to plain text. Supports PAGE, ALTO and falls back to plain text.
@ -115,7 +115,7 @@ def extract(filename):
except XMLSyntaxError: except XMLSyntaxError:
return plain_extract(filename) return plain_extract(filename)
try: try:
return page_extract(tree) return page_extract(tree, textequiv_level=textequiv_level)
except ValueError: except ValueError:
return alto_extract(tree) return alto_extract(tree)

Loading…
Cancel
Save