dinglehopper: Add CLI option to choose TextEquiv level

pull/38/head
Gerber, Mike 4 years ago
parent 75733039b8
commit 9744fa2567

@ -43,8 +43,12 @@ Usage: dinglehopper [OPTIONS] GT OCR [REPORT_PREFIX]
$REPORT_PREFIX defaults to "report". The reports include the character
error rate (CER) and the word error rate (WER).
By default, the text of PAGE files is extracted on 'region' level. You may
use "--textequiv-level line" to extract from the level of TextLine tags.
Options:
--metrics / --no-metrics Enable/disable metrics and green/red
--textequiv-level LEVEL PAGE TextEquiv level to extract text from
--progress Show progress bar
--help Show this message and exit.
~~~

@ -80,15 +80,15 @@ def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none):
'''.format(gtx, ocrx)
def process(gt, ocr, report_prefix, *, metrics=True):
def process(gt, ocr, report_prefix, *, metrics=True, textequiv_level='region'):
"""Check OCR result against GT.
The @click decorators change the signature of the decorated functions, so we keep this undecorated version and use
Click on a wrapper.
"""
gt_text = extract(gt)
ocr_text = extract(ocr)
gt_text = extract(gt, textequiv_level=textequiv_level)
ocr_text = extract(ocr, textequiv_level=textequiv_level)
cer, n_characters = character_error_rate_n(gt_text, ocr_text)
wer, n_words = word_error_rate_n(gt_text, ocr_text)
@ -134,8 +134,9 @@ def process(gt, ocr, report_prefix, *, metrics=True):
@click.argument('ocr', type=click.Path(exists=True))
@click.argument('report_prefix', type=click.Path(), default='report')
@click.option('--metrics/--no-metrics', default=True, help='Enable/disable metrics and green/red')
@click.option('--textequiv-level', default='region', help='PAGE TextEquiv level to extract text from', metavar='LEVEL')
@click.option('--progress', default=False, is_flag=True, help='Show progress bar')
def main(gt, ocr, report_prefix, metrics, progress):
def main(gt, ocr, report_prefix, metrics, textequiv_level, progress):
"""
Compare the PAGE/ALTO/text document GT against the document OCR.
@ -150,9 +151,12 @@ def main(gt, ocr, report_prefix, metrics, progress):
The comparison report will be written to $REPORT_PREFIX.{html,json}, where
$REPORT_PREFIX defaults to "report". The reports include the character error
rate (CER) and the word error rate (WER).
By default, the text of PAGE files is extracted on 'region' level. You may
use "--textequiv-level line" to extract from the level of TextLine tags.
"""
Config.progress = progress
process(gt, ocr, report_prefix, metrics=metrics)
process(gt, ocr, report_prefix, metrics=metrics, textequiv_level=textequiv_level)
if __name__ == '__main__':

@ -54,7 +54,7 @@ def page_namespace(tree):
raise ValueError('Not a PAGE tree')
def page_extract(tree, textequiv_level='region'):
def page_extract(tree, *, textequiv_level='region'):
"""Extract text from the given PAGE content ElementTree."""
# Internally, this is just parsing the Reading Order (if it exists) and
@ -87,7 +87,7 @@ def page_extract(tree, textequiv_level='region'):
return ExtractedText(None, regions, '\n', None)
def page_text(tree, textequiv_level='region'):
def page_text(tree, *, textequiv_level='region'):
return page_extract(tree, textequiv_level=textequiv_level).text
@ -105,7 +105,7 @@ def plain_text(filename):
return plain_extract(filename).text
def extract(filename):
def extract(filename, *, textequiv_level='region'):
"""Extract the text from the given file.
Supports PAGE, ALTO and falls back to plain text.
@ -115,7 +115,7 @@ def extract(filename):
except XMLSyntaxError:
return plain_extract(filename)
try:
return page_extract(tree)
return page_extract(tree, textequiv_level=textequiv_level)
except ValueError:
return alto_extract(tree)

Loading…
Cancel
Save