mirror of
https://github.com/qurator-spk/dinglehopper.git
synced 2025-07-07 01:19:56 +02:00
✨ dinglehopper: Add CLI option to choose TextEquiv level
This commit is contained in:
parent
75733039b8
commit
9744fa2567
3 changed files with 17 additions and 9 deletions
|
@ -43,8 +43,12 @@ Usage: dinglehopper [OPTIONS] GT OCR [REPORT_PREFIX]
|
||||||
$REPORT_PREFIX defaults to "report". The reports include the character
|
$REPORT_PREFIX defaults to "report". The reports include the character
|
||||||
error rate (CER) and the word error rate (WER).
|
error rate (CER) and the word error rate (WER).
|
||||||
|
|
||||||
|
By default, the text of PAGE files is extracted on 'region' level. You may
|
||||||
|
use "--textequiv-level line" to extract from the level of TextLine tags.
|
||||||
|
|
||||||
Options:
|
Options:
|
||||||
--metrics / --no-metrics Enable/disable metrics and green/red
|
--metrics / --no-metrics Enable/disable metrics and green/red
|
||||||
|
--textequiv-level LEVEL PAGE TextEquiv level to extract text from
|
||||||
--progress Show progress bar
|
--progress Show progress bar
|
||||||
--help Show this message and exit.
|
--help Show this message and exit.
|
||||||
~~~
|
~~~
|
||||||
|
|
|
@ -80,15 +80,15 @@ def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none):
|
||||||
'''.format(gtx, ocrx)
|
'''.format(gtx, ocrx)
|
||||||
|
|
||||||
|
|
||||||
def process(gt, ocr, report_prefix, *, metrics=True):
|
def process(gt, ocr, report_prefix, *, metrics=True, textequiv_level='region'):
|
||||||
"""Check OCR result against GT.
|
"""Check OCR result against GT.
|
||||||
|
|
||||||
The @click decorators change the signature of the decorated functions, so we keep this undecorated version and use
|
The @click decorators change the signature of the decorated functions, so we keep this undecorated version and use
|
||||||
Click on a wrapper.
|
Click on a wrapper.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
gt_text = extract(gt)
|
gt_text = extract(gt, textequiv_level=textequiv_level)
|
||||||
ocr_text = extract(ocr)
|
ocr_text = extract(ocr, textequiv_level=textequiv_level)
|
||||||
|
|
||||||
cer, n_characters = character_error_rate_n(gt_text, ocr_text)
|
cer, n_characters = character_error_rate_n(gt_text, ocr_text)
|
||||||
wer, n_words = word_error_rate_n(gt_text, ocr_text)
|
wer, n_words = word_error_rate_n(gt_text, ocr_text)
|
||||||
|
@ -134,8 +134,9 @@ def process(gt, ocr, report_prefix, *, metrics=True):
|
||||||
@click.argument('ocr', type=click.Path(exists=True))
|
@click.argument('ocr', type=click.Path(exists=True))
|
||||||
@click.argument('report_prefix', type=click.Path(), default='report')
|
@click.argument('report_prefix', type=click.Path(), default='report')
|
||||||
@click.option('--metrics/--no-metrics', default=True, help='Enable/disable metrics and green/red')
|
@click.option('--metrics/--no-metrics', default=True, help='Enable/disable metrics and green/red')
|
||||||
|
@click.option('--textequiv-level', default='region', help='PAGE TextEquiv level to extract text from', metavar='LEVEL')
|
||||||
@click.option('--progress', default=False, is_flag=True, help='Show progress bar')
|
@click.option('--progress', default=False, is_flag=True, help='Show progress bar')
|
||||||
def main(gt, ocr, report_prefix, metrics, progress):
|
def main(gt, ocr, report_prefix, metrics, textequiv_level, progress):
|
||||||
"""
|
"""
|
||||||
Compare the PAGE/ALTO/text document GT against the document OCR.
|
Compare the PAGE/ALTO/text document GT against the document OCR.
|
||||||
|
|
||||||
|
@ -150,9 +151,12 @@ def main(gt, ocr, report_prefix, metrics, progress):
|
||||||
The comparison report will be written to $REPORT_PREFIX.{html,json}, where
|
The comparison report will be written to $REPORT_PREFIX.{html,json}, where
|
||||||
$REPORT_PREFIX defaults to "report". The reports include the character error
|
$REPORT_PREFIX defaults to "report". The reports include the character error
|
||||||
rate (CER) and the word error rate (WER).
|
rate (CER) and the word error rate (WER).
|
||||||
|
|
||||||
|
By default, the text of PAGE files is extracted on 'region' level. You may
|
||||||
|
use "--textequiv-level line" to extract from the level of TextLine tags.
|
||||||
"""
|
"""
|
||||||
Config.progress = progress
|
Config.progress = progress
|
||||||
process(gt, ocr, report_prefix, metrics=metrics)
|
process(gt, ocr, report_prefix, metrics=metrics, textequiv_level=textequiv_level)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
|
|
@ -54,7 +54,7 @@ def page_namespace(tree):
|
||||||
raise ValueError('Not a PAGE tree')
|
raise ValueError('Not a PAGE tree')
|
||||||
|
|
||||||
|
|
||||||
def page_extract(tree, textequiv_level='region'):
|
def page_extract(tree, *, textequiv_level='region'):
|
||||||
"""Extract text from the given PAGE content ElementTree."""
|
"""Extract text from the given PAGE content ElementTree."""
|
||||||
|
|
||||||
# Internally, this is just parsing the Reading Order (if it exists) and
|
# Internally, this is just parsing the Reading Order (if it exists) and
|
||||||
|
@ -87,7 +87,7 @@ def page_extract(tree, textequiv_level='region'):
|
||||||
return ExtractedText(None, regions, '\n', None)
|
return ExtractedText(None, regions, '\n', None)
|
||||||
|
|
||||||
|
|
||||||
def page_text(tree, textequiv_level='region'):
|
def page_text(tree, *, textequiv_level='region'):
|
||||||
return page_extract(tree, textequiv_level=textequiv_level).text
|
return page_extract(tree, textequiv_level=textequiv_level).text
|
||||||
|
|
||||||
|
|
||||||
|
@ -105,7 +105,7 @@ def plain_text(filename):
|
||||||
return plain_extract(filename).text
|
return plain_extract(filename).text
|
||||||
|
|
||||||
|
|
||||||
def extract(filename):
|
def extract(filename, *, textequiv_level='region'):
|
||||||
"""Extract the text from the given file.
|
"""Extract the text from the given file.
|
||||||
|
|
||||||
Supports PAGE, ALTO and falls back to plain text.
|
Supports PAGE, ALTO and falls back to plain text.
|
||||||
|
@ -115,7 +115,7 @@ def extract(filename):
|
||||||
except XMLSyntaxError:
|
except XMLSyntaxError:
|
||||||
return plain_extract(filename)
|
return plain_extract(filename)
|
||||||
try:
|
try:
|
||||||
return page_extract(tree)
|
return page_extract(tree, textequiv_level=textequiv_level)
|
||||||
except ValueError:
|
except ValueError:
|
||||||
return alto_extract(tree)
|
return alto_extract(tree)
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue