mirror of
https://github.com/qurator-spk/dinglehopper.git
synced 2025-06-10 20:29:57 +02:00
🚧 Support 'merged' GT+OCR line directories
This commit is contained in:
parent
6980d7a252
commit
73ee16fe51
1 changed files with 19 additions and 7 deletions
|
@ -97,7 +97,7 @@ def find_gt_and_ocr_files_autodetect(gt_dir, ocr_dir):
|
|||
yield from find_gt_and_ocr_files(gt_dir, gt_suffix, ocr_dir, ocr_suffix)
|
||||
|
||||
|
||||
def process(gt_dir, ocr_dir, report_prefix, *, metrics=True):
|
||||
def process(gt_dir, ocr_dir, report_prefix, *, metrics=True, gt_suffix=None, ocr_suffix=None):
|
||||
|
||||
cer = None
|
||||
n_characters = None
|
||||
|
@ -106,8 +106,12 @@ def process(gt_dir, ocr_dir, report_prefix, *, metrics=True):
|
|||
n_words = None
|
||||
word_diff_report = ""
|
||||
|
||||
for k, (gt_fn, ocr_fn) in enumerate(find_gt_and_ocr_files_autodetect(gt_dir, ocr_dir)):
|
||||
if gt_suffix is not None and ocr_suffix is not None:
|
||||
gt_ocr_files = find_gt_and_ocr_files(gt_dir, gt_suffix, ocr_dir, ocr_suffix)
|
||||
else:
|
||||
gt_ocr_files = find_gt_and_ocr_files_autodetect(gt_dir, ocr_dir)
|
||||
|
||||
for k, (gt_fn, ocr_fn) in enumerate(gt_ocr_files):
|
||||
gt_text = plain_extract(gt_fn, include_filename_in_id=True)
|
||||
ocr_text = plain_extract(ocr_fn, include_filename_in_id=True)
|
||||
gt_words = words_normalized(gt_text)
|
||||
|
@ -183,17 +187,25 @@ def process(gt_dir, ocr_dir, report_prefix, *, metrics=True):
|
|||
@click.option(
|
||||
"--metrics/--no-metrics", default=True, help="Enable/disable metrics and green/red"
|
||||
)
|
||||
def main(gt, ocr, report_prefix, metrics):
|
||||
@click.option("--gt-suffix", help="Suffix of GT line text files")
|
||||
@click.option("--ocr-suffix", help="Suffix of OCR line text files")
|
||||
def main(gt, ocr, report_prefix, metrics, gt_suffix, ocr_suffix):
|
||||
"""
|
||||
Compare the GT line text directory against the OCR line text directory.
|
||||
|
||||
This assumes that the GT line text directory contains textfiles with a common
|
||||
suffix like ".gt.txt", and the OCR line text directory contains textfiles with
|
||||
a common suffix like ".some-ocr.txt". The text files also need to be paired,
|
||||
i.e. the GT file "line001.gt.txt" needs to match a file "line001.some-ocr.txt"
|
||||
in the OCT lines directory.
|
||||
i.e. the GT filename "line001.gt.txt" needs to match a filename
|
||||
"line001.some-ocr.txt" in the OCR lines directory.
|
||||
|
||||
The GT and OCR directories are usually round truth line texts and the results of
|
||||
GT and OCR directories may contain line text files in matching subdirectories,
|
||||
e.g. "GT/goethe_faust/line1.gt.txt" and "OCR/goethe_faust/line1.pred.txt".
|
||||
|
||||
GT and OCR directories can also be the same directory, but in this case you need
|
||||
to give --gt-suffix and --ocr-suffix explicitly.
|
||||
|
||||
The GT and OCR directories are usually ground truth line texts and the results of
|
||||
an OCR software, but you may use dinglehopper to compare two OCR results. In
|
||||
that case, use --no-metrics to disable the then meaningless metrics and also
|
||||
change the color scheme from green/red to blue.
|
||||
|
@ -204,7 +216,7 @@ def main(gt, ocr, report_prefix, metrics):
|
|||
|
||||
"""
|
||||
initLogging()
|
||||
process(gt, ocr, report_prefix, metrics=metrics)
|
||||
process(gt, ocr, report_prefix, metrics=metrics, gt_suffix=gt_suffix, ocr_suffix=ocr_suffix)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue