🚧 Support 'merged' GT+OCR line directories

feat/flex-line-dirs
Gerber, Mike 1 week ago
parent f82479eff2
commit b93db45ca8

@ -97,7 +97,7 @@ def find_gt_and_ocr_files_autodetect(gt_dir, ocr_dir):
yield from find_gt_and_ocr_files(gt_dir, gt_suffix, ocr_dir, ocr_suffix)
def process(gt_dir, ocr_dir, report_prefix, *, metrics=True):
def process(gt_dir, ocr_dir, report_prefix, *, metrics=True, gt_suffix=None, ocr_suffix=None):
cer = None
n_characters = None
@ -106,8 +106,12 @@ def process(gt_dir, ocr_dir, report_prefix, *, metrics=True):
n_words = None
word_diff_report = ""
for k, (gt_fn, ocr_fn) in enumerate(find_gt_and_ocr_files_autodetect(gt_dir, ocr_dir)):
if gt_suffix is not None and ocr_suffix is not None:
gt_ocr_files = find_gt_and_ocr_files(gt_dir, gt_suffix, ocr_dir, ocr_suffix)
else:
gt_ocr_files = find_gt_and_ocr_files_autodetect(gt_dir, ocr_dir)
for k, (gt_fn, ocr_fn) in enumerate(gt_ocr_files):
gt_text = plain_extract(gt_fn, include_filename_in_id=True)
ocr_text = plain_extract(ocr_fn, include_filename_in_id=True)
gt_words = words_normalized(gt_text)
@ -183,17 +187,25 @@ def process(gt_dir, ocr_dir, report_prefix, *, metrics=True):
@click.option(
"--metrics/--no-metrics", default=True, help="Enable/disable metrics and green/red"
)
def main(gt, ocr, report_prefix, metrics):
@click.option("--gt-suffix", help="Suffix of GT line text files")
@click.option("--ocr-suffix", help="Suffix of OCR line text files")
def main(gt, ocr, report_prefix, metrics, gt_suffix, ocr_suffix):
"""
Compare the GT line text directory against the OCR line text directory.
This assumes that the GT line text directory contains textfiles with a common
suffix like ".gt.txt", and the OCR line text directory contains textfiles with
a common suffix like ".some-ocr.txt". The text files also need to be paired,
i.e. the GT file "line001.gt.txt" needs to match a file "line001.some-ocr.txt"
in the OCT lines directory.
i.e. the GT filename "line001.gt.txt" needs to match a filename
"line001.some-ocr.txt" in the OCR lines directory.
The GT and OCR directories are usually round truth line texts and the results of
GT and OCR directories may contain line text files in matching subdirectories,
e.g. "GT/goethe_faust/line1.gt.txt" and "OCR/goethe_faust/line1.pred.txt".
GT and OCR directories can also be the same directory, but in this case you need
to give --gt-suffix and --ocr-suffix explicitly.
The GT and OCR directories are usually ground truth line texts and the results of
an OCR software, but you may use dinglehopper to compare two OCR results. In
that case, use --no-metrics to disable the then meaningless metrics and also
change the color scheme from green/red to blue.
@ -204,7 +216,7 @@ def main(gt, ocr, report_prefix, metrics):
"""
initLogging()
process(gt, ocr, report_prefix, metrics=metrics)
process(gt, ocr, report_prefix, metrics=metrics, gt_suffix=gt_suffix, ocr_suffix=ocr_suffix)
if __name__ == "__main__":

Loading…
Cancel
Save