From d706ef46216912fea6ebbc54f943625f284f4874 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Wed, 30 Sep 2020 17:58:05 +0200 Subject: [PATCH] =?UTF-8?q?=F0=9F=93=9D=20Document=20CER/WER=20and=20the?= =?UTF-8?q?=20format=20detection=20(Fixes=20GH-26)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 6 +++++- qurator/dinglehopper/cli.py | 6 +++++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 0e894eb..025c3e7 100644 --- a/README.md +++ b/README.md @@ -31,13 +31,17 @@ Usage: dinglehopper [OPTIONS] GT OCR [REPORT_PREFIX] Compare the PAGE/ALTO/text document GT against the document OCR. + dinglehopper detects if GT/OCR are ALTO or PAGE XML documents to extract + their text and falls back to plain text if no ALTO or PAGE is detected. + The files GT and OCR are usually a ground truth document and the result of an OCR software, but you may use dinglehopper to compare two OCR results. In that case, use --no-metrics to disable the then meaningless metrics and also change the color scheme from green/red to blue. The comparison report will be written to $REPORT_PREFIX.{html,json}, where - $REPORT_PREFIX defaults to "report". + $REPORT_PREFIX defaults to "report". The reports include the character + error rate (CER) and the word error rate (WER). Options: --metrics / --no-metrics Enable/disable metrics and green/red diff --git a/qurator/dinglehopper/cli.py b/qurator/dinglehopper/cli.py index 1e53a94..759d040 100644 --- a/qurator/dinglehopper/cli.py +++ b/qurator/dinglehopper/cli.py @@ -105,13 +105,17 @@ def main(gt, ocr, report_prefix, metrics): """ Compare the PAGE/ALTO/text document GT against the document OCR. + dinglehopper detects if GT/OCR are ALTO or PAGE XML documents to extract + their text and falls back to plain text if no ALTO or PAGE is detected. + The files GT and OCR are usually a ground truth document and the result of an OCR software, but you may use dinglehopper to compare two OCR results. In that case, use --no-metrics to disable the then meaningless metrics and also change the color scheme from green/red to blue. The comparison report will be written to $REPORT_PREFIX.{html,json}, where - $REPORT_PREFIX defaults to "report". + $REPORT_PREFIX defaults to "report". The reports include the character error + rate (CER) and the word error rate (WER). """ process(gt, ocr, report_prefix, metrics=metrics)