|
|
@ -1,5 +1,6 @@
|
|
|
|
import itertools
|
|
|
|
import itertools
|
|
|
|
import os
|
|
|
|
import os
|
|
|
|
|
|
|
|
from typing import Callable, Iterator, List, Optional, Tuple
|
|
|
|
|
|
|
|
|
|
|
|
import click
|
|
|
|
import click
|
|
|
|
from jinja2 import Environment, FileSystemLoader
|
|
|
|
from jinja2 import Environment, FileSystemLoader
|
|
|
@ -12,6 +13,41 @@ from .ocr_files import plain_extract
|
|
|
|
from .word_error_rate import word_error_rate_n, words_normalized
|
|
|
|
from .word_error_rate import word_error_rate_n, words_normalized
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def removesuffix(text, suffix):
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
Remove suffix from text.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Can be replaced with str.removesuffix when we only support Python >= 3.9.
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
if suffix and text.endswith(suffix):
|
|
|
|
|
|
|
|
return text[: -len(suffix)]
|
|
|
|
|
|
|
|
return text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def is_hidden(filepath):
|
|
|
|
|
|
|
|
filename = os.path.basename(os.path.abspath(filepath))
|
|
|
|
|
|
|
|
return filename.startswith(".")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def find_all_files(
|
|
|
|
|
|
|
|
dir_: str, pred: Optional[Callable[[str], bool]] = None, return_hidden: bool = False
|
|
|
|
|
|
|
|
) -> Iterator[str]:
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
Find all files in dir_, returning filenames
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
If pred is given, pred(filename) must be True for the filename.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Does not return hidden files by default.
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
for root, _, filenames in os.walk(dir_):
|
|
|
|
|
|
|
|
for fn in filenames:
|
|
|
|
|
|
|
|
if not return_hidden and is_hidden(fn):
|
|
|
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
if pred and not pred(fn):
|
|
|
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
yield os.path.join(root, fn)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def all_equal(iterable):
|
|
|
|
def all_equal(iterable):
|
|
|
|
g = itertools.groupby(iterable)
|
|
|
|
g = itertools.groupby(iterable)
|
|
|
|
return next(g, True) and not next(g, False)
|
|
|
|
return next(g, True) and not next(g, False)
|
|
|
@ -25,15 +61,63 @@ def common_suffix(its):
|
|
|
|
return reversed(common_prefix(reversed(it) for it in its))
|
|
|
|
return reversed(common_prefix(reversed(it) for it in its))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def removesuffix(text, suffix):
|
|
|
|
def find_gt_and_ocr_files(
|
|
|
|
if suffix and text.endswith(suffix):
|
|
|
|
gt_dir: str, gt_suffix: str, ocr_dir: str, ocr_suffix: str
|
|
|
|
return text[: -len(suffix)]
|
|
|
|
) -> Iterator[Tuple[str, str]]:
|
|
|
|
return text
|
|
|
|
"""
|
|
|
|
|
|
|
|
Find GT files and matching OCR files.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Returns pairs of GT and OCR files.
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
for gt_fn in find_all_files(gt_dir, lambda fn: fn.endswith(gt_suffix)):
|
|
|
|
|
|
|
|
ocr_fn = os.path.join(
|
|
|
|
|
|
|
|
ocr_dir,
|
|
|
|
|
|
|
|
removesuffix(os.path.relpath(gt_fn, start=gt_dir), gt_suffix) + ocr_suffix,
|
|
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
if not os.path.exists(ocr_fn):
|
|
|
|
|
|
|
|
raise RuntimeError(f"{ocr_fn} (matching {gt_fn}) does not exist")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
yield gt_fn, ocr_fn
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def find_gt_and_ocr_files_autodetect(gt_dir, ocr_dir):
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
Find GT files and matching OCR files, autodetect suffixes.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
This only works if gt_dir (or respectivley ocr_dir) only contains GT (OCR)
|
|
|
|
|
|
|
|
files with a common suffix. Currently the files must have a suffix, e.g.
|
|
|
|
|
|
|
|
".gt.txt" (e.g. ".ocr.txt").
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Returns pairs of GT and OCR files.
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# Autodetect suffixes
|
|
|
|
|
|
|
|
gt_files = find_all_files(gt_dir)
|
|
|
|
|
|
|
|
gt_suffix = "".join(common_suffix(gt_files))
|
|
|
|
|
|
|
|
if len(gt_suffix) == 0:
|
|
|
|
|
|
|
|
raise RuntimeError(
|
|
|
|
|
|
|
|
f"Files in GT directory {gt_dir} do not have a common suffix"
|
|
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
ocr_files = find_all_files(ocr_dir)
|
|
|
|
|
|
|
|
ocr_suffix = "".join(common_suffix(ocr_files))
|
|
|
|
|
|
|
|
if len(ocr_suffix) == 0:
|
|
|
|
|
|
|
|
raise RuntimeError(
|
|
|
|
|
|
|
|
f"Files in OCR directory {ocr_dir} do not have a common suffix"
|
|
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
yield from find_gt_and_ocr_files(gt_dir, gt_suffix, ocr_dir, ocr_suffix)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def process(gt_dir, ocr_dir, report_prefix, *, metrics=True):
|
|
|
|
def process(
|
|
|
|
gt_suffix = "".join(common_suffix(os.listdir(gt_dir)))
|
|
|
|
gt_dir,
|
|
|
|
ocr_suffix = "".join(common_suffix(os.listdir(ocr_dir)))
|
|
|
|
ocr_dir,
|
|
|
|
|
|
|
|
report_prefix,
|
|
|
|
|
|
|
|
*,
|
|
|
|
|
|
|
|
metrics=True,
|
|
|
|
|
|
|
|
gt_suffix=None,
|
|
|
|
|
|
|
|
ocr_suffix=None,
|
|
|
|
|
|
|
|
plain_encoding="autodetect",
|
|
|
|
|
|
|
|
):
|
|
|
|
|
|
|
|
|
|
|
|
cer = None
|
|
|
|
cer = None
|
|
|
|
n_characters = None
|
|
|
|
n_characters = None
|
|
|
@ -42,16 +126,20 @@ def process(gt_dir, ocr_dir, report_prefix, *, metrics=True):
|
|
|
|
n_words = None
|
|
|
|
n_words = None
|
|
|
|
word_diff_report = ""
|
|
|
|
word_diff_report = ""
|
|
|
|
|
|
|
|
|
|
|
|
for k, gt in enumerate(os.listdir(gt_dir)):
|
|
|
|
if gt_suffix is not None and ocr_suffix is not None:
|
|
|
|
# Find a match by replacing the suffix
|
|
|
|
gt_ocr_files = find_gt_and_ocr_files(gt_dir, gt_suffix, ocr_dir, ocr_suffix)
|
|
|
|
ocr = removesuffix(gt, gt_suffix) + ocr_suffix
|
|
|
|
else:
|
|
|
|
|
|
|
|
gt_ocr_files = find_gt_and_ocr_files_autodetect(gt_dir, ocr_dir)
|
|
|
|
|
|
|
|
|
|
|
|
gt_text = plain_extract(os.path.join(gt_dir, gt), include_filename_in_id=True)
|
|
|
|
for k, (gt_fn, ocr_fn) in enumerate(gt_ocr_files):
|
|
|
|
|
|
|
|
gt_text = plain_extract(
|
|
|
|
|
|
|
|
gt_fn, include_filename_in_id=True, encoding=plain_encoding
|
|
|
|
|
|
|
|
)
|
|
|
|
ocr_text = plain_extract(
|
|
|
|
ocr_text = plain_extract(
|
|
|
|
os.path.join(ocr_dir, ocr), include_filename_in_id=True
|
|
|
|
ocr_fn, include_filename_in_id=True, encoding=plain_encoding
|
|
|
|
)
|
|
|
|
)
|
|
|
|
gt_words = words_normalized(gt_text)
|
|
|
|
gt_words: List[str] = list(words_normalized(gt_text))
|
|
|
|
ocr_words = words_normalized(ocr_text)
|
|
|
|
ocr_words: List[str] = list(words_normalized(ocr_text))
|
|
|
|
|
|
|
|
|
|
|
|
# Compute CER
|
|
|
|
# Compute CER
|
|
|
|
l_cer, l_n_characters = character_error_rate_n(gt_text, ocr_text)
|
|
|
|
l_cer, l_n_characters = character_error_rate_n(gt_text, ocr_text)
|
|
|
@ -81,7 +169,7 @@ def process(gt_dir, ocr_dir, report_prefix, *, metrics=True):
|
|
|
|
joiner="",
|
|
|
|
joiner="",
|
|
|
|
none="·",
|
|
|
|
none="·",
|
|
|
|
score_hint=score_hint(l_cer, l_n_characters),
|
|
|
|
score_hint=score_hint(l_cer, l_n_characters),
|
|
|
|
)
|
|
|
|
)[0]
|
|
|
|
word_diff_report += gen_diff_report(
|
|
|
|
word_diff_report += gen_diff_report(
|
|
|
|
gt_words,
|
|
|
|
gt_words,
|
|
|
|
ocr_words,
|
|
|
|
ocr_words,
|
|
|
@ -89,7 +177,7 @@ def process(gt_dir, ocr_dir, report_prefix, *, metrics=True):
|
|
|
|
joiner=" ",
|
|
|
|
joiner=" ",
|
|
|
|
none="⋯",
|
|
|
|
none="⋯",
|
|
|
|
score_hint=score_hint(l_wer, l_n_words),
|
|
|
|
score_hint=score_hint(l_wer, l_n_words),
|
|
|
|
)
|
|
|
|
)[0]
|
|
|
|
|
|
|
|
|
|
|
|
env = Environment(
|
|
|
|
env = Environment(
|
|
|
|
loader=FileSystemLoader(
|
|
|
|
loader=FileSystemLoader(
|
|
|
@ -123,17 +211,30 @@ def process(gt_dir, ocr_dir, report_prefix, *, metrics=True):
|
|
|
|
@click.option(
|
|
|
|
@click.option(
|
|
|
|
"--metrics/--no-metrics", default=True, help="Enable/disable metrics and green/red"
|
|
|
|
"--metrics/--no-metrics", default=True, help="Enable/disable metrics and green/red"
|
|
|
|
)
|
|
|
|
)
|
|
|
|
def main(gt, ocr, report_prefix, metrics):
|
|
|
|
@click.option("--gt-suffix", help="Suffix of GT line text files")
|
|
|
|
|
|
|
|
@click.option("--ocr-suffix", help="Suffix of OCR line text files")
|
|
|
|
|
|
|
|
@click.option(
|
|
|
|
|
|
|
|
"--plain-encoding",
|
|
|
|
|
|
|
|
default="autodetect",
|
|
|
|
|
|
|
|
help='Encoding (e.g. "utf-8") of plain text files',
|
|
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
def main(gt, ocr, report_prefix, metrics, gt_suffix, ocr_suffix, plain_encoding):
|
|
|
|
"""
|
|
|
|
"""
|
|
|
|
Compare the GT line text directory against the OCR line text directory.
|
|
|
|
Compare the GT line text directory against the OCR line text directory.
|
|
|
|
|
|
|
|
|
|
|
|
This assumes that the GT line text directory contains textfiles with a common
|
|
|
|
This assumes that the GT line text directory contains textfiles with a common
|
|
|
|
suffix like ".gt.txt", and the OCR line text directory contains textfiles with
|
|
|
|
suffix like ".gt.txt", and the OCR line text directory contains textfiles with
|
|
|
|
a common suffix like ".some-ocr.txt". The text files also need to be paired,
|
|
|
|
a common suffix like ".some-ocr.txt". The text files also need to be paired,
|
|
|
|
i.e. the GT file "line001.gt.txt" needs to match a file "line001.some-ocr.txt"
|
|
|
|
i.e. the GT filename "line001.gt.txt" needs to match a filename
|
|
|
|
in the OCT lines directory.
|
|
|
|
"line001.some-ocr.txt" in the OCR lines directory.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
GT and OCR directories may contain line text files in matching subdirectories,
|
|
|
|
|
|
|
|
e.g. "GT/goethe_faust/line1.gt.txt" and "OCR/goethe_faust/line1.pred.txt".
|
|
|
|
|
|
|
|
|
|
|
|
The GT and OCR directories are usually round truth line texts and the results of
|
|
|
|
GT and OCR directories can also be the same directory, but in this case you need
|
|
|
|
|
|
|
|
to give --gt-suffix and --ocr-suffix explicitly.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
The GT and OCR directories are usually ground truth line texts and the results of
|
|
|
|
an OCR software, but you may use dinglehopper to compare two OCR results. In
|
|
|
|
an OCR software, but you may use dinglehopper to compare two OCR results. In
|
|
|
|
that case, use --no-metrics to disable the then meaningless metrics and also
|
|
|
|
that case, use --no-metrics to disable the then meaningless metrics and also
|
|
|
|
change the color scheme from green/red to blue.
|
|
|
|
change the color scheme from green/red to blue.
|
|
|
@ -142,9 +243,19 @@ def main(gt, ocr, report_prefix, metrics):
|
|
|
|
$REPORT_PREFIX defaults to "report". The reports include the character error
|
|
|
|
$REPORT_PREFIX defaults to "report". The reports include the character error
|
|
|
|
rate (CER) and the word error rate (WER).
|
|
|
|
rate (CER) and the word error rate (WER).
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
It is recommended to specify the encoding of the text files, for example with
|
|
|
|
|
|
|
|
--plain-encoding utf-8. If this option is not given, we try to auto-detect it.
|
|
|
|
"""
|
|
|
|
"""
|
|
|
|
initLogging()
|
|
|
|
initLogging()
|
|
|
|
process(gt, ocr, report_prefix, metrics=metrics)
|
|
|
|
process(
|
|
|
|
|
|
|
|
gt,
|
|
|
|
|
|
|
|
ocr,
|
|
|
|
|
|
|
|
report_prefix,
|
|
|
|
|
|
|
|
metrics=metrics,
|
|
|
|
|
|
|
|
gt_suffix=gt_suffix,
|
|
|
|
|
|
|
|
ocr_suffix=ocr_suffix,
|
|
|
|
|
|
|
|
plain_encoding=plain_encoding,
|
|
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
if __name__ == "__main__":
|
|
|
|