@ -1,5 +1,6 @@
import itertools
import os
from typing import Callable , Iterator , List , Optional , Tuple
import click
from jinja2 import Environment , FileSystemLoader
@ -12,6 +13,41 @@ from .ocr_files import plain_extract
from . word_error_rate import word_error_rate_n , words_normalized
def removesuffix ( text , suffix ) :
"""
Remove suffix from text .
Can be replaced with str . removesuffix when we only support Python > = 3.9 .
"""
if suffix and text . endswith ( suffix ) :
return text [ : - len ( suffix ) ]
return text
def is_hidden ( filepath ) :
filename = os . path . basename ( os . path . abspath ( filepath ) )
return filename . startswith ( " . " )
def find_all_files (
dir_ : str , pred : Optional [ Callable [ [ str ] , bool ] ] = None , return_hidden : bool = False
) - > Iterator [ str ] :
"""
Find all files in dir_ , returning filenames
If pred is given , pred ( filename ) must be True for the filename .
Does not return hidden files by default .
"""
for root , _ , filenames in os . walk ( dir_ ) :
for fn in filenames :
if not return_hidden and is_hidden ( fn ) :
continue
if pred and not pred ( fn ) :
continue
yield os . path . join ( root , fn )
def all_equal ( iterable ) :
g = itertools . groupby ( iterable )
return next ( g , True ) and not next ( g , False )
@ -25,15 +61,63 @@ def common_suffix(its):
return reversed ( common_prefix ( reversed ( it ) for it in its ) )
def removesuffix ( text , suffix ) :
if suffix and text . endswith ( suffix ) :
return text [ : - len ( suffix ) ]
return text
def find_gt_and_ocr_files (
gt_dir : str , gt_suffix : str , ocr_dir : str , ocr_suffix : str
) - > Iterator [ Tuple [ str , str ] ] :
"""
Find GT files and matching OCR files .
Returns pairs of GT and OCR files .
"""
for gt_fn in find_all_files ( gt_dir , lambda fn : fn . endswith ( gt_suffix ) ) :
ocr_fn = os . path . join (
ocr_dir ,
removesuffix ( os . path . relpath ( gt_fn , start = gt_dir ) , gt_suffix ) + ocr_suffix ,
)
if not os . path . exists ( ocr_fn ) :
raise RuntimeError ( f " { ocr_fn } (matching { gt_fn } ) does not exist " )
yield gt_fn , ocr_fn
def find_gt_and_ocr_files_autodetect ( gt_dir , ocr_dir ) :
"""
Find GT files and matching OCR files , autodetect suffixes .
This only works if gt_dir ( or respectivley ocr_dir ) only contains GT ( OCR )
files with a common suffix . Currently the files must have a suffix , e . g .
" .gt.txt " ( e . g . " .ocr.txt " ) .
Returns pairs of GT and OCR files .
"""
# Autodetect suffixes
gt_files = find_all_files ( gt_dir )
gt_suffix = " " . join ( common_suffix ( gt_files ) )
if len ( gt_suffix ) == 0 :
raise RuntimeError (
f " Files in GT directory { gt_dir } do not have a common suffix "
)
ocr_files = find_all_files ( ocr_dir )
ocr_suffix = " " . join ( common_suffix ( ocr_files ) )
if len ( ocr_suffix ) == 0 :
raise RuntimeError (
f " Files in OCR directory { ocr_dir } do not have a common suffix "
)
yield from find_gt_and_ocr_files ( gt_dir , gt_suffix , ocr_dir , ocr_suffix )
def process ( gt_dir , ocr_dir , report_prefix , * , metrics = True ) :
gt_suffix = " " . join ( common_suffix ( os . listdir ( gt_dir ) ) )
ocr_suffix = " " . join ( common_suffix ( os . listdir ( ocr_dir ) ) )
def process (
gt_dir ,
ocr_dir ,
report_prefix ,
* ,
metrics = True ,
gt_suffix = None ,
ocr_suffix = None ,
plain_encoding = " autodetect " ,
) :
cer = None
n_characters = None
@ -42,16 +126,20 @@ def process(gt_dir, ocr_dir, report_prefix, *, metrics=True):
n_words = None
word_diff_report = " "
for k , gt in enumerate ( os . listdir ( gt_dir ) ) :
# Find a match by replacing the suffix
ocr = removesuffix ( gt , gt_suffix ) + ocr_suffix
if gt_suffix is not None and ocr_suffix is not None :
gt_ocr_files = find_gt_and_ocr_files ( gt_dir , gt_suffix , ocr_dir , ocr_suffix )
else :
gt_ocr_files = find_gt_and_ocr_files_autodetect ( gt_dir , ocr_dir )
gt_text = plain_extract ( os . path . join ( gt_dir , gt ) , include_filename_in_id = True )
for k , ( gt_fn , ocr_fn ) in enumerate ( gt_ocr_files ) :
gt_text = plain_extract (
gt_fn , include_filename_in_id = True , encoding = plain_encoding
)
ocr_text = plain_extract (
os . path . join ( ocr_dir , ocr ) , include_filename_in_id = True
o cr_fn , include_filename_in_id = True , encoding = plain_encoding
)
gt_words = words_normalized ( gt_text )
ocr_words = words_normalized ( ocr_text )
gt_words : List [ str ] = list ( words_normalized ( gt_text ) )
ocr_words : List [ str ] = list ( words_normalized ( ocr_text ) )
# Compute CER
l_cer , l_n_characters = character_error_rate_n ( gt_text , ocr_text )
@ -81,7 +169,7 @@ def process(gt_dir, ocr_dir, report_prefix, *, metrics=True):
joiner = " " ,
none = " · " ,
score_hint = score_hint ( l_cer , l_n_characters ) ,
)
) [ 0 ]
word_diff_report + = gen_diff_report (
gt_words ,
ocr_words ,
@ -89,7 +177,7 @@ def process(gt_dir, ocr_dir, report_prefix, *, metrics=True):
joiner = " " ,
none = " ⋯ " ,
score_hint = score_hint ( l_wer , l_n_words ) ,
)
) [ 0 ]
env = Environment (
loader = FileSystemLoader (
@ -123,17 +211,30 @@ def process(gt_dir, ocr_dir, report_prefix, *, metrics=True):
@click.option (
" --metrics/--no-metrics " , default = True , help = " Enable/disable metrics and green/red "
)
def main ( gt , ocr , report_prefix , metrics ) :
@click.option ( " --gt-suffix " , help = " Suffix of GT line text files " )
@click.option ( " --ocr-suffix " , help = " Suffix of OCR line text files " )
@click.option (
" --plain-encoding " ,
default = " autodetect " ,
help = ' Encoding (e.g. " utf-8 " ) of plain text files ' ,
)
def main ( gt , ocr , report_prefix , metrics , gt_suffix , ocr_suffix , plain_encoding ) :
"""
Compare the GT line text directory against the OCR line text directory .
This assumes that the GT line text directory contains textfiles with a common
suffix like " .gt.txt " , and the OCR line text directory contains textfiles with
a common suffix like " .some-ocr.txt " . The text files also need to be paired ,
i . e . the GT file " line001.gt.txt " needs to match a file " line001.some-ocr.txt "
in the OCT lines directory .
i . e . the GT filename " line001.gt.txt " needs to match a filename
" line001.some-ocr.txt " in the OCR lines directory .
GT and OCR directories may contain line text files in matching subdirectories ,
e . g . " GT/goethe_faust/line1.gt.txt " and " OCR/goethe_faust/line1.pred.txt " .
GT and OCR directories can also be the same directory , but in this case you need
to give - - gt - suffix and - - ocr - suffix explicitly .
The GT and OCR directories are usually round truth line texts and the results of
The GT and OCR directories are usually g round truth line texts and the results of
an OCR software , but you may use dinglehopper to compare two OCR results . In
that case , use - - no - metrics to disable the then meaningless metrics and also
change the color scheme from green / red to blue .
@ -142,9 +243,19 @@ def main(gt, ocr, report_prefix, metrics):
$ REPORT_PREFIX defaults to " report " . The reports include the character error
rate ( CER ) and the word error rate ( WER ) .
It is recommended to specify the encoding of the text files , for example with
- - plain - encoding utf - 8. If this option is not given , we try to auto - detect it .
"""
initLogging ( )
process ( gt , ocr , report_prefix , metrics = metrics )
process (
gt ,
ocr ,
report_prefix ,
metrics = metrics ,
gt_suffix = gt_suffix ,
ocr_suffix = ocr_suffix ,
plain_encoding = plain_encoding ,
)
if __name__ == " __main__ " :