🚧 Add option for text encoding to line dir cli

feat/flex-line-dirs
Gerber, Mike 4 weeks ago
parent 0920247a45
commit eca76a7db2

@ -114,6 +114,7 @@ def process(
metrics: bool = True, metrics: bool = True,
differences: bool = False, differences: bool = False,
textequiv_level: str = "region", textequiv_level: str = "region",
plain_encoding: str = "autodetect",
) -> None: ) -> None:
"""Check OCR result against GT. """Check OCR result against GT.
@ -121,8 +122,12 @@ def process(
this undecorated version and use Click on a wrapper. this undecorated version and use Click on a wrapper.
""" """
gt_text = extract(gt, textequiv_level=textequiv_level) gt_text = extract(
ocr_text = extract(ocr, textequiv_level=textequiv_level) gt, textequiv_level=textequiv_level, plain_encoding=plain_encoding
)
ocr_text = extract(
ocr, textequiv_level=textequiv_level, plain_encoding=plain_encoding
)
gt_words: List[str] = list(words_normalized(gt_text)) gt_words: List[str] = list(words_normalized(gt_text))
ocr_words: List[str] = list(words_normalized(ocr_text)) ocr_words: List[str] = list(words_normalized(ocr_text))
@ -195,6 +200,7 @@ def process_dir(
metrics: bool = True, metrics: bool = True,
differences: bool = False, differences: bool = False,
textequiv_level: str = "region", textequiv_level: str = "region",
plain_encoding: str = "autodetect",
) -> None: ) -> None:
for gt_file in os.listdir(gt): for gt_file in os.listdir(gt):
gt_file_path = os.path.join(gt, gt_file) gt_file_path = os.path.join(gt, gt_file)
@ -209,6 +215,7 @@ def process_dir(
metrics=metrics, metrics=metrics,
differences=differences, differences=differences,
textequiv_level=textequiv_level, textequiv_level=textequiv_level,
plain_encoding=plain_encoding,
) )
else: else:
print("Skipping {0} and {1}".format(gt_file_path, ocr_file_path)) print("Skipping {0} and {1}".format(gt_file_path, ocr_file_path))
@ -233,6 +240,11 @@ def process_dir(
help="PAGE TextEquiv level to extract text from", help="PAGE TextEquiv level to extract text from",
metavar="LEVEL", metavar="LEVEL",
) )
@click.option(
"--plain-encoding",
default="autodetect",
help='Encoding (e.g. "utf-8") of plain text files',
)
@click.option("--progress", default=False, is_flag=True, help="Show progress bar") @click.option("--progress", default=False, is_flag=True, help="Show progress bar")
@click.version_option() @click.version_option()
def main( def main(
@ -243,6 +255,7 @@ def main(
metrics, metrics,
differences, differences,
textequiv_level, textequiv_level,
plain_encoding,
progress, progress,
): ):
""" """
@ -280,6 +293,7 @@ def main(
metrics=metrics, metrics=metrics,
differences=differences, differences=differences,
textequiv_level=textequiv_level, textequiv_level=textequiv_level,
plain_encoding=plain_encoding,
) )
else: else:
process( process(
@ -290,6 +304,7 @@ def main(
metrics=metrics, metrics=metrics,
differences=differences, differences=differences,
textequiv_level=textequiv_level, textequiv_level=textequiv_level,
plain_encoding=plain_encoding,
) )

Loading…
Cancel
Save