🚧 dinglehopper: Compare line text directories (WIP)
continuous-integration/drone/push Build encountered an error Details

pull/66/head
Gerber, Mike 3 years ago
parent a018006f98
commit dbb660615a

@ -44,12 +44,12 @@ def process(gt_dir, ocr_dir, report_prefix, *, metrics=True):
n_characters = None n_characters = None
char_diff_report = "" char_diff_report = ""
for gt in os.listdir(gt_dir): for k, gt in enumerate(os.listdir(gt_dir)):
# Find a match by replacing the suffix # Find a match by replacing the suffix
ocr = removesuffix(gt, gt_suffix) + ocr_suffix ocr = removesuffix(gt, gt_suffix) + ocr_suffix
gt_text = plain_extract(os.path.join(gt_dir, gt)) gt_text = plain_extract(os.path.join(gt_dir, gt), include_filename_in_id=True)
ocr_text = plain_extract(os.path.join(ocr_dir, ocr)) ocr_text = plain_extract(os.path.join(ocr_dir, ocr), include_filename_in_id=True)
# Compute CER # Compute CER
l_cer, l_n_characters = character_error_rate_n(gt_text, ocr_text) l_cer, l_n_characters = character_error_rate_n(gt_text, ocr_text)
@ -65,7 +65,7 @@ def process(gt_dir, ocr_dir, report_prefix, *, metrics=True):
wer = 9999; n_words = 0 wer = 9999; n_words = 0
char_diff_report += gen_diff_report( char_diff_report += gen_diff_report(
gt_text, ocr_text, css_prefix="c", joiner="", none="·" gt_text, ocr_text, css_prefix="l{0}-c".format(k), joiner="", none="·"
) )
# TODO # TODO

@ -1,8 +1,9 @@
from __future__ import division, print_function from __future__ import division, print_function
import os
import sys
from typing import Iterator from typing import Iterator
from warnings import warn from warnings import warn
import sys
from lxml import etree as ET from lxml import etree as ET
from lxml.etree import XMLSyntaxError from lxml.etree import XMLSyntaxError
@ -130,12 +131,15 @@ def page_text(tree, *, textequiv_level="region"):
return page_extract(tree, textequiv_level=textequiv_level).text return page_extract(tree, textequiv_level=textequiv_level).text
def plain_extract(filename): def plain_extract(filename, include_filename_in_id=False):
id_template = "{filename} - line {no}" if include_filename_in_id else "line {no}"
with open(filename, "r") as f: with open(filename, "r") as f:
return ExtractedText( return ExtractedText(
None, None,
[ [
ExtractedText("line %d" % no, None, None, normalize_sbb(line)) ExtractedText(
id_template.format(filename=os.path.basename(filename), no=no),
None, None, normalize_sbb(line))
for no, line in enumerate(f.readlines()) for no, line in enumerate(f.readlines())
], ],
"\n", "\n",

Loading…
Cancel
Save