🚧 dinglehopper: Compare line text directories (WIP)
continuous-integration/drone/push Build encountered an error Details

pull/66/head
Gerber, Mike 2 years ago
parent a018006f98
commit dbb660615a

@ -44,12 +44,12 @@ def process(gt_dir, ocr_dir, report_prefix, *, metrics=True):
n_characters = None
char_diff_report = ""
for gt in os.listdir(gt_dir):
for k, gt in enumerate(os.listdir(gt_dir)):
# Find a match by replacing the suffix
ocr = removesuffix(gt, gt_suffix) + ocr_suffix
gt_text = plain_extract(os.path.join(gt_dir, gt))
ocr_text = plain_extract(os.path.join(ocr_dir, ocr))
gt_text = plain_extract(os.path.join(gt_dir, gt), include_filename_in_id=True)
ocr_text = plain_extract(os.path.join(ocr_dir, ocr), include_filename_in_id=True)
# Compute CER
l_cer, l_n_characters = character_error_rate_n(gt_text, ocr_text)
@ -65,7 +65,7 @@ def process(gt_dir, ocr_dir, report_prefix, *, metrics=True):
wer = 9999; n_words = 0
char_diff_report += gen_diff_report(
gt_text, ocr_text, css_prefix="c", joiner="", none="·"
gt_text, ocr_text, css_prefix="l{0}-c".format(k), joiner="", none="·"
)
# TODO

@ -1,8 +1,9 @@
from __future__ import division, print_function
import os
import sys
from typing import Iterator
from warnings import warn
import sys
from lxml import etree as ET
from lxml.etree import XMLSyntaxError
@ -130,12 +131,15 @@ def page_text(tree, *, textequiv_level="region"):
return page_extract(tree, textequiv_level=textequiv_level).text
def plain_extract(filename):
def plain_extract(filename, include_filename_in_id=False):
id_template = "{filename} - line {no}" if include_filename_in_id else "line {no}"
with open(filename, "r") as f:
return ExtractedText(
None,
[
ExtractedText("line %d" % no, None, None, normalize_sbb(line))
ExtractedText(
id_template.format(filename=os.path.basename(filename), no=no),
None, None, normalize_sbb(line))
for no, line in enumerate(f.readlines())
],
"\n",

Loading…
Cancel
Save