mirror of
https://github.com/qurator-spk/dinglehopper.git
synced 2025-06-09 03:40:12 +02:00
🚧 dinglehopper: Compare line text directories (WIP)
Some checks reported errors
continuous-integration/drone/push Build encountered an error
Some checks reported errors
continuous-integration/drone/push Build encountered an error
This commit is contained in:
parent
a018006f98
commit
dbb660615a
2 changed files with 11 additions and 7 deletions
|
@ -44,12 +44,12 @@ def process(gt_dir, ocr_dir, report_prefix, *, metrics=True):
|
|||
n_characters = None
|
||||
char_diff_report = ""
|
||||
|
||||
for gt in os.listdir(gt_dir):
|
||||
for k, gt in enumerate(os.listdir(gt_dir)):
|
||||
# Find a match by replacing the suffix
|
||||
ocr = removesuffix(gt, gt_suffix) + ocr_suffix
|
||||
|
||||
gt_text = plain_extract(os.path.join(gt_dir, gt))
|
||||
ocr_text = plain_extract(os.path.join(ocr_dir, ocr))
|
||||
gt_text = plain_extract(os.path.join(gt_dir, gt), include_filename_in_id=True)
|
||||
ocr_text = plain_extract(os.path.join(ocr_dir, ocr), include_filename_in_id=True)
|
||||
|
||||
# Compute CER
|
||||
l_cer, l_n_characters = character_error_rate_n(gt_text, ocr_text)
|
||||
|
@ -65,7 +65,7 @@ def process(gt_dir, ocr_dir, report_prefix, *, metrics=True):
|
|||
wer = 9999; n_words = 0
|
||||
|
||||
char_diff_report += gen_diff_report(
|
||||
gt_text, ocr_text, css_prefix="c", joiner="", none="·"
|
||||
gt_text, ocr_text, css_prefix="l{0}-c".format(k), joiner="", none="·"
|
||||
)
|
||||
|
||||
# TODO
|
||||
|
|
|
@ -1,8 +1,9 @@
|
|||
from __future__ import division, print_function
|
||||
|
||||
import os
|
||||
import sys
|
||||
from typing import Iterator
|
||||
from warnings import warn
|
||||
import sys
|
||||
|
||||
from lxml import etree as ET
|
||||
from lxml.etree import XMLSyntaxError
|
||||
|
@ -130,12 +131,15 @@ def page_text(tree, *, textequiv_level="region"):
|
|||
return page_extract(tree, textequiv_level=textequiv_level).text
|
||||
|
||||
|
||||
def plain_extract(filename):
|
||||
def plain_extract(filename, include_filename_in_id=False):
|
||||
id_template = "{filename} - line {no}" if include_filename_in_id else "line {no}"
|
||||
with open(filename, "r") as f:
|
||||
return ExtractedText(
|
||||
None,
|
||||
[
|
||||
ExtractedText("line %d" % no, None, None, normalize_sbb(line))
|
||||
ExtractedText(
|
||||
id_template.format(filename=os.path.basename(filename), no=no),
|
||||
None, None, normalize_sbb(line))
|
||||
for no, line in enumerate(f.readlines())
|
||||
],
|
||||
"\n",
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue