mirror of
https://github.com/qurator-spk/dinglehopper.git
synced 2025-07-07 01:19:56 +02:00
🚧 dinglehopper: Compare line text directories (WIP)
Some checks reported errors
continuous-integration/drone/push Build encountered an error
Some checks reported errors
continuous-integration/drone/push Build encountered an error
This commit is contained in:
parent
a018006f98
commit
dbb660615a
2 changed files with 11 additions and 7 deletions
|
@ -44,12 +44,12 @@ def process(gt_dir, ocr_dir, report_prefix, *, metrics=True):
|
||||||
n_characters = None
|
n_characters = None
|
||||||
char_diff_report = ""
|
char_diff_report = ""
|
||||||
|
|
||||||
for gt in os.listdir(gt_dir):
|
for k, gt in enumerate(os.listdir(gt_dir)):
|
||||||
# Find a match by replacing the suffix
|
# Find a match by replacing the suffix
|
||||||
ocr = removesuffix(gt, gt_suffix) + ocr_suffix
|
ocr = removesuffix(gt, gt_suffix) + ocr_suffix
|
||||||
|
|
||||||
gt_text = plain_extract(os.path.join(gt_dir, gt))
|
gt_text = plain_extract(os.path.join(gt_dir, gt), include_filename_in_id=True)
|
||||||
ocr_text = plain_extract(os.path.join(ocr_dir, ocr))
|
ocr_text = plain_extract(os.path.join(ocr_dir, ocr), include_filename_in_id=True)
|
||||||
|
|
||||||
# Compute CER
|
# Compute CER
|
||||||
l_cer, l_n_characters = character_error_rate_n(gt_text, ocr_text)
|
l_cer, l_n_characters = character_error_rate_n(gt_text, ocr_text)
|
||||||
|
@ -65,7 +65,7 @@ def process(gt_dir, ocr_dir, report_prefix, *, metrics=True):
|
||||||
wer = 9999; n_words = 0
|
wer = 9999; n_words = 0
|
||||||
|
|
||||||
char_diff_report += gen_diff_report(
|
char_diff_report += gen_diff_report(
|
||||||
gt_text, ocr_text, css_prefix="c", joiner="", none="·"
|
gt_text, ocr_text, css_prefix="l{0}-c".format(k), joiner="", none="·"
|
||||||
)
|
)
|
||||||
|
|
||||||
# TODO
|
# TODO
|
||||||
|
|
|
@ -1,8 +1,9 @@
|
||||||
from __future__ import division, print_function
|
from __future__ import division, print_function
|
||||||
|
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
from typing import Iterator
|
from typing import Iterator
|
||||||
from warnings import warn
|
from warnings import warn
|
||||||
import sys
|
|
||||||
|
|
||||||
from lxml import etree as ET
|
from lxml import etree as ET
|
||||||
from lxml.etree import XMLSyntaxError
|
from lxml.etree import XMLSyntaxError
|
||||||
|
@ -130,12 +131,15 @@ def page_text(tree, *, textequiv_level="region"):
|
||||||
return page_extract(tree, textequiv_level=textequiv_level).text
|
return page_extract(tree, textequiv_level=textequiv_level).text
|
||||||
|
|
||||||
|
|
||||||
def plain_extract(filename):
|
def plain_extract(filename, include_filename_in_id=False):
|
||||||
|
id_template = "{filename} - line {no}" if include_filename_in_id else "line {no}"
|
||||||
with open(filename, "r") as f:
|
with open(filename, "r") as f:
|
||||||
return ExtractedText(
|
return ExtractedText(
|
||||||
None,
|
None,
|
||||||
[
|
[
|
||||||
ExtractedText("line %d" % no, None, None, normalize_sbb(line))
|
ExtractedText(
|
||||||
|
id_template.format(filename=os.path.basename(filename), no=no),
|
||||||
|
None, None, normalize_sbb(line))
|
||||||
for no, line in enumerate(f.readlines())
|
for no, line in enumerate(f.readlines())
|
||||||
],
|
],
|
||||||
"\n",
|
"\n",
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue