mirror of
https://github.com/qurator-spk/dinglehopper.git
synced 2025-06-30 22:19:57 +02:00
Merge branch 'feat/compare-line-texts'
Some checks reported errors
continuous-integration/drone/push Build encountered an error
Some checks reported errors
continuous-integration/drone/push Build encountered an error
This commit is contained in:
commit
195354c6d4
6 changed files with 194 additions and 20 deletions
|
@ -61,6 +61,15 @@ This generates `report.html` and `report.json`.
|
|||
|
||||

|
||||
|
||||
### dinglehopper-line-dirs
|
||||
You also may want to compare a directory of GT text files (i.e. `gt/line0001.gt.txt`)
|
||||
with a directory of OCR text files (i.e. `ocr/line0001.some-ocr.txt`) with a separate
|
||||
CLI interface:
|
||||
|
||||
~~~
|
||||
dinglehopper-line-dirs gt/ ocr/
|
||||
~~~
|
||||
|
||||
### dinglehopper-extract
|
||||
The tool `dinglehopper-extract` extracts the text of the given input file on
|
||||
stdout, for example:
|
||||
|
|
|
@ -84,6 +84,19 @@ def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none):
|
|||
)
|
||||
|
||||
|
||||
def json_float(value):
|
||||
"""Convert a float value to an JSON float.
|
||||
|
||||
This is here so that float('inf') yields "Infinity", not "inf".
|
||||
"""
|
||||
if value == float("inf"):
|
||||
return "Infinity"
|
||||
elif value == float("-inf"):
|
||||
return "-Infinity"
|
||||
else:
|
||||
return str(value)
|
||||
|
||||
|
||||
def process(gt, ocr, report_prefix, *, metrics=True, textequiv_level="region"):
|
||||
"""Check OCR result against GT.
|
||||
|
||||
|
@ -107,18 +120,6 @@ def process(gt, ocr, report_prefix, *, metrics=True, textequiv_level="region"):
|
|||
gt_words, ocr_words, css_prefix="w", joiner=" ", none="⋯"
|
||||
)
|
||||
|
||||
def json_float(value):
|
||||
"""Convert a float value to an JSON float.
|
||||
|
||||
This is here so that float('inf') yields "Infinity", not "inf".
|
||||
"""
|
||||
if value == float("inf"):
|
||||
return "Infinity"
|
||||
elif value == float("-inf"):
|
||||
return "-Infinity"
|
||||
else:
|
||||
return str(value)
|
||||
|
||||
env = Environment(
|
||||
loader=FileSystemLoader(
|
||||
os.path.join(os.path.dirname(os.path.realpath(__file__)), "templates")
|
||||
|
|
142
qurator/dinglehopper/cli_line_dirs.py
Normal file
142
qurator/dinglehopper/cli_line_dirs.py
Normal file
|
@ -0,0 +1,142 @@
|
|||
import os
|
||||
import sys
|
||||
import itertools
|
||||
|
||||
import click
|
||||
from jinja2 import Environment, FileSystemLoader
|
||||
from markupsafe import escape
|
||||
from uniseg.graphemecluster import grapheme_clusters
|
||||
from ocrd_utils import initLogging
|
||||
|
||||
from .character_error_rate import character_error_rate_n
|
||||
from .word_error_rate import word_error_rate_n, words_normalized
|
||||
from .align import seq_align
|
||||
from .extracted_text import ExtractedText
|
||||
from .ocr_files import plain_extract
|
||||
from .config import Config
|
||||
from .cli import gen_diff_report, json_float
|
||||
|
||||
|
||||
def all_equal(iterable):
|
||||
g = itertools.groupby(iterable)
|
||||
return next(g, True) and not next(g, False)
|
||||
|
||||
|
||||
def common_prefix(its):
|
||||
return [p[0] for p in itertools.takewhile(all_equal, zip(*its))]
|
||||
|
||||
|
||||
def common_suffix(its):
|
||||
return reversed(common_prefix(reversed(it) for it in its))
|
||||
|
||||
|
||||
def removesuffix(text, suffix):
|
||||
if suffix and text.endswith(suffix):
|
||||
return text[:-len(suffix)]
|
||||
return text
|
||||
|
||||
|
||||
def process(gt_dir, ocr_dir, report_prefix, *, metrics=True):
|
||||
gt_suffix = "".join(common_suffix(os.listdir(gt_dir)))
|
||||
ocr_suffix = "".join(common_suffix(os.listdir(ocr_dir)))
|
||||
|
||||
cer = None
|
||||
n_characters = None
|
||||
char_diff_report = ""
|
||||
wer = None
|
||||
n_words = None
|
||||
word_diff_report = ""
|
||||
|
||||
for k, gt in enumerate(os.listdir(gt_dir)):
|
||||
# Find a match by replacing the suffix
|
||||
ocr = removesuffix(gt, gt_suffix) + ocr_suffix
|
||||
|
||||
gt_text = plain_extract(os.path.join(gt_dir, gt), include_filename_in_id=True)
|
||||
ocr_text = plain_extract(os.path.join(ocr_dir, ocr), include_filename_in_id=True)
|
||||
|
||||
# Compute CER
|
||||
l_cer, l_n_characters = character_error_rate_n(gt_text, ocr_text)
|
||||
if cer is None:
|
||||
cer, n_characters = l_cer, l_n_characters
|
||||
else:
|
||||
# Rolling update
|
||||
cer = (cer * n_characters + l_cer * l_n_characters) / (n_characters + l_n_characters)
|
||||
n_characters = n_characters + l_n_characters
|
||||
|
||||
# Compute WER
|
||||
l_wer, l_n_words = word_error_rate_n(gt_text, ocr_text)
|
||||
if wer is None:
|
||||
wer, n_words = l_wer, l_n_words
|
||||
else:
|
||||
# Rolling update
|
||||
wer = (wer * n_words + l_wer * l_n_words) / (n_words + l_n_words)
|
||||
n_words = n_words + l_n_words
|
||||
|
||||
# Generate diff reports
|
||||
char_diff_report += gen_diff_report(
|
||||
gt_text, ocr_text, css_prefix="l{0}-c".format(k), joiner="", none="·"
|
||||
)
|
||||
gt_words = words_normalized(gt_text)
|
||||
ocr_words = words_normalized(ocr_text)
|
||||
word_diff_report += gen_diff_report(
|
||||
gt_words, ocr_words, css_prefix="l{0}-w".format(k), joiner=" ", none="⋯"
|
||||
)
|
||||
|
||||
env = Environment(
|
||||
loader=FileSystemLoader(
|
||||
os.path.join(os.path.dirname(os.path.realpath(__file__)), "templates")
|
||||
)
|
||||
)
|
||||
env.filters["json_float"] = json_float
|
||||
|
||||
for report_suffix in (".html", ".json"):
|
||||
template_fn = "report" + report_suffix + ".j2"
|
||||
out_fn = report_prefix + report_suffix
|
||||
|
||||
template = env.get_template(template_fn)
|
||||
template.stream(
|
||||
gt=gt_dir, # Note: directory
|
||||
ocr=ocr_dir, # Note: directory
|
||||
cer=cer,
|
||||
n_characters=n_characters,
|
||||
wer=wer,
|
||||
n_words=n_words,
|
||||
char_diff_report=char_diff_report,
|
||||
word_diff_report=word_diff_report,
|
||||
metrics=metrics,
|
||||
).dump(out_fn)
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.argument("gt", type=click.Path(exists=True))
|
||||
@click.argument("ocr", type=click.Path(exists=True))
|
||||
@click.argument("report_prefix", type=click.Path(), default="report")
|
||||
@click.option(
|
||||
"--metrics/--no-metrics", default=True, help="Enable/disable metrics and green/red"
|
||||
)
|
||||
def main(gt, ocr, report_prefix, metrics):
|
||||
"""
|
||||
Compare the GT line text directory against the OCR line text directory.
|
||||
|
||||
This assumes that the GT line text directory contains textfiles with a common
|
||||
suffix like ".gt.txt", and the OCR line text directory contains textfiles with
|
||||
a common suffix like ".some-ocr.txt". The text files also need to be paired,
|
||||
i.e. the GT file "line001.gt.txt" needs to match a file "line001.some-ocr.txt"
|
||||
in the OCT lines directory.
|
||||
|
||||
The GT and OCR directories are usually round truth line texts and the results of
|
||||
an OCR software, but you may use dinglehopper to compare two OCR results. In
|
||||
that case, use --no-metrics to disable the then meaningless metrics and also
|
||||
change the color scheme from green/red to blue.
|
||||
|
||||
The comparison report will be written to $REPORT_PREFIX.{html,json}, where
|
||||
$REPORT_PREFIX defaults to "report". The reports include the character error
|
||||
rate (CER) and the word error rate (WER).
|
||||
|
||||
"""
|
||||
initLogging()
|
||||
process(gt, ocr, report_prefix, metrics=metrics)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
|
@ -1,8 +1,9 @@
|
|||
from __future__ import division, print_function
|
||||
|
||||
import os
|
||||
import sys
|
||||
from typing import Iterator
|
||||
from warnings import warn
|
||||
import sys
|
||||
|
||||
from lxml import etree as ET
|
||||
from lxml.etree import XMLSyntaxError
|
||||
|
@ -130,12 +131,15 @@ def page_text(tree, *, textequiv_level="region"):
|
|||
return page_extract(tree, textequiv_level=textequiv_level).text
|
||||
|
||||
|
||||
def plain_extract(filename):
|
||||
def plain_extract(filename, include_filename_in_id=False):
|
||||
id_template = "{filename} - line {no}" if include_filename_in_id else "line {no}"
|
||||
with open(filename, "r") as f:
|
||||
return ExtractedText(
|
||||
None,
|
||||
[
|
||||
ExtractedText("line %d" % no, None, None, normalize_sbb(line))
|
||||
ExtractedText(
|
||||
id_template.format(filename=os.path.basename(filename), no=no),
|
||||
None, None, normalize_sbb(line))
|
||||
for no, line in enumerate(f.readlines())
|
||||
],
|
||||
"\n",
|
||||
|
|
|
@ -10,12 +10,17 @@ from rapidfuzz.string_metric import levenshtein
|
|||
from . import ExtractedText
|
||||
|
||||
|
||||
@multimethod
|
||||
def words(s: str):
|
||||
"""Extract words from a string"""
|
||||
# Did we patch uniseg.wordbreak.word_break already?
|
||||
word_break_patched = False
|
||||
|
||||
# Patch uniseg.wordbreak.word_break to deal with our private use characters. See also
|
||||
# https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/WordBreakProperty.txt
|
||||
|
||||
def patch_word_break():
|
||||
"""
|
||||
Patch uniseg.wordbreak.word_break to deal with our private use characters.
|
||||
|
||||
See also
|
||||
https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/WordBreakProperty.txt
|
||||
"""
|
||||
old_word_break = uniseg.wordbreak.word_break
|
||||
|
||||
def new_word_break(c, index=0):
|
||||
|
@ -25,6 +30,18 @@ def words(s: str):
|
|||
return old_word_break(c, index)
|
||||
|
||||
uniseg.wordbreak.word_break = new_word_break
|
||||
global word_break_patched
|
||||
word_break_patched = True
|
||||
|
||||
|
||||
@multimethod
|
||||
def words(s: str):
|
||||
"""Extract words from a string"""
|
||||
|
||||
global word_break_patched
|
||||
if not word_break_patched:
|
||||
patch_word_break()
|
||||
|
||||
|
||||
# Check if c is an unwanted character, i.e. whitespace, punctuation, or similar
|
||||
def unwanted(c):
|
||||
|
|
1
setup.py
1
setup.py
|
@ -26,6 +26,7 @@ setup(
|
|||
entry_points={
|
||||
"console_scripts": [
|
||||
"dinglehopper=qurator.dinglehopper.cli:main",
|
||||
"dinglehopper-line-dirs=qurator.dinglehopper.cli_line_dirs:main",
|
||||
"dinglehopper-extract=qurator.dinglehopper.cli_extract:main",
|
||||
"ocrd-dinglehopper=qurator.dinglehopper.ocrd_cli:ocrd_dinglehopper",
|
||||
]
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue