Konstantin Baierer 1 week ago committed by GitHub
commit 139b3a1cc6
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

@ -0,0 +1,5 @@
src/dinglehopper/tests
dist
build
*.egg-info
.git

@ -6,17 +6,33 @@ LABEL \
maintainer="https://github.com/qurator-spk/dinglehopper/issues" \ maintainer="https://github.com/qurator-spk/dinglehopper/issues" \
org.label-schema.vcs-ref=$VCS_REF \ org.label-schema.vcs-ref=$VCS_REF \
org.label-schema.vcs-url="https://github.com/qurator-spk/dinglehopper" \ org.label-schema.vcs-url="https://github.com/qurator-spk/dinglehopper" \
org.label-schema.build-date=$BUILD_DATE org.label-schema.build-date=$BUILD_DATE \
org.opencontainers.image.vendor="DFG-Funded Initiative for Optical Character Recognition Development" \
org.opencontainers.image.title="dinglehopper" \
org.opencontainers.image.description="The OCR evaluation tool" \
org.opencontainers.image.source="https://github.com/qurator-spk/dinglehopper" \
org.opencontainers.image.documentation="https://github.com/qurator-spk/dinglehopper/blob/${VCS_REF}/README.md" \
org.opencontainers.image.revision=$VCS_REF \
org.opencontainers.image.created=$BUILD_DATE \
org.opencontainers.image.base.name=ocrd/core
ENV LANG=C.UTF-8
ENV LC_ALL=C.UTF-8
# avoid HOME/.local/share (hard to predict USER here)
# so let XDG_DATA_HOME coincide with fixed system location
# (can still be overridden by derived stages)
ENV XDG_DATA_HOME /usr/local/share
# avoid the need for an extra volume for persistent resource user db
# (i.e. XDG_CONFIG_HOME/ocrd/resources.yml)
ENV XDG_CONFIG_HOME /usr/local/share/ocrd-resources
WORKDIR /build/dinglehopper WORKDIR /build/dinglehopper
COPY pyproject.toml . COPY . .
COPY src/dinglehopper/ocrd-tool.json . COPY ocrd-tool.json .
COPY src ./src # prepackage ocrd-tool.json as ocrd-all-tool.json
COPY requirements.txt . RUN ocrd ocrd-tool ocrd-tool.json dump-tools > $(dirname $(ocrd bashlib filename))/ocrd-all-tool.json
COPY README.md . RUN make install && rm -rf /build/dinglehopper
COPY Makefile .
RUN make install
RUN rm -rf /build/dinglehopper
WORKDIR /data WORKDIR /data
VOLUME ["/data"] VOLUME /data

@ -1,8 +1,9 @@
PYTHON = python3 PYTHON = python3
PIP = pip3 PIP = pip3
PYTHONIOENCODING=utf8 PYTHONIOENCODING=utf8
PYTEST_ARGS = -vv
DOCKER_BASE_IMAGE = docker.io/ocrd/core:v2.69.0 DOCKER_BASE_IMAGE = docker.io/ocrd/core:v3.3.0
DOCKER_TAG = ocrd/dinglehopper DOCKER_TAG = ocrd/dinglehopper
help: help:
@ -16,6 +17,12 @@ help:
install: install:
$(PIP) install . $(PIP) install .
install-dev:
$(PIP) install -e .
test:
pytest $(PYTEST_ARGS)
docker: docker:
docker build \ docker build \
--build-arg DOCKER_BASE_IMAGE=$(DOCKER_BASE_IMAGE) \ --build-arg DOCKER_BASE_IMAGE=$(DOCKER_BASE_IMAGE) \

@ -1,14 +1,16 @@
click click
jinja2 jinja2
lxml lxml
uniseg >= 0.8.0 # FIXME
uniseg == 0.8.1
numpy numpy
colorama colorama
MarkupSafe MarkupSafe
ocrd >= 2.65.0 ocrd >= 3.3.0
attrs attrs
multimethod >= 1.3 multimethod >= 1.3
tqdm tqdm
rapidfuzz >= 2.7.0 # FIXME
rapidfuzz >= 2.7.0, < 3
chardet chardet
importlib_resources importlib_resources

@ -20,14 +20,7 @@ def character_error_rate_n(
:return: character error rate and length of the reference :return: character error rate and length of the reference
""" """
d = distance(reference, compared) return distance(reference, compared), len(reference)
n = len(reference)
if d == 0:
return 0, n
if n == 0:
return float("inf"), n
return d / n, n
# XXX Should we really count newlines here? # XXX Should we really count newlines here?

@ -9,18 +9,18 @@ from .extracted_text import ExtractedText
@multimethod @multimethod
def distance(seq1: List[str], seq2: List[str]) -> int: def distance(seq1: List[str], seq2: List[str]) -> float:
"""Compute the Levenshtein edit distance between two lists of grapheme clusters. """Compute the Levenshtein edit distance between two lists of grapheme clusters.
This assumes that the grapheme clusters are already normalized. This assumes that the grapheme clusters are already normalized.
Use distance(str, str) instead if you need to compare two Unicode strings. Use distance(str, str) instead if you need to compare two Unicode strings.
""" """
return Levenshtein.distance(seq1, seq2) return Levenshtein.normalized_distance(seq1, seq2)
@distance.register @distance.register
def _(s1: str, s2: str) -> int: def _(s1: str, s2: str) -> float:
"""Compute the Levenshtein edit distance between two Unicode strings """Compute the Levenshtein edit distance between two Unicode strings
Note that this is different from levenshtein() as this function knows about Unicode Note that this is different from levenshtein() as this function knows about Unicode
@ -29,12 +29,12 @@ def _(s1: str, s2: str) -> int:
""" """
seq1 = list(grapheme_clusters(unicodedata.normalize("NFC", s1))) seq1 = list(grapheme_clusters(unicodedata.normalize("NFC", s1)))
seq2 = list(grapheme_clusters(unicodedata.normalize("NFC", s2))) seq2 = list(grapheme_clusters(unicodedata.normalize("NFC", s2)))
return Levenshtein.distance(seq1, seq2) return Levenshtein.normalized_distance(seq1, seq2)
@distance.register @distance.register
def _(s1: ExtractedText, s2: ExtractedText) -> int: def _(s1: ExtractedText, s2: ExtractedText) -> float:
return Levenshtein.distance(s1.grapheme_clusters, s2.grapheme_clusters) return Levenshtein.normalized_distance(s1.grapheme_clusters, s2.grapheme_clusters)
def editops(word1, word2): def editops(word1, word2):

@ -1,17 +1,13 @@
{ {
"version": "0.9.7", "version": "0.9.7",
"git_url": "https://github.com/qurator-spk/dinglehopper", "git_url": "https://github.com/qurator-spk/dinglehopper",
"dockerhub": "ocrd/dinglehopper",
"tools": { "tools": {
"ocrd-dinglehopper": { "ocrd-dinglehopper": {
"executable": "ocrd-dinglehopper", "executable": "ocrd-dinglehopper",
"input_file_grp_cardinality": 2,
"output_file_grp_cardinality": 1,
"description": "Evaluate OCR text against ground truth with dinglehopper", "description": "Evaluate OCR text against ground truth with dinglehopper",
"input_file_grp": [
"OCR-D-GT-PAGE",
"OCR-D-OCR"
],
"output_file_grp": [
"OCR-D-OCR-EVAL"
],
"categories": [ "categories": [
"Quality assurance" "Quality assurance"
], ],

@ -1,53 +1,43 @@
import json from functools import cached_property
import os import os
from typing import Optional
import click import click
import importlib_resources from ocrd_models import OcrdFileType
from ocrd import Processor from ocrd import Processor
from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor
from ocrd_utils import assert_file_grp_cardinality, getLogger, make_file_id from ocrd_utils import make_file_id
from .cli import process as cli_process from .cli import process as cli_process
OCRD_TOOL = json.loads(
importlib_resources.files(__name__)
.joinpath("ocrd-tool.json")
.read_text(encoding="utf-8", errors="strict")
)
@click.command() @click.command()
@ocrd_cli_options @ocrd_cli_options
def ocrd_dinglehopper(*args, **kwargs): def ocrd_dinglehopper(*args, **kwargs):
return ocrd_cli_wrap_processor(OcrdDinglehopperEvaluate, *args, **kwargs) return ocrd_cli_wrap_processor(OcrdDinglehopperEvaluate, *args, **kwargs)
class OcrdDinglehopperEvaluate(Processor): class OcrdDinglehopperEvaluate(Processor):
def __init__(self, *args, **kwargs):
kwargs["ocrd_tool"] = OCRD_TOOL["tools"]["ocrd-dinglehopper"]
kwargs["version"] = OCRD_TOOL["version"]
super(OcrdDinglehopperEvaluate, self).__init__(*args, **kwargs)
def process(self): @cached_property
assert_file_grp_cardinality(self.input_file_grp, 2, "GT and OCR") def executable(self):
assert_file_grp_cardinality(self.output_file_grp, 1) return 'ocrd-dinglehopper'
log = getLogger("processor.OcrdDinglehopperEvaluate") def process_page_file(self, *input_files: Optional[OcrdFileType]) -> None:
assert self.parameter
metrics = self.parameter["metrics"] metrics = self.parameter["metrics"]
textequiv_level = self.parameter["textequiv_level"] textequiv_level = self.parameter["textequiv_level"]
gt_grp, ocr_grp = self.input_file_grp.split(",")
input_file_tuples = self.zip_input_files(on_error="abort") try:
for n, (gt_file, ocr_file) in enumerate(input_file_tuples): gt_file, ocr_file = input_files
if not gt_file or not ocr_file: assert gt_file, 'missing GT file'
# file/page was not found in this group assert ocr_file, 'missing OCR file'
continue assert gt_file.local_filename
gt_file = self.workspace.download_file(gt_file) assert ocr_file.local_filename
ocr_file = self.workspace.download_file(ocr_file) except (ValueError, AssertionError) as err:
page_id = gt_file.pageId self.logger.warning(f'Missing either GT file, OCR file or both: {err}') # TODO how to log which page?
return
log.info("INPUT FILES %i / %s%s", n, gt_file, ocr_file) page_id = gt_file.pageId
file_id = make_file_id(ocr_file, self.output_file_grp) file_id = make_file_id(ocr_file, self.output_file_grp)
report_prefix = os.path.join(self.output_file_grp, file_id) report_prefix = os.path.join(self.output_file_grp, file_id)

@ -14,9 +14,9 @@ def test_character_error_rate():
assert character_error_rate("Foo", "") == 3 / 3 assert character_error_rate("Foo", "") == 3 / 3
assert character_error_rate("", "") == 0 assert character_error_rate("", "") == 0
assert math.isinf(character_error_rate("", "Foo")) assert character_error_rate("", "Foo") == 3 / 3
assert character_error_rate("Foo", "Food") == 1 / 3 assert character_error_rate("Foo", "Food") == 1 / 4
assert character_error_rate("Fnord", "Food") == 2 / 5 assert character_error_rate("Fnord", "Food") == 2 / 5
assert character_error_rate("Müll", "Mull") == 1 / 4 assert character_error_rate("Müll", "Mull") == 1 / 4
assert character_error_rate("Abstand", "Sand") == 4 / 7 assert character_error_rate("Abstand", "Sand") == 4 / 7

@ -6,8 +6,8 @@ from .. import distance
def test_distance(): def test_distance():
assert distance("Fnord", "Food") == 2 assert distance("Fnord", "Food") == 2 / 5
assert distance("Müll", "Mull") == 1 assert distance("Müll", "Mull") == 1 / 4
word1 = unicodedata.normalize("NFC", "Schlyñ") word1 = unicodedata.normalize("NFC", "Schlyñ")
word2 = unicodedata.normalize("NFD", "Schlyñ") # Different, decomposed! word2 = unicodedata.normalize("NFD", "Schlyñ") # Different, decomposed!
@ -21,4 +21,4 @@ def test_distance():
assert ( assert (
len(word2) == 7 len(word2) == 7
) # This, OTOH, ends with LATIN SMALL LETTER M + COMBINING TILDE, 7 code points ) # This, OTOH, ends with LATIN SMALL LETTER M + COMBINING TILDE, 7 code points
assert distance(word1, word2) == 1 assert distance(word1, word2) == 1 / 6

@ -56,4 +56,4 @@ def test_character_error_rate_between_page_alto_2():
) )
) )
assert character_error_rate(gt, ocr) == 8 / 591 # Manually verified assert character_error_rate(gt, ocr) == 8 / 594 # Manually verified

@ -32,11 +32,11 @@ def test_cli_json_cer_is_infinity(tmp_path):
with working_directory(tmp_path): with working_directory(tmp_path):
with open("gt.txt", "w") as gtf: with open("gt.txt", "w") as gtf:
gtf.write("") # Empty to yield CER == inf gtf.write("")
with open("ocr.txt", "w") as ocrf: with open("ocr.txt", "w") as ocrf:
ocrf.write("Not important") ocrf.write("Not important")
process("gt.txt", "ocr.txt", "report") process("gt.txt", "ocr.txt", "report")
with open("report.json", "r") as jsonf: with open("report.json", "r") as jsonf:
j = json.load(jsonf) j = json.load(jsonf)
assert j["cer"] == pytest.approx(float("inf")) assert j["cer"] == pytest.approx(1.0)

@ -17,7 +17,7 @@ def test_distance_between_page_files():
# → 2 differences # → 2 differences
gt = page_text(ET.parse(os.path.join(data_dir, "test-gt.page2018.xml"))) gt = page_text(ET.parse(os.path.join(data_dir, "test-gt.page2018.xml")))
ocr = page_text(ET.parse(os.path.join(data_dir, "test-fake-ocr.page2018.xml"))) ocr = page_text(ET.parse(os.path.join(data_dir, "test-fake-ocr.page2018.xml")))
assert distance(gt, ocr) == 2 assert distance(gt, ocr) == 2 / 827
@pytest.mark.integration @pytest.mark.integration
@ -52,4 +52,4 @@ def test_distance_between_page_alto_2():
) )
) )
assert distance(gt, ocr) == 8 # Manually verified assert distance(gt, ocr) == 8 / 594 # Manually verified

@ -12,9 +12,9 @@ from .util import working_directory
@pytest.mark.parametrize( @pytest.mark.parametrize(
"gt_file_content,ocr_file_content,cer_expected", "gt_file_content,ocr_file_content,cer_expected",
[ [
("", "Lorem ipsum", math.inf), ("", "Lorem ipsum", 1.0),
("Lorem ipsum", "", 1.0), ("Lorem ipsum", "", 1.0),
("\ufeff", "Lorem ipsum", math.inf), ("\ufeff", "Lorem ipsum", 1.0),
("Lorem ipsum", "\ufeff", 1.0), ("Lorem ipsum", "\ufeff", 1.0),
("", "", 0.0), ("", "", 0.0),
("\ufeff", "", 0.0), ("\ufeff", "", 0.0),

@ -64,5 +64,5 @@ def test_word_error_rate_between_page_alto_2():
) )
assert ( assert (
word_error_rate(gt, ocr) == 7 / gt_word_count word_error_rate(gt, ocr) == 7 / (gt_word_count + 1)
) # Manually verified, 6 words are wrong, 1 got split (=2 errors) ) # Manually verified, 6 words are wrong, 1 got split (=2 errors)

@ -76,7 +76,7 @@ def test_word_error_rate():
) )
assert word_error_rate("Dies ist ein Beispielsatz!", "") == 4 / 4 assert word_error_rate("Dies ist ein Beispielsatz!", "") == 4 / 4
assert math.isinf(word_error_rate("", "Dies ist ein Beispielsatz!")) assert word_error_rate("", "Dies ist ein Beispielsatz!") == 4 / 4
assert word_error_rate("", "") == 0 assert word_error_rate("", "") == 0
assert ( assert (

@ -96,15 +96,10 @@ def _(reference: Iterable[T], compared: Iterable[T]) -> Tuple[float, int]:
reference_seq = list(reference) reference_seq = list(reference)
compared_seq = list(compared) compared_seq = list(compared)
d = Levenshtein.distance(reference_seq, compared_seq) d = Levenshtein.normalized_distance(reference_seq, compared_seq)
n = len(reference_seq) n = len(reference_seq)
if d == 0: return d, n
return 0, n
if n == 0:
return float("inf"), n
return d / n, n
def word_error_rate(reference: T, compared: T) -> float: def word_error_rate(reference: T, compared: T) -> float:
wer: float wer: float

Loading…
Cancel
Save