diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..a8312db --- /dev/null +++ b/.dockerignore @@ -0,0 +1,5 @@ +src/dinglehopper/tests +dist +build +*.egg-info +.git diff --git a/Dockerfile b/Dockerfile index 04e7330..75dfcdd 100644 --- a/Dockerfile +++ b/Dockerfile @@ -6,17 +6,33 @@ LABEL \ maintainer="https://github.com/qurator-spk/dinglehopper/issues" \ org.label-schema.vcs-ref=$VCS_REF \ org.label-schema.vcs-url="https://github.com/qurator-spk/dinglehopper" \ - org.label-schema.build-date=$BUILD_DATE + org.label-schema.build-date=$BUILD_DATE \ + org.opencontainers.image.vendor="DFG-Funded Initiative for Optical Character Recognition Development" \ + org.opencontainers.image.title="dinglehopper" \ + org.opencontainers.image.description="The OCR evaluation tool" \ + org.opencontainers.image.source="https://github.com/qurator-spk/dinglehopper" \ + org.opencontainers.image.documentation="https://github.com/qurator-spk/dinglehopper/blob/${VCS_REF}/README.md" \ + org.opencontainers.image.revision=$VCS_REF \ + org.opencontainers.image.created=$BUILD_DATE \ + org.opencontainers.image.base.name=ocrd/core + +ENV LANG=C.UTF-8 +ENV LC_ALL=C.UTF-8 + +# avoid HOME/.local/share (hard to predict USER here) +# so let XDG_DATA_HOME coincide with fixed system location +# (can still be overridden by derived stages) +ENV XDG_DATA_HOME /usr/local/share +# avoid the need for an extra volume for persistent resource user db +# (i.e. XDG_CONFIG_HOME/ocrd/resources.yml) +ENV XDG_CONFIG_HOME /usr/local/share/ocrd-resources WORKDIR /build/dinglehopper -COPY pyproject.toml . -COPY src/dinglehopper/ocrd-tool.json . -COPY src ./src -COPY requirements.txt . -COPY README.md . -COPY Makefile . -RUN make install -RUN rm -rf /build/dinglehopper +COPY . . +COPY ocrd-tool.json . +# prepackage ocrd-tool.json as ocrd-all-tool.json +RUN ocrd ocrd-tool ocrd-tool.json dump-tools > $(dirname $(ocrd bashlib filename))/ocrd-all-tool.json +RUN make install && rm -rf /build/dinglehopper WORKDIR /data -VOLUME ["/data"] +VOLUME /data diff --git a/Makefile b/Makefile index babaf5f..2a4b13c 100644 --- a/Makefile +++ b/Makefile @@ -1,8 +1,9 @@ PYTHON = python3 PIP = pip3 PYTHONIOENCODING=utf8 +PYTEST_ARGS = -vv -DOCKER_BASE_IMAGE = docker.io/ocrd/core:v2.69.0 +DOCKER_BASE_IMAGE = docker.io/ocrd/core:v3.3.0 DOCKER_TAG = ocrd/dinglehopper help: @@ -16,6 +17,12 @@ help: install: $(PIP) install . +install-dev: + $(PIP) install -e . + +test: + pytest $(PYTEST_ARGS) + docker: docker build \ --build-arg DOCKER_BASE_IMAGE=$(DOCKER_BASE_IMAGE) \ diff --git a/requirements.txt b/requirements.txt index 6741fa2..a57ef86 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,14 +1,16 @@ click jinja2 lxml -uniseg >= 0.8.0 +# FIXME +uniseg == 0.8.1 numpy colorama MarkupSafe -ocrd >= 2.65.0 +ocrd >= 3.3.0 attrs multimethod >= 1.3 tqdm -rapidfuzz >= 2.7.0 +# FIXME +rapidfuzz >= 2.7.0, < 3 chardet importlib_resources diff --git a/src/dinglehopper/character_error_rate.py b/src/dinglehopper/character_error_rate.py index 88a88f8..04e4bfe 100644 --- a/src/dinglehopper/character_error_rate.py +++ b/src/dinglehopper/character_error_rate.py @@ -20,14 +20,7 @@ def character_error_rate_n( :return: character error rate and length of the reference """ - d = distance(reference, compared) - n = len(reference) - - if d == 0: - return 0, n - if n == 0: - return float("inf"), n - return d / n, n + return distance(reference, compared), len(reference) # XXX Should we really count newlines here? diff --git a/src/dinglehopper/edit_distance.py b/src/dinglehopper/edit_distance.py index ec564ae..988849c 100644 --- a/src/dinglehopper/edit_distance.py +++ b/src/dinglehopper/edit_distance.py @@ -9,18 +9,18 @@ from .extracted_text import ExtractedText @multimethod -def distance(seq1: List[str], seq2: List[str]) -> int: +def distance(seq1: List[str], seq2: List[str]) -> float: """Compute the Levenshtein edit distance between two lists of grapheme clusters. This assumes that the grapheme clusters are already normalized. Use distance(str, str) instead if you need to compare two Unicode strings. """ - return Levenshtein.distance(seq1, seq2) + return Levenshtein.normalized_distance(seq1, seq2) @distance.register -def _(s1: str, s2: str) -> int: +def _(s1: str, s2: str) -> float: """Compute the Levenshtein edit distance between two Unicode strings Note that this is different from levenshtein() as this function knows about Unicode @@ -29,12 +29,12 @@ def _(s1: str, s2: str) -> int: """ seq1 = list(grapheme_clusters(unicodedata.normalize("NFC", s1))) seq2 = list(grapheme_clusters(unicodedata.normalize("NFC", s2))) - return Levenshtein.distance(seq1, seq2) + return Levenshtein.normalized_distance(seq1, seq2) @distance.register -def _(s1: ExtractedText, s2: ExtractedText) -> int: - return Levenshtein.distance(s1.grapheme_clusters, s2.grapheme_clusters) +def _(s1: ExtractedText, s2: ExtractedText) -> float: + return Levenshtein.normalized_distance(s1.grapheme_clusters, s2.grapheme_clusters) def editops(word1, word2): diff --git a/src/dinglehopper/ocrd-tool.json b/src/dinglehopper/ocrd-tool.json index f4572c7..00d5d2b 100644 --- a/src/dinglehopper/ocrd-tool.json +++ b/src/dinglehopper/ocrd-tool.json @@ -1,17 +1,13 @@ { "version": "0.9.7", "git_url": "https://github.com/qurator-spk/dinglehopper", + "dockerhub": "ocrd/dinglehopper", "tools": { "ocrd-dinglehopper": { "executable": "ocrd-dinglehopper", + "input_file_grp_cardinality": 2, + "output_file_grp_cardinality": 1, "description": "Evaluate OCR text against ground truth with dinglehopper", - "input_file_grp": [ - "OCR-D-GT-PAGE", - "OCR-D-OCR" - ], - "output_file_grp": [ - "OCR-D-OCR-EVAL" - ], "categories": [ "Quality assurance" ], diff --git a/src/dinglehopper/ocrd_cli.py b/src/dinglehopper/ocrd_cli.py index 4da4960..9696ff9 100644 --- a/src/dinglehopper/ocrd_cli.py +++ b/src/dinglehopper/ocrd_cli.py @@ -1,83 +1,73 @@ -import json +from functools import cached_property import os +from typing import Optional import click -import importlib_resources +from ocrd_models import OcrdFileType from ocrd import Processor from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor -from ocrd_utils import assert_file_grp_cardinality, getLogger, make_file_id +from ocrd_utils import make_file_id from .cli import process as cli_process -OCRD_TOOL = json.loads( - importlib_resources.files(__name__) - .joinpath("ocrd-tool.json") - .read_text(encoding="utf-8", errors="strict") -) - - @click.command() @ocrd_cli_options def ocrd_dinglehopper(*args, **kwargs): return ocrd_cli_wrap_processor(OcrdDinglehopperEvaluate, *args, **kwargs) - class OcrdDinglehopperEvaluate(Processor): - def __init__(self, *args, **kwargs): - kwargs["ocrd_tool"] = OCRD_TOOL["tools"]["ocrd-dinglehopper"] - kwargs["version"] = OCRD_TOOL["version"] - super(OcrdDinglehopperEvaluate, self).__init__(*args, **kwargs) - def process(self): - assert_file_grp_cardinality(self.input_file_grp, 2, "GT and OCR") - assert_file_grp_cardinality(self.output_file_grp, 1) + @cached_property + def executable(self): + return 'ocrd-dinglehopper' - log = getLogger("processor.OcrdDinglehopperEvaluate") + def process_page_file(self, *input_files: Optional[OcrdFileType]) -> None: + assert self.parameter metrics = self.parameter["metrics"] textequiv_level = self.parameter["textequiv_level"] - gt_grp, ocr_grp = self.input_file_grp.split(",") - - input_file_tuples = self.zip_input_files(on_error="abort") - for n, (gt_file, ocr_file) in enumerate(input_file_tuples): - if not gt_file or not ocr_file: - # file/page was not found in this group - continue - gt_file = self.workspace.download_file(gt_file) - ocr_file = self.workspace.download_file(ocr_file) - page_id = gt_file.pageId - log.info("INPUT FILES %i / %s↔ %s", n, gt_file, ocr_file) - - file_id = make_file_id(ocr_file, self.output_file_grp) - report_prefix = os.path.join(self.output_file_grp, file_id) - - # Process the files - try: - os.mkdir(self.output_file_grp) - except FileExistsError: - pass - cli_process( - gt_file.local_filename, - ocr_file.local_filename, - report_prefix, - metrics=metrics, - textequiv_level=textequiv_level, + try: + gt_file, ocr_file = input_files + assert gt_file, 'missing GT file' + assert ocr_file, 'missing OCR file' + assert gt_file.local_filename + assert ocr_file.local_filename + except (ValueError, AssertionError) as err: + self.logger.warning(f'Missing either GT file, OCR file or both: {err}') # TODO how to log which page? + return + + page_id = gt_file.pageId + + file_id = make_file_id(ocr_file, self.output_file_grp) + report_prefix = os.path.join(self.output_file_grp, file_id) + + # Process the files + try: + os.mkdir(self.output_file_grp) + except FileExistsError: + pass + cli_process( + gt_file.local_filename, + ocr_file.local_filename, + report_prefix, + metrics=metrics, + textequiv_level=textequiv_level, + ) + + # Add reports to the workspace + for report_suffix, mimetype in [ + [".html", "text/html"], + [".json", "application/json"], + ]: + self.workspace.add_file( + file_id=file_id + report_suffix, + file_grp=self.output_file_grp, + page_id=page_id, + mimetype=mimetype, + local_filename=report_prefix + report_suffix, ) - # Add reports to the workspace - for report_suffix, mimetype in [ - [".html", "text/html"], - [".json", "application/json"], - ]: - self.workspace.add_file( - file_id=file_id + report_suffix, - file_grp=self.output_file_grp, - page_id=page_id, - mimetype=mimetype, - local_filename=report_prefix + report_suffix, - ) - if __name__ == "__main__": ocrd_dinglehopper() diff --git a/src/dinglehopper/tests/test_character_error_rate.py b/src/dinglehopper/tests/test_character_error_rate.py index 970f740..63d2f72 100644 --- a/src/dinglehopper/tests/test_character_error_rate.py +++ b/src/dinglehopper/tests/test_character_error_rate.py @@ -14,9 +14,9 @@ def test_character_error_rate(): assert character_error_rate("Foo", "") == 3 / 3 assert character_error_rate("", "") == 0 - assert math.isinf(character_error_rate("", "Foo")) + assert character_error_rate("", "Foo") == 3 / 3 - assert character_error_rate("Foo", "Food") == 1 / 3 + assert character_error_rate("Foo", "Food") == 1 / 4 assert character_error_rate("Fnord", "Food") == 2 / 5 assert character_error_rate("Müll", "Mull") == 1 / 4 assert character_error_rate("Abstand", "Sand") == 4 / 7 diff --git a/src/dinglehopper/tests/test_edit_distance.py b/src/dinglehopper/tests/test_edit_distance.py index be427a8..e782ca6 100644 --- a/src/dinglehopper/tests/test_edit_distance.py +++ b/src/dinglehopper/tests/test_edit_distance.py @@ -6,8 +6,8 @@ from .. import distance def test_distance(): - assert distance("Fnord", "Food") == 2 - assert distance("Müll", "Mull") == 1 + assert distance("Fnord", "Food") == 2 / 5 + assert distance("Müll", "Mull") == 1 / 4 word1 = unicodedata.normalize("NFC", "Schlyñ") word2 = unicodedata.normalize("NFD", "Schlyñ") # Different, decomposed! @@ -21,4 +21,4 @@ def test_distance(): assert ( len(word2) == 7 ) # This, OTOH, ends with LATIN SMALL LETTER M + COMBINING TILDE, 7 code points - assert distance(word1, word2) == 1 + assert distance(word1, word2) == 1 / 6 diff --git a/src/dinglehopper/tests/test_integ_character_error_rate_ocr.py b/src/dinglehopper/tests/test_integ_character_error_rate_ocr.py index 7755e2d..b3a5914 100644 --- a/src/dinglehopper/tests/test_integ_character_error_rate_ocr.py +++ b/src/dinglehopper/tests/test_integ_character_error_rate_ocr.py @@ -56,4 +56,4 @@ def test_character_error_rate_between_page_alto_2(): ) ) - assert character_error_rate(gt, ocr) == 8 / 591 # Manually verified + assert character_error_rate(gt, ocr) == 8 / 594 # Manually verified diff --git a/src/dinglehopper/tests/test_integ_cli_valid_json.py b/src/dinglehopper/tests/test_integ_cli_valid_json.py index 6cbfa0c..a993ee7 100644 --- a/src/dinglehopper/tests/test_integ_cli_valid_json.py +++ b/src/dinglehopper/tests/test_integ_cli_valid_json.py @@ -32,11 +32,11 @@ def test_cli_json_cer_is_infinity(tmp_path): with working_directory(tmp_path): with open("gt.txt", "w") as gtf: - gtf.write("") # Empty to yield CER == inf + gtf.write("") with open("ocr.txt", "w") as ocrf: ocrf.write("Not important") process("gt.txt", "ocr.txt", "report") with open("report.json", "r") as jsonf: j = json.load(jsonf) - assert j["cer"] == pytest.approx(float("inf")) + assert j["cer"] == pytest.approx(1.0) diff --git a/src/dinglehopper/tests/test_integ_edit_distance_ocr.py b/src/dinglehopper/tests/test_integ_edit_distance_ocr.py index e01ac76..b032e22 100644 --- a/src/dinglehopper/tests/test_integ_edit_distance_ocr.py +++ b/src/dinglehopper/tests/test_integ_edit_distance_ocr.py @@ -17,7 +17,7 @@ def test_distance_between_page_files(): # → 2 differences gt = page_text(ET.parse(os.path.join(data_dir, "test-gt.page2018.xml"))) ocr = page_text(ET.parse(os.path.join(data_dir, "test-fake-ocr.page2018.xml"))) - assert distance(gt, ocr) == 2 + assert distance(gt, ocr) == 2 / 827 @pytest.mark.integration @@ -52,4 +52,4 @@ def test_distance_between_page_alto_2(): ) ) - assert distance(gt, ocr) == 8 # Manually verified + assert distance(gt, ocr) == 8 / 594 # Manually verified diff --git a/src/dinglehopper/tests/test_integ_empty_files.py b/src/dinglehopper/tests/test_integ_empty_files.py index 5c90ed1..2b29513 100644 --- a/src/dinglehopper/tests/test_integ_empty_files.py +++ b/src/dinglehopper/tests/test_integ_empty_files.py @@ -12,9 +12,9 @@ from .util import working_directory @pytest.mark.parametrize( "gt_file_content,ocr_file_content,cer_expected", [ - ("", "Lorem ipsum", math.inf), + ("", "Lorem ipsum", 1.0), ("Lorem ipsum", "", 1.0), - ("\ufeff", "Lorem ipsum", math.inf), + ("\ufeff", "Lorem ipsum", 1.0), ("Lorem ipsum", "\ufeff", 1.0), ("", "", 0.0), ("\ufeff", "", 0.0), diff --git a/src/dinglehopper/tests/test_integ_word_error_rate_ocr.py b/src/dinglehopper/tests/test_integ_word_error_rate_ocr.py index 8a57ed2..f114685 100644 --- a/src/dinglehopper/tests/test_integ_word_error_rate_ocr.py +++ b/src/dinglehopper/tests/test_integ_word_error_rate_ocr.py @@ -64,5 +64,5 @@ def test_word_error_rate_between_page_alto_2(): ) assert ( - word_error_rate(gt, ocr) == 7 / gt_word_count + word_error_rate(gt, ocr) == 7 / (gt_word_count + 1) ) # Manually verified, 6 words are wrong, 1 got split (=2 errors) diff --git a/src/dinglehopper/tests/test_word_error_rate.py b/src/dinglehopper/tests/test_word_error_rate.py index 311ffff..245fa74 100644 --- a/src/dinglehopper/tests/test_word_error_rate.py +++ b/src/dinglehopper/tests/test_word_error_rate.py @@ -76,7 +76,7 @@ def test_word_error_rate(): ) assert word_error_rate("Dies ist ein Beispielsatz!", "") == 4 / 4 - assert math.isinf(word_error_rate("", "Dies ist ein Beispielsatz!")) + assert word_error_rate("", "Dies ist ein Beispielsatz!") == 4 / 4 assert word_error_rate("", "") == 0 assert ( diff --git a/src/dinglehopper/word_error_rate.py b/src/dinglehopper/word_error_rate.py index 578850f..3b12623 100644 --- a/src/dinglehopper/word_error_rate.py +++ b/src/dinglehopper/word_error_rate.py @@ -96,15 +96,10 @@ def _(reference: Iterable[T], compared: Iterable[T]) -> Tuple[float, int]: reference_seq = list(reference) compared_seq = list(compared) - d = Levenshtein.distance(reference_seq, compared_seq) + d = Levenshtein.normalized_distance(reference_seq, compared_seq) n = len(reference_seq) - if d == 0: - return 0, n - if n == 0: - return float("inf"), n - return d / n, n - + return d, n def word_error_rate(reference: T, compared: T) -> float: wer: float