Merge pull request #70 from bertsky/patch-2

add checkpoint_dir content-type, remove checkpoint variant
2025-12-13 19:54:12 +01:00 · 2022-02-23 13:12:03 +01:00 · 2022-02-23 13:12:03 +01:00 · 1eb342ef65
commit 1eb342ef65
parent 76b34c50cb 5fddd32929
6 changed files with 42 additions and 78 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@ -14,8 +14,7 @@ jobs:
      - run: locale-gen "en_US.UTF-8"; update-locale LC_ALL="en_US.UTF-8"
      - checkout
      - run: pip3 install --upgrade pip
-      - run: make install PIP_INSTALL="pip3 install"
+      - run: make install deps-test PIP_INSTALL="pip3 install"
      - run: pip3 install -r requirements-test.txt
      - run: make coverage LC_ALL=en_US.utf8
      - codecov/upload
--- a/27
+++ b/27
@ -3,6 +3,7 @@ PIP_INSTALL = pip3 install
 GIT_CLONE = git clone
 PYTHON = python3
 PYTEST_ARGS = -W 'ignore::DeprecationWarning' -W 'ignore::FutureWarning'
 MODEL = qurator-gt4histocr-1.0
 # BEGIN-EVAL makefile-parser --make-help Makefile
@ -11,7 +12,7 @@ help:
 	@echo "  Targets"
 	@echo ""
 	@echo "    install          Install ocrd_calamari"
-	@echo "    gt4histocr-calamari1 Get GT4HistOCR Calamari model (from SBB)"
+	@echo "    $(MODEL)         Get Calamari model (from SBB)"
 	@echo "    actevedef_718448162 Download example data"
 	@echo "    deps-test        Install testing python deps via pip"
 	@echo "    repo/assets      Clone OCR-D/assets to ./repo/assets"
@ -25,6 +26,7 @@ help:
 	@echo "    PYTHON       '$(PYTHON)'"
 	@echo "    PIP_INSTALL  '$(PIP_INSTALL)'"
 	@echo "    GIT_CLONE    '$(GIT_CLONE)'"
 	@echo "    MODEL        '$(MODEL)'"
 # END-EVAL
@ -34,17 +36,14 @@ install:
 # Get GT4HistOCR Calamari model (from SBB)
-gt4histocr-calamari1:
+$(MODEL):
-	mkdir -p gt4histocr-calamari1
+	ocrd resmgr download ocrd-calamari-recognize $@
 	cd gt4histocr-calamari1 && \
 	wget https://qurator-data.de/calamari-models/GT4HistOCR/2019-12-11T11_10+0100/model.tar.xz && \
 	tar xfv model.tar.xz && \
 	rm model.tar.xz
-# Download example data
+# Download example data (not used currently)
 actevedef_718448162:
-	wget https://qurator-data.de/examples/actevedef_718448162.zip && \
+	wget https://qurator-data.de/examples/actevedef_718448162.zip \
-	unzip actevedef_718448162.zip
+	&& unzip actevedef_718448162.zip \
 	&& rm actevedef_718448162.zip
@ -54,7 +53,7 @@ actevedef_718448162:
 # Install testing python deps via pip
 deps-test:
-	$(PIP) install -r requirements_test.txt
+	$(PIP_INSTALL) -r requirements-test.txt
 # Clone OCR-D/assets to ./repo/assets
@ -73,15 +72,15 @@ assets-clean:
 	rm -rf test/assets
 # Run unit tests
-test: test/assets gt4histocr-calamari1
+test: test/assets $(MODEL)
 	# declare -p HTTP_PROXY
 	$(PYTHON) -m pytest --continue-on-collection-errors test $(PYTEST_ARGS)
 # Run unit tests and determine test coverage
-coverage: test/assets gt4histocr-calamari1
+coverage: test/assets $(MODEL)
 	coverage erase
 	make test PYTHON="coverage run"
 	coverage report
 	coverage html
-.PHONY: assets-clean test
+.PHONY: install assets-clean deps-test test coverage $(MODEL)
--- a/ocrd_calamari/ocrd-tool.json
+++ b/ocrd_calamari/ocrd-tool.json
@ -20,11 +20,11 @@
      "parameters": {
        "checkpoint_dir": {
          "description": "The directory containing calamari model files (*.ckpt.json). Uses all checkpoints in that directory",
-          "type": "string", "format": "file", "cacheable": true, "default": "qurator-gt4histocr-1.0"
+          "type": "string",
-        },
+          "format": "uri",
-        "checkpoint": {
+          "content-type": "text/directory",
-          "description": "The calamari model files (*.ckpt.json)",
+          "cacheable": true,
-          "type": "string", "format": "file", "cacheable": true
+          "default": "qurator-gt4histocr-1.0"
        },
        "voter": {
          "description": "The voting algorithm to use",
--- a/ocrd_calamari/recognize.py
+++ b/ocrd_calamari/recognize.py
@ -47,10 +47,8 @@ class CalamariRecognize(Processor):
        """
        Set up the model prior to processing.
        """
        if not self.parameter.get('checkpoint', None) and self.parameter.get('checkpoint_dir', None):
        resolved = self.resolve_resource(self.parameter['checkpoint_dir'])
-            self.parameter['checkpoint'] = '%s/*.ckpt.json' % resolved
+        checkpoints = glob('%s/*.ckpt.json' % resolved)
        checkpoints = glob(self.parameter['checkpoint'])
        self.predictor = MultiPredictor(checkpoints=checkpoints)
        self.network_input_channels = self.predictor.predictors[0].network.input_channels
@ -244,18 +242,7 @@ class CalamariRecognize(Processor):
            # Add metadata about this operation and its runtime parameters:
-            metadata = pcgts.get_Metadata()  # ensured by from_file()
+            self.add_metadata(pcgts)
            metadata.add_MetadataItem(
                MetadataItemType(type_="processingStep",
                                 name=self.ocrd_tool['steps'][0],
                                 value=TOOL,
                                 Labels=[LabelsType(
                                     externalModel="ocrd-tool",
                                     externalId="parameters",
                                     Label=[LabelType(type_=name, value=self.parameter[name])
                                            for name in self.parameter.keys()])]))
            file_id = make_file_id(input_file, self.output_file_grp)
            pcgts.set_pcGtsId(file_id)
            self.workspace.add_file(
--- a/test/base.py
+++ b/test/base.py
@ -4,6 +4,9 @@ import os
 import sys
 from test.assets import assets
 from ocrd_utils import initLogging
 PWD = os.path.dirname(os.path.realpath(__file__))
 sys.path.append(PWD + '/../ocrd')
 initLogging()
--- a/test/test_recognize.py
+++ b/test/test_recognize.py
@ -14,8 +14,7 @@ from .base import assets
 METS_KANT = assets.url_of('kant_aufklaerung_1784-page-region-line-word_glyph/data/mets.xml')
 WORKSPACE_DIR = '/tmp/test-ocrd-calamari'
-CHECKPOINT_DIR = os.path.join(os.getcwd(), 'gt4histocr-calamari1')
+CHECKPOINT_DIR = os.getenv('MODEL')
 CHECKPOINT = os.path.join(CHECKPOINT_DIR, '*.ckpt.json')
 # Because XML namespace versions are so much fun, we not only use one, we use TWO!
 NSMAP = { "pc": "http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15" }
@ -31,14 +30,6 @@ def workspace():
    resolver = Resolver()
    workspace = resolver.workspace_from_url(METS_KANT, dst_dir=WORKSPACE_DIR)
    # XXX Work around data bug(?):
    #     PAGE-XML links to OCR-D-IMG/INPUT_0017.tif, but this is nothing core can download
    os.makedirs(os.path.join(WORKSPACE_DIR, 'OCR-D-IMG'))
    for f in ['INPUT_0017.tif', 'INPUT_0020.tif']:
        urllib.request.urlretrieve(
            "https://github.com/OCR-D/assets/raw/master/data/kant_aufklaerung_1784/data/OCR-D-IMG/" + f,
            os.path.join(WORKSPACE_DIR, 'OCR-D-IMG', f))
    # The binarization options I have are:
    #
    # a. ocrd_kraken which tries to install cltsm, whose installation is borken on my machine (protobuf)
@ -47,21 +38,22 @@ def workspace():
    # c. just fumble with the original files
    #
    # So I'm going for option c.
-    for f in ['INPUT_0017.tif', 'INPUT_0020.tif']:
+    for imgf in workspace.mets.find_files(fileGrp="OCR-D-IMG"):
-        ff = os.path.join(WORKSPACE_DIR, 'OCR-D-IMG', f)
+        imgf = workspace.download_file(imgf)
-        subprocess.call(['convert', ff, '-threshold', '50%', ff])
+        path = os.path.join(workspace.directory, imgf.local_filename)
        subprocess.call(['mogrify', '-threshold', '50%', path])
    # Remove GT Words and TextEquivs, to not accidently check GT text instead of the OCR text
    # XXX Review data again
    # XXX Make this more robust against namespace version changes
-    for of in workspace.mets.find_files(fileGrp="OCR-D-GT-SEG-LINE"):
+    for of in workspace.mets.find_files(fileGrp="OCR-D-GT-SEG-WORD-GLYPH"):
        workspace.download_file(of)
        path = os.path.join(workspace.directory, of.local_filename)
        tree = etree.parse(path)
        for to_remove in ["//pc:Word", "//pc:TextEquiv"]:
        for ff in glob(os.path.join(WORKSPACE_DIR, "OCR-D-GT-SEG-LINE", "*")):
            tree = etree.parse(ff)
            for e in tree.xpath(to_remove, namespaces=NSMAP_GT):
                e.getparent().remove(e)
-            tree.write(ff, xml_declaration=True, encoding="utf-8")
+        tree.write(path, xml_declaration=True, encoding="utf-8")
    return workspace
@ -69,23 +61,7 @@ def workspace():
 def test_recognize(workspace):
    CalamariRecognize(
        workspace,
-        input_file_grp="OCR-D-GT-SEG-LINE",
+        input_file_grp="OCR-D-GT-SEG-WORD-GLYPH",
        output_file_grp="OCR-D-OCR-CALAMARI",
        parameter={
            "checkpoint": CHECKPOINT,
        }
    ).process()
    workspace.save_mets()
    page1 = os.path.join(workspace.directory, "OCR-D-OCR-CALAMARI/OCR-D-OCR-CALAMARI_0001.xml")
    assert os.path.exists(page1)
    with open(page1, "r", encoding="utf-8") as f:
        assert "verſchuldeten" in f.read()
 def test_recognize_with_checkpoint_dir(workspace):
    CalamariRecognize(
        workspace,
        input_file_grp="OCR-D-GT-SEG-LINE",
        output_file_grp="OCR-D-OCR-CALAMARI",
        parameter={
            "checkpoint_dir": CHECKPOINT_DIR,
@ -103,9 +79,9 @@ def test_recognize_should_warn_if_given_rgb_image_and_single_channel_model(works
    caplog.set_level(logging.WARNING)
    CalamariRecognize(
        workspace,
-        input_file_grp="OCR-D-GT-SEG-LINE",
+        input_file_grp="OCR-D-GT-SEG-WORD-GLYPH",
        output_file_grp="OCR-D-OCR-CALAMARI-BROKEN",
-        parameter={'checkpoint': CHECKPOINT}
+        parameter={'checkpoint_dir': CHECKPOINT_DIR}
    ).process()
    interesting_log_messages = [t[2] for t in caplog.record_tuples if "Using raw image" in t[2]]
@ -115,10 +91,10 @@ def test_recognize_should_warn_if_given_rgb_image_and_single_channel_model(works
 def test_word_segmentation(workspace):
    CalamariRecognize(
        workspace,
-        input_file_grp="OCR-D-GT-SEG-LINE",
+        input_file_grp="OCR-D-GT-SEG-WORD-GLYPH",
        output_file_grp="OCR-D-OCR-CALAMARI",
        parameter={
-            "checkpoint": CHECKPOINT,
+            "checkpoint_dir": CHECKPOINT_DIR,
            "textequiv_level": "word",   # Note that we're going down to word level here
        }
    ).process()
@ -147,10 +123,10 @@ def test_word_segmentation(workspace):
 def test_glyphs(workspace):
    CalamariRecognize(
        workspace,
-        input_file_grp="OCR-D-GT-SEG-LINE",
+        input_file_grp="OCR-D-GT-SEG-WORD-GLYPH",
        output_file_grp="OCR-D-OCR-CALAMARI",
        parameter={
-            "checkpoint": CHECKPOINT,
+            "checkpoint_dir": CHECKPOINT_DIR,
            "textequiv_level": "glyph",   # Note that we're going down to glyph level here
        }
    ).process()