✅ Fix tests by 1. binarizing and 2. use the GT4HistOCR model

2026-07-21 18:19:10 +02:00 · 2019-12-05 13:02:20 +01:00 · 2019-12-05 13:02:20 +01:00 · 99d04ddccb
commit 99d04ddccb
parent 2aff9d8a48
4 changed files with 27 additions and 6 deletions
--- a/.gitignore
+++ b/.gitignore
@ -106,5 +106,6 @@ venv.bak/

 /calamari
 /calamari_models
+/gt4histocr-calamari
 /repo
 /test/assets
--- a/11
+++ b/11
@ -12,6 +12,7 @@ help:
 	@echo "    install          Install ocrd_calamari"
 	@echo "    calamari         Clone calamari repo"
 	@echo "    calamari_models  Clone calamari_models repo"
+	@echo "    gt4histocr-calamari Get GT4HistOCR Calamari model (from SBB)"
 	@echo "    calamari/build   pip install calamari"
 	@echo "    deps-test        Install testing python deps via pip"
 	@echo "    repo/assets      Clone OCR-D/assets to ./repo/assets"
@ -42,6 +43,14 @@ calamari_models:
 	# Checkout latest version that works with calamari-ocr==0.3.5:
 	cd calamari_models && git checkout f76b1d3ec

+gt4histocr-calamari:
+	mkdir gt4histocr-calamari
+	cd gt4histocr-calamari && \
+	wget https://file.spk-berlin.de:8443/calamari-models/GT4HistOCR/model.tar.xz && \
+	tar xfv model.tar.xz && \
+	rm model.tar.xz
+
+

 # pip install calamari
 calamari/build: calamari calamari_models
@ -73,7 +82,7 @@ assets-clean:
 	rm -rf test/assets

 # Run unit tests
-test: test/assets calamari_models
+test: test/assets gt4histocr-calamari
 	# declare -p HTTP_PROXY
 	$(PYTHON) -m pytest --continue-on-collection-errors test $(PYTEST_ARGS)

--- a/README.md
+++ b/README.md
@ -39,10 +39,8 @@ unzip master.zip
 Download models trained on GT4HistOCR data:

 ```
-wget https://file.spk-berlin.de:8443/calamari-models/GT4HistOCR/model.tar.xz
-mkdir gt4hist-calamari
-cd gt4hist-calamari
-tar xf ../model.tar.xz
+make gt4histocr-calamari
+ls gt4histocr-calamari
 ```

 ## Example Usage
--- a/test/test_recognize.py
+++ b/test/test_recognize.py
@ -1,5 +1,6 @@
 import os
 import shutil
+import subprocess
 import urllib.request

 from test.base import TestCase, main, assets, skip
@ -31,6 +32,18 @@ class TestCalamariRecognize(TestCase):
                    "https://github.com/OCR-D/assets/raw/master/data/kant_aufklaerung_1784/data/OCR-D-IMG/" + f,
                    os.path.join(WORKSPACE_DIR, 'OCR-D-IMG', f))

+        # The binarization options I have are:
+        #
+        # a. ocrd_kraken which tries to install cltsm, whose installation is borken on my machine (protobuf)
+        # b. ocrd_olena which 1. I cannot fully install via pip and 2. whose dependency olena doesn't compile on my
+        #    machine
+        # c. just fumble with the original files
+        #
+        # So I'm going for option c.
+        for f in ['INPUT_0017.tif', 'INPUT_0020.tif']:
+            ff = os.path.join(WORKSPACE_DIR, 'OCR-D-IMG', f)
+            subprocess.call(['convert', ff, '-colorspace', 'Gray', ff])
+
        # XXX Should remove GT text to really test this

        CalamariRecognize(
@ -38,7 +51,7 @@ class TestCalamariRecognize(TestCase):
            input_file_grp="OCR-D-GT-SEG-LINE",
            output_file_grp="OCR-D-OCR-CALAMARI",
            parameter={
-                'checkpoint': os.path.join(os.getcwd(), 'calamari_models/fraktur_19th_century/*.ckpt.json')
+                'checkpoint': os.path.join(os.getcwd(), 'gt4histocr-calamari/*.ckpt.json')
            }
        ).process()
        workspace.save_mets()