diff --git a/.gitignore b/.gitignore index d080392..42c4957 100644 --- a/.gitignore +++ b/.gitignore @@ -106,5 +106,6 @@ venv.bak/ /calamari /calamari_models +/gt4histocr-calamari /repo /test/assets diff --git a/Makefile b/Makefile index dc77d09..0508505 100644 --- a/Makefile +++ b/Makefile @@ -12,6 +12,7 @@ help: @echo " install Install ocrd_calamari" @echo " calamari Clone calamari repo" @echo " calamari_models Clone calamari_models repo" + @echo " gt4histocr-calamari Get GT4HistOCR Calamari model (from SBB)" @echo " calamari/build pip install calamari" @echo " deps-test Install testing python deps via pip" @echo " repo/assets Clone OCR-D/assets to ./repo/assets" @@ -42,6 +43,14 @@ calamari_models: # Checkout latest version that works with calamari-ocr==0.3.5: cd calamari_models && git checkout f76b1d3ec +gt4histocr-calamari: + mkdir gt4histocr-calamari + cd gt4histocr-calamari && \ + wget https://file.spk-berlin.de:8443/calamari-models/GT4HistOCR/model.tar.xz && \ + tar xfv model.tar.xz && \ + rm model.tar.xz + + # pip install calamari calamari/build: calamari calamari_models @@ -73,7 +82,7 @@ assets-clean: rm -rf test/assets # Run unit tests -test: test/assets calamari_models +test: test/assets gt4histocr-calamari # declare -p HTTP_PROXY $(PYTHON) -m pytest --continue-on-collection-errors test $(PYTEST_ARGS) diff --git a/README.md b/README.md index f335225..4d7dc96 100644 --- a/README.md +++ b/README.md @@ -39,10 +39,8 @@ unzip master.zip Download models trained on GT4HistOCR data: ``` -wget https://file.spk-berlin.de:8443/calamari-models/GT4HistOCR/model.tar.xz -mkdir gt4hist-calamari -cd gt4hist-calamari -tar xf ../model.tar.xz +make gt4histocr-calamari +ls gt4histocr-calamari ``` ## Example Usage diff --git a/test/test_recognize.py b/test/test_recognize.py index 28ebcbd..5ee91bc 100644 --- a/test/test_recognize.py +++ b/test/test_recognize.py @@ -1,5 +1,6 @@ import os import shutil +import subprocess import urllib.request from test.base import TestCase, main, assets, skip @@ -31,6 +32,18 @@ class TestCalamariRecognize(TestCase): "https://github.com/OCR-D/assets/raw/master/data/kant_aufklaerung_1784/data/OCR-D-IMG/" + f, os.path.join(WORKSPACE_DIR, 'OCR-D-IMG', f)) + # The binarization options I have are: + # + # a. ocrd_kraken which tries to install cltsm, whose installation is borken on my machine (protobuf) + # b. ocrd_olena which 1. I cannot fully install via pip and 2. whose dependency olena doesn't compile on my + # machine + # c. just fumble with the original files + # + # So I'm going for option c. + for f in ['INPUT_0017.tif', 'INPUT_0020.tif']: + ff = os.path.join(WORKSPACE_DIR, 'OCR-D-IMG', f) + subprocess.call(['convert', ff, '-colorspace', 'Gray', ff]) + # XXX Should remove GT text to really test this CalamariRecognize( @@ -38,7 +51,7 @@ class TestCalamariRecognize(TestCase): input_file_grp="OCR-D-GT-SEG-LINE", output_file_grp="OCR-D-OCR-CALAMARI", parameter={ - 'checkpoint': os.path.join(os.getcwd(), 'calamari_models/fraktur_19th_century/*.ckpt.json') + 'checkpoint': os.path.join(os.getcwd(), 'gt4histocr-calamari/*.ckpt.json') } ).process() workspace.save_mets()