diff --git a/.circleci/config.yml b/.circleci/config.yml new file mode 100644 index 0000000..1c1c5b3 --- /dev/null +++ b/.circleci/config.yml @@ -0,0 +1,21 @@ +version: 2.1 +orbs: + codecov: codecov/codecov@1.0.5 + +jobs: + + build-python36: + docker: + - image: ubuntu:18.04 + steps: + - run: apt-get update ; apt-get install -y make git curl python3 python3-pip wget + - checkout + - run: make install PIP_INSTALL="pip3 install" + - run: pip3 install -r requirements-test.txt + - run: make coverage + - codecov/upload + +workflows: + build: + jobs: + - build-python36 diff --git a/.coveragerc b/.coveragerc new file mode 100644 index 0000000..3aca4eb --- /dev/null +++ b/.coveragerc @@ -0,0 +1,14 @@ +[run] +branch = True +source = + ocrd_calamari + +[report] +exclude_lines = + if self.debug: + pragma: no cover + raise NotImplementedError + if __name__ == .__main__.: +ignore_errors = True +omit = + ocrd_calamari/cli.py diff --git a/.gitignore b/.gitignore index 1c2329e..d080392 100644 --- a/.gitignore +++ b/.gitignore @@ -102,7 +102,9 @@ venv.bak/ # mypy .mypy_cache/ + + /calamari /calamari_models /repo -/test +/test/assets diff --git a/Makefile b/Makefile index 15a546c..9fabd2e 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,11 @@ -GIT_CLONE = git clone --depth 1 +# '$(PYTHON)' +PYTHON = python -# Docker tag -DOCKER_TAG = ocrd/calamari +# '$(PIP_INSTALL)' +PIP_INSTALL = pip install + +# '$(GIT_CLONE)' +GIT_CLONE = git clone # BEGIN-EVAL makefile-parser --make-help Makefile @@ -9,31 +13,78 @@ help: @echo "" @echo " Targets" @echo "" - @echo " calamari git clone calamari" - @echo " calamari_models git clone calamari_models" - @echo " calamari/build Install calamari" - @echo " docker Build docker image" + @echo " install Install ocrd_calamari" + @echo " calamari Clone calamari repo" + @echo " calamari_models Clone calamari_models repo" + @echo " calamari/build pip install calamari" + @echo " deps-test Install testing python deps via pip" + @echo " repo/assets Clone OCR-D/assets to ./repo/assets" + @echo " test/assets Setup test assets" + @echo " assets-clean Remove symlinks in test/assets" + @echo " test Run unit tests" + @echo " coverage Run unit tests and determine test coverage" @echo "" @echo " Variables" @echo "" - @echo " DOCKER_TAG Docker tag" + @echo " PYTHON '$(PYTHON)'" + @echo " PIP_INSTALL '$(PIP_INSTALL)'" + @echo " GIT_CLONE '$(GIT_CLONE)'" # END-EVAL -# git clone calamari +# Install ocrd_calamari +install: + $(PIP_INSTALL) . + +# Clone calamari repo calamari: $(GIT_CLONE) https://github.com/chwick/calamari -# git clone calamari_models +# Clone calamari_models repo calamari_models: - $(GIT_CLONE) https://github.com/chwick/calamari_models + $(GIT_CLONE) -n https://github.com/chwick/calamari_models + # Checkout latest version that works with calamari-ocr==0.3.5: + cd calamari_models && git checkout f76b1d3ec + -# Install calamari +# pip install calamari calamari/build: calamari calamari_models - cd calamari &&\ - pip install -r requirements.txt ;\ - python setup.py install + cd calamari && $(PIP_INSTALL) . + +# +# Assets and Tests +# + +# Install testing python deps via pip +deps-test: + $(PIP) install -r requirements_test.txt + + +# Clone OCR-D/assets to ./repo/assets +repo/assets: + mkdir -p $(dir $@) + git clone https://github.com/OCR-D/assets "$@" + + +# Setup test assets +test/assets: repo/assets + mkdir -p $@ + cp -r -t $@ repo/assets/data/* + +# Remove symlinks in test/assets +assets-clean: + rm -rf test/assets + +# Run unit tests +test: test/assets calamari_models + # declare -p HTTP_PROXY + $(PYTHON) -m pytest --continue-on-collection-errors test $(PYTEST_ARGS) + +# Run unit tests and determine test coverage +coverage: test/assets calamari_models + coverage erase + make test PYTHON="coverage run" + coverage report + coverage html -# Build docker image -docker: - docker build -t '$(DOCKER_TAG)' . +.PHONY: assets-clean test diff --git a/README.md b/README.md index 5ae93e8..dca1ffc 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,10 @@ # ocrd_calamari -Recognize text using [Calamari OCR](https://github.com/Calamari-OCR/calamari). +> Recognize text using [Calamari OCR](https://github.com/Calamari-OCR/calamari). + +[![image](https://circleci.com/gh/OCR-D/ocrd_calamari.svg?style=svg)](https://circleci.com/gh/OCR-D/ocrd_calamari) +[![image](https://img.shields.io/pypi/v/ocrd_calamari.svg)](https://pypi.org/project/ocrd_calamari/) +[![image](https://codecov.io/gh/OCR-D/ocrd_calamari/branch/master/graph/badge.svg)](https://codecov.io/gh/OCR-D/ocrd_calamari) ## Introduction diff --git a/ocrd_calamari/__init__.py b/ocrd_calamari/__init__.py index 683a3e2..f56b516 100644 --- a/ocrd_calamari/__init__.py +++ b/ocrd_calamari/__init__.py @@ -1 +1,5 @@ +__all__ = [ + 'CalamariRecognize' +] + from .recognize import CalamariRecognize diff --git a/requirements-test.txt b/requirements-test.txt new file mode 100644 index 0000000..49ec960 --- /dev/null +++ b/requirements-test.txt @@ -0,0 +1,2 @@ +pytest +coverage diff --git a/test/__init__.py b/test/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/test/base.py b/test/base.py new file mode 100644 index 0000000..cea8ed3 --- /dev/null +++ b/test/base.py @@ -0,0 +1,10 @@ +# pylint: disable=unused-import + +import os +import sys +from unittest import TestCase, skip, main # pylint: disable=unused-import + +from test.assets import assets + +PWD = os.path.dirname(os.path.realpath(__file__)) +sys.path.append(PWD + '/../ocrd') diff --git a/test/test_recognize.py b/test/test_recognize.py new file mode 100644 index 0000000..28ebcbd --- /dev/null +++ b/test/test_recognize.py @@ -0,0 +1,52 @@ +import os +import shutil +import urllib.request + +from test.base import TestCase, main, assets, skip + +from ocrd.resolver import Resolver + +from ocrd_calamari import CalamariRecognize + +METS_KANT = assets.url_of('kant_aufklaerung_1784-page-block-line-word_glyph/data/mets.xml') + +WORKSPACE_DIR = '/tmp/test-ocrd-calamari' + +class TestCalamariRecognize(TestCase): + + def setUp(self): + if os.path.exists(WORKSPACE_DIR): + shutil.rmtree(WORKSPACE_DIR) + os.makedirs(WORKSPACE_DIR) + + def runTest(self): + resolver = Resolver() + workspace = resolver.workspace_from_url(METS_KANT, dst_dir=WORKSPACE_DIR) + + # XXX Work around data bug(?): + # PAGE-XML links to OCR-D-IMG/INPUT_0017.tif, but this is nothing core can download + os.makedirs(os.path.join(WORKSPACE_DIR, 'OCR-D-IMG')) + for f in ['INPUT_0017.tif', 'INPUT_0020.tif']: + urllib.request.urlretrieve( + "https://github.com/OCR-D/assets/raw/master/data/kant_aufklaerung_1784/data/OCR-D-IMG/" + f, + os.path.join(WORKSPACE_DIR, 'OCR-D-IMG', f)) + + # XXX Should remove GT text to really test this + + CalamariRecognize( + workspace, + input_file_grp="OCR-D-GT-SEG-LINE", + output_file_grp="OCR-D-OCR-CALAMARI", + parameter={ + 'checkpoint': os.path.join(os.getcwd(), 'calamari_models/fraktur_19th_century/*.ckpt.json') + } + ).process() + workspace.save_mets() + + page1 = os.path.join(workspace.directory, 'OCR-D-OCR-CALAMARI/OCR-D-OCR-CALAMARI_0001.xml') + self.assertTrue(os.path.exists(page1)) + with open(page1, 'r', encoding='utf-8') as f: + self.assertIn('verſchuldeten', f.read()) + +if __name__ == '__main__': + main()