From b54ccf90f70b254bb72fbed83f4dbe72bb56d209 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Wed, 21 Aug 2019 19:52:08 +0200 Subject: [PATCH 01/12] smoke test, circle ci Conflicts: Makefile ocrd_calamari/__init__.py --- .circleci/config.yml | 21 ++++++++++ .gitignore | 4 +- Makefile | 82 +++++++++++++++++++++++++++++++-------- ocrd_calamari/__init__.py | 4 ++ requirements-test.txt | 2 + test/__init__.py | 0 test/base.py | 10 +++++ test/test_recognize.py | 62 +++++++++++++++++++++++++++++ 8 files changed, 167 insertions(+), 18 deletions(-) create mode 100644 .circleci/config.yml create mode 100644 requirements-test.txt create mode 100644 test/__init__.py create mode 100644 test/base.py create mode 100644 test/test_recognize.py diff --git a/.circleci/config.yml b/.circleci/config.yml new file mode 100644 index 0000000..deaaeb8 --- /dev/null +++ b/.circleci/config.yml @@ -0,0 +1,21 @@ +version: 2.1 +orbs: + codecov: codecov/codecov@1.0.5 + +jobs: + + build-python36: + docker: + - image: ubuntu:18.04 + steps: + - run: apt-get update ; apt-get install -y make git curl + - checkout + - run: make install + - run: pip install -r requirements-test.txt + - run: make coverage + - codecov/upload + +workflows: + build: + jobs: + - build-python36 diff --git a/.gitignore b/.gitignore index 1c2329e..d080392 100644 --- a/.gitignore +++ b/.gitignore @@ -102,7 +102,9 @@ venv.bak/ # mypy .mypy_cache/ + + /calamari /calamari_models /repo -/test +/test/assets diff --git a/Makefile b/Makefile index 15a546c..a31830f 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,11 @@ -GIT_CLONE = git clone --depth 1 +# '$(PYTHON)' +PYTHON = python + +# '$(PIP_INSTALL)' +PIP_INSTALL = pip install -# Docker tag -DOCKER_TAG = ocrd/calamari +# '$(GIT_CLONE)' +GIT_CLONE = git clone --depth 1 # BEGIN-EVAL makefile-parser --make-help Makefile @@ -9,31 +13,75 @@ help: @echo "" @echo " Targets" @echo "" - @echo " calamari git clone calamari" - @echo " calamari_models git clone calamari_models" - @echo " calamari/build Install calamari" - @echo " docker Build docker image" + @echo " install Install ocrd_calamari" + @echo " calamari Clone calamari repo" + @echo " calamari_models Clone calamari_models repo" + @echo " calamari/build pip install calamari" + @echo " deps-test Install testing python deps via pip" + @echo " repo/assets Clone OCR-D/assets to ./repo/assets" + @echo " test/assets Setup test assets" + @echo " assets-clean Remove symlinks in test/assets" + @echo " test Run unit tests" + @echo " coverage Run unit tests and determine test coverage" @echo "" @echo " Variables" @echo "" - @echo " DOCKER_TAG Docker tag" + @echo " PYTHON '$(PYTHON)'" + @echo " PIP_INSTALL '$(PIP_INSTALL)'" + @echo " GIT_CLONE '$(GIT_CLONE)'" # END-EVAL -# git clone calamari +# Install ocrd_calamari +install: + $(PIP_INSTALL) . + +# Clone calamari repo calamari: $(GIT_CLONE) https://github.com/chwick/calamari -# git clone calamari_models +# Clone calamari_models repo calamari_models: $(GIT_CLONE) https://github.com/chwick/calamari_models -# Install calamari +# pip install calamari calamari/build: calamari calamari_models - cd calamari &&\ - pip install -r requirements.txt ;\ - python setup.py install + cd calamari && $(PIP_INSTALL) . + +# +# Assets and Tests +# + +# Install testing python deps via pip +deps-test: + $(PIP) install -r requirements_test.txt + + +# Clone OCR-D/assets to ./repo/assets +repo/assets: + mkdir -p $(dir $@) + git clone https://github.com/OCR-D/assets "$@" + + +# Setup test assets +test/assets: repo/assets + mkdir -p $@ + cp -r -t $@ repo/assets/data/* + +# Remove symlinks in test/assets +assets-clean: + rm -rf test/assets + +# Run unit tests +test: test/assets calamari_models + # declare -p HTTP_PROXY + $(PYTHON) -m pytest --continue-on-collection-errors test $(PYTEST_ARGS) + +# Run unit tests and determine test coverage +coverage: + coverage erase + make test PYTHON="coverage run" + coverage report + coverage html -# Build docker image -docker: - docker build -t '$(DOCKER_TAG)' . +.PHONY: assets-clean test diff --git a/ocrd_calamari/__init__.py b/ocrd_calamari/__init__.py index 683a3e2..f56b516 100644 --- a/ocrd_calamari/__init__.py +++ b/ocrd_calamari/__init__.py @@ -1 +1,5 @@ +__all__ = [ + 'CalamariRecognize' +] + from .recognize import CalamariRecognize diff --git a/requirements-test.txt b/requirements-test.txt new file mode 100644 index 0000000..7aad3d0 --- /dev/null +++ b/requirements-test.txt @@ -0,0 +1,2 @@ +pytest +ocrd_tesserocr >= 0.4.0 diff --git a/test/__init__.py b/test/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/test/base.py b/test/base.py new file mode 100644 index 0000000..cea8ed3 --- /dev/null +++ b/test/base.py @@ -0,0 +1,10 @@ +# pylint: disable=unused-import + +import os +import sys +from unittest import TestCase, skip, main # pylint: disable=unused-import + +from test.assets import assets + +PWD = os.path.dirname(os.path.realpath(__file__)) +sys.path.append(PWD + '/../ocrd') diff --git a/test/test_recognize.py b/test/test_recognize.py new file mode 100644 index 0000000..ed85485 --- /dev/null +++ b/test/test_recognize.py @@ -0,0 +1,62 @@ +import os +from os.path import join, exists +import shutil + +from test.base import TestCase, main, assets, skip + +from ocrd.resolver import Resolver + +from ocrd_tesserocr import TesserocrSegmentRegion +from ocrd_tesserocr import TesserocrSegmentLine + +from ocrd_calamari import CalamariRecognize + +#METS_HEROLD_SMALL = assets.url_of('SBB0000F29300010000/data/mets_one_file.xml') +# as long as #96 remains, we cannot use workspaces which have local relative files: +METS_HEROLD_SMALL = assets.url_of('kant_aufklaerung_1784-binarized/data/mets.xml') + +WORKSPACE_DIR = '/tmp/test-ocrd-calamari' + +class TestCalamariRecognize(TestCase): + + def setUp(self): + if exists(WORKSPACE_DIR): + shutil.rmtree(WORKSPACE_DIR) + os.makedirs(WORKSPACE_DIR) + + #skip("Takes too long") + def runTest(self): + resolver = Resolver() + workspace = resolver.workspace_from_url(METS_HEROLD_SMALL, dst_dir=WORKSPACE_DIR) + + TesserocrSegmentRegion( + workspace, + input_file_grp="OCR-D-IMG", + output_file_grp="OCR-D-SEG-BLOCK" + ).process() + workspace.save_mets() + + TesserocrSegmentLine( + workspace, + input_file_grp="OCR-D-SEG-BLOCK", + output_file_grp="OCR-D-SEG-LINE" + ).process() + workspace.save_mets() + + CalamariRecognize( + workspace, + input_file_grp="OCR-D-SEG-LINE", + output_file_grp="OCR-D-OCR-CALAMARI", + parameter={ + 'checkpoint': 'calamari_models/fraktur_historical/*.ckpt.json' + } + ).process() + workspace.save_mets() + + page1 = join(workspace.directory, 'OCR-D-OCR-CALAMARI/OCR-D-OCR-CALAMARI_0001.xml') + self.assertTrue(exists(page1)) + with open(page1, 'r') as f: + self.assertIn('verſchuldeten', f.read()) + +if __name__ == '__main__': + main() From 7b099352c0bbfadbaf82668fc3227c591f076369 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Wed, 21 Aug 2019 19:54:35 +0200 Subject: [PATCH 02/12] ci: install pip and require models for coverage --- .circleci/config.yml | 2 +- Makefile | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index deaaeb8..49abb41 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -8,7 +8,7 @@ jobs: docker: - image: ubuntu:18.04 steps: - - run: apt-get update ; apt-get install -y make git curl + - run: apt-get update ; apt-get install -y make git curl python3-pip - checkout - run: make install - run: pip install -r requirements-test.txt diff --git a/Makefile b/Makefile index a31830f..da1eed5 100644 --- a/Makefile +++ b/Makefile @@ -78,7 +78,7 @@ test: test/assets calamari_models $(PYTHON) -m pytest --continue-on-collection-errors test $(PYTEST_ARGS) # Run unit tests and determine test coverage -coverage: +coverage: test/assets calamari_models coverage erase make test PYTHON="coverage run" coverage report From 404fc707ae557e33720dae311f9e623103f92140 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Wed, 21 Aug 2019 19:59:03 +0200 Subject: [PATCH 03/12] ci: pip{,3} --- .circleci/config.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 49abb41..d5215b0 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -10,8 +10,8 @@ jobs: steps: - run: apt-get update ; apt-get install -y make git curl python3-pip - checkout - - run: make install - - run: pip install -r requirements-test.txt + - run: make install PIP_INSTALL="pip3 install" + - run: pip3 install -r requirements-test.txt - run: make coverage - codecov/upload From 710f2096ef3b525031962ad434c5fe605e87d18e Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Thu, 22 Aug 2019 11:57:23 +0200 Subject: [PATCH 04/12] ci: install tesseract (again) --- .circleci/config.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index d5215b0..62690a4 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -8,7 +8,7 @@ jobs: docker: - image: ubuntu:18.04 steps: - - run: apt-get update ; apt-get install -y make git curl python3-pip + - run: apt-get update ; apt-get install -y make git curl python3 python3-pip libtesseract-dev libleptonica-dev tesseract-ocr-eng tesseract-ocr wget - checkout - run: make install PIP_INSTALL="pip3 install" - run: pip3 install -r requirements-test.txt From 52ea047a714d474b3c26c385e434d156fa5435f1 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Thu, 22 Aug 2019 13:53:38 +0200 Subject: [PATCH 05/12] ci: try ubuntu 19.04 --- .circleci/config.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 62690a4..52fcfef 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -6,7 +6,7 @@ jobs: build-python36: docker: - - image: ubuntu:18.04 + - image: ubuntu:19.04 steps: - run: apt-get update ; apt-get install -y make git curl python3 python3-pip libtesseract-dev libleptonica-dev tesseract-ocr-eng tesseract-ocr wget - checkout From 5b15dc5fd6b508a5a083de61ca9108b78ef4e7e6 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Thu, 22 Aug 2019 14:03:59 +0200 Subject: [PATCH 06/12] ci: require coverage --- requirements-test.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements-test.txt b/requirements-test.txt index 7aad3d0..dc069b5 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -1,2 +1,3 @@ pytest ocrd_tesserocr >= 0.4.0 +coverage From d8212ced9388ea74a9476aec3b9ae703c5c1d95a Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Tue, 3 Dec 2019 16:23:41 +0100 Subject: [PATCH 07/12] =?UTF-8?q?=E2=9C=85=20Use=20GT=20segmentation=20to?= =?UTF-8?q?=20test?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .circleci/config.yml | 2 +- Makefile | 7 +++++-- requirements-test.txt | 1 - test/test_recognize.py | 42 ++++++++++++++++-------------------------- 4 files changed, 22 insertions(+), 30 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 52fcfef..7f45b88 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -8,7 +8,7 @@ jobs: docker: - image: ubuntu:19.04 steps: - - run: apt-get update ; apt-get install -y make git curl python3 python3-pip libtesseract-dev libleptonica-dev tesseract-ocr-eng tesseract-ocr wget + - run: apt-get update ; apt-get install -y make git curl python3 python3-pip wget - checkout - run: make install PIP_INSTALL="pip3 install" - run: pip3 install -r requirements-test.txt diff --git a/Makefile b/Makefile index da1eed5..707e8c1 100644 --- a/Makefile +++ b/Makefile @@ -5,7 +5,7 @@ PYTHON = python PIP_INSTALL = pip install # '$(GIT_CLONE)' -GIT_CLONE = git clone --depth 1 +GIT_CLONE = git clone # BEGIN-EVAL makefile-parser --make-help Makefile @@ -42,7 +42,10 @@ calamari: # Clone calamari_models repo calamari_models: - $(GIT_CLONE) https://github.com/chwick/calamari_models + $(GIT_CLONE) -n https://github.com/chwick/calamari_models + # Checkout latest version that works with calamari-ocr==0.3.5: + git checkout f76b1d3ec + # pip install calamari calamari/build: calamari calamari_models diff --git a/requirements-test.txt b/requirements-test.txt index dc069b5..49ec960 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -1,3 +1,2 @@ pytest -ocrd_tesserocr >= 0.4.0 coverage diff --git a/test/test_recognize.py b/test/test_recognize.py index ed85485..545f25a 100644 --- a/test/test_recognize.py +++ b/test/test_recognize.py @@ -1,60 +1,50 @@ import os -from os.path import join, exists import shutil +import urllib.request from test.base import TestCase, main, assets, skip from ocrd.resolver import Resolver -from ocrd_tesserocr import TesserocrSegmentRegion -from ocrd_tesserocr import TesserocrSegmentLine - from ocrd_calamari import CalamariRecognize -#METS_HEROLD_SMALL = assets.url_of('SBB0000F29300010000/data/mets_one_file.xml') -# as long as #96 remains, we cannot use workspaces which have local relative files: -METS_HEROLD_SMALL = assets.url_of('kant_aufklaerung_1784-binarized/data/mets.xml') +METS_KANT = assets.url_of('kant_aufklaerung_1784-page-block-line-word_glyph/data/mets.xml') WORKSPACE_DIR = '/tmp/test-ocrd-calamari' class TestCalamariRecognize(TestCase): def setUp(self): - if exists(WORKSPACE_DIR): + if os.path.exists(WORKSPACE_DIR): shutil.rmtree(WORKSPACE_DIR) os.makedirs(WORKSPACE_DIR) - #skip("Takes too long") def runTest(self): resolver = Resolver() - workspace = resolver.workspace_from_url(METS_HEROLD_SMALL, dst_dir=WORKSPACE_DIR) + workspace = resolver.workspace_from_url(METS_KANT, dst_dir=WORKSPACE_DIR) - TesserocrSegmentRegion( - workspace, - input_file_grp="OCR-D-IMG", - output_file_grp="OCR-D-SEG-BLOCK" - ).process() - workspace.save_mets() + # XXX Work around data bug(?): + # PAGE-XML links to OCR-D-IMG/INPUT_0017.tif, but this is nothing core can download + os.makedirs(os.path.join(WORKSPACE_DIR, 'OCR-D-IMG')) + for f in ['INPUT_0017.tif', 'INPUT_0020.tif']: + urllib.request.urlretrieve( + "https://github.com/OCR-D/assets/raw/master/data/kant_aufklaerung_1784/data/OCR-D-IMG/" + f, + os.path.join(WORKSPACE_DIR, 'OCR-D-IMG', f)) - TesserocrSegmentLine( - workspace, - input_file_grp="OCR-D-SEG-BLOCK", - output_file_grp="OCR-D-SEG-LINE" - ).process() - workspace.save_mets() + # XXX Should remove GT text to really test this CalamariRecognize( workspace, - input_file_grp="OCR-D-SEG-LINE", + input_file_grp="OCR-D-GT-SEG-LINE", output_file_grp="OCR-D-OCR-CALAMARI", parameter={ - 'checkpoint': 'calamari_models/fraktur_historical/*.ckpt.json' + 'checkpoint': os.path.join(os.getcwd(), 'calamari_models/fraktur_19th_century/*.ckpt.json') } ).process() workspace.save_mets() - page1 = join(workspace.directory, 'OCR-D-OCR-CALAMARI/OCR-D-OCR-CALAMARI_0001.xml') - self.assertTrue(exists(page1)) + page1 = os.path.join(workspace.directory, 'OCR-D-OCR-CALAMARI/OCR-D-OCR-CALAMARI_0001.xml') + self.assertTrue(os.path.exists(page1)) with open(page1, 'r') as f: self.assertIn('verſchuldeten', f.read()) From 5a3ffeeb630db01f11f34d889a405bcb5d56ffe8 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Tue, 3 Dec 2019 16:29:18 +0100 Subject: [PATCH 08/12] =?UTF-8?q?=F0=9F=90=9B=20Fix=20checking=20out=20cal?= =?UTF-8?q?amari=5Fmodels?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 707e8c1..9fabd2e 100644 --- a/Makefile +++ b/Makefile @@ -44,7 +44,7 @@ calamari: calamari_models: $(GIT_CLONE) -n https://github.com/chwick/calamari_models # Checkout latest version that works with calamari-ocr==0.3.5: - git checkout f76b1d3ec + cd calamari_models && git checkout f76b1d3ec # pip install calamari From 2fb37b14a2269a991b16f38da4480cd2cecb4cf2 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Tue, 3 Dec 2019 16:38:53 +0100 Subject: [PATCH 09/12] =?UTF-8?q?=E2=9C=85=20Use=20Ubuntu=2018.04=20LTS=20?= =?UTF-8?q?again=20(19.04=20is=20EOL=202020-01)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .circleci/config.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 7f45b88..1c1c5b3 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -6,7 +6,7 @@ jobs: build-python36: docker: - - image: ubuntu:19.04 + - image: ubuntu:18.04 steps: - run: apt-get update ; apt-get install -y make git curl python3 python3-pip wget - checkout From 010391b09b1e72082bc7cd9b871d2ac046d267ef Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Tue, 3 Dec 2019 16:42:09 +0100 Subject: [PATCH 10/12] =?UTF-8?q?=E2=9C=85=20Only=20do=20the=20coverage=20?= =?UTF-8?q?on=20our=20code?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .coveragerc | 14 ++++++++++++++ 1 file changed, 14 insertions(+) create mode 100644 .coveragerc diff --git a/.coveragerc b/.coveragerc new file mode 100644 index 0000000..3aca4eb --- /dev/null +++ b/.coveragerc @@ -0,0 +1,14 @@ +[run] +branch = True +source = + ocrd_calamari + +[report] +exclude_lines = + if self.debug: + pragma: no cover + raise NotImplementedError + if __name__ == .__main__.: +ignore_errors = True +omit = + ocrd_calamari/cli.py From e1f57b263842c1141d39aa5b6ee151c1d6c1ad9b Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Tue, 3 Dec 2019 16:52:34 +0100 Subject: [PATCH 11/12] =?UTF-8?q?=F0=9F=90=9B=20Open=20our=20test=20result?= =?UTF-8?q?=20with=20UTF-8=20encoding=20(for=20Python=203.6=3F)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- test/test_recognize.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_recognize.py b/test/test_recognize.py index 545f25a..28ebcbd 100644 --- a/test/test_recognize.py +++ b/test/test_recognize.py @@ -45,7 +45,7 @@ class TestCalamariRecognize(TestCase): page1 = os.path.join(workspace.directory, 'OCR-D-OCR-CALAMARI/OCR-D-OCR-CALAMARI_0001.xml') self.assertTrue(os.path.exists(page1)) - with open(page1, 'r') as f: + with open(page1, 'r', encoding='utf-8') as f: self.assertIn('verſchuldeten', f.read()) if __name__ == '__main__': From df35e799033255da1d91e2c6e58110c9038d8012 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Tue, 3 Dec 2019 17:01:01 +0100 Subject: [PATCH 12/12] =?UTF-8?q?=F0=9F=93=8F=20README:=20Add=20CI/PyPI/Co?= =?UTF-8?q?decov=20badges?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 5ae93e8..dca1ffc 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,10 @@ # ocrd_calamari -Recognize text using [Calamari OCR](https://github.com/Calamari-OCR/calamari). +> Recognize text using [Calamari OCR](https://github.com/Calamari-OCR/calamari). + +[![image](https://circleci.com/gh/OCR-D/ocrd_calamari.svg?style=svg)](https://circleci.com/gh/OCR-D/ocrd_calamari) +[![image](https://img.shields.io/pypi/v/ocrd_calamari.svg)](https://pypi.org/project/ocrd_calamari/) +[![image](https://codecov.io/gh/OCR-D/ocrd_calamari/branch/master/graph/badge.svg)](https://codecov.io/gh/OCR-D/ocrd_calamari) ## Introduction