From d8212ced9388ea74a9476aec3b9ae703c5c1d95a Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Tue, 3 Dec 2019 16:23:41 +0100 Subject: [PATCH] =?UTF-8?q?=E2=9C=85=20Use=20GT=20segmentation=20to=20test?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .circleci/config.yml | 2 +- Makefile | 7 +++++-- requirements-test.txt | 1 - test/test_recognize.py | 42 ++++++++++++++++-------------------------- 4 files changed, 22 insertions(+), 30 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 52fcfef..7f45b88 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -8,7 +8,7 @@ jobs: docker: - image: ubuntu:19.04 steps: - - run: apt-get update ; apt-get install -y make git curl python3 python3-pip libtesseract-dev libleptonica-dev tesseract-ocr-eng tesseract-ocr wget + - run: apt-get update ; apt-get install -y make git curl python3 python3-pip wget - checkout - run: make install PIP_INSTALL="pip3 install" - run: pip3 install -r requirements-test.txt diff --git a/Makefile b/Makefile index da1eed5..707e8c1 100644 --- a/Makefile +++ b/Makefile @@ -5,7 +5,7 @@ PYTHON = python PIP_INSTALL = pip install # '$(GIT_CLONE)' -GIT_CLONE = git clone --depth 1 +GIT_CLONE = git clone # BEGIN-EVAL makefile-parser --make-help Makefile @@ -42,7 +42,10 @@ calamari: # Clone calamari_models repo calamari_models: - $(GIT_CLONE) https://github.com/chwick/calamari_models + $(GIT_CLONE) -n https://github.com/chwick/calamari_models + # Checkout latest version that works with calamari-ocr==0.3.5: + git checkout f76b1d3ec + # pip install calamari calamari/build: calamari calamari_models diff --git a/requirements-test.txt b/requirements-test.txt index dc069b5..49ec960 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -1,3 +1,2 @@ pytest -ocrd_tesserocr >= 0.4.0 coverage diff --git a/test/test_recognize.py b/test/test_recognize.py index ed85485..545f25a 100644 --- a/test/test_recognize.py +++ b/test/test_recognize.py @@ -1,60 +1,50 @@ import os -from os.path import join, exists import shutil +import urllib.request from test.base import TestCase, main, assets, skip from ocrd.resolver import Resolver -from ocrd_tesserocr import TesserocrSegmentRegion -from ocrd_tesserocr import TesserocrSegmentLine - from ocrd_calamari import CalamariRecognize -#METS_HEROLD_SMALL = assets.url_of('SBB0000F29300010000/data/mets_one_file.xml') -# as long as #96 remains, we cannot use workspaces which have local relative files: -METS_HEROLD_SMALL = assets.url_of('kant_aufklaerung_1784-binarized/data/mets.xml') +METS_KANT = assets.url_of('kant_aufklaerung_1784-page-block-line-word_glyph/data/mets.xml') WORKSPACE_DIR = '/tmp/test-ocrd-calamari' class TestCalamariRecognize(TestCase): def setUp(self): - if exists(WORKSPACE_DIR): + if os.path.exists(WORKSPACE_DIR): shutil.rmtree(WORKSPACE_DIR) os.makedirs(WORKSPACE_DIR) - #skip("Takes too long") def runTest(self): resolver = Resolver() - workspace = resolver.workspace_from_url(METS_HEROLD_SMALL, dst_dir=WORKSPACE_DIR) + workspace = resolver.workspace_from_url(METS_KANT, dst_dir=WORKSPACE_DIR) - TesserocrSegmentRegion( - workspace, - input_file_grp="OCR-D-IMG", - output_file_grp="OCR-D-SEG-BLOCK" - ).process() - workspace.save_mets() + # XXX Work around data bug(?): + # PAGE-XML links to OCR-D-IMG/INPUT_0017.tif, but this is nothing core can download + os.makedirs(os.path.join(WORKSPACE_DIR, 'OCR-D-IMG')) + for f in ['INPUT_0017.tif', 'INPUT_0020.tif']: + urllib.request.urlretrieve( + "https://github.com/OCR-D/assets/raw/master/data/kant_aufklaerung_1784/data/OCR-D-IMG/" + f, + os.path.join(WORKSPACE_DIR, 'OCR-D-IMG', f)) - TesserocrSegmentLine( - workspace, - input_file_grp="OCR-D-SEG-BLOCK", - output_file_grp="OCR-D-SEG-LINE" - ).process() - workspace.save_mets() + # XXX Should remove GT text to really test this CalamariRecognize( workspace, - input_file_grp="OCR-D-SEG-LINE", + input_file_grp="OCR-D-GT-SEG-LINE", output_file_grp="OCR-D-OCR-CALAMARI", parameter={ - 'checkpoint': 'calamari_models/fraktur_historical/*.ckpt.json' + 'checkpoint': os.path.join(os.getcwd(), 'calamari_models/fraktur_19th_century/*.ckpt.json') } ).process() workspace.save_mets() - page1 = join(workspace.directory, 'OCR-D-OCR-CALAMARI/OCR-D-OCR-CALAMARI_0001.xml') - self.assertTrue(exists(page1)) + page1 = os.path.join(workspace.directory, 'OCR-D-OCR-CALAMARI/OCR-D-OCR-CALAMARI_0001.xml') + self.assertTrue(os.path.exists(page1)) with open(page1, 'r') as f: self.assertIn('verſchuldeten', f.read())