Use GT segmentation to test

fix/readme-no-checkpoint
Gerber, Mike 4 years ago
parent 5b15dc5fd6
commit d8212ced93

@ -8,7 +8,7 @@ jobs:
docker: docker:
- image: ubuntu:19.04 - image: ubuntu:19.04
steps: steps:
- run: apt-get update ; apt-get install -y make git curl python3 python3-pip libtesseract-dev libleptonica-dev tesseract-ocr-eng tesseract-ocr wget - run: apt-get update ; apt-get install -y make git curl python3 python3-pip wget
- checkout - checkout
- run: make install PIP_INSTALL="pip3 install" - run: make install PIP_INSTALL="pip3 install"
- run: pip3 install -r requirements-test.txt - run: pip3 install -r requirements-test.txt

@ -5,7 +5,7 @@ PYTHON = python
PIP_INSTALL = pip install PIP_INSTALL = pip install
# '$(GIT_CLONE)' # '$(GIT_CLONE)'
GIT_CLONE = git clone --depth 1 GIT_CLONE = git clone
# BEGIN-EVAL makefile-parser --make-help Makefile # BEGIN-EVAL makefile-parser --make-help Makefile
@ -42,7 +42,10 @@ calamari:
# Clone calamari_models repo # Clone calamari_models repo
calamari_models: calamari_models:
$(GIT_CLONE) https://github.com/chwick/calamari_models $(GIT_CLONE) -n https://github.com/chwick/calamari_models
# Checkout latest version that works with calamari-ocr==0.3.5:
git checkout f76b1d3ec
# pip install calamari # pip install calamari
calamari/build: calamari calamari_models calamari/build: calamari calamari_models

@ -1,3 +1,2 @@
pytest pytest
ocrd_tesserocr >= 0.4.0
coverage coverage

@ -1,60 +1,50 @@
import os import os
from os.path import join, exists
import shutil import shutil
import urllib.request
from test.base import TestCase, main, assets, skip from test.base import TestCase, main, assets, skip
from ocrd.resolver import Resolver from ocrd.resolver import Resolver
from ocrd_tesserocr import TesserocrSegmentRegion
from ocrd_tesserocr import TesserocrSegmentLine
from ocrd_calamari import CalamariRecognize from ocrd_calamari import CalamariRecognize
#METS_HEROLD_SMALL = assets.url_of('SBB0000F29300010000/data/mets_one_file.xml') METS_KANT = assets.url_of('kant_aufklaerung_1784-page-block-line-word_glyph/data/mets.xml')
# as long as #96 remains, we cannot use workspaces which have local relative files:
METS_HEROLD_SMALL = assets.url_of('kant_aufklaerung_1784-binarized/data/mets.xml')
WORKSPACE_DIR = '/tmp/test-ocrd-calamari' WORKSPACE_DIR = '/tmp/test-ocrd-calamari'
class TestCalamariRecognize(TestCase): class TestCalamariRecognize(TestCase):
def setUp(self): def setUp(self):
if exists(WORKSPACE_DIR): if os.path.exists(WORKSPACE_DIR):
shutil.rmtree(WORKSPACE_DIR) shutil.rmtree(WORKSPACE_DIR)
os.makedirs(WORKSPACE_DIR) os.makedirs(WORKSPACE_DIR)
#skip("Takes too long")
def runTest(self): def runTest(self):
resolver = Resolver() resolver = Resolver()
workspace = resolver.workspace_from_url(METS_HEROLD_SMALL, dst_dir=WORKSPACE_DIR) workspace = resolver.workspace_from_url(METS_KANT, dst_dir=WORKSPACE_DIR)
TesserocrSegmentRegion( # XXX Work around data bug(?):
workspace, # PAGE-XML links to OCR-D-IMG/INPUT_0017.tif, but this is nothing core can download
input_file_grp="OCR-D-IMG", os.makedirs(os.path.join(WORKSPACE_DIR, 'OCR-D-IMG'))
output_file_grp="OCR-D-SEG-BLOCK" for f in ['INPUT_0017.tif', 'INPUT_0020.tif']:
).process() urllib.request.urlretrieve(
workspace.save_mets() "https://github.com/OCR-D/assets/raw/master/data/kant_aufklaerung_1784/data/OCR-D-IMG/" + f,
os.path.join(WORKSPACE_DIR, 'OCR-D-IMG', f))
TesserocrSegmentLine( # XXX Should remove GT text to really test this
workspace,
input_file_grp="OCR-D-SEG-BLOCK",
output_file_grp="OCR-D-SEG-LINE"
).process()
workspace.save_mets()
CalamariRecognize( CalamariRecognize(
workspace, workspace,
input_file_grp="OCR-D-SEG-LINE", input_file_grp="OCR-D-GT-SEG-LINE",
output_file_grp="OCR-D-OCR-CALAMARI", output_file_grp="OCR-D-OCR-CALAMARI",
parameter={ parameter={
'checkpoint': 'calamari_models/fraktur_historical/*.ckpt.json' 'checkpoint': os.path.join(os.getcwd(), 'calamari_models/fraktur_19th_century/*.ckpt.json')
} }
).process() ).process()
workspace.save_mets() workspace.save_mets()
page1 = join(workspace.directory, 'OCR-D-OCR-CALAMARI/OCR-D-OCR-CALAMARI_0001.xml') page1 = os.path.join(workspace.directory, 'OCR-D-OCR-CALAMARI/OCR-D-OCR-CALAMARI_0001.xml')
self.assertTrue(exists(page1)) self.assertTrue(os.path.exists(page1))
with open(page1, 'r') as f: with open(page1, 'r') as f:
self.assertIn('verſchuldeten', f.read()) self.assertIn('verſchuldeten', f.read())

Loading…
Cancel
Save