ocrd_calamari/test/test_recognize.py

import os
import shutil
import subprocess
import urllib.request

from test.base import TestCase, main, assets, skip

from ocrd.resolver import Resolver

from ocrd_calamari import CalamariRecognize

METS_KANT = assets.url_of('kant_aufklaerung_1784-page-block-line-word_glyph/data/mets.xml')

WORKSPACE_DIR = '/tmp/test-ocrd-calamari'

class TestCalamariRecognize(TestCase):

    def setUp(self):
        if os.path.exists(WORKSPACE_DIR):
            shutil.rmtree(WORKSPACE_DIR)
        os.makedirs(WORKSPACE_DIR)

    def runTest(self):
        resolver = Resolver()
        workspace = resolver.workspace_from_url(METS_KANT, dst_dir=WORKSPACE_DIR)

        # XXX Work around data bug(?):
        #     PAGE-XML links to OCR-D-IMG/INPUT_0017.tif, but this is nothing core can download
        os.makedirs(os.path.join(WORKSPACE_DIR, 'OCR-D-IMG'))
        for f in ['INPUT_0017.tif', 'INPUT_0020.tif']:
            urllib.request.urlretrieve(
                    "https://github.com/OCR-D/assets/raw/master/data/kant_aufklaerung_1784/data/OCR-D-IMG/" + f,
                    os.path.join(WORKSPACE_DIR, 'OCR-D-IMG', f))

        # The binarization options I have are:
        #
        # a. ocrd_kraken which tries to install cltsm, whose installation is borken on my machine (protobuf)
        # b. ocrd_olena which 1. I cannot fully install via pip and 2. whose dependency olena doesn't compile on my
        #    machine
        # c. just fumble with the original files
        #
        # So I'm going for option c.
        for f in ['INPUT_0017.tif', 'INPUT_0020.tif']:
            ff = os.path.join(WORKSPACE_DIR, 'OCR-D-IMG', f)
            subprocess.call(['convert', ff, '-colorspace', 'Gray', ff])

        # XXX Should remove GT text to really test this

        CalamariRecognize(
            workspace,
            input_file_grp="OCR-D-GT-SEG-LINE",
            output_file_grp="OCR-D-OCR-CALAMARI",
            parameter={
                'checkpoint': os.path.join(os.getcwd(), 'gt4histocr-calamari/*.ckpt.json')
            }
        ).process()
        workspace.save_mets()

        page1 = os.path.join(workspace.directory, 'OCR-D-OCR-CALAMARI/OCR-D-OCR-CALAMARI_0001.xml')
        self.assertTrue(os.path.exists(page1))
        with open(page1, 'r', encoding='utf-8') as f:
            self.assertIn('verſchuldeten', f.read())

if __name__ == '__main__':
    main()
-												smoke test, circle ci

Conflicts:
	Makefile
	ocrd_calamari/__init__.py

											
										
										
											5 years ago
+								import os
 								import shutil
-												✅ Fix tests by 1. binarizing and 2. use the GT4HistOCR model

											
										
										
											5 years ago
+								import subprocess
-												✅ Use GT segmentation to test

											
										
										
											5 years ago
+								import urllib.request
-												smoke test, circle ci

Conflicts:
	Makefile
	ocrd_calamari/__init__.py

											
										
										
											5 years ago
 								from test.base import TestCase, main, assets, skip
 								from ocrd.resolver import Resolver
 								from ocrd_calamari import CalamariRecognize
-												✅ Use GT segmentation to test

											
										
										
											5 years ago
+								METS_KANT = assets.url_of('kant_aufklaerung_1784-page-block-line-word_glyph/data/mets.xml')
-												smoke test, circle ci

Conflicts:
	Makefile
	ocrd_calamari/__init__.py

											
										
										
											5 years ago
 								WORKSPACE_DIR = '/tmp/test-ocrd-calamari'
 								class TestCalamariRecognize(TestCase):
 								    def setUp(self):
-												✅ Use GT segmentation to test

											
										
										
											5 years ago
+								        if os.path.exists(WORKSPACE_DIR):
-												smoke test, circle ci

Conflicts:
	Makefile
	ocrd_calamari/__init__.py

											
										
										
											5 years ago
+								            shutil.rmtree(WORKSPACE_DIR)
 								        os.makedirs(WORKSPACE_DIR)
 								    def runTest(self):
 								        resolver = Resolver()
-												✅ Use GT segmentation to test

											
										
										
											5 years ago
+								        workspace = resolver.workspace_from_url(METS_KANT, dst_dir=WORKSPACE_DIR)
-												smoke test, circle ci

Conflicts:
	Makefile
	ocrd_calamari/__init__.py

											
										
										
											5 years ago
-												✅ Use GT segmentation to test

											
										
										
											5 years ago
+								        # XXX Work around data bug(?):
 								        #     PAGE-XML links to OCR-D-IMG/INPUT_0017.tif, but this is nothing core can download
 								        os.makedirs(os.path.join(WORKSPACE_DIR, 'OCR-D-IMG'))
 								        for f in ['INPUT_0017.tif', 'INPUT_0020.tif']:
 								            urllib.request.urlretrieve(
 								                    "https://github.com/OCR-D/assets/raw/master/data/kant_aufklaerung_1784/data/OCR-D-IMG/" + f,
 								                    os.path.join(WORKSPACE_DIR, 'OCR-D-IMG', f))
-												smoke test, circle ci

Conflicts:
	Makefile
	ocrd_calamari/__init__.py

											
										
										
											5 years ago
-												✅ Fix tests by 1. binarizing and 2. use the GT4HistOCR model

											
										
										
											5 years ago
+								        # The binarization options I have are:
 								        #
 								        # a. ocrd_kraken which tries to install cltsm, whose installation is borken on my machine (protobuf)
 								        # b. ocrd_olena which 1. I cannot fully install via pip and 2. whose dependency olena doesn't compile on my
 								        #    machine
 								        # c. just fumble with the original files
 								        #
 								        # So I'm going for option c.
 								        for f in ['INPUT_0017.tif', 'INPUT_0020.tif']:
 								            ff = os.path.join(WORKSPACE_DIR, 'OCR-D-IMG', f)
 								            subprocess.call(['convert', ff, '-colorspace', 'Gray', ff])
-												✅ Use GT segmentation to test

											
										
										
											5 years ago
+								        # XXX Should remove GT text to really test this
-												smoke test, circle ci

Conflicts:
	Makefile
	ocrd_calamari/__init__.py

											
										
										
											5 years ago
 								        CalamariRecognize(
 								            workspace,
-												✅ Use GT segmentation to test

											
										
										
											5 years ago
+								            input_file_grp="OCR-D-GT-SEG-LINE",
-												smoke test, circle ci

Conflicts:
	Makefile
	ocrd_calamari/__init__.py

											
										
										
											5 years ago
+								            output_file_grp="OCR-D-OCR-CALAMARI",
 								            parameter={
-												✅ Fix tests by 1. binarizing and 2. use the GT4HistOCR model

											
										
										
											5 years ago
+								                'checkpoint': os.path.join(os.getcwd(), 'gt4histocr-calamari/*.ckpt.json')
-												smoke test, circle ci

Conflicts:
	Makefile
	ocrd_calamari/__init__.py

											
										
										
											5 years ago
+								            }
 								        ).process()
 								        workspace.save_mets()
-												✅ Use GT segmentation to test

											
										
										
											5 years ago
+								        page1 = os.path.join(workspace.directory, 'OCR-D-OCR-CALAMARI/OCR-D-OCR-CALAMARI_0001.xml')
 								        self.assertTrue(os.path.exists(page1))
-												🐛 Open our test result with UTF-8 encoding (for Python 3.6?)

											
										
										
											5 years ago
+								        with open(page1, 'r', encoding='utf-8') as f:
-												smoke test, circle ci

Conflicts:
	Makefile
	ocrd_calamari/__init__.py

											
										
										
											5 years ago
+								            self.assertIn('verſchuldeten', f.read())
 								if __name__ == '__main__':
 								    main()