diff --git a/.circleci/config.yml b/.circleci/config.yml index b90ef37..87e3b2a 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -14,8 +14,7 @@ jobs: - run: locale-gen "en_US.UTF-8"; update-locale LC_ALL="en_US.UTF-8" - checkout - run: pip3 install --upgrade pip - - run: make install PIP_INSTALL="pip3 install" - - run: pip3 install -r requirements-test.txt + - run: make install deps-test PIP_INSTALL="pip3 install" - run: make coverage LC_ALL=en_US.utf8 - codecov/upload diff --git a/Makefile b/Makefile index 00a8f69..56350f3 100644 --- a/Makefile +++ b/Makefile @@ -3,6 +3,7 @@ PIP_INSTALL = pip3 install GIT_CLONE = git clone PYTHON = python3 PYTEST_ARGS = -W 'ignore::DeprecationWarning' -W 'ignore::FutureWarning' +MODEL = qurator-gt4histocr-1.0 # BEGIN-EVAL makefile-parser --make-help Makefile @@ -11,7 +12,7 @@ help: @echo " Targets" @echo "" @echo " install Install ocrd_calamari" - @echo " gt4histocr-calamari1 Get GT4HistOCR Calamari model (from SBB)" + @echo " $(MODEL) Get Calamari model (from SBB)" @echo " actevedef_718448162 Download example data" @echo " deps-test Install testing python deps via pip" @echo " repo/assets Clone OCR-D/assets to ./repo/assets" @@ -25,6 +26,7 @@ help: @echo " PYTHON '$(PYTHON)'" @echo " PIP_INSTALL '$(PIP_INSTALL)'" @echo " GIT_CLONE '$(GIT_CLONE)'" + @echo " MODEL '$(MODEL)'" # END-EVAL @@ -34,17 +36,14 @@ install: # Get GT4HistOCR Calamari model (from SBB) -gt4histocr-calamari1: - mkdir -p gt4histocr-calamari1 - cd gt4histocr-calamari1 && \ - wget https://qurator-data.de/calamari-models/GT4HistOCR/2019-12-11T11_10+0100/model.tar.xz && \ - tar xfv model.tar.xz && \ - rm model.tar.xz - -# Download example data +$(MODEL): + ocrd resmgr download ocrd-calamari-recognize $@ + +# Download example data (not used currently) actevedef_718448162: - wget https://qurator-data.de/examples/actevedef_718448162.zip && \ - unzip actevedef_718448162.zip + wget https://qurator-data.de/examples/actevedef_718448162.zip \ + && unzip actevedef_718448162.zip \ + && rm actevedef_718448162.zip @@ -54,7 +53,7 @@ actevedef_718448162: # Install testing python deps via pip deps-test: - $(PIP) install -r requirements_test.txt + $(PIP_INSTALL) -r requirements-test.txt # Clone OCR-D/assets to ./repo/assets @@ -73,15 +72,15 @@ assets-clean: rm -rf test/assets # Run unit tests -test: test/assets gt4histocr-calamari1 +test: test/assets $(MODEL) # declare -p HTTP_PROXY $(PYTHON) -m pytest --continue-on-collection-errors test $(PYTEST_ARGS) # Run unit tests and determine test coverage -coverage: test/assets gt4histocr-calamari1 +coverage: test/assets $(MODEL) coverage erase make test PYTHON="coverage run" coverage report coverage html -.PHONY: assets-clean test +.PHONY: install assets-clean deps-test test coverage $(MODEL) diff --git a/ocrd_calamari/ocrd-tool.json b/ocrd_calamari/ocrd-tool.json index f5e8c91..7f6d2d7 100644 --- a/ocrd_calamari/ocrd-tool.json +++ b/ocrd_calamari/ocrd-tool.json @@ -20,11 +20,11 @@ "parameters": { "checkpoint_dir": { "description": "The directory containing calamari model files (*.ckpt.json). Uses all checkpoints in that directory", - "type": "string", "format": "file", "cacheable": true, "default": "qurator-gt4histocr-1.0" - }, - "checkpoint": { - "description": "The calamari model files (*.ckpt.json)", - "type": "string", "format": "file", "cacheable": true + "type": "string", + "format": "uri", + "content-type": "text/directory", + "cacheable": true, + "default": "qurator-gt4histocr-1.0" }, "voter": { "description": "The voting algorithm to use", diff --git a/ocrd_calamari/recognize.py b/ocrd_calamari/recognize.py index bf267d6..2e7bbc9 100644 --- a/ocrd_calamari/recognize.py +++ b/ocrd_calamari/recognize.py @@ -47,10 +47,8 @@ class CalamariRecognize(Processor): """ Set up the model prior to processing. """ - if not self.parameter.get('checkpoint', None) and self.parameter.get('checkpoint_dir', None): - resolved = self.resolve_resource(self.parameter['checkpoint_dir']) - self.parameter['checkpoint'] = '%s/*.ckpt.json' % resolved - checkpoints = glob(self.parameter['checkpoint']) + resolved = self.resolve_resource(self.parameter['checkpoint_dir']) + checkpoints = glob('%s/*.ckpt.json' % resolved) self.predictor = MultiPredictor(checkpoints=checkpoints) self.network_input_channels = self.predictor.predictors[0].network.input_channels @@ -244,18 +242,7 @@ class CalamariRecognize(Processor): # Add metadata about this operation and its runtime parameters: - metadata = pcgts.get_Metadata() # ensured by from_file() - metadata.add_MetadataItem( - MetadataItemType(type_="processingStep", - name=self.ocrd_tool['steps'][0], - value=TOOL, - Labels=[LabelsType( - externalModel="ocrd-tool", - externalId="parameters", - Label=[LabelType(type_=name, value=self.parameter[name]) - for name in self.parameter.keys()])])) - - + self.add_metadata(pcgts) file_id = make_file_id(input_file, self.output_file_grp) pcgts.set_pcGtsId(file_id) self.workspace.add_file( diff --git a/test/base.py b/test/base.py index 79b0d1a..36c192c 100644 --- a/test/base.py +++ b/test/base.py @@ -4,6 +4,9 @@ import os import sys from test.assets import assets +from ocrd_utils import initLogging PWD = os.path.dirname(os.path.realpath(__file__)) sys.path.append(PWD + '/../ocrd') + +initLogging() diff --git a/test/test_recognize.py b/test/test_recognize.py index 976469f..081f190 100644 --- a/test/test_recognize.py +++ b/test/test_recognize.py @@ -14,8 +14,7 @@ from .base import assets METS_KANT = assets.url_of('kant_aufklaerung_1784-page-region-line-word_glyph/data/mets.xml') WORKSPACE_DIR = '/tmp/test-ocrd-calamari' -CHECKPOINT_DIR = os.path.join(os.getcwd(), 'gt4histocr-calamari1') -CHECKPOINT = os.path.join(CHECKPOINT_DIR, '*.ckpt.json') +CHECKPOINT_DIR = os.getenv('MODEL') # Because XML namespace versions are so much fun, we not only use one, we use TWO! NSMAP = { "pc": "http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15" } @@ -31,14 +30,6 @@ def workspace(): resolver = Resolver() workspace = resolver.workspace_from_url(METS_KANT, dst_dir=WORKSPACE_DIR) - # XXX Work around data bug(?): - # PAGE-XML links to OCR-D-IMG/INPUT_0017.tif, but this is nothing core can download - os.makedirs(os.path.join(WORKSPACE_DIR, 'OCR-D-IMG')) - for f in ['INPUT_0017.tif', 'INPUT_0020.tif']: - urllib.request.urlretrieve( - "https://github.com/OCR-D/assets/raw/master/data/kant_aufklaerung_1784/data/OCR-D-IMG/" + f, - os.path.join(WORKSPACE_DIR, 'OCR-D-IMG', f)) - # The binarization options I have are: # # a. ocrd_kraken which tries to install cltsm, whose installation is borken on my machine (protobuf) @@ -47,21 +38,22 @@ def workspace(): # c. just fumble with the original files # # So I'm going for option c. - for f in ['INPUT_0017.tif', 'INPUT_0020.tif']: - ff = os.path.join(WORKSPACE_DIR, 'OCR-D-IMG', f) - subprocess.call(['convert', ff, '-threshold', '50%', ff]) + for imgf in workspace.mets.find_files(fileGrp="OCR-D-IMG"): + imgf = workspace.download_file(imgf) + path = os.path.join(workspace.directory, imgf.local_filename) + subprocess.call(['mogrify', '-threshold', '50%', path]) # Remove GT Words and TextEquivs, to not accidently check GT text instead of the OCR text # XXX Review data again # XXX Make this more robust against namespace version changes - for of in workspace.mets.find_files(fileGrp="OCR-D-GT-SEG-LINE"): + for of in workspace.mets.find_files(fileGrp="OCR-D-GT-SEG-WORD-GLYPH"): workspace.download_file(of) - for to_remove in ["//pc:Word", "//pc:TextEquiv"]: - for ff in glob(os.path.join(WORKSPACE_DIR, "OCR-D-GT-SEG-LINE", "*")): - tree = etree.parse(ff) + path = os.path.join(workspace.directory, of.local_filename) + tree = etree.parse(path) + for to_remove in ["//pc:Word", "//pc:TextEquiv"]: for e in tree.xpath(to_remove, namespaces=NSMAP_GT): e.getparent().remove(e) - tree.write(ff, xml_declaration=True, encoding="utf-8") + tree.write(path, xml_declaration=True, encoding="utf-8") return workspace @@ -69,23 +61,7 @@ def workspace(): def test_recognize(workspace): CalamariRecognize( workspace, - input_file_grp="OCR-D-GT-SEG-LINE", - output_file_grp="OCR-D-OCR-CALAMARI", - parameter={ - "checkpoint": CHECKPOINT, - } - ).process() - workspace.save_mets() - - page1 = os.path.join(workspace.directory, "OCR-D-OCR-CALAMARI/OCR-D-OCR-CALAMARI_0001.xml") - assert os.path.exists(page1) - with open(page1, "r", encoding="utf-8") as f: - assert "verſchuldeten" in f.read() - -def test_recognize_with_checkpoint_dir(workspace): - CalamariRecognize( - workspace, - input_file_grp="OCR-D-GT-SEG-LINE", + input_file_grp="OCR-D-GT-SEG-WORD-GLYPH", output_file_grp="OCR-D-OCR-CALAMARI", parameter={ "checkpoint_dir": CHECKPOINT_DIR, @@ -103,9 +79,9 @@ def test_recognize_should_warn_if_given_rgb_image_and_single_channel_model(works caplog.set_level(logging.WARNING) CalamariRecognize( workspace, - input_file_grp="OCR-D-GT-SEG-LINE", + input_file_grp="OCR-D-GT-SEG-WORD-GLYPH", output_file_grp="OCR-D-OCR-CALAMARI-BROKEN", - parameter={'checkpoint': CHECKPOINT} + parameter={'checkpoint_dir': CHECKPOINT_DIR} ).process() interesting_log_messages = [t[2] for t in caplog.record_tuples if "Using raw image" in t[2]] @@ -115,10 +91,10 @@ def test_recognize_should_warn_if_given_rgb_image_and_single_channel_model(works def test_word_segmentation(workspace): CalamariRecognize( workspace, - input_file_grp="OCR-D-GT-SEG-LINE", + input_file_grp="OCR-D-GT-SEG-WORD-GLYPH", output_file_grp="OCR-D-OCR-CALAMARI", parameter={ - "checkpoint": CHECKPOINT, + "checkpoint_dir": CHECKPOINT_DIR, "textequiv_level": "word", # Note that we're going down to word level here } ).process() @@ -147,10 +123,10 @@ def test_word_segmentation(workspace): def test_glyphs(workspace): CalamariRecognize( workspace, - input_file_grp="OCR-D-GT-SEG-LINE", + input_file_grp="OCR-D-GT-SEG-WORD-GLYPH", output_file_grp="OCR-D-OCR-CALAMARI", parameter={ - "checkpoint": CHECKPOINT, + "checkpoint_dir": CHECKPOINT_DIR, "textequiv_level": "glyph", # Note that we're going down to glyph level here } ).process()