Merge pull request #70 from bertsky/patch-2

add checkpoint_dir content-type, remove checkpoint variant
fix/readme-no-checkpoint
Mike Gerber 3 years ago committed by GitHub
commit 1eb342ef65
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -14,8 +14,7 @@ jobs:
- run: locale-gen "en_US.UTF-8"; update-locale LC_ALL="en_US.UTF-8"
- checkout
- run: pip3 install --upgrade pip
- run: make install PIP_INSTALL="pip3 install"
- run: pip3 install -r requirements-test.txt
- run: make install deps-test PIP_INSTALL="pip3 install"
- run: make coverage LC_ALL=en_US.utf8
- codecov/upload

@ -3,6 +3,7 @@ PIP_INSTALL = pip3 install
GIT_CLONE = git clone
PYTHON = python3
PYTEST_ARGS = -W 'ignore::DeprecationWarning' -W 'ignore::FutureWarning'
MODEL = qurator-gt4histocr-1.0
# BEGIN-EVAL makefile-parser --make-help Makefile
@ -11,7 +12,7 @@ help:
@echo " Targets"
@echo ""
@echo " install Install ocrd_calamari"
@echo " gt4histocr-calamari1 Get GT4HistOCR Calamari model (from SBB)"
@echo " $(MODEL) Get Calamari model (from SBB)"
@echo " actevedef_718448162 Download example data"
@echo " deps-test Install testing python deps via pip"
@echo " repo/assets Clone OCR-D/assets to ./repo/assets"
@ -25,6 +26,7 @@ help:
@echo " PYTHON '$(PYTHON)'"
@echo " PIP_INSTALL '$(PIP_INSTALL)'"
@echo " GIT_CLONE '$(GIT_CLONE)'"
@echo " MODEL '$(MODEL)'"
# END-EVAL
@ -34,17 +36,14 @@ install:
# Get GT4HistOCR Calamari model (from SBB)
gt4histocr-calamari1:
mkdir -p gt4histocr-calamari1
cd gt4histocr-calamari1 && \
wget https://qurator-data.de/calamari-models/GT4HistOCR/2019-12-11T11_10+0100/model.tar.xz && \
tar xfv model.tar.xz && \
rm model.tar.xz
# Download example data
$(MODEL):
ocrd resmgr download ocrd-calamari-recognize $@
# Download example data (not used currently)
actevedef_718448162:
wget https://qurator-data.de/examples/actevedef_718448162.zip && \
unzip actevedef_718448162.zip
wget https://qurator-data.de/examples/actevedef_718448162.zip \
&& unzip actevedef_718448162.zip \
&& rm actevedef_718448162.zip
@ -54,7 +53,7 @@ actevedef_718448162:
# Install testing python deps via pip
deps-test:
$(PIP) install -r requirements_test.txt
$(PIP_INSTALL) -r requirements-test.txt
# Clone OCR-D/assets to ./repo/assets
@ -73,15 +72,15 @@ assets-clean:
rm -rf test/assets
# Run unit tests
test: test/assets gt4histocr-calamari1
test: test/assets $(MODEL)
# declare -p HTTP_PROXY
$(PYTHON) -m pytest --continue-on-collection-errors test $(PYTEST_ARGS)
# Run unit tests and determine test coverage
coverage: test/assets gt4histocr-calamari1
coverage: test/assets $(MODEL)
coverage erase
make test PYTHON="coverage run"
coverage report
coverage html
.PHONY: assets-clean test
.PHONY: install assets-clean deps-test test coverage $(MODEL)

@ -20,11 +20,11 @@
"parameters": {
"checkpoint_dir": {
"description": "The directory containing calamari model files (*.ckpt.json). Uses all checkpoints in that directory",
"type": "string", "format": "file", "cacheable": true, "default": "qurator-gt4histocr-1.0"
},
"checkpoint": {
"description": "The calamari model files (*.ckpt.json)",
"type": "string", "format": "file", "cacheable": true
"type": "string",
"format": "uri",
"content-type": "text/directory",
"cacheable": true,
"default": "qurator-gt4histocr-1.0"
},
"voter": {
"description": "The voting algorithm to use",

@ -47,10 +47,8 @@ class CalamariRecognize(Processor):
"""
Set up the model prior to processing.
"""
if not self.parameter.get('checkpoint', None) and self.parameter.get('checkpoint_dir', None):
resolved = self.resolve_resource(self.parameter['checkpoint_dir'])
self.parameter['checkpoint'] = '%s/*.ckpt.json' % resolved
checkpoints = glob(self.parameter['checkpoint'])
resolved = self.resolve_resource(self.parameter['checkpoint_dir'])
checkpoints = glob('%s/*.ckpt.json' % resolved)
self.predictor = MultiPredictor(checkpoints=checkpoints)
self.network_input_channels = self.predictor.predictors[0].network.input_channels
@ -244,18 +242,7 @@ class CalamariRecognize(Processor):
# Add metadata about this operation and its runtime parameters:
metadata = pcgts.get_Metadata() # ensured by from_file()
metadata.add_MetadataItem(
MetadataItemType(type_="processingStep",
name=self.ocrd_tool['steps'][0],
value=TOOL,
Labels=[LabelsType(
externalModel="ocrd-tool",
externalId="parameters",
Label=[LabelType(type_=name, value=self.parameter[name])
for name in self.parameter.keys()])]))
self.add_metadata(pcgts)
file_id = make_file_id(input_file, self.output_file_grp)
pcgts.set_pcGtsId(file_id)
self.workspace.add_file(

@ -4,6 +4,9 @@ import os
import sys
from test.assets import assets
from ocrd_utils import initLogging
PWD = os.path.dirname(os.path.realpath(__file__))
sys.path.append(PWD + '/../ocrd')
initLogging()

@ -14,8 +14,7 @@ from .base import assets
METS_KANT = assets.url_of('kant_aufklaerung_1784-page-region-line-word_glyph/data/mets.xml')
WORKSPACE_DIR = '/tmp/test-ocrd-calamari'
CHECKPOINT_DIR = os.path.join(os.getcwd(), 'gt4histocr-calamari1')
CHECKPOINT = os.path.join(CHECKPOINT_DIR, '*.ckpt.json')
CHECKPOINT_DIR = os.getenv('MODEL')
# Because XML namespace versions are so much fun, we not only use one, we use TWO!
NSMAP = { "pc": "http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15" }
@ -31,14 +30,6 @@ def workspace():
resolver = Resolver()
workspace = resolver.workspace_from_url(METS_KANT, dst_dir=WORKSPACE_DIR)
# XXX Work around data bug(?):
# PAGE-XML links to OCR-D-IMG/INPUT_0017.tif, but this is nothing core can download
os.makedirs(os.path.join(WORKSPACE_DIR, 'OCR-D-IMG'))
for f in ['INPUT_0017.tif', 'INPUT_0020.tif']:
urllib.request.urlretrieve(
"https://github.com/OCR-D/assets/raw/master/data/kant_aufklaerung_1784/data/OCR-D-IMG/" + f,
os.path.join(WORKSPACE_DIR, 'OCR-D-IMG', f))
# The binarization options I have are:
#
# a. ocrd_kraken which tries to install cltsm, whose installation is borken on my machine (protobuf)
@ -47,21 +38,22 @@ def workspace():
# c. just fumble with the original files
#
# So I'm going for option c.
for f in ['INPUT_0017.tif', 'INPUT_0020.tif']:
ff = os.path.join(WORKSPACE_DIR, 'OCR-D-IMG', f)
subprocess.call(['convert', ff, '-threshold', '50%', ff])
for imgf in workspace.mets.find_files(fileGrp="OCR-D-IMG"):
imgf = workspace.download_file(imgf)
path = os.path.join(workspace.directory, imgf.local_filename)
subprocess.call(['mogrify', '-threshold', '50%', path])
# Remove GT Words and TextEquivs, to not accidently check GT text instead of the OCR text
# XXX Review data again
# XXX Make this more robust against namespace version changes
for of in workspace.mets.find_files(fileGrp="OCR-D-GT-SEG-LINE"):
for of in workspace.mets.find_files(fileGrp="OCR-D-GT-SEG-WORD-GLYPH"):
workspace.download_file(of)
for to_remove in ["//pc:Word", "//pc:TextEquiv"]:
for ff in glob(os.path.join(WORKSPACE_DIR, "OCR-D-GT-SEG-LINE", "*")):
tree = etree.parse(ff)
path = os.path.join(workspace.directory, of.local_filename)
tree = etree.parse(path)
for to_remove in ["//pc:Word", "//pc:TextEquiv"]:
for e in tree.xpath(to_remove, namespaces=NSMAP_GT):
e.getparent().remove(e)
tree.write(ff, xml_declaration=True, encoding="utf-8")
tree.write(path, xml_declaration=True, encoding="utf-8")
return workspace
@ -69,23 +61,7 @@ def workspace():
def test_recognize(workspace):
CalamariRecognize(
workspace,
input_file_grp="OCR-D-GT-SEG-LINE",
output_file_grp="OCR-D-OCR-CALAMARI",
parameter={
"checkpoint": CHECKPOINT,
}
).process()
workspace.save_mets()
page1 = os.path.join(workspace.directory, "OCR-D-OCR-CALAMARI/OCR-D-OCR-CALAMARI_0001.xml")
assert os.path.exists(page1)
with open(page1, "r", encoding="utf-8") as f:
assert "verſchuldeten" in f.read()
def test_recognize_with_checkpoint_dir(workspace):
CalamariRecognize(
workspace,
input_file_grp="OCR-D-GT-SEG-LINE",
input_file_grp="OCR-D-GT-SEG-WORD-GLYPH",
output_file_grp="OCR-D-OCR-CALAMARI",
parameter={
"checkpoint_dir": CHECKPOINT_DIR,
@ -103,9 +79,9 @@ def test_recognize_should_warn_if_given_rgb_image_and_single_channel_model(works
caplog.set_level(logging.WARNING)
CalamariRecognize(
workspace,
input_file_grp="OCR-D-GT-SEG-LINE",
input_file_grp="OCR-D-GT-SEG-WORD-GLYPH",
output_file_grp="OCR-D-OCR-CALAMARI-BROKEN",
parameter={'checkpoint': CHECKPOINT}
parameter={'checkpoint_dir': CHECKPOINT_DIR}
).process()
interesting_log_messages = [t[2] for t in caplog.record_tuples if "Using raw image" in t[2]]
@ -115,10 +91,10 @@ def test_recognize_should_warn_if_given_rgb_image_and_single_channel_model(works
def test_word_segmentation(workspace):
CalamariRecognize(
workspace,
input_file_grp="OCR-D-GT-SEG-LINE",
input_file_grp="OCR-D-GT-SEG-WORD-GLYPH",
output_file_grp="OCR-D-OCR-CALAMARI",
parameter={
"checkpoint": CHECKPOINT,
"checkpoint_dir": CHECKPOINT_DIR,
"textequiv_level": "word", # Note that we're going down to word level here
}
).process()
@ -147,10 +123,10 @@ def test_word_segmentation(workspace):
def test_glyphs(workspace):
CalamariRecognize(
workspace,
input_file_grp="OCR-D-GT-SEG-LINE",
input_file_grp="OCR-D-GT-SEG-WORD-GLYPH",
output_file_grp="OCR-D-OCR-CALAMARI",
parameter={
"checkpoint": CHECKPOINT,
"checkpoint_dir": CHECKPOINT_DIR,
"textequiv_level": "glyph", # Note that we're going down to glyph level here
}
).process()

Loading…
Cancel
Save