implement "checkpoint_dir" parameter as a simpler alternative to "checkpoint"

fix/readme-no-checkpoint
Konstantin Baierer 4 years ago
parent fe973e58db
commit 83adfcfd5a

@ -69,6 +69,13 @@ Finally recognize the text using ocrd_calamari and the downloaded model:
ocrd-calamari-recognize -p '{ "checkpoint": "../gt4histocr-calamari1/*.ckpt.json" }' -I OCR-D-SEG-LINE -O OCR-D-OCR-CALAMARI ocrd-calamari-recognize -p '{ "checkpoint": "../gt4histocr-calamari1/*.ckpt.json" }' -I OCR-D-SEG-LINE -O OCR-D-OCR-CALAMARI
``` ```
or
```
ocrd-calamari-recognize -P checkpoint_dir ../gt4histocr-calamari1 -I OCR-D-SEG-LINE -O OCR-D-OCR-CALAMARI
```
You may want to have a look at the [ocrd-tool.json](ocrd_calamari/ocrd-tool.json) descriptions You may want to have a look at the [ocrd-tool.json](ocrd_calamari/ocrd-tool.json) descriptions
for additional parameters and default values. for additional parameters and default values.

@ -18,6 +18,10 @@
"OCR-D-OCR-CALAMARI" "OCR-D-OCR-CALAMARI"
], ],
"parameters": { "parameters": {
"checkpoint_dir": {
"description": "The directory containing calamari model files (*.ckpt.json). Uses all checkpoints in that directory",
"type": "string", "format": "file", "cacheable": true
},
"checkpoint": { "checkpoint": {
"description": "The calamari model files (*.ckpt.json)", "description": "The calamari model files (*.ckpt.json)",
"type": "string", "format": "file", "cacheable": true "type": "string", "format": "file", "cacheable": true

@ -40,6 +40,8 @@ class CalamariRecognize(Processor):
def _init_calamari(self): def _init_calamari(self):
os.environ['TF_CPP_MIN_LOG_LEVEL'] = TF_CPP_MIN_LOG_LEVEL os.environ['TF_CPP_MIN_LOG_LEVEL'] = TF_CPP_MIN_LOG_LEVEL
if self.parameter['checkpoint_dir']:
self.parameter['checkpoint'] = '%s/*.ckpt.json' % self.parameter['checkpoint_dir']
checkpoints = glob(self.parameter['checkpoint']) checkpoints = glob(self.parameter['checkpoint'])
self.predictor = MultiPredictor(checkpoints=checkpoints) self.predictor = MultiPredictor(checkpoints=checkpoints)

@ -14,7 +14,8 @@ from .base import assets
METS_KANT = assets.url_of('kant_aufklaerung_1784-page-region-line-word_glyph/data/mets.xml') METS_KANT = assets.url_of('kant_aufklaerung_1784-page-region-line-word_glyph/data/mets.xml')
WORKSPACE_DIR = '/tmp/test-ocrd-calamari' WORKSPACE_DIR = '/tmp/test-ocrd-calamari'
CHECKPOINT = os.path.join(os.getcwd(), 'gt4histocr-calamari1/*.ckpt.json') CHECKPPOINT_DIR = os.path.join(os.getcwd(), 'gt4histocr-calamari1')
CHECKPOINT = os.path.join(CHECKPPOINT_DIR, '*.ckpt.json')
# Because XML namespace versions are so much fun, we not only use one, we use TWO! # Because XML namespace versions are so much fun, we not only use one, we use TWO!
NSMAP = { "pc": "http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15" } NSMAP = { "pc": "http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15" }
@ -81,6 +82,22 @@ def test_recognize(workspace):
with open(page1, "r", encoding="utf-8") as f: with open(page1, "r", encoding="utf-8") as f:
assert "verſchuldeten" in f.read() assert "verſchuldeten" in f.read()
def test_recognize_with_checkpoint_dir(workspace):
CalamariRecognize(
workspace,
input_file_grp="OCR-D-GT-SEG-LINE",
output_file_grp="OCR-D-OCR-CALAMARI",
parameter={
"checkpoin_dir": CHECKPOINT_DIR,
}
).process()
workspace.save_mets()
page1 = os.path.join(workspace.directory, "OCR-D-OCR-CALAMARI/OCR-D-OCR-CALAMARI_0001.xml")
assert os.path.exists(page1)
with open(page1, "r", encoding="utf-8") as f:
assert "verſchuldeten" in f.read()
def test_word_segmentation(workspace): def test_word_segmentation(workspace):
CalamariRecognize( CalamariRecognize(

Loading…
Cancel
Save