diff --git a/README.md b/README.md index d662189..079f2a8 100644 --- a/README.md +++ b/README.md @@ -69,6 +69,13 @@ Finally recognize the text using ocrd_calamari and the downloaded model: ocrd-calamari-recognize -p '{ "checkpoint": "../gt4histocr-calamari1/*.ckpt.json" }' -I OCR-D-SEG-LINE -O OCR-D-OCR-CALAMARI ``` +or + +``` +ocrd-calamari-recognize -P checkpoint_dir ../gt4histocr-calamari1 -I OCR-D-SEG-LINE -O OCR-D-OCR-CALAMARI +``` + + You may want to have a look at the [ocrd-tool.json](ocrd_calamari/ocrd-tool.json) descriptions for additional parameters and default values. diff --git a/ocrd_calamari/ocrd-tool.json b/ocrd_calamari/ocrd-tool.json index 4494679..691eeba 100644 --- a/ocrd_calamari/ocrd-tool.json +++ b/ocrd_calamari/ocrd-tool.json @@ -18,6 +18,10 @@ "OCR-D-OCR-CALAMARI" ], "parameters": { + "checkpoint_dir": { + "description": "The directory containing calamari model files (*.ckpt.json). Uses all checkpoints in that directory", + "type": "string", "format": "file", "cacheable": true + }, "checkpoint": { "description": "The calamari model files (*.ckpt.json)", "type": "string", "format": "file", "cacheable": true diff --git a/ocrd_calamari/recognize.py b/ocrd_calamari/recognize.py index 381cac3..102c927 100644 --- a/ocrd_calamari/recognize.py +++ b/ocrd_calamari/recognize.py @@ -40,6 +40,8 @@ class CalamariRecognize(Processor): def _init_calamari(self): os.environ['TF_CPP_MIN_LOG_LEVEL'] = TF_CPP_MIN_LOG_LEVEL + if self.parameter['checkpoint_dir']: + self.parameter['checkpoint'] = '%s/*.ckpt.json' % self.parameter['checkpoint_dir'] checkpoints = glob(self.parameter['checkpoint']) self.predictor = MultiPredictor(checkpoints=checkpoints) diff --git a/test/test_recognize.py b/test/test_recognize.py index 0d23c1f..7926404 100644 --- a/test/test_recognize.py +++ b/test/test_recognize.py @@ -14,7 +14,8 @@ from .base import assets METS_KANT = assets.url_of('kant_aufklaerung_1784-page-region-line-word_glyph/data/mets.xml') WORKSPACE_DIR = '/tmp/test-ocrd-calamari' -CHECKPOINT = os.path.join(os.getcwd(), 'gt4histocr-calamari1/*.ckpt.json') +CHECKPPOINT_DIR = os.path.join(os.getcwd(), 'gt4histocr-calamari1') +CHECKPOINT = os.path.join(CHECKPPOINT_DIR, '*.ckpt.json') # Because XML namespace versions are so much fun, we not only use one, we use TWO! NSMAP = { "pc": "http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15" } @@ -81,6 +82,22 @@ def test_recognize(workspace): with open(page1, "r", encoding="utf-8") as f: assert "verſchuldeten" in f.read() +def test_recognize_with_checkpoint_dir(workspace): + CalamariRecognize( + workspace, + input_file_grp="OCR-D-GT-SEG-LINE", + output_file_grp="OCR-D-OCR-CALAMARI", + parameter={ + "checkpoin_dir": CHECKPOINT_DIR, + } + ).process() + workspace.save_mets() + + page1 = os.path.join(workspace.directory, "OCR-D-OCR-CALAMARI/OCR-D-OCR-CALAMARI_0001.xml") + assert os.path.exists(page1) + with open(page1, "r", encoding="utf-8") as f: + assert "verſchuldeten" in f.read() + def test_word_segmentation(workspace): CalamariRecognize(