mirror of
https://github.com/mikegerber/ocrd_calamari.git
synced 2025-06-09 03:39:55 +02:00
commit
e7fb432e35
5 changed files with 32 additions and 1 deletions
1
.gitignore
vendored
1
.gitignore
vendored
|
@ -110,3 +110,4 @@ venv.bak/
|
||||||
/actevedef_718448162*
|
/actevedef_718448162*
|
||||||
/repo
|
/repo
|
||||||
/test/assets
|
/test/assets
|
||||||
|
gt4histocr-calamari*
|
||||||
|
|
|
@ -69,6 +69,13 @@ Finally recognize the text using ocrd_calamari and the downloaded model:
|
||||||
ocrd-calamari-recognize -p '{ "checkpoint": "../gt4histocr-calamari1/*.ckpt.json" }' -I OCR-D-SEG-LINE -O OCR-D-OCR-CALAMARI
|
ocrd-calamari-recognize -p '{ "checkpoint": "../gt4histocr-calamari1/*.ckpt.json" }' -I OCR-D-SEG-LINE -O OCR-D-OCR-CALAMARI
|
||||||
```
|
```
|
||||||
|
|
||||||
|
or
|
||||||
|
|
||||||
|
```
|
||||||
|
ocrd-calamari-recognize -P checkpoint_dir ../gt4histocr-calamari1 -I OCR-D-SEG-LINE -O OCR-D-OCR-CALAMARI
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
You may want to have a look at the [ocrd-tool.json](ocrd_calamari/ocrd-tool.json) descriptions
|
You may want to have a look at the [ocrd-tool.json](ocrd_calamari/ocrd-tool.json) descriptions
|
||||||
for additional parameters and default values.
|
for additional parameters and default values.
|
||||||
|
|
||||||
|
|
|
@ -18,6 +18,10 @@
|
||||||
"OCR-D-OCR-CALAMARI"
|
"OCR-D-OCR-CALAMARI"
|
||||||
],
|
],
|
||||||
"parameters": {
|
"parameters": {
|
||||||
|
"checkpoint_dir": {
|
||||||
|
"description": "The directory containing calamari model files (*.ckpt.json). Uses all checkpoints in that directory",
|
||||||
|
"type": "string", "format": "file", "cacheable": true
|
||||||
|
},
|
||||||
"checkpoint": {
|
"checkpoint": {
|
||||||
"description": "The calamari model files (*.ckpt.json)",
|
"description": "The calamari model files (*.ckpt.json)",
|
||||||
"type": "string", "format": "file", "cacheable": true
|
"type": "string", "format": "file", "cacheable": true
|
||||||
|
|
|
@ -42,6 +42,8 @@ class CalamariRecognize(Processor):
|
||||||
|
|
||||||
def _init_calamari(self):
|
def _init_calamari(self):
|
||||||
|
|
||||||
|
if self.parameter.get('checkpoint_dir', None):
|
||||||
|
self.parameter['checkpoint'] = '%s/*.ckpt.json' % self.parameter['checkpoint_dir']
|
||||||
checkpoints = glob(self.parameter['checkpoint'])
|
checkpoints = glob(self.parameter['checkpoint'])
|
||||||
self.predictor = MultiPredictor(checkpoints=checkpoints)
|
self.predictor = MultiPredictor(checkpoints=checkpoints)
|
||||||
|
|
||||||
|
|
|
@ -14,7 +14,8 @@ from .base import assets
|
||||||
|
|
||||||
METS_KANT = assets.url_of('kant_aufklaerung_1784-page-region-line-word_glyph/data/mets.xml')
|
METS_KANT = assets.url_of('kant_aufklaerung_1784-page-region-line-word_glyph/data/mets.xml')
|
||||||
WORKSPACE_DIR = '/tmp/test-ocrd-calamari'
|
WORKSPACE_DIR = '/tmp/test-ocrd-calamari'
|
||||||
CHECKPOINT = os.path.join(os.getcwd(), 'gt4histocr-calamari1/*.ckpt.json')
|
CHECKPOINT_DIR = os.path.join(os.getcwd(), 'gt4histocr-calamari1')
|
||||||
|
CHECKPOINT = os.path.join(CHECKPOINT_DIR, '*.ckpt.json')
|
||||||
|
|
||||||
# Because XML namespace versions are so much fun, we not only use one, we use TWO!
|
# Because XML namespace versions are so much fun, we not only use one, we use TWO!
|
||||||
NSMAP = { "pc": "http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15" }
|
NSMAP = { "pc": "http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15" }
|
||||||
|
@ -81,6 +82,22 @@ def test_recognize(workspace):
|
||||||
with open(page1, "r", encoding="utf-8") as f:
|
with open(page1, "r", encoding="utf-8") as f:
|
||||||
assert "verſchuldeten" in f.read()
|
assert "verſchuldeten" in f.read()
|
||||||
|
|
||||||
|
def test_recognize_with_checkpoint_dir(workspace):
|
||||||
|
CalamariRecognize(
|
||||||
|
workspace,
|
||||||
|
input_file_grp="OCR-D-GT-SEG-LINE",
|
||||||
|
output_file_grp="OCR-D-OCR-CALAMARI",
|
||||||
|
parameter={
|
||||||
|
"checkpoint_dir": CHECKPOINT_DIR,
|
||||||
|
}
|
||||||
|
).process()
|
||||||
|
workspace.save_mets()
|
||||||
|
|
||||||
|
page1 = os.path.join(workspace.directory, "OCR-D-OCR-CALAMARI/OCR-D-OCR-CALAMARI_0001.xml")
|
||||||
|
assert os.path.exists(page1)
|
||||||
|
with open(page1, "r", encoding="utf-8") as f:
|
||||||
|
assert "verſchuldeten" in f.read()
|
||||||
|
|
||||||
|
|
||||||
def test_word_segmentation(workspace):
|
def test_word_segmentation(workspace):
|
||||||
CalamariRecognize(
|
CalamariRecognize(
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue