From dbe43e2316fa834e44116b61bd4cde7e5528318d Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Thu, 8 Aug 2019 10:41:55 +0200 Subject: [PATCH 01/16] =?UTF-8?q?=F0=9F=9A=A7=20Process=20lines?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ocrd_calamari/cli.py | 10 ++++++ ocrd_calamari/config.py | 4 +++ ocrd_calamari/ocr.py | 70 +++++++++++++++++++++++++----------- ocrd_calamari/ocrd-tool.json | 14 +++++--- requirements.txt | 4 +++ setup.py | 30 ++++++++++++++++ 6 files changed, 108 insertions(+), 24 deletions(-) create mode 100644 ocrd_calamari/cli.py create mode 100644 ocrd_calamari/config.py create mode 100644 requirements.txt create mode 100644 setup.py diff --git a/ocrd_calamari/cli.py b/ocrd_calamari/cli.py new file mode 100644 index 0000000..7a28dad --- /dev/null +++ b/ocrd_calamari/cli.py @@ -0,0 +1,10 @@ +import click + +from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor +from ocrd_calamari.ocr import CalamariOcr + + +@click.command() +@ocrd_cli_options +def ocrd_calamari_ocr(*args, **kwargs): + return ocrd_cli_wrap_processor(CalamariOcr, *args, **kwargs) diff --git a/ocrd_calamari/config.py b/ocrd_calamari/config.py new file mode 100644 index 0000000..01e0b23 --- /dev/null +++ b/ocrd_calamari/config.py @@ -0,0 +1,4 @@ +import json +from pkg_resources import resource_string + +OCRD_TOOL = json.loads(resource_string(__name__, 'ocrd-tool.json').decode('utf8')) diff --git a/ocrd_calamari/ocr.py b/ocrd_calamari/ocr.py index 2dd6038..6a793e0 100644 --- a/ocrd_calamari/ocr.py +++ b/ocrd_calamari/ocr.py @@ -1,18 +1,46 @@ from __future__ import absolute_import -from calamari_ocr.scripts.predict import run -log = getLogger('processor.KrakenOcr') +from glob import glob -class KrakenOcr(Processor): +import numpy as np +from calamari_ocr.ocr import MultiPredictor +from calamari_ocr.ocr.voting import voter_from_proto +from calamari_ocr.proto import VoterParams +from ocrd import Processor +from ocrd.logging import getLogger +from ocrd.model import ocrd_page +from ocrd.utils import polygon_from_points + +from ocrd_calamari.config import OCRD_TOOL + +log = getLogger('processor.CalamariOcr') + +# TODO: Should this be "recognize", not "ocr" akin ocrd_tesserocr? + + +class CalamariOcr(Processor): def __init__(self, *args, **kwargs): kwargs['ocrd_tool'] = OCRD_TOOL['tools']['ocrd-calamari-ocr'] - super(KrakenOcr, self).__init__(*args, **kwargs) + super(CalamariOcr, self).__init__(*args, **kwargs) + + + def _init_calamari(self): + checkpoints = glob('/home/mike/devel/experiments/train-calamari-gt4histocr/models/*.ckpt.json') # XXX + self.predictor = MultiPredictor(checkpoints=checkpoints) + + voter_params = VoterParams() + voter_params.type = VoterParams.Type.Value('confidence_voter_default_ctc'.upper()) + self.voter = voter_from_proto(voter_params) + def process(self): """ - Performs the binarization. + Performs the recognition. """ + + self._init_calamari() + for (n, input_file) in enumerate(self.input_files): log.info("INPUT FILE %i / %s", n, input_file) pcgts = ocrd_page.from_file(self.workspace.download_file(input_file)) @@ -20,20 +48,22 @@ class KrakenOcr(Processor): log.info("pcgts %s", pcgts) for region in pcgts.get_Page().get_TextRegion(): textlines = region.get_TextLine() - log.info("About to binarize %i lines of region '%s'", len(textlines), region.id) + log.info("About to recognize %i lines of region '%s'", len(textlines), region.id) for (line_no, line) in enumerate(textlines): - log.debug("Binarizing line '%s' in region '%s'", line_no, region.id) - image = self.workspace.resolve_image_as_pil(image_url, polygon_from_points(line.get_Coords().points)) - print(dir(kraken.binarization)) - bin_image = kraken.binarization.nlbin(image) - bin_image_bytes = io.BytesIO() - bin_image.save(bin_image_bytes, format='PNG') - ID = concat_padded(self.output_file_grp, n) - self.add_output_file( - ID=ID, - file_grp=self.output_file_grp, - basename="%s.bin.png" % ID, - mimetype='image/png', - content=bin_image_bytes.getvalue() - ) + log.debug("Recognizing line '%s' in region '%s'", line_no, region.id) + image = self.workspace.resolve_image_as_pil(image_url, + polygon_from_points(line.get_Coords().points)) + image_np = np.array(image, dtype=np.uint8) # XXX better way? + + raw_results = list(self.predictor.predict_raw([image_np], progress_bar=False))[0] + + for i, p in enumerate(raw_results): + p.prediction.id = "fold_{}".format(i) + + prediction = self.voter.vote_prediction_result(raw_results) + prediction.id = "voted" + print('***', prediction.sentence) + print(prediction.avg_char_probability) + for raw_result in raw_results: + print(raw_result.sentence) diff --git a/ocrd_calamari/ocrd-tool.json b/ocrd_calamari/ocrd-tool.json index 6c0b0ad..a2a8c4f 100644 --- a/ocrd_calamari/ocrd-tool.json +++ b/ocrd_calamari/ocrd-tool.json @@ -10,13 +10,19 @@ "steps": [ "recognition/text-recognition" ], - "description": "Recognize lines with kraken", + "description": "Recognize lines with Calamari", + "input_file_grp": [ + "OCR-D-SEG-LINE" + ], + "output_file_grp": [ + "OCR-D-OCR-CALAMARI" + ], "parameters": { - "checkpoint": {"type": "string", "format": "file", "cacheable": true}, + "XXX checkpoint": {"type": "string", "format": "file", "cacheable": true}, "processes": {"type": "number", "default": 1}, "batch_size": {"type": "number", "default": 1}, - "voter": {"type": "string", "default": "confidence_voter_default_ctc"}, - "extended_prediction_data_format": {"type": "string", "default": "json"}, + "XXX voter": {"type": "string", "default": "confidence_voter_default_ctc"}, + "XXXX extended_prediction_data_format": {"type": "string", "default": "json"}, "XXX output_dir": "TODO", "XXX extended_prediction_data": "TODO" } diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..1c3fd3e --- /dev/null +++ b/requirements.txt @@ -0,0 +1,4 @@ +calamari-ocr +tensorflow-gpu +click +ocrd \ No newline at end of file diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..8eb4533 --- /dev/null +++ b/setup.py @@ -0,0 +1,30 @@ +# -*- coding: utf-8 -*- +""" +Installs one executable: + + - ocrd_calamari_ocr +""" +import codecs + +from setuptools import setup, find_packages + +setup( + name='ocrd_calamari', + version='0.0.1', + description='Calamari bindings', + long_description=codecs.open('README.md', encoding='utf-8').read(), + author='Konstantin Baierer, Mike Gerber', + author_email='unixprog@gmail.com, mike.gerber@sbb.spk-berlin.de', + url='https://github.com/OCR-D/ocrd_calamari', # XXX + license='Apache License 2.0', + packages=find_packages(exclude=('tests', 'docs')), + install_requires=open('requirements.txt').read().split('\n'), + package_data={ + '': ['*.json', '*.yml', '*.yaml'], + }, + entry_points={ + 'console_scripts': [ + 'ocrd-calamari-ocr=ocrd_calamari.cli:ocrd_calamari_ocr', + ] + }, +) From b9e38487bd0aa3aa3e36f5b017946c15978a9b1c Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Thu, 8 Aug 2019 10:49:35 +0200 Subject: [PATCH 02/16] =?UTF-8?q?=F0=9F=9A=A7=20Extract=20a=20method=20to?= =?UTF-8?q?=20resolve=20an=20image=20as=20a=20Numpy=20array?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ocrd_calamari/ocr.py | 9 ++++----- requirements.txt | 1 + 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/ocrd_calamari/ocr.py b/ocrd_calamari/ocr.py index 6a793e0..24cea7a 100644 --- a/ocrd_calamari/ocr.py +++ b/ocrd_calamari/ocr.py @@ -24,7 +24,6 @@ class CalamariOcr(Processor): kwargs['ocrd_tool'] = OCRD_TOOL['tools']['ocrd-calamari-ocr'] super(CalamariOcr, self).__init__(*args, **kwargs) - def _init_calamari(self): checkpoints = glob('/home/mike/devel/experiments/train-calamari-gt4histocr/models/*.ckpt.json') # XXX self.predictor = MultiPredictor(checkpoints=checkpoints) @@ -33,6 +32,8 @@ class CalamariOcr(Processor): voter_params.type = VoterParams.Type.Value('confidence_voter_default_ctc'.upper()) self.voter = voter_from_proto(voter_params) + def resolve_image_as_np(self, image_url, coords): + return np.array(self.workspace.resolve_image_as_pil(image_url, coords), dtype=np.uint8) def process(self): """ @@ -51,12 +52,10 @@ class CalamariOcr(Processor): log.info("About to recognize %i lines of region '%s'", len(textlines), region.id) for (line_no, line) in enumerate(textlines): log.debug("Recognizing line '%s' in region '%s'", line_no, region.id) - image = self.workspace.resolve_image_as_pil(image_url, - polygon_from_points(line.get_Coords().points)) - image_np = np.array(image, dtype=np.uint8) # XXX better way? - raw_results = list(self.predictor.predict_raw([image_np], progress_bar=False))[0] + image = self.resolve_image_as_np(image_url, polygon_from_points(line.get_Coords().points)) + raw_results = list(self.predictor.predict_raw([image], progress_bar=False))[0] for i, p in enumerate(raw_results): p.prediction.id = "fold_{}".format(i) diff --git a/requirements.txt b/requirements.txt index 1c3fd3e..552c477 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ +numpy calamari-ocr tensorflow-gpu click From f62332223b0e81ecf2e73b787fc9f54dd087b06d Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Thu, 8 Aug 2019 11:12:23 +0200 Subject: [PATCH 03/16] =?UTF-8?q?=F0=9F=9A=A7=20Save=20results=20in=20the?= =?UTF-8?q?=20workspace?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ocrd_calamari/ocr.py | 30 ++++++++++++++++++++++++------ 1 file changed, 24 insertions(+), 6 deletions(-) diff --git a/ocrd_calamari/ocr.py b/ocrd_calamari/ocr.py index 24cea7a..2930bb2 100644 --- a/ocrd_calamari/ocr.py +++ b/ocrd_calamari/ocr.py @@ -1,15 +1,18 @@ from __future__ import absolute_import +import os from glob import glob import numpy as np from calamari_ocr.ocr import MultiPredictor from calamari_ocr.ocr.voting import voter_from_proto from calamari_ocr.proto import VoterParams -from ocrd import Processor +from ocrd import Processor, MIMETYPE_PAGE from ocrd.logging import getLogger from ocrd.model import ocrd_page -from ocrd.utils import polygon_from_points +from ocrd.model.ocrd_page import to_xml +from ocrd.model.ocrd_page_generateds import TextEquivType +from ocrd.utils import polygon_from_points, concat_padded from ocrd_calamari.config import OCRD_TOOL @@ -35,6 +38,12 @@ class CalamariOcr(Processor): def resolve_image_as_np(self, image_url, coords): return np.array(self.workspace.resolve_image_as_pil(image_url, coords), dtype=np.uint8) + def _make_file_id(self, input_file, n): + file_id = input_file.ID.replace(self.input_file_grp, self.output_file_grp) + if file_id == input_file.ID: + file_id = concat_padded(self.output_file_grp, n) + return file_id + def process(self): """ Performs the recognition. @@ -62,7 +71,16 @@ class CalamariOcr(Processor): prediction = self.voter.vote_prediction_result(raw_results) prediction.id = "voted" - print('***', prediction.sentence) - print(prediction.avg_char_probability) - for raw_result in raw_results: - print(raw_result.sentence) + line_text = prediction.sentence + line_conf = prediction.avg_char_probability + + line.add_TextEquiv(TextEquivType(Unicode=line_text, conf=line_conf)) + + file_id = self._make_file_id(input_file, n) + self.workspace.add_file( + ID=file_id, + file_grp=self.output_file_grp, + pageId=input_file.pageId, + mimetype=MIMETYPE_PAGE, + local_filename=os.path.join(self.output_file_grp, file_id + '.xml'), + content=to_xml(pcgts)) From 4df04ac6d763b4e692be702ac819d5f6e0ab03fd Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Thu, 8 Aug 2019 12:31:46 +0200 Subject: [PATCH 04/16] =?UTF-8?q?=F0=9F=9A=A7=20Update=20to=20ocrd=20>=3D?= =?UTF-8?q?=201.0.0b11?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ocrd_calamari/ocr.py | 14 +++++++------- requirements.txt | 2 +- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/ocrd_calamari/ocr.py b/ocrd_calamari/ocr.py index 2930bb2..f0dc7d2 100644 --- a/ocrd_calamari/ocr.py +++ b/ocrd_calamari/ocr.py @@ -7,12 +7,12 @@ import numpy as np from calamari_ocr.ocr import MultiPredictor from calamari_ocr.ocr.voting import voter_from_proto from calamari_ocr.proto import VoterParams -from ocrd import Processor, MIMETYPE_PAGE -from ocrd.logging import getLogger -from ocrd.model import ocrd_page -from ocrd.model.ocrd_page import to_xml -from ocrd.model.ocrd_page_generateds import TextEquivType -from ocrd.utils import polygon_from_points, concat_padded +from ocrd import Processor +from ocrd_modelfactory import page_from_file +from ocrd_models import ocrd_page +from ocrd_models.ocrd_page import to_xml +from ocrd_models.ocrd_page_generateds import TextEquivType +from ocrd_utils import getLogger, concat_padded, polygon_from_points, MIMETYPE_PAGE from ocrd_calamari.config import OCRD_TOOL @@ -53,7 +53,7 @@ class CalamariOcr(Processor): for (n, input_file) in enumerate(self.input_files): log.info("INPUT FILE %i / %s", n, input_file) - pcgts = ocrd_page.from_file(self.workspace.download_file(input_file)) + pcgts = page_from_file(self.workspace.download_file(input_file)) image_url = pcgts.get_Page().imageFilename log.info("pcgts %s", pcgts) for region in pcgts.get_Page().get_TextRegion(): diff --git a/requirements.txt b/requirements.txt index 552c477..9d528c0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,4 +2,4 @@ numpy calamari-ocr tensorflow-gpu click -ocrd \ No newline at end of file +ocrd >= 1.0.0b11 \ No newline at end of file From 3b02da8da2d54cffb0864e893d78b78cf2327813 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Thu, 8 Aug 2019 12:50:11 +0200 Subject: [PATCH 05/16] =?UTF-8?q?=F0=9F=9A=A7=20Set=20TensorFlow=20minimum?= =?UTF-8?q?=20log=20level?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ocrd_calamari/config.py | 1 + ocrd_calamari/ocr.py | 4 +++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/ocrd_calamari/config.py b/ocrd_calamari/config.py index 01e0b23..6141ca2 100644 --- a/ocrd_calamari/config.py +++ b/ocrd_calamari/config.py @@ -2,3 +2,4 @@ import json from pkg_resources import resource_string OCRD_TOOL = json.loads(resource_string(__name__, 'ocrd-tool.json').decode('utf8')) +TF_CPP_MIN_LOG_LEVEL = '3' # '3' == ERROR diff --git a/ocrd_calamari/ocr.py b/ocrd_calamari/ocr.py index f0dc7d2..181fcd7 100644 --- a/ocrd_calamari/ocr.py +++ b/ocrd_calamari/ocr.py @@ -14,7 +14,7 @@ from ocrd_models.ocrd_page import to_xml from ocrd_models.ocrd_page_generateds import TextEquivType from ocrd_utils import getLogger, concat_padded, polygon_from_points, MIMETYPE_PAGE -from ocrd_calamari.config import OCRD_TOOL +from ocrd_calamari.config import OCRD_TOOL, TF_CPP_MIN_LOG_LEVEL log = getLogger('processor.CalamariOcr') @@ -28,6 +28,8 @@ class CalamariOcr(Processor): super(CalamariOcr, self).__init__(*args, **kwargs) def _init_calamari(self): + os.environ['TF_CPP_MIN_LOG_LEVEL'] = TF_CPP_MIN_LOG_LEVEL + checkpoints = glob('/home/mike/devel/experiments/train-calamari-gt4histocr/models/*.ckpt.json') # XXX self.predictor = MultiPredictor(checkpoints=checkpoints) From 67aa3db3571cfbdf81dac094b599a54277c99eed Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Thu, 8 Aug 2019 13:38:14 +0200 Subject: [PATCH 06/16] =?UTF-8?q?=F0=9F=9A=A7=20Remove=20unused=20import?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ocrd_calamari/ocr.py | 1 - 1 file changed, 1 deletion(-) diff --git a/ocrd_calamari/ocr.py b/ocrd_calamari/ocr.py index 181fcd7..4a782e0 100644 --- a/ocrd_calamari/ocr.py +++ b/ocrd_calamari/ocr.py @@ -9,7 +9,6 @@ from calamari_ocr.ocr.voting import voter_from_proto from calamari_ocr.proto import VoterParams from ocrd import Processor from ocrd_modelfactory import page_from_file -from ocrd_models import ocrd_page from ocrd_models.ocrd_page import to_xml from ocrd_models.ocrd_page_generateds import TextEquivType from ocrd_utils import getLogger, concat_padded, polygon_from_points, MIMETYPE_PAGE From 64794363d8145dc8137f103f5303a7cb7c1c6073 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Thu, 8 Aug 2019 13:38:35 +0200 Subject: [PATCH 07/16] =?UTF-8?q?=F0=9F=9A=A7=20Use=20voter=20from=20JSON?= =?UTF-8?q?=20config?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ocrd_calamari/ocr.py | 2 +- ocrd_calamari/ocrd-tool.json | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ocrd_calamari/ocr.py b/ocrd_calamari/ocr.py index 4a782e0..1aba218 100644 --- a/ocrd_calamari/ocr.py +++ b/ocrd_calamari/ocr.py @@ -33,7 +33,7 @@ class CalamariOcr(Processor): self.predictor = MultiPredictor(checkpoints=checkpoints) voter_params = VoterParams() - voter_params.type = VoterParams.Type.Value('confidence_voter_default_ctc'.upper()) + voter_params.type = VoterParams.Type.Value(self.parameter['voter'].upper()) self.voter = voter_from_proto(voter_params) def resolve_image_as_np(self, image_url, coords): diff --git a/ocrd_calamari/ocrd-tool.json b/ocrd_calamari/ocrd-tool.json index a2a8c4f..ca03964 100644 --- a/ocrd_calamari/ocrd-tool.json +++ b/ocrd_calamari/ocrd-tool.json @@ -21,7 +21,7 @@ "XXX checkpoint": {"type": "string", "format": "file", "cacheable": true}, "processes": {"type": "number", "default": 1}, "batch_size": {"type": "number", "default": 1}, - "XXX voter": {"type": "string", "default": "confidence_voter_default_ctc"}, + "voter": {"type": "string", "default": "confidence_voter_default_ctc"}, "XXXX extended_prediction_data_format": {"type": "string", "default": "json"}, "XXX output_dir": "TODO", "XXX extended_prediction_data": "TODO" From 611371fa2c622748a4863001c75717ef722d5b6d Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Thu, 8 Aug 2019 13:43:18 +0200 Subject: [PATCH 08/16] =?UTF-8?q?=F0=9F=9A=A7=20Use=20correct=20URL?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ocrd_calamari/ocrd-tool.json | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ocrd_calamari/ocrd-tool.json b/ocrd_calamari/ocrd-tool.json index ca03964..7216e87 100644 --- a/ocrd_calamari/ocrd-tool.json +++ b/ocrd_calamari/ocrd-tool.json @@ -1,5 +1,5 @@ { - "git_url": "https://github.com/OCR-D/ocrd_calamari", + "git_url": "https://github.com/kba/ocrd_calamari", "version": "0.0.1", "tools": { "ocrd-calamari-ocr": { diff --git a/setup.py b/setup.py index 8eb4533..f82586d 100644 --- a/setup.py +++ b/setup.py @@ -15,7 +15,7 @@ setup( long_description=codecs.open('README.md', encoding='utf-8').read(), author='Konstantin Baierer, Mike Gerber', author_email='unixprog@gmail.com, mike.gerber@sbb.spk-berlin.de', - url='https://github.com/OCR-D/ocrd_calamari', # XXX + url='https://github.com/kba/ocrd_calamari', license='Apache License 2.0', packages=find_packages(exclude=('tests', 'docs')), install_requires=open('requirements.txt').read().split('\n'), From 319ce3a467bce5e11e97723925c00749fae3fc60 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Thu, 8 Aug 2019 13:48:58 +0200 Subject: [PATCH 09/16] =?UTF-8?q?=F0=9F=9A=A7=20s/Ocr/Recognize?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ocrd_calamari/cli.py | 6 +++--- ocrd_calamari/ocrd-tool.json | 4 ++-- ocrd_calamari/{ocr.py => recognize.py} | 10 ++++------ setup.py | 2 +- 4 files changed, 10 insertions(+), 12 deletions(-) rename ocrd_calamari/{ocr.py => recognize.py} (94%) diff --git a/ocrd_calamari/cli.py b/ocrd_calamari/cli.py index 7a28dad..2a1e210 100644 --- a/ocrd_calamari/cli.py +++ b/ocrd_calamari/cli.py @@ -1,10 +1,10 @@ import click from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor -from ocrd_calamari.ocr import CalamariOcr +from ocrd_calamari.recognize import CalamariRecognize @click.command() @ocrd_cli_options -def ocrd_calamari_ocr(*args, **kwargs): - return ocrd_cli_wrap_processor(CalamariOcr, *args, **kwargs) +def ocrd_calamari_recognize(*args, **kwargs): + return ocrd_cli_wrap_processor(CalamariRecognize, *args, **kwargs) diff --git a/ocrd_calamari/ocrd-tool.json b/ocrd_calamari/ocrd-tool.json index 7216e87..e039d8e 100644 --- a/ocrd_calamari/ocrd-tool.json +++ b/ocrd_calamari/ocrd-tool.json @@ -2,8 +2,8 @@ "git_url": "https://github.com/kba/ocrd_calamari", "version": "0.0.1", "tools": { - "ocrd-calamari-ocr": { - "executable": "ocrd-calamari-ocr", + "ocrd-calamari-recognize": { + "executable": "ocrd-calamari-recognize", "categories": [ "Text recognition and optimization" ], diff --git a/ocrd_calamari/ocr.py b/ocrd_calamari/recognize.py similarity index 94% rename from ocrd_calamari/ocr.py rename to ocrd_calamari/recognize.py index 1aba218..6121885 100644 --- a/ocrd_calamari/ocr.py +++ b/ocrd_calamari/recognize.py @@ -15,16 +15,14 @@ from ocrd_utils import getLogger, concat_padded, polygon_from_points, MIMETYPE_P from ocrd_calamari.config import OCRD_TOOL, TF_CPP_MIN_LOG_LEVEL -log = getLogger('processor.CalamariOcr') +log = getLogger('processor.CalamariRecognize') -# TODO: Should this be "recognize", not "ocr" akin ocrd_tesserocr? - -class CalamariOcr(Processor): +class CalamariRecognize(Processor): def __init__(self, *args, **kwargs): - kwargs['ocrd_tool'] = OCRD_TOOL['tools']['ocrd-calamari-ocr'] - super(CalamariOcr, self).__init__(*args, **kwargs) + kwargs['ocrd_tool'] = OCRD_TOOL['tools']['ocrd-calamari-recognize'] + super(CalamariRecognize, self).__init__(*args, **kwargs) def _init_calamari(self): os.environ['TF_CPP_MIN_LOG_LEVEL'] = TF_CPP_MIN_LOG_LEVEL diff --git a/setup.py b/setup.py index f82586d..ed24216 100644 --- a/setup.py +++ b/setup.py @@ -24,7 +24,7 @@ setup( }, entry_points={ 'console_scripts': [ - 'ocrd-calamari-ocr=ocrd_calamari.cli:ocrd_calamari_ocr', + 'ocrd-calamari-recognize=ocrd_calamari.cli:ocrd_calamari_recognize', ] }, ) From 2561b67891e12cdfacdec45d98021a1df33190e1 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Thu, 8 Aug 2019 14:12:10 +0200 Subject: [PATCH 10/16] =?UTF-8?q?=F0=9F=9A=A7=20Remove=20output=5Fdir=20pa?= =?UTF-8?q?rameter=20as=20we=20are=20not=20outputting=20to=20text=20files?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ocrd_calamari/ocrd-tool.json | 1 - 1 file changed, 1 deletion(-) diff --git a/ocrd_calamari/ocrd-tool.json b/ocrd_calamari/ocrd-tool.json index e039d8e..408abf1 100644 --- a/ocrd_calamari/ocrd-tool.json +++ b/ocrd_calamari/ocrd-tool.json @@ -23,7 +23,6 @@ "batch_size": {"type": "number", "default": 1}, "voter": {"type": "string", "default": "confidence_voter_default_ctc"}, "XXXX extended_prediction_data_format": {"type": "string", "default": "json"}, - "XXX output_dir": "TODO", "XXX extended_prediction_data": "TODO" } } From 0498f9551e2bbb9cd9da7319cfd92f8dae96c7d0 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Thu, 8 Aug 2019 16:28:08 +0200 Subject: [PATCH 11/16] =?UTF-8?q?=F0=9F=9A=A7=20Update=20higher=20TextEqui?= =?UTF-8?q?v=20levels?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ocrd_calamari/recognize.py | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/ocrd_calamari/recognize.py b/ocrd_calamari/recognize.py index 6121885..f6e79f3 100644 --- a/ocrd_calamari/recognize.py +++ b/ocrd_calamari/recognize.py @@ -75,6 +75,8 @@ class CalamariRecognize(Processor): line.add_TextEquiv(TextEquivType(Unicode=line_text, conf=line_conf)) + _page_update_higher_textequiv_levels('line', pcgts) + file_id = self._make_file_id(input_file, n) self.workspace.add_file( ID=file_id, @@ -83,3 +85,38 @@ class CalamariRecognize(Processor): mimetype=MIMETYPE_PAGE, local_filename=os.path.join(self.output_file_grp, file_id + '.xml'), content=to_xml(pcgts)) + + +# TODO: This is a copy of ocrd_tesserocr's function, and should probably be moved to a ocrd lib +def _page_update_higher_textequiv_levels(level, pcgts): + """Update the TextEquivs of all PAGE-XML hierarchy levels above `level` for consistency. + + Starting with the hierarchy level chosen for processing, + join all first TextEquiv (by the rules governing the respective level) + into TextEquiv of the next higher level, replacing them. + """ + regions = pcgts.get_Page().get_TextRegion() + if level != 'region': + for region in regions: + lines = region.get_TextLine() + if level != 'line': + for line in lines: + words = line.get_Word() + if level != 'word': + for word in words: + glyphs = word.get_Glyph() + word_unicode = u''.join(glyph.get_TextEquiv()[0].Unicode + if glyph.get_TextEquiv() + else u'' for glyph in glyphs) + word.set_TextEquiv( + [TextEquivType(Unicode=word_unicode)]) # remove old + line_unicode = u' '.join(word.get_TextEquiv()[0].Unicode + if word.get_TextEquiv() + else u'' for word in words) + line.set_TextEquiv( + [TextEquivType(Unicode=line_unicode)]) # remove old + region_unicode = u'\n'.join(line.get_TextEquiv()[0].Unicode + if line.get_TextEquiv() + else u'' for line in lines) + region.set_TextEquiv( + [TextEquivType(Unicode=region_unicode)]) # remove old From c9b52b5c7c1a4466482eaec912a2223639f4192b Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Thu, 8 Aug 2019 16:37:30 +0200 Subject: [PATCH 12/16] =?UTF-8?q?=F0=9F=9A=A7=20Update=20README?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 31e06d9..c5c85cd 100644 --- a/README.md +++ b/README.md @@ -1 +1,11 @@ -# ocrd_calamari \ No newline at end of file +# ocrd_calamari + +Recognize text using [Calamari OCR](https://github.com/Calamari-OCR/calamari). + +Introduction +------------- + +This offers a OCR-D compliant workspace processor for some of the functionality of Calamari OCR. + +This processor only operates on the text line level and so needs a line segmentation (and by extension a binarized +image) as its input. From 8cfb075a6d75f472b790726c4fb336493f48ab62 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Thu, 8 Aug 2019 16:48:13 +0200 Subject: [PATCH 13/16] =?UTF-8?q?=F0=9F=9A=A7=20Remove=20unused=20paramete?= =?UTF-8?q?rs?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ocrd_calamari/ocrd-tool.json | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/ocrd_calamari/ocrd-tool.json b/ocrd_calamari/ocrd-tool.json index 408abf1..449c372 100644 --- a/ocrd_calamari/ocrd-tool.json +++ b/ocrd_calamari/ocrd-tool.json @@ -19,11 +19,7 @@ ], "parameters": { "XXX checkpoint": {"type": "string", "format": "file", "cacheable": true}, - "processes": {"type": "number", "default": 1}, - "batch_size": {"type": "number", "default": 1}, - "voter": {"type": "string", "default": "confidence_voter_default_ctc"}, - "XXXX extended_prediction_data_format": {"type": "string", "default": "json"}, - "XXX extended_prediction_data": "TODO" + "voter": {"type": "string", "default": "confidence_voter_default_ctc"} } } } From ebf0d5364001366d3f2654c7b628cd09dc907048 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Thu, 8 Aug 2019 17:26:02 +0200 Subject: [PATCH 14/16] =?UTF-8?q?=F0=9F=9A=A7=20Do=20not=20hardcode=20used?= =?UTF-8?q?=20models?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 14 ++++++++++++++ ocrd_calamari/ocrd-tool.json | 2 +- ocrd_calamari/recognize.py | 2 +- 3 files changed, 16 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index c5c85cd..f76e932 100644 --- a/README.md +++ b/README.md @@ -9,3 +9,17 @@ This offers a OCR-D compliant workspace processor for some of the functionality This processor only operates on the text line level and so needs a line segmentation (and by extension a binarized image) as its input. + +Example Usage +------------- + +~~~ +ocrd-calamari-recognize -p test-parameters.json -m mets.xml -I OCR-D-SEG-LINE -O OCR-D-OCR-CALAMARI +~~~ + +With `test-parameters.json`: +~~~ +{ + "checkpoint": "/path/to/some/trained/models/*.ckpt.json" +} +~~~ diff --git a/ocrd_calamari/ocrd-tool.json b/ocrd_calamari/ocrd-tool.json index 449c372..5fc5073 100644 --- a/ocrd_calamari/ocrd-tool.json +++ b/ocrd_calamari/ocrd-tool.json @@ -18,7 +18,7 @@ "OCR-D-OCR-CALAMARI" ], "parameters": { - "XXX checkpoint": {"type": "string", "format": "file", "cacheable": true}, + "checkpoint": {"type": "string", "format": "file", "cacheable": true}, "voter": {"type": "string", "default": "confidence_voter_default_ctc"} } } diff --git a/ocrd_calamari/recognize.py b/ocrd_calamari/recognize.py index f6e79f3..ad5f8ad 100644 --- a/ocrd_calamari/recognize.py +++ b/ocrd_calamari/recognize.py @@ -27,7 +27,7 @@ class CalamariRecognize(Processor): def _init_calamari(self): os.environ['TF_CPP_MIN_LOG_LEVEL'] = TF_CPP_MIN_LOG_LEVEL - checkpoints = glob('/home/mike/devel/experiments/train-calamari-gt4histocr/models/*.ckpt.json') # XXX + checkpoints = glob(self.parameter['checkpoint']) self.predictor = MultiPredictor(checkpoints=checkpoints) voter_params = VoterParams() From 32469108c5df8b735d94958072a1d6b80306c479 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Thu, 8 Aug 2019 17:27:15 +0200 Subject: [PATCH 15/16] =?UTF-8?q?=F0=9F=93=9D=20Document=20TODOs?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/README.md b/README.md index f76e932..0967119 100644 --- a/README.md +++ b/README.md @@ -23,3 +23,10 @@ With `test-parameters.json`: "checkpoint": "/path/to/some/trained/models/*.ckpt.json" } ~~~ + +TODO +---- + +* Support Calamari's "extended prediction data" output +* Currently, the processor only supports a prediction using confidence voting of multiple models. While this is + superior, it makes sense to support single model prediction, too. From 3278ebcac8b19b0ed23342236cb685ebd10ef7c6 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Thu, 8 Aug 2019 17:59:29 +0200 Subject: [PATCH 16/16] =?UTF-8?q?=F0=9F=A7=B9=20Remove=20wrong=20and=20red?= =?UTF-8?q?undant=20info=20from=20setup.py?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit setup.py already contains info about the installed scripts under `entry_points` → `console_scripts`. Remove the second and wrong info about installed executables. --- setup.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/setup.py b/setup.py index ed24216..4007fff 100644 --- a/setup.py +++ b/setup.py @@ -1,9 +1,4 @@ # -*- coding: utf-8 -*- -""" -Installs one executable: - - - ocrd_calamari_ocr -""" import codecs from setuptools import setup, find_packages