✨ Implement the thing (#1)

✨ Implement the thing
2026-02-05 17:11:56 +01:00 · 2019-08-20 13:54:49 +02:00 · 2019-08-20 13:54:49 +02:00 · 58f2adcd1c
commit 58f2adcd1c
parent 2ebf3c0e00 3278ebcac8
8 changed files with 210 additions and 50 deletions
--- a/README.md
+++ b/README.md
@ -1 +1,32 @@
-# ocrd_calamari
+# ocrd_calamari
+
+Recognize text using [Calamari OCR](https://github.com/Calamari-OCR/calamari).
+
+Introduction
+-------------
+
+This offers a OCR-D compliant workspace processor for some of the functionality of Calamari OCR.
+
+This processor only operates on the text line level and so needs a line segmentation (and by extension a binarized 
+image) as its input.
+
+Example Usage
+-------------
+
+~~~
+ocrd-calamari-recognize -p test-parameters.json -m mets.xml -I OCR-D-SEG-LINE -O OCR-D-OCR-CALAMARI
+~~~
+
+With `test-parameters.json`:
+~~~
+{
+    "checkpoint": "/path/to/some/trained/models/*.ckpt.json"
+}
+~~~
+
+TODO
+----
+
+* Support Calamari's "extended prediction data" output
+* Currently, the processor only supports a prediction using confidence voting of multiple models. While this is
+  superior, it makes sense to support single model prediction, too.
--- a/ocrd_calamari/cli.py
+++ b/ocrd_calamari/cli.py
@ -0,0 +1,10 @@
+import click
+
+from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor
+from ocrd_calamari.recognize import CalamariRecognize
+
+
+@click.command()
+@ocrd_cli_options
+def ocrd_calamari_recognize(*args, **kwargs):
+    return ocrd_cli_wrap_processor(CalamariRecognize, *args, **kwargs)
--- a/ocrd_calamari/config.py
+++ b/ocrd_calamari/config.py
@ -0,0 +1,5 @@
+import json
+from pkg_resources import resource_string
+
+OCRD_TOOL = json.loads(resource_string(__name__, 'ocrd-tool.json').decode('utf8'))
+TF_CPP_MIN_LOG_LEVEL = '3'  # '3' == ERROR
--- a/ocrd_calamari/ocr.py
+++ b/ocrd_calamari/ocr.py
@ -1,39 +0,0 @@
-from __future__ import absolute_import
-from calamari_ocr.scripts.predict import run
-
-log = getLogger('processor.KrakenOcr')
-
-class KrakenOcr(Processor):
-
-    def __init__(self, *args, **kwargs):
-        kwargs['ocrd_tool'] = OCRD_TOOL['tools']['ocrd-calamari-ocr']
-        super(KrakenOcr, self).__init__(*args, **kwargs)
-
-    def process(self):
-        """
-        Performs the binarization.
-        """
-        for (n, input_file) in enumerate(self.input_files):
-            log.info("INPUT FILE %i / %s", n, input_file)
-            pcgts = ocrd_page.from_file(self.workspace.download_file(input_file))
-            image_url = pcgts.get_Page().imageFilename
-            log.info("pcgts %s", pcgts)
-            for region in pcgts.get_Page().get_TextRegion():
-                textlines = region.get_TextLine()
-                log.info("About to binarize %i lines of region '%s'", len(textlines), region.id)
-                for (line_no, line) in enumerate(textlines):
-                    log.debug("Binarizing line '%s' in region '%s'", line_no, region.id)
-                    image = self.workspace.resolve_image_as_pil(image_url, polygon_from_points(line.get_Coords().points))
-                    print(dir(kraken.binarization))
-                    bin_image = kraken.binarization.nlbin(image)
-                    bin_image_bytes = io.BytesIO()
-                    bin_image.save(bin_image_bytes, format='PNG')
-                    ID = concat_padded(self.output_file_grp, n)
-                    self.add_output_file(
-                        ID=ID,
-                        file_grp=self.output_file_grp,
-                        basename="%s.bin.png" % ID,
-                        mimetype='image/png',
-                        content=bin_image_bytes.getvalue()
-                    )
-
--- a/ocrd_calamari/ocrd-tool.json
+++ b/ocrd_calamari/ocrd-tool.json
@ -1,24 +1,25 @@
 {
-  "git_url": "https://github.com/OCR-D/ocrd_calamari",
+  "git_url": "https://github.com/kba/ocrd_calamari",
  "version": "0.0.1",
  "tools": {
-    "ocrd-calamari-ocr": {
-      "executable": "ocrd-calamari-ocr",
+    "ocrd-calamari-recognize": {
+      "executable": "ocrd-calamari-recognize",
      "categories": [
        "Text recognition and optimization"
      ],
      "steps": [
        "recognition/text-recognition"
      ],
-      "description": "Recognize lines with kraken",
+      "description": "Recognize lines with Calamari",
+      "input_file_grp": [
+        "OCR-D-SEG-LINE"
+      ],
+      "output_file_grp": [
+        "OCR-D-OCR-CALAMARI"
+      ],
      "parameters": {
        "checkpoint": {"type": "string", "format": "file", "cacheable": true},
-        "processes": {"type": "number", "default": 1},
-        "batch_size": {"type": "number", "default": 1},
-        "voter": {"type": "string", "default": "confidence_voter_default_ctc"},
-        "extended_prediction_data_format": {"type": "string", "default": "json"},
-        "XXX output_dir": "TODO",
-        "XXX extended_prediction_data": "TODO"
+        "voter": {"type": "string", "default": "confidence_voter_default_ctc"}
      }
    }
  }
--- a/ocrd_calamari/recognize.py
+++ b/ocrd_calamari/recognize.py
@ -0,0 +1,122 @@
+from __future__ import absolute_import
+
+import os
+from glob import glob
+
+import numpy as np
+from calamari_ocr.ocr import MultiPredictor
+from calamari_ocr.ocr.voting import voter_from_proto
+from calamari_ocr.proto import VoterParams
+from ocrd import Processor
+from ocrd_modelfactory import page_from_file
+from ocrd_models.ocrd_page import to_xml
+from ocrd_models.ocrd_page_generateds import TextEquivType
+from ocrd_utils import getLogger, concat_padded, polygon_from_points, MIMETYPE_PAGE
+
+from ocrd_calamari.config import OCRD_TOOL, TF_CPP_MIN_LOG_LEVEL
+
+log = getLogger('processor.CalamariRecognize')
+
+
+class CalamariRecognize(Processor):
+
+    def __init__(self, *args, **kwargs):
+        kwargs['ocrd_tool'] = OCRD_TOOL['tools']['ocrd-calamari-recognize']
+        super(CalamariRecognize, self).__init__(*args, **kwargs)
+
+    def _init_calamari(self):
+        os.environ['TF_CPP_MIN_LOG_LEVEL'] = TF_CPP_MIN_LOG_LEVEL
+
+        checkpoints = glob(self.parameter['checkpoint'])
+        self.predictor = MultiPredictor(checkpoints=checkpoints)
+
+        voter_params = VoterParams()
+        voter_params.type = VoterParams.Type.Value(self.parameter['voter'].upper())
+        self.voter = voter_from_proto(voter_params)
+
+    def resolve_image_as_np(self, image_url, coords):
+        return np.array(self.workspace.resolve_image_as_pil(image_url, coords), dtype=np.uint8)
+
+    def _make_file_id(self, input_file, n):
+        file_id = input_file.ID.replace(self.input_file_grp, self.output_file_grp)
+        if file_id == input_file.ID:
+            file_id = concat_padded(self.output_file_grp, n)
+        return file_id
+
+    def process(self):
+        """
+        Performs the recognition.
+        """
+
+        self._init_calamari()
+
+        for (n, input_file) in enumerate(self.input_files):
+            log.info("INPUT FILE %i / %s", n, input_file)
+            pcgts = page_from_file(self.workspace.download_file(input_file))
+            image_url = pcgts.get_Page().imageFilename
+            log.info("pcgts %s", pcgts)
+            for region in pcgts.get_Page().get_TextRegion():
+                textlines = region.get_TextLine()
+                log.info("About to recognize %i lines of region '%s'", len(textlines), region.id)
+                for (line_no, line) in enumerate(textlines):
+                    log.debug("Recognizing line '%s' in region '%s'", line_no, region.id)
+
+                    image = self.resolve_image_as_np(image_url, polygon_from_points(line.get_Coords().points))
+
+                    raw_results = list(self.predictor.predict_raw([image], progress_bar=False))[0]
+                    for i, p in enumerate(raw_results):
+                        p.prediction.id = "fold_{}".format(i)
+
+                    prediction = self.voter.vote_prediction_result(raw_results)
+                    prediction.id = "voted"
+
+                    line_text = prediction.sentence
+                    line_conf = prediction.avg_char_probability
+
+                    line.add_TextEquiv(TextEquivType(Unicode=line_text, conf=line_conf))
+
+            _page_update_higher_textequiv_levels('line', pcgts)
+
+            file_id = self._make_file_id(input_file, n)
+            self.workspace.add_file(
+                ID=file_id,
+                file_grp=self.output_file_grp,
+                pageId=input_file.pageId,
+                mimetype=MIMETYPE_PAGE,
+                local_filename=os.path.join(self.output_file_grp, file_id + '.xml'),
+                content=to_xml(pcgts))
+
+
+# TODO: This is a copy of ocrd_tesserocr's function, and should probably be moved to a ocrd lib
+def _page_update_higher_textequiv_levels(level, pcgts):
+    """Update the TextEquivs of all PAGE-XML hierarchy levels above `level` for consistency.
+
+    Starting with the hierarchy level chosen for processing,
+    join all first TextEquiv (by the rules governing the respective level)
+    into TextEquiv of the next higher level, replacing them.
+    """
+    regions = pcgts.get_Page().get_TextRegion()
+    if level != 'region':
+        for region in regions:
+            lines = region.get_TextLine()
+            if level != 'line':
+                for line in lines:
+                    words = line.get_Word()
+                    if level != 'word':
+                        for word in words:
+                            glyphs = word.get_Glyph()
+                            word_unicode = u''.join(glyph.get_TextEquiv()[0].Unicode
+                                                    if glyph.get_TextEquiv()
+                                                    else u'' for glyph in glyphs)
+                            word.set_TextEquiv(
+                                [TextEquivType(Unicode=word_unicode)])  # remove old
+                    line_unicode = u' '.join(word.get_TextEquiv()[0].Unicode
+                                             if word.get_TextEquiv()
+                                             else u'' for word in words)
+                    line.set_TextEquiv(
+                        [TextEquivType(Unicode=line_unicode)])  # remove old
+            region_unicode = u'\n'.join(line.get_TextEquiv()[0].Unicode
+                                        if line.get_TextEquiv()
+                                        else u'' for line in lines)
+            region.set_TextEquiv(
+                [TextEquivType(Unicode=region_unicode)])  # remove old
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,5 @@
+numpy
+calamari-ocr
+tensorflow-gpu
+click
+ocrd >= 1.0.0b11
--- a/setup.py
+++ b/setup.py
@ -0,0 +1,25 @@
+# -*- coding: utf-8 -*-
+import codecs
+
+from setuptools import setup, find_packages
+
+setup(
+    name='ocrd_calamari',
+    version='0.0.1',
+    description='Calamari bindings',
+    long_description=codecs.open('README.md', encoding='utf-8').read(),
+    author='Konstantin Baierer, Mike Gerber',
+    author_email='unixprog@gmail.com, mike.gerber@sbb.spk-berlin.de',
+    url='https://github.com/kba/ocrd_calamari',
+    license='Apache License 2.0',
+    packages=find_packages(exclude=('tests', 'docs')),
+    install_requires=open('requirements.txt').read().split('\n'),
+    package_data={
+        '': ['*.json', '*.yml', '*.yaml'],
+    },
+    entry_points={
+        'console_scripts': [
+            'ocrd-calamari-recognize=ocrd_calamari.cli:ocrd_calamari_recognize',
+        ]
+    },
+)