✨ Implement the thing (#1)

✨ Implement the thing
2026-07-02 08:49:10 +02:00 · 2019-08-20 13:54:49 +02:00 · 2019-08-20 13:54:49 +02:00 · 58f2adcd1c
commit 58f2adcd1c
parent 2ebf3c0e00 3278ebcac8
8 changed files with 210 additions and 50 deletions
--- a/README.md
+++ b/README.md
@ -1 +1,32 @@
 # ocrd_calamari
 Recognize text using [Calamari OCR](https://github.com/Calamari-OCR/calamari).
 Introduction
 -------------
 This offers a OCR-D compliant workspace processor for some of the functionality of Calamari OCR.
 This processor only operates on the text line level and so needs a line segmentation (and by extension a binarized 
 image) as its input.
 Example Usage
 -------------
 ~~~
 ocrd-calamari-recognize -p test-parameters.json -m mets.xml -I OCR-D-SEG-LINE -O OCR-D-OCR-CALAMARI
 ~~~
 With `test-parameters.json`:
 ~~~
 {
    "checkpoint": "/path/to/some/trained/models/*.ckpt.json"
 }
 ~~~
 TODO
 ----
 * Support Calamari's "extended prediction data" output
 * Currently, the processor only supports a prediction using confidence voting of multiple models. While this is
  superior, it makes sense to support single model prediction, too.
--- a/ocrd_calamari/cli.py
+++ b/ocrd_calamari/cli.py
@ -0,0 +1,10 @@
 import click
 from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor
 from ocrd_calamari.recognize import CalamariRecognize
@click.command()
@ocrd_cli_options
 def ocrd_calamari_recognize(*args, **kwargs):
    return ocrd_cli_wrap_processor(CalamariRecognize, *args, **kwargs)
--- a/ocrd_calamari/config.py
+++ b/ocrd_calamari/config.py
@ -0,0 +1,5 @@
 import json
 from pkg_resources import resource_string
 OCRD_TOOL = json.loads(resource_string(__name__, 'ocrd-tool.json').decode('utf8'))
 TF_CPP_MIN_LOG_LEVEL = '3'  # '3' == ERROR
--- a/ocrd_calamari/ocr.py
+++ b/ocrd_calamari/ocr.py
@ -1,39 +0,0 @@
 from __future__ import absolute_import
 from calamari_ocr.scripts.predict import run
 log = getLogger('processor.KrakenOcr')
 class KrakenOcr(Processor):
    def __init__(self, *args, **kwargs):
        kwargs['ocrd_tool'] = OCRD_TOOL['tools']['ocrd-calamari-ocr']
        super(KrakenOcr, self).__init__(*args, **kwargs)
    def process(self):
        """
        Performs the binarization.
        """
        for (n, input_file) in enumerate(self.input_files):
            log.info("INPUT FILE %i / %s", n, input_file)
            pcgts = ocrd_page.from_file(self.workspace.download_file(input_file))
            image_url = pcgts.get_Page().imageFilename
            log.info("pcgts %s", pcgts)
            for region in pcgts.get_Page().get_TextRegion():
                textlines = region.get_TextLine()
                log.info("About to binarize %i lines of region '%s'", len(textlines), region.id)
                for (line_no, line) in enumerate(textlines):
                    log.debug("Binarizing line '%s' in region '%s'", line_no, region.id)
                    image = self.workspace.resolve_image_as_pil(image_url, polygon_from_points(line.get_Coords().points))
                    print(dir(kraken.binarization))
                    bin_image = kraken.binarization.nlbin(image)
                    bin_image_bytes = io.BytesIO()
                    bin_image.save(bin_image_bytes, format='PNG')
                    ID = concat_padded(self.output_file_grp, n)
                    self.add_output_file(
                        ID=ID,
                        file_grp=self.output_file_grp,
                        basename="%s.bin.png" % ID,
                        mimetype='image/png',
                        content=bin_image_bytes.getvalue()
                    )
--- a/ocrd_calamari/ocrd-tool.json
+++ b/ocrd_calamari/ocrd-tool.json
@ -1,24 +1,25 @@
 {
-  "git_url": "https://github.com/OCR-D/ocrd_calamari",
+  "git_url": "https://github.com/kba/ocrd_calamari",
  "version": "0.0.1",
  "tools": {
-    "ocrd-calamari-ocr": {
+    "ocrd-calamari-recognize": {
-      "executable": "ocrd-calamari-ocr",
+      "executable": "ocrd-calamari-recognize",
      "categories": [
        "Text recognition and optimization"
      ],
      "steps": [
        "recognition/text-recognition"
      ],
-      "description": "Recognize lines with kraken",
+      "description": "Recognize lines with Calamari",
      "input_file_grp": [
        "OCR-D-SEG-LINE"
      ],
      "output_file_grp": [
        "OCR-D-OCR-CALAMARI"
      ],
      "parameters": {
        "checkpoint": {"type": "string", "format": "file", "cacheable": true},
-        "processes": {"type": "number", "default": 1},
+        "voter": {"type": "string", "default": "confidence_voter_default_ctc"}
        "batch_size": {"type": "number", "default": 1},
        "voter": {"type": "string", "default": "confidence_voter_default_ctc"},
        "extended_prediction_data_format": {"type": "string", "default": "json"},
        "XXX output_dir": "TODO",
        "XXX extended_prediction_data": "TODO"
      }
    }
  }
--- a/ocrd_calamari/recognize.py
+++ b/ocrd_calamari/recognize.py
@ -0,0 +1,122 @@
 from __future__ import absolute_import
 import os
 from glob import glob
 import numpy as np
 from calamari_ocr.ocr import MultiPredictor
 from calamari_ocr.ocr.voting import voter_from_proto
 from calamari_ocr.proto import VoterParams
 from ocrd import Processor
 from ocrd_modelfactory import page_from_file
 from ocrd_models.ocrd_page import to_xml
 from ocrd_models.ocrd_page_generateds import TextEquivType
 from ocrd_utils import getLogger, concat_padded, polygon_from_points, MIMETYPE_PAGE
 from ocrd_calamari.config import OCRD_TOOL, TF_CPP_MIN_LOG_LEVEL
 log = getLogger('processor.CalamariRecognize')
 class CalamariRecognize(Processor):
    def __init__(self, *args, **kwargs):
        kwargs['ocrd_tool'] = OCRD_TOOL['tools']['ocrd-calamari-recognize']
        super(CalamariRecognize, self).__init__(*args, **kwargs)
    def _init_calamari(self):
        os.environ['TF_CPP_MIN_LOG_LEVEL'] = TF_CPP_MIN_LOG_LEVEL
        checkpoints = glob(self.parameter['checkpoint'])
        self.predictor = MultiPredictor(checkpoints=checkpoints)
        voter_params = VoterParams()
        voter_params.type = VoterParams.Type.Value(self.parameter['voter'].upper())
        self.voter = voter_from_proto(voter_params)
    def resolve_image_as_np(self, image_url, coords):
        return np.array(self.workspace.resolve_image_as_pil(image_url, coords), dtype=np.uint8)
    def _make_file_id(self, input_file, n):
        file_id = input_file.ID.replace(self.input_file_grp, self.output_file_grp)
        if file_id == input_file.ID:
            file_id = concat_padded(self.output_file_grp, n)
        return file_id
    def process(self):
        """
        Performs the recognition.
        """
        self._init_calamari()
        for (n, input_file) in enumerate(self.input_files):
            log.info("INPUT FILE %i / %s", n, input_file)
            pcgts = page_from_file(self.workspace.download_file(input_file))
            image_url = pcgts.get_Page().imageFilename
            log.info("pcgts %s", pcgts)
            for region in pcgts.get_Page().get_TextRegion():
                textlines = region.get_TextLine()
                log.info("About to recognize %i lines of region '%s'", len(textlines), region.id)
                for (line_no, line) in enumerate(textlines):
                    log.debug("Recognizing line '%s' in region '%s'", line_no, region.id)
                    image = self.resolve_image_as_np(image_url, polygon_from_points(line.get_Coords().points))
                    raw_results = list(self.predictor.predict_raw([image], progress_bar=False))[0]
                    for i, p in enumerate(raw_results):
                        p.prediction.id = "fold_{}".format(i)
                    prediction = self.voter.vote_prediction_result(raw_results)
                    prediction.id = "voted"
                    line_text = prediction.sentence
                    line_conf = prediction.avg_char_probability
                    line.add_TextEquiv(TextEquivType(Unicode=line_text, conf=line_conf))
            _page_update_higher_textequiv_levels('line', pcgts)
            file_id = self._make_file_id(input_file, n)
            self.workspace.add_file(
                ID=file_id,
                file_grp=self.output_file_grp,
                pageId=input_file.pageId,
                mimetype=MIMETYPE_PAGE,
                local_filename=os.path.join(self.output_file_grp, file_id + '.xml'),
                content=to_xml(pcgts))
 # TODO: This is a copy of ocrd_tesserocr's function, and should probably be moved to a ocrd lib
 def _page_update_higher_textequiv_levels(level, pcgts):
    """Update the TextEquivs of all PAGE-XML hierarchy levels above `level` for consistency.
    Starting with the hierarchy level chosen for processing,
    join all first TextEquiv (by the rules governing the respective level)
    into TextEquiv of the next higher level, replacing them.
    """
    regions = pcgts.get_Page().get_TextRegion()
    if level != 'region':
        for region in regions:
            lines = region.get_TextLine()
            if level != 'line':
                for line in lines:
                    words = line.get_Word()
                    if level != 'word':
                        for word in words:
                            glyphs = word.get_Glyph()
                            word_unicode = u''.join(glyph.get_TextEquiv()[0].Unicode
                                                    if glyph.get_TextEquiv()
                                                    else u'' for glyph in glyphs)
                            word.set_TextEquiv(
                                [TextEquivType(Unicode=word_unicode)])  # remove old
                    line_unicode = u' '.join(word.get_TextEquiv()[0].Unicode
                                             if word.get_TextEquiv()
                                             else u'' for word in words)
                    line.set_TextEquiv(
                        [TextEquivType(Unicode=line_unicode)])  # remove old
            region_unicode = u'\n'.join(line.get_TextEquiv()[0].Unicode
                                        if line.get_TextEquiv()
                                        else u'' for line in lines)
            region.set_TextEquiv(
                [TextEquivType(Unicode=region_unicode)])  # remove old
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,5 @@
 numpy
 calamari-ocr
 tensorflow-gpu
 click
 ocrd >= 1.0.0b11
--- a/setup.py
+++ b/setup.py
@ -0,0 +1,25 @@
 # -*- coding: utf-8 -*-
 import codecs
 from setuptools import setup, find_packages
 setup(
    name='ocrd_calamari',
    version='0.0.1',
    description='Calamari bindings',
    long_description=codecs.open('README.md', encoding='utf-8').read(),
    author='Konstantin Baierer, Mike Gerber',
    author_email='unixprog@gmail.com, mike.gerber@sbb.spk-berlin.de',
    url='https://github.com/kba/ocrd_calamari',
    license='Apache License 2.0',
    packages=find_packages(exclude=('tests', 'docs')),
    install_requires=open('requirements.txt').read().split('\n'),
    package_data={
        '': ['*.json', '*.yml', '*.yaml'],
    },
    entry_points={
        'console_scripts': [
            'ocrd-calamari-recognize=ocrd_calamari.cli:ocrd_calamari_recognize',
        ]
    },
 )