mirror of
				https://github.com/mikegerber/ocrd_calamari.git
				synced 2025-11-04 01:24:14 +01:00 
			
		
		
		
	
						commit
						58f2adcd1c
					
				
					 8 changed files with 210 additions and 50 deletions
				
			
		
							
								
								
									
										31
									
								
								README.md
									
										
									
									
									
								
							
							
						
						
									
										31
									
								
								README.md
									
										
									
									
									
								
							| 
						 | 
					@ -1 +1,32 @@
 | 
				
			||||||
# ocrd_calamari
 | 
					# ocrd_calamari
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Recognize text using [Calamari OCR](https://github.com/Calamari-OCR/calamari).
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Introduction
 | 
				
			||||||
 | 
					-------------
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					This offers a OCR-D compliant workspace processor for some of the functionality of Calamari OCR.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					This processor only operates on the text line level and so needs a line segmentation (and by extension a binarized 
 | 
				
			||||||
 | 
					image) as its input.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Example Usage
 | 
				
			||||||
 | 
					-------------
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					~~~
 | 
				
			||||||
 | 
					ocrd-calamari-recognize -p test-parameters.json -m mets.xml -I OCR-D-SEG-LINE -O OCR-D-OCR-CALAMARI
 | 
				
			||||||
 | 
					~~~
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					With `test-parameters.json`:
 | 
				
			||||||
 | 
					~~~
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
					    "checkpoint": "/path/to/some/trained/models/*.ckpt.json"
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					~~~
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					TODO
 | 
				
			||||||
 | 
					----
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					* Support Calamari's "extended prediction data" output
 | 
				
			||||||
 | 
					* Currently, the processor only supports a prediction using confidence voting of multiple models. While this is
 | 
				
			||||||
 | 
					  superior, it makes sense to support single model prediction, too.
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
							
								
								
									
										10
									
								
								ocrd_calamari/cli.py
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										10
									
								
								ocrd_calamari/cli.py
									
										
									
									
									
										Normal file
									
								
							| 
						 | 
					@ -0,0 +1,10 @@
 | 
				
			||||||
 | 
					import click
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor
 | 
				
			||||||
 | 
					from ocrd_calamari.recognize import CalamariRecognize
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@click.command()
 | 
				
			||||||
 | 
					@ocrd_cli_options
 | 
				
			||||||
 | 
					def ocrd_calamari_recognize(*args, **kwargs):
 | 
				
			||||||
 | 
					    return ocrd_cli_wrap_processor(CalamariRecognize, *args, **kwargs)
 | 
				
			||||||
							
								
								
									
										5
									
								
								ocrd_calamari/config.py
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										5
									
								
								ocrd_calamari/config.py
									
										
									
									
									
										Normal file
									
								
							| 
						 | 
					@ -0,0 +1,5 @@
 | 
				
			||||||
 | 
					import json
 | 
				
			||||||
 | 
					from pkg_resources import resource_string
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					OCRD_TOOL = json.loads(resource_string(__name__, 'ocrd-tool.json').decode('utf8'))
 | 
				
			||||||
 | 
					TF_CPP_MIN_LOG_LEVEL = '3'  # '3' == ERROR
 | 
				
			||||||
| 
						 | 
					@ -1,39 +0,0 @@
 | 
				
			||||||
from __future__ import absolute_import
 | 
					 | 
				
			||||||
from calamari_ocr.scripts.predict import run
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
log = getLogger('processor.KrakenOcr')
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
class KrakenOcr(Processor):
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    def __init__(self, *args, **kwargs):
 | 
					 | 
				
			||||||
        kwargs['ocrd_tool'] = OCRD_TOOL['tools']['ocrd-calamari-ocr']
 | 
					 | 
				
			||||||
        super(KrakenOcr, self).__init__(*args, **kwargs)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    def process(self):
 | 
					 | 
				
			||||||
        """
 | 
					 | 
				
			||||||
        Performs the binarization.
 | 
					 | 
				
			||||||
        """
 | 
					 | 
				
			||||||
        for (n, input_file) in enumerate(self.input_files):
 | 
					 | 
				
			||||||
            log.info("INPUT FILE %i / %s", n, input_file)
 | 
					 | 
				
			||||||
            pcgts = ocrd_page.from_file(self.workspace.download_file(input_file))
 | 
					 | 
				
			||||||
            image_url = pcgts.get_Page().imageFilename
 | 
					 | 
				
			||||||
            log.info("pcgts %s", pcgts)
 | 
					 | 
				
			||||||
            for region in pcgts.get_Page().get_TextRegion():
 | 
					 | 
				
			||||||
                textlines = region.get_TextLine()
 | 
					 | 
				
			||||||
                log.info("About to binarize %i lines of region '%s'", len(textlines), region.id)
 | 
					 | 
				
			||||||
                for (line_no, line) in enumerate(textlines):
 | 
					 | 
				
			||||||
                    log.debug("Binarizing line '%s' in region '%s'", line_no, region.id)
 | 
					 | 
				
			||||||
                    image = self.workspace.resolve_image_as_pil(image_url, polygon_from_points(line.get_Coords().points))
 | 
					 | 
				
			||||||
                    print(dir(kraken.binarization))
 | 
					 | 
				
			||||||
                    bin_image = kraken.binarization.nlbin(image)
 | 
					 | 
				
			||||||
                    bin_image_bytes = io.BytesIO()
 | 
					 | 
				
			||||||
                    bin_image.save(bin_image_bytes, format='PNG')
 | 
					 | 
				
			||||||
                    ID = concat_padded(self.output_file_grp, n)
 | 
					 | 
				
			||||||
                    self.add_output_file(
 | 
					 | 
				
			||||||
                        ID=ID,
 | 
					 | 
				
			||||||
                        file_grp=self.output_file_grp,
 | 
					 | 
				
			||||||
                        basename="%s.bin.png" % ID,
 | 
					 | 
				
			||||||
                        mimetype='image/png',
 | 
					 | 
				
			||||||
                        content=bin_image_bytes.getvalue()
 | 
					 | 
				
			||||||
                    )
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
| 
						 | 
					@ -1,24 +1,25 @@
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
  "git_url": "https://github.com/OCR-D/ocrd_calamari",
 | 
					  "git_url": "https://github.com/kba/ocrd_calamari",
 | 
				
			||||||
  "version": "0.0.1",
 | 
					  "version": "0.0.1",
 | 
				
			||||||
  "tools": {
 | 
					  "tools": {
 | 
				
			||||||
    "ocrd-calamari-ocr": {
 | 
					    "ocrd-calamari-recognize": {
 | 
				
			||||||
      "executable": "ocrd-calamari-ocr",
 | 
					      "executable": "ocrd-calamari-recognize",
 | 
				
			||||||
      "categories": [
 | 
					      "categories": [
 | 
				
			||||||
        "Text recognition and optimization"
 | 
					        "Text recognition and optimization"
 | 
				
			||||||
      ],
 | 
					      ],
 | 
				
			||||||
      "steps": [
 | 
					      "steps": [
 | 
				
			||||||
        "recognition/text-recognition"
 | 
					        "recognition/text-recognition"
 | 
				
			||||||
      ],
 | 
					      ],
 | 
				
			||||||
      "description": "Recognize lines with kraken",
 | 
					      "description": "Recognize lines with Calamari",
 | 
				
			||||||
 | 
					      "input_file_grp": [
 | 
				
			||||||
 | 
					        "OCR-D-SEG-LINE"
 | 
				
			||||||
 | 
					      ],
 | 
				
			||||||
 | 
					      "output_file_grp": [
 | 
				
			||||||
 | 
					        "OCR-D-OCR-CALAMARI"
 | 
				
			||||||
 | 
					      ],
 | 
				
			||||||
      "parameters": {
 | 
					      "parameters": {
 | 
				
			||||||
        "checkpoint": {"type": "string", "format": "file", "cacheable": true},
 | 
					        "checkpoint": {"type": "string", "format": "file", "cacheable": true},
 | 
				
			||||||
        "processes": {"type": "number", "default": 1},
 | 
					        "voter": {"type": "string", "default": "confidence_voter_default_ctc"}
 | 
				
			||||||
        "batch_size": {"type": "number", "default": 1},
 | 
					 | 
				
			||||||
        "voter": {"type": "string", "default": "confidence_voter_default_ctc"},
 | 
					 | 
				
			||||||
        "extended_prediction_data_format": {"type": "string", "default": "json"},
 | 
					 | 
				
			||||||
        "XXX output_dir": "TODO",
 | 
					 | 
				
			||||||
        "XXX extended_prediction_data": "TODO"
 | 
					 | 
				
			||||||
      }
 | 
					      }
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
							
								
								
									
										122
									
								
								ocrd_calamari/recognize.py
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										122
									
								
								ocrd_calamari/recognize.py
									
										
									
									
									
										Normal file
									
								
							| 
						 | 
					@ -0,0 +1,122 @@
 | 
				
			||||||
 | 
					from __future__ import absolute_import
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					import os
 | 
				
			||||||
 | 
					from glob import glob
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					import numpy as np
 | 
				
			||||||
 | 
					from calamari_ocr.ocr import MultiPredictor
 | 
				
			||||||
 | 
					from calamari_ocr.ocr.voting import voter_from_proto
 | 
				
			||||||
 | 
					from calamari_ocr.proto import VoterParams
 | 
				
			||||||
 | 
					from ocrd import Processor
 | 
				
			||||||
 | 
					from ocrd_modelfactory import page_from_file
 | 
				
			||||||
 | 
					from ocrd_models.ocrd_page import to_xml
 | 
				
			||||||
 | 
					from ocrd_models.ocrd_page_generateds import TextEquivType
 | 
				
			||||||
 | 
					from ocrd_utils import getLogger, concat_padded, polygon_from_points, MIMETYPE_PAGE
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from ocrd_calamari.config import OCRD_TOOL, TF_CPP_MIN_LOG_LEVEL
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					log = getLogger('processor.CalamariRecognize')
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					class CalamariRecognize(Processor):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def __init__(self, *args, **kwargs):
 | 
				
			||||||
 | 
					        kwargs['ocrd_tool'] = OCRD_TOOL['tools']['ocrd-calamari-recognize']
 | 
				
			||||||
 | 
					        super(CalamariRecognize, self).__init__(*args, **kwargs)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def _init_calamari(self):
 | 
				
			||||||
 | 
					        os.environ['TF_CPP_MIN_LOG_LEVEL'] = TF_CPP_MIN_LOG_LEVEL
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        checkpoints = glob(self.parameter['checkpoint'])
 | 
				
			||||||
 | 
					        self.predictor = MultiPredictor(checkpoints=checkpoints)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        voter_params = VoterParams()
 | 
				
			||||||
 | 
					        voter_params.type = VoterParams.Type.Value(self.parameter['voter'].upper())
 | 
				
			||||||
 | 
					        self.voter = voter_from_proto(voter_params)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def resolve_image_as_np(self, image_url, coords):
 | 
				
			||||||
 | 
					        return np.array(self.workspace.resolve_image_as_pil(image_url, coords), dtype=np.uint8)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def _make_file_id(self, input_file, n):
 | 
				
			||||||
 | 
					        file_id = input_file.ID.replace(self.input_file_grp, self.output_file_grp)
 | 
				
			||||||
 | 
					        if file_id == input_file.ID:
 | 
				
			||||||
 | 
					            file_id = concat_padded(self.output_file_grp, n)
 | 
				
			||||||
 | 
					        return file_id
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def process(self):
 | 
				
			||||||
 | 
					        """
 | 
				
			||||||
 | 
					        Performs the recognition.
 | 
				
			||||||
 | 
					        """
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        self._init_calamari()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        for (n, input_file) in enumerate(self.input_files):
 | 
				
			||||||
 | 
					            log.info("INPUT FILE %i / %s", n, input_file)
 | 
				
			||||||
 | 
					            pcgts = page_from_file(self.workspace.download_file(input_file))
 | 
				
			||||||
 | 
					            image_url = pcgts.get_Page().imageFilename
 | 
				
			||||||
 | 
					            log.info("pcgts %s", pcgts)
 | 
				
			||||||
 | 
					            for region in pcgts.get_Page().get_TextRegion():
 | 
				
			||||||
 | 
					                textlines = region.get_TextLine()
 | 
				
			||||||
 | 
					                log.info("About to recognize %i lines of region '%s'", len(textlines), region.id)
 | 
				
			||||||
 | 
					                for (line_no, line) in enumerate(textlines):
 | 
				
			||||||
 | 
					                    log.debug("Recognizing line '%s' in region '%s'", line_no, region.id)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					                    image = self.resolve_image_as_np(image_url, polygon_from_points(line.get_Coords().points))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					                    raw_results = list(self.predictor.predict_raw([image], progress_bar=False))[0]
 | 
				
			||||||
 | 
					                    for i, p in enumerate(raw_results):
 | 
				
			||||||
 | 
					                        p.prediction.id = "fold_{}".format(i)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					                    prediction = self.voter.vote_prediction_result(raw_results)
 | 
				
			||||||
 | 
					                    prediction.id = "voted"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					                    line_text = prediction.sentence
 | 
				
			||||||
 | 
					                    line_conf = prediction.avg_char_probability
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					                    line.add_TextEquiv(TextEquivType(Unicode=line_text, conf=line_conf))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            _page_update_higher_textequiv_levels('line', pcgts)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            file_id = self._make_file_id(input_file, n)
 | 
				
			||||||
 | 
					            self.workspace.add_file(
 | 
				
			||||||
 | 
					                ID=file_id,
 | 
				
			||||||
 | 
					                file_grp=self.output_file_grp,
 | 
				
			||||||
 | 
					                pageId=input_file.pageId,
 | 
				
			||||||
 | 
					                mimetype=MIMETYPE_PAGE,
 | 
				
			||||||
 | 
					                local_filename=os.path.join(self.output_file_grp, file_id + '.xml'),
 | 
				
			||||||
 | 
					                content=to_xml(pcgts))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# TODO: This is a copy of ocrd_tesserocr's function, and should probably be moved to a ocrd lib
 | 
				
			||||||
 | 
					def _page_update_higher_textequiv_levels(level, pcgts):
 | 
				
			||||||
 | 
					    """Update the TextEquivs of all PAGE-XML hierarchy levels above `level` for consistency.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    Starting with the hierarchy level chosen for processing,
 | 
				
			||||||
 | 
					    join all first TextEquiv (by the rules governing the respective level)
 | 
				
			||||||
 | 
					    into TextEquiv of the next higher level, replacing them.
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    regions = pcgts.get_Page().get_TextRegion()
 | 
				
			||||||
 | 
					    if level != 'region':
 | 
				
			||||||
 | 
					        for region in regions:
 | 
				
			||||||
 | 
					            lines = region.get_TextLine()
 | 
				
			||||||
 | 
					            if level != 'line':
 | 
				
			||||||
 | 
					                for line in lines:
 | 
				
			||||||
 | 
					                    words = line.get_Word()
 | 
				
			||||||
 | 
					                    if level != 'word':
 | 
				
			||||||
 | 
					                        for word in words:
 | 
				
			||||||
 | 
					                            glyphs = word.get_Glyph()
 | 
				
			||||||
 | 
					                            word_unicode = u''.join(glyph.get_TextEquiv()[0].Unicode
 | 
				
			||||||
 | 
					                                                    if glyph.get_TextEquiv()
 | 
				
			||||||
 | 
					                                                    else u'' for glyph in glyphs)
 | 
				
			||||||
 | 
					                            word.set_TextEquiv(
 | 
				
			||||||
 | 
					                                [TextEquivType(Unicode=word_unicode)])  # remove old
 | 
				
			||||||
 | 
					                    line_unicode = u' '.join(word.get_TextEquiv()[0].Unicode
 | 
				
			||||||
 | 
					                                             if word.get_TextEquiv()
 | 
				
			||||||
 | 
					                                             else u'' for word in words)
 | 
				
			||||||
 | 
					                    line.set_TextEquiv(
 | 
				
			||||||
 | 
					                        [TextEquivType(Unicode=line_unicode)])  # remove old
 | 
				
			||||||
 | 
					            region_unicode = u'\n'.join(line.get_TextEquiv()[0].Unicode
 | 
				
			||||||
 | 
					                                        if line.get_TextEquiv()
 | 
				
			||||||
 | 
					                                        else u'' for line in lines)
 | 
				
			||||||
 | 
					            region.set_TextEquiv(
 | 
				
			||||||
 | 
					                [TextEquivType(Unicode=region_unicode)])  # remove old
 | 
				
			||||||
							
								
								
									
										5
									
								
								requirements.txt
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										5
									
								
								requirements.txt
									
										
									
									
									
										Normal file
									
								
							| 
						 | 
					@ -0,0 +1,5 @@
 | 
				
			||||||
 | 
					numpy
 | 
				
			||||||
 | 
					calamari-ocr
 | 
				
			||||||
 | 
					tensorflow-gpu
 | 
				
			||||||
 | 
					click
 | 
				
			||||||
 | 
					ocrd >= 1.0.0b11
 | 
				
			||||||
							
								
								
									
										25
									
								
								setup.py
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										25
									
								
								setup.py
									
										
									
									
									
										Normal file
									
								
							| 
						 | 
					@ -0,0 +1,25 @@
 | 
				
			||||||
 | 
					# -*- coding: utf-8 -*-
 | 
				
			||||||
 | 
					import codecs
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from setuptools import setup, find_packages
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					setup(
 | 
				
			||||||
 | 
					    name='ocrd_calamari',
 | 
				
			||||||
 | 
					    version='0.0.1',
 | 
				
			||||||
 | 
					    description='Calamari bindings',
 | 
				
			||||||
 | 
					    long_description=codecs.open('README.md', encoding='utf-8').read(),
 | 
				
			||||||
 | 
					    author='Konstantin Baierer, Mike Gerber',
 | 
				
			||||||
 | 
					    author_email='unixprog@gmail.com, mike.gerber@sbb.spk-berlin.de',
 | 
				
			||||||
 | 
					    url='https://github.com/kba/ocrd_calamari',
 | 
				
			||||||
 | 
					    license='Apache License 2.0',
 | 
				
			||||||
 | 
					    packages=find_packages(exclude=('tests', 'docs')),
 | 
				
			||||||
 | 
					    install_requires=open('requirements.txt').read().split('\n'),
 | 
				
			||||||
 | 
					    package_data={
 | 
				
			||||||
 | 
					        '': ['*.json', '*.yml', '*.yaml'],
 | 
				
			||||||
 | 
					    },
 | 
				
			||||||
 | 
					    entry_points={
 | 
				
			||||||
 | 
					        'console_scripts': [
 | 
				
			||||||
 | 
					            'ocrd-calamari-recognize=ocrd_calamari.cli:ocrd_calamari_recognize',
 | 
				
			||||||
 | 
					        ]
 | 
				
			||||||
 | 
					    },
 | 
				
			||||||
 | 
					)
 | 
				
			||||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue