commit
58f2adcd1c
@ -1 +1,32 @@
|
|||||||
# ocrd_calamari
|
# ocrd_calamari
|
||||||
|
|
||||||
|
Recognize text using [Calamari OCR](https://github.com/Calamari-OCR/calamari).
|
||||||
|
|
||||||
|
Introduction
|
||||||
|
-------------
|
||||||
|
|
||||||
|
This offers a OCR-D compliant workspace processor for some of the functionality of Calamari OCR.
|
||||||
|
|
||||||
|
This processor only operates on the text line level and so needs a line segmentation (and by extension a binarized
|
||||||
|
image) as its input.
|
||||||
|
|
||||||
|
Example Usage
|
||||||
|
-------------
|
||||||
|
|
||||||
|
~~~
|
||||||
|
ocrd-calamari-recognize -p test-parameters.json -m mets.xml -I OCR-D-SEG-LINE -O OCR-D-OCR-CALAMARI
|
||||||
|
~~~
|
||||||
|
|
||||||
|
With `test-parameters.json`:
|
||||||
|
~~~
|
||||||
|
{
|
||||||
|
"checkpoint": "/path/to/some/trained/models/*.ckpt.json"
|
||||||
|
}
|
||||||
|
~~~
|
||||||
|
|
||||||
|
TODO
|
||||||
|
----
|
||||||
|
|
||||||
|
* Support Calamari's "extended prediction data" output
|
||||||
|
* Currently, the processor only supports a prediction using confidence voting of multiple models. While this is
|
||||||
|
superior, it makes sense to support single model prediction, too.
|
||||||
|
@ -0,0 +1,10 @@
|
|||||||
|
import click
|
||||||
|
|
||||||
|
from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor
|
||||||
|
from ocrd_calamari.recognize import CalamariRecognize
|
||||||
|
|
||||||
|
|
||||||
|
@click.command()
|
||||||
|
@ocrd_cli_options
|
||||||
|
def ocrd_calamari_recognize(*args, **kwargs):
|
||||||
|
return ocrd_cli_wrap_processor(CalamariRecognize, *args, **kwargs)
|
@ -0,0 +1,5 @@
|
|||||||
|
import json
|
||||||
|
from pkg_resources import resource_string
|
||||||
|
|
||||||
|
OCRD_TOOL = json.loads(resource_string(__name__, 'ocrd-tool.json').decode('utf8'))
|
||||||
|
TF_CPP_MIN_LOG_LEVEL = '3' # '3' == ERROR
|
@ -1,39 +0,0 @@
|
|||||||
from __future__ import absolute_import
|
|
||||||
from calamari_ocr.scripts.predict import run
|
|
||||||
|
|
||||||
log = getLogger('processor.KrakenOcr')
|
|
||||||
|
|
||||||
class KrakenOcr(Processor):
|
|
||||||
|
|
||||||
def __init__(self, *args, **kwargs):
|
|
||||||
kwargs['ocrd_tool'] = OCRD_TOOL['tools']['ocrd-calamari-ocr']
|
|
||||||
super(KrakenOcr, self).__init__(*args, **kwargs)
|
|
||||||
|
|
||||||
def process(self):
|
|
||||||
"""
|
|
||||||
Performs the binarization.
|
|
||||||
"""
|
|
||||||
for (n, input_file) in enumerate(self.input_files):
|
|
||||||
log.info("INPUT FILE %i / %s", n, input_file)
|
|
||||||
pcgts = ocrd_page.from_file(self.workspace.download_file(input_file))
|
|
||||||
image_url = pcgts.get_Page().imageFilename
|
|
||||||
log.info("pcgts %s", pcgts)
|
|
||||||
for region in pcgts.get_Page().get_TextRegion():
|
|
||||||
textlines = region.get_TextLine()
|
|
||||||
log.info("About to binarize %i lines of region '%s'", len(textlines), region.id)
|
|
||||||
for (line_no, line) in enumerate(textlines):
|
|
||||||
log.debug("Binarizing line '%s' in region '%s'", line_no, region.id)
|
|
||||||
image = self.workspace.resolve_image_as_pil(image_url, polygon_from_points(line.get_Coords().points))
|
|
||||||
print(dir(kraken.binarization))
|
|
||||||
bin_image = kraken.binarization.nlbin(image)
|
|
||||||
bin_image_bytes = io.BytesIO()
|
|
||||||
bin_image.save(bin_image_bytes, format='PNG')
|
|
||||||
ID = concat_padded(self.output_file_grp, n)
|
|
||||||
self.add_output_file(
|
|
||||||
ID=ID,
|
|
||||||
file_grp=self.output_file_grp,
|
|
||||||
basename="%s.bin.png" % ID,
|
|
||||||
mimetype='image/png',
|
|
||||||
content=bin_image_bytes.getvalue()
|
|
||||||
)
|
|
||||||
|
|
@ -0,0 +1,122 @@
|
|||||||
|
from __future__ import absolute_import
|
||||||
|
|
||||||
|
import os
|
||||||
|
from glob import glob
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
from calamari_ocr.ocr import MultiPredictor
|
||||||
|
from calamari_ocr.ocr.voting import voter_from_proto
|
||||||
|
from calamari_ocr.proto import VoterParams
|
||||||
|
from ocrd import Processor
|
||||||
|
from ocrd_modelfactory import page_from_file
|
||||||
|
from ocrd_models.ocrd_page import to_xml
|
||||||
|
from ocrd_models.ocrd_page_generateds import TextEquivType
|
||||||
|
from ocrd_utils import getLogger, concat_padded, polygon_from_points, MIMETYPE_PAGE
|
||||||
|
|
||||||
|
from ocrd_calamari.config import OCRD_TOOL, TF_CPP_MIN_LOG_LEVEL
|
||||||
|
|
||||||
|
log = getLogger('processor.CalamariRecognize')
|
||||||
|
|
||||||
|
|
||||||
|
class CalamariRecognize(Processor):
|
||||||
|
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
kwargs['ocrd_tool'] = OCRD_TOOL['tools']['ocrd-calamari-recognize']
|
||||||
|
super(CalamariRecognize, self).__init__(*args, **kwargs)
|
||||||
|
|
||||||
|
def _init_calamari(self):
|
||||||
|
os.environ['TF_CPP_MIN_LOG_LEVEL'] = TF_CPP_MIN_LOG_LEVEL
|
||||||
|
|
||||||
|
checkpoints = glob(self.parameter['checkpoint'])
|
||||||
|
self.predictor = MultiPredictor(checkpoints=checkpoints)
|
||||||
|
|
||||||
|
voter_params = VoterParams()
|
||||||
|
voter_params.type = VoterParams.Type.Value(self.parameter['voter'].upper())
|
||||||
|
self.voter = voter_from_proto(voter_params)
|
||||||
|
|
||||||
|
def resolve_image_as_np(self, image_url, coords):
|
||||||
|
return np.array(self.workspace.resolve_image_as_pil(image_url, coords), dtype=np.uint8)
|
||||||
|
|
||||||
|
def _make_file_id(self, input_file, n):
|
||||||
|
file_id = input_file.ID.replace(self.input_file_grp, self.output_file_grp)
|
||||||
|
if file_id == input_file.ID:
|
||||||
|
file_id = concat_padded(self.output_file_grp, n)
|
||||||
|
return file_id
|
||||||
|
|
||||||
|
def process(self):
|
||||||
|
"""
|
||||||
|
Performs the recognition.
|
||||||
|
"""
|
||||||
|
|
||||||
|
self._init_calamari()
|
||||||
|
|
||||||
|
for (n, input_file) in enumerate(self.input_files):
|
||||||
|
log.info("INPUT FILE %i / %s", n, input_file)
|
||||||
|
pcgts = page_from_file(self.workspace.download_file(input_file))
|
||||||
|
image_url = pcgts.get_Page().imageFilename
|
||||||
|
log.info("pcgts %s", pcgts)
|
||||||
|
for region in pcgts.get_Page().get_TextRegion():
|
||||||
|
textlines = region.get_TextLine()
|
||||||
|
log.info("About to recognize %i lines of region '%s'", len(textlines), region.id)
|
||||||
|
for (line_no, line) in enumerate(textlines):
|
||||||
|
log.debug("Recognizing line '%s' in region '%s'", line_no, region.id)
|
||||||
|
|
||||||
|
image = self.resolve_image_as_np(image_url, polygon_from_points(line.get_Coords().points))
|
||||||
|
|
||||||
|
raw_results = list(self.predictor.predict_raw([image], progress_bar=False))[0]
|
||||||
|
for i, p in enumerate(raw_results):
|
||||||
|
p.prediction.id = "fold_{}".format(i)
|
||||||
|
|
||||||
|
prediction = self.voter.vote_prediction_result(raw_results)
|
||||||
|
prediction.id = "voted"
|
||||||
|
|
||||||
|
line_text = prediction.sentence
|
||||||
|
line_conf = prediction.avg_char_probability
|
||||||
|
|
||||||
|
line.add_TextEquiv(TextEquivType(Unicode=line_text, conf=line_conf))
|
||||||
|
|
||||||
|
_page_update_higher_textequiv_levels('line', pcgts)
|
||||||
|
|
||||||
|
file_id = self._make_file_id(input_file, n)
|
||||||
|
self.workspace.add_file(
|
||||||
|
ID=file_id,
|
||||||
|
file_grp=self.output_file_grp,
|
||||||
|
pageId=input_file.pageId,
|
||||||
|
mimetype=MIMETYPE_PAGE,
|
||||||
|
local_filename=os.path.join(self.output_file_grp, file_id + '.xml'),
|
||||||
|
content=to_xml(pcgts))
|
||||||
|
|
||||||
|
|
||||||
|
# TODO: This is a copy of ocrd_tesserocr's function, and should probably be moved to a ocrd lib
|
||||||
|
def _page_update_higher_textequiv_levels(level, pcgts):
|
||||||
|
"""Update the TextEquivs of all PAGE-XML hierarchy levels above `level` for consistency.
|
||||||
|
|
||||||
|
Starting with the hierarchy level chosen for processing,
|
||||||
|
join all first TextEquiv (by the rules governing the respective level)
|
||||||
|
into TextEquiv of the next higher level, replacing them.
|
||||||
|
"""
|
||||||
|
regions = pcgts.get_Page().get_TextRegion()
|
||||||
|
if level != 'region':
|
||||||
|
for region in regions:
|
||||||
|
lines = region.get_TextLine()
|
||||||
|
if level != 'line':
|
||||||
|
for line in lines:
|
||||||
|
words = line.get_Word()
|
||||||
|
if level != 'word':
|
||||||
|
for word in words:
|
||||||
|
glyphs = word.get_Glyph()
|
||||||
|
word_unicode = u''.join(glyph.get_TextEquiv()[0].Unicode
|
||||||
|
if glyph.get_TextEquiv()
|
||||||
|
else u'' for glyph in glyphs)
|
||||||
|
word.set_TextEquiv(
|
||||||
|
[TextEquivType(Unicode=word_unicode)]) # remove old
|
||||||
|
line_unicode = u' '.join(word.get_TextEquiv()[0].Unicode
|
||||||
|
if word.get_TextEquiv()
|
||||||
|
else u'' for word in words)
|
||||||
|
line.set_TextEquiv(
|
||||||
|
[TextEquivType(Unicode=line_unicode)]) # remove old
|
||||||
|
region_unicode = u'\n'.join(line.get_TextEquiv()[0].Unicode
|
||||||
|
if line.get_TextEquiv()
|
||||||
|
else u'' for line in lines)
|
||||||
|
region.set_TextEquiv(
|
||||||
|
[TextEquivType(Unicode=region_unicode)]) # remove old
|
@ -0,0 +1,5 @@
|
|||||||
|
numpy
|
||||||
|
calamari-ocr
|
||||||
|
tensorflow-gpu
|
||||||
|
click
|
||||||
|
ocrd >= 1.0.0b11
|
@ -0,0 +1,25 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
import codecs
|
||||||
|
|
||||||
|
from setuptools import setup, find_packages
|
||||||
|
|
||||||
|
setup(
|
||||||
|
name='ocrd_calamari',
|
||||||
|
version='0.0.1',
|
||||||
|
description='Calamari bindings',
|
||||||
|
long_description=codecs.open('README.md', encoding='utf-8').read(),
|
||||||
|
author='Konstantin Baierer, Mike Gerber',
|
||||||
|
author_email='unixprog@gmail.com, mike.gerber@sbb.spk-berlin.de',
|
||||||
|
url='https://github.com/kba/ocrd_calamari',
|
||||||
|
license='Apache License 2.0',
|
||||||
|
packages=find_packages(exclude=('tests', 'docs')),
|
||||||
|
install_requires=open('requirements.txt').read().split('\n'),
|
||||||
|
package_data={
|
||||||
|
'': ['*.json', '*.yml', '*.yaml'],
|
||||||
|
},
|
||||||
|
entry_points={
|
||||||
|
'console_scripts': [
|
||||||
|
'ocrd-calamari-recognize=ocrd_calamari.cli:ocrd_calamari_recognize',
|
||||||
|
]
|
||||||
|
},
|
||||||
|
)
|
Loading…
Reference in New Issue