ocrd_calamari/ocrd_calamari/recognize.py

from __future__ import absolute_import

import os
from glob import glob

import numpy as np
from calamari_ocr.ocr import MultiPredictor
from calamari_ocr.ocr.voting import voter_from_proto
from calamari_ocr.proto import VoterParams
from ocrd import Processor
from ocrd_modelfactory import page_from_file
from ocrd_models.ocrd_page import (
        LabelType, LabelsType,
        MetadataItemType,
        TextEquivType,
        WordType, GlyphType, CoordsType,
        to_xml
)
from ocrd_utils import (
        getLogger, concat_padded,
        coordinates_for_segment, points_from_polygon, polygon_from_x0y0x1y1,
        MIMETYPE_PAGE
)

from ocrd_calamari.config import OCRD_TOOL, TF_CPP_MIN_LOG_LEVEL

TOOL = 'ocrd-calamari-recognize'
log = getLogger('processor.CalamariRecognize')


class CalamariRecognize(Processor):

    def __init__(self, *args, **kwargs):
        kwargs['ocrd_tool'] = OCRD_TOOL['tools'][TOOL]
        kwargs['version'] = OCRD_TOOL['version']
        super(CalamariRecognize, self).__init__(*args, **kwargs)

    def _init_calamari(self):
        os.environ['TF_CPP_MIN_LOG_LEVEL'] = TF_CPP_MIN_LOG_LEVEL

        checkpoints = glob(self.parameter['checkpoint'])
        self.predictor = MultiPredictor(checkpoints=checkpoints)

        voter_params = VoterParams()
        voter_params.type = VoterParams.Type.Value(self.parameter['voter'].upper())
        self.voter = voter_from_proto(voter_params)

    def _make_file_id(self, input_file, n):
        file_id = input_file.ID.replace(self.input_file_grp, self.output_file_grp)
        if file_id == input_file.ID:
            file_id = concat_padded(self.output_file_grp, n)
        return file_id

    def process(self):
        """
        Performs the recognition.
        """

        self._init_calamari()

        for (n, input_file) in enumerate(self.input_files):
            page_id = input_file.pageId or input_file.ID
            log.info("INPUT FILE %i / %s", n, page_id)
            pcgts = page_from_file(self.workspace.download_file(input_file))

            page = pcgts.get_Page()
            page_image, page_xywh, page_image_info = self.workspace.image_from_page(page, page_id)

            for region in pcgts.get_Page().get_TextRegion():
                region_image, region_xywh = self.workspace.image_from_segment(region, page_image, page_xywh)

                textlines = region.get_TextLine()
                log.info("About to recognize %i lines of region '%s'", len(textlines), region.id)
                for (line_no, line) in enumerate(textlines):
                    log.debug("Recognizing line '%s' in region '%s'", line_no, region.id)

                    line_image, line_coords = self.workspace.image_from_segment(line, region_image, region_xywh)
                    line_image_np = np.array(line_image, dtype=np.uint8)

                    raw_results = list(self.predictor.predict_raw([line_image_np], progress_bar=False))[0]
                    for i, p in enumerate(raw_results):
                        p.prediction.id = "fold_{}".format(i)

                    prediction = self.voter.vote_prediction_result(raw_results)
                    prediction.id = "voted"

                    line_text = prediction.sentence
                    line_conf = prediction.avg_char_probability

                    # Delete existing results
                    if line.get_TextEquiv():
                        log.warning("Line '%s' already contained text results", line.id)
                    line.set_TextEquiv([])
                    if line.get_Word():
                        log.warning("Line '%s' already contained word segmentation", line.id)
                    line.set_Word([])

                    # Save line results
                    line.set_TextEquiv([TextEquivType(Unicode=line_text, conf=line_conf)])

                    # Save word results
                    #
                    # Calamari OCR does not provide word positions, so we infer word positions from a. text segmentation
                    # and b. the glyph positions. This is necessary because the PAGE XML format enforces a strict
                    # hierarchy of lines > words > glyphs.

                    def _words(s):
                        """Split words based on spaces and include spaces as 'words'"""
                        spaces = None
                        word = ''
                        for c in s:
                            if c == ' ' and spaces is True:
                                word += c
                            elif c != ' ' and spaces is False:
                                word += c
                            else:
                                if word:
                                    yield word
                                word = c
                                spaces = (c == ' ')
                        yield word

                    if self.parameter['textequiv_level'] in ['word', 'glyph']:
                        word_no = 0
                        i = 0

                        for word_text in _words(prediction.sentence):
                            word_length = len(word_text)
                            if not all(c == ' ' for c in word_text):
                                word_positions = prediction.positions[i:i+word_length]
                                word_start = word_positions[0].global_start
                                word_end = word_positions[-1].global_end

                                polygon = polygon_from_x0y0x1y1([word_start, 0, word_end, line_image.height])
                                points = points_from_polygon(coordinates_for_segment(polygon, None, line_coords))
                                # XXX Crop to line polygon?

                                word = WordType(id='%s_word%04d' % (line.id, word_no), Coords=CoordsType(points))
                                word.add_TextEquiv(TextEquivType(Unicode=word_text))

                                if self.parameter['textequiv_level'] == 'glyph':
                                    for glyph_no, p in enumerate(word_positions):
                                        glyph_start = p.global_start
                                        glyph_end = p.global_end

                                        polygon = polygon_from_x0y0x1y1([glyph_start, 0, glyph_end, line_image.height])
                                        points = points_from_polygon(coordinates_for_segment(polygon, None, line_coords))

                                        glyph = GlyphType(id='%s_glyph%04d' % (word.id, glyph_no), Coords=CoordsType(points))

                                        chars = sorted(p.chars, key=lambda k: k.probability, reverse=True)
                                        char_index = 1  # Must start with 1, see https://ocr-d.github.io/page#multiple-textequivs
                                        for char in chars:
                                            if char.char:
                                                glyph.add_TextEquiv(TextEquivType(Unicode=char.char, index=char_index, conf=char.probability))
                                                char_index += 1
                                                # XXX Note that omission probabilities are not normalized?!

                                        word.add_Glyph(glyph)

                                line.add_Word(word)
                                word_no += 1

                            i += word_length


            _page_update_higher_textequiv_levels('line', pcgts)


            # Add metadata about this operation and its runtime parameters:
            metadata = pcgts.get_Metadata()  # ensured by from_file()
            metadata.add_MetadataItem(
                MetadataItemType(type_="processingStep",
                                 name=self.ocrd_tool['steps'][0],
                                 value=TOOL,
                                 Labels=[LabelsType(
                                     externalModel="ocrd-tool",
                                     externalId="parameters",
                                     Label=[LabelType(type_=name, value=self.parameter[name])
                                            for name in self.parameter.keys()])]))


            file_id = self._make_file_id(input_file, n)
            self.workspace.add_file(
                ID=file_id,
                file_grp=self.output_file_grp,
                pageId=input_file.pageId,
                mimetype=MIMETYPE_PAGE,
                local_filename=os.path.join(self.output_file_grp, file_id + '.xml'),
                content=to_xml(pcgts))


# TODO: This is a copy of ocrd_tesserocr's function, and should probably be moved to a ocrd lib
def _page_update_higher_textequiv_levels(level, pcgts):
    """Update the TextEquivs of all PAGE-XML hierarchy levels above `level` for consistency.

    Starting with the hierarchy level chosen for processing,
    join all first TextEquiv (by the rules governing the respective level)
    into TextEquiv of the next higher level, replacing them.
    """
    regions = pcgts.get_Page().get_TextRegion()
    if level != 'region':
        for region in regions:
            lines = region.get_TextLine()
            if level != 'line':
                for line in lines:
                    words = line.get_Word()
                    if level != 'word':
                        for word in words:
                            glyphs = word.get_Glyph()
                            word_unicode = u''.join(glyph.get_TextEquiv()[0].Unicode
                                                    if glyph.get_TextEquiv()
                                                    else u'' for glyph in glyphs)
                            word.set_TextEquiv(
                                [TextEquivType(Unicode=word_unicode)])  # remove old
                    line_unicode = u' '.join(word.get_TextEquiv()[0].Unicode
                                             if word.get_TextEquiv()
                                             else u'' for word in words)
                    line.set_TextEquiv(
                        [TextEquivType(Unicode=line_unicode)])  # remove old
            region_unicode = u'\n'.join(line.get_TextEquiv()[0].Unicode
                                        if line.get_TextEquiv()
                                        else u'' for line in lines)
            region.set_TextEquiv(
                [TextEquivType(Unicode=region_unicode)])  # remove old

# vim:tw=120:
. 2018-07-26 19:09:07 +02:00			`from __future__ import absolute_import`

🚧 Save results in the workspace 2019-08-08 11:12:23 +02:00			`import os`
🚧 Process lines 2019-08-08 10:41:55 +02:00			`from glob import glob`
. 2018-07-26 19:09:07 +02:00
🚧 Process lines 2019-08-08 10:41:55 +02:00			`import numpy as np`
			`from calamari_ocr.ocr import MultiPredictor`
			`from calamari_ocr.ocr.voting import voter_from_proto`
			`from calamari_ocr.proto import VoterParams`
🚧 Update to ocrd >= 1.0.0b11 2019-08-08 12:31:46 +02:00			`from ocrd import Processor`
			`from ocrd_modelfactory import page_from_file`
✨ Add metadata about the recognition operation w/ parameter info 2019-12-19 16:24:34 +01:00			`from ocrd_models.ocrd_page import (`
			`LabelType, LabelsType,`
			`MetadataItemType,`
			`TextEquivType,`
✨ Include proper word + glyph segmentation 2020-02-03 12:22:01 +01:00			`WordType, GlyphType, CoordsType,`
✨ Add metadata about the recognition operation w/ parameter info 2019-12-19 16:24:34 +01:00			`to_xml`
			`)`
🎨 Use polygon_from_x0y0x1y1 to build word/glyph polygon 2020-02-03 14:03:01 +01:00			`from ocrd_utils import (`
			`getLogger, concat_padded,`
			`coordinates_for_segment, points_from_polygon, polygon_from_x0y0x1y1,`
			`MIMETYPE_PAGE`
			`)`
🚧 Process lines 2019-08-08 10:41:55 +02:00
🚧 Set TensorFlow minimum log level 2019-08-08 12:50:11 +02:00			`from ocrd_calamari.config import OCRD_TOOL, TF_CPP_MIN_LOG_LEVEL`
🚧 Process lines 2019-08-08 10:41:55 +02:00
🎨 Use TOOL constant convention from the other OCR-D processors 2019-12-19 16:23:16 +01:00			`TOOL = 'ocrd-calamari-recognize'`
🚧 s/Ocr/Recognize 2019-08-08 13:48:58 +02:00			`log = getLogger('processor.CalamariRecognize')`
🚧 Process lines 2019-08-08 10:41:55 +02:00

🚧 s/Ocr/Recognize 2019-08-08 13:48:58 +02:00			`class CalamariRecognize(Processor):`
. 2018-07-26 19:09:07 +02:00
			`def __init__(self, args, *kwargs):`
🎨 Use TOOL constant convention from the other OCR-D processors 2019-12-19 16:23:16 +01:00			`kwargs['ocrd_tool'] = OCRD_TOOL['tools'][TOOL]`
pass version to processor base constructor, fix #14 2019-12-02 18:02:01 +01:00			`kwargs['version'] = OCRD_TOOL['version']`
🚧 s/Ocr/Recognize 2019-08-08 13:48:58 +02:00			`super(CalamariRecognize, self).__init__(args, *kwargs)`
🚧 Process lines 2019-08-08 10:41:55 +02:00
			`def _init_calamari(self):`
🚧 Set TensorFlow minimum log level 2019-08-08 12:50:11 +02:00			`os.environ['TF_CPP_MIN_LOG_LEVEL'] = TF_CPP_MIN_LOG_LEVEL`

🚧 Do not hardcode used models 2019-08-08 17:26:02 +02:00			`checkpoints = glob(self.parameter['checkpoint'])`
🚧 Process lines 2019-08-08 10:41:55 +02:00			`self.predictor = MultiPredictor(checkpoints=checkpoints)`

			`voter_params = VoterParams()`
🚧 Use voter from JSON config 2019-08-08 13:38:35 +02:00			`voter_params.type = VoterParams.Type.Value(self.parameter['voter'].upper())`
🚧 Process lines 2019-08-08 10:41:55 +02:00			`self.voter = voter_from_proto(voter_params)`

🚧 Save results in the workspace 2019-08-08 11:12:23 +02:00			`def _make_file_id(self, input_file, n):`
			`file_id = input_file.ID.replace(self.input_file_grp, self.output_file_grp)`
			`if file_id == input_file.ID:`
			`file_id = concat_padded(self.output_file_grp, n)`
			`return file_id`

. 2018-07-26 19:09:07 +02:00			`def process(self):`
			`"""`
🚧 Process lines 2019-08-08 10:41:55 +02:00			`Performs the recognition.`
. 2018-07-26 19:09:07 +02:00			`"""`
🚧 Process lines 2019-08-08 10:41:55 +02:00
			`self._init_calamari()`

. 2018-07-26 19:09:07 +02:00			`for (n, input_file) in enumerate(self.input_files):`
⬆ Use image_from_segment instead of deprecated resolve_image_as_pil 2019-09-27 14:10:19 +02:00			`page_id = input_file.pageId or input_file.ID`
			`log.info("INPUT FILE %i / %s", n, page_id)`
🚧 Update to ocrd >= 1.0.0b11 2019-08-08 12:31:46 +02:00			`pcgts = page_from_file(self.workspace.download_file(input_file))`
⬆ Use image_from_segment instead of deprecated resolve_image_as_pil 2019-09-27 14:10:19 +02:00
			`page = pcgts.get_Page()`
			`page_image, page_xywh, page_image_info = self.workspace.image_from_page(page, page_id)`

. 2018-07-26 19:09:07 +02:00			`for region in pcgts.get_Page().get_TextRegion():`
⬆ Use image_from_segment instead of deprecated resolve_image_as_pil 2019-09-27 14:10:19 +02:00			`region_image, region_xywh = self.workspace.image_from_segment(region, page_image, page_xywh)`

. 2018-07-26 19:09:07 +02:00			`textlines = region.get_TextLine()`
🚧 Process lines 2019-08-08 10:41:55 +02:00			`log.info("About to recognize %i lines of region '%s'", len(textlines), region.id)`
. 2018-07-26 19:09:07 +02:00			`for (line_no, line) in enumerate(textlines):`
🚧 Process lines 2019-08-08 10:41:55 +02:00			`log.debug("Recognizing line '%s' in region '%s'", line_no, region.id)`

🚧 Use character positions as word segmentation 2020-01-31 17:45:00 +01:00			`line_image, line_coords = self.workspace.image_from_segment(line, region_image, region_xywh)`
⬆ Use image_from_segment instead of deprecated resolve_image_as_pil 2019-09-27 14:10:19 +02:00			`line_image_np = np.array(line_image, dtype=np.uint8)`
🚧 Process lines 2019-08-08 10:41:55 +02:00
⬆ Use image_from_segment instead of deprecated resolve_image_as_pil 2019-09-27 14:10:19 +02:00			`raw_results = list(self.predictor.predict_raw([line_image_np], progress_bar=False))[0]`
🚧 Process lines 2019-08-08 10:41:55 +02:00			`for i, p in enumerate(raw_results):`
			`p.prediction.id = "fold_{}".format(i)`

			`prediction = self.voter.vote_prediction_result(raw_results)`
			`prediction.id = "voted"`
. 2018-07-26 19:09:07 +02:00
🚧 Save results in the workspace 2019-08-08 11:12:23 +02:00			`line_text = prediction.sentence`
			`line_conf = prediction.avg_char_probability`

🚧 Use character positions as word segmentation 2020-01-31 17:45:00 +01:00			`# Delete existing results`
remove existing annotation below the line level to avoid inconsistency 2019-11-06 00:46:40 +01:00			`if line.get_TextEquiv():`
			`log.warning("Line '%s' already contained text results", line.id)`
🚧 Use character positions as word segmentation 2020-01-31 17:45:00 +01:00			`line.set_TextEquiv([])`
remove existing annotation below the line level to avoid inconsistency 2019-11-06 00:46:40 +01:00			`if line.get_Word():`
			`log.warning("Line '%s' already contained word segmentation", line.id)`
			`line.set_Word([])`
🚧 Save results in the workspace 2019-08-08 11:12:23 +02:00
🚧 Use character positions as word segmentation 2020-01-31 17:45:00 +01:00			`# Save line results`
			`line.set_TextEquiv([TextEquivType(Unicode=line_text, conf=line_conf)])`

			`# Save word results`
📝 Document why we are using Unicode text segmentation to produce word results 2020-02-03 15:33:11 +01:00			`#`
✨ Do word segmentation as expected by OCR-D PAGE specs 2020-02-03 19:10:16 +01:00			`# Calamari OCR does not provide word positions, so we infer word positions from a. text segmentation`
			`# and b. the glyph positions. This is necessary because the PAGE XML format enforces a strict`
			`# hierarchy of lines > words > glyphs.`

			`def _words(s):`
			`"""Split words based on spaces and include spaces as 'words'"""`
			`spaces = None`
			`word = ''`
			`for c in s:`
			`if c == ' ' and spaces is True:`
			`word += c`
			`elif c != ' ' and spaces is False:`
			`word += c`
			`else:`
			`if word:`
			`yield word`
			`word = c`
			`spaces = (c == ' ')`
			`yield word`
🚧 Use character positions as word segmentation 2020-01-31 17:45:00 +01:00
✨ Allow controlling of output hierarchy level, e.g. only line, not words+glyphs 2020-02-05 13:02:10 +01:00			`if self.parameter['textequiv_level'] in ['word', 'glyph']:`
			`word_no = 0`
			`i = 0`

			`for word_text in _words(prediction.sentence):`
			`word_length = len(word_text)`
			`if not all(c == ' ' for c in word_text):`
			`word_positions = prediction.positions[i:i+word_length]`
			`word_start = word_positions[0].global_start`
			`word_end = word_positions[-1].global_end`

			`polygon = polygon_from_x0y0x1y1([word_start, 0, word_end, line_image.height])`
			`points = points_from_polygon(coordinates_for_segment(polygon, None, line_coords))`
			`# XXX Crop to line polygon?`
🚧 Use character positions as word segmentation 2020-01-31 17:45:00 +01:00
✨ Allow controlling of output hierarchy level, e.g. only line, not words+glyphs 2020-02-05 13:02:10 +01:00			`word = WordType(id='%s_word%04d' % (line.id, word_no), Coords=CoordsType(points))`
			`word.add_TextEquiv(TextEquivType(Unicode=word_text))`
✨ Include proper word + glyph segmentation 2020-02-03 12:22:01 +01:00
✨ Allow controlling of output hierarchy level, e.g. only line, not words+glyphs 2020-02-05 13:02:10 +01:00			`if self.parameter['textequiv_level'] == 'glyph':`
			`for glyph_no, p in enumerate(word_positions):`
			`glyph_start = p.global_start`
			`glyph_end = p.global_end`
✨ Include proper word + glyph segmentation 2020-02-03 12:22:01 +01:00
✨ Allow controlling of output hierarchy level, e.g. only line, not words+glyphs 2020-02-05 13:02:10 +01:00			`polygon = polygon_from_x0y0x1y1([glyph_start, 0, glyph_end, line_image.height])`
			`points = points_from_polygon(coordinates_for_segment(polygon, None, line_coords))`
✨ Include proper word + glyph segmentation 2020-02-03 12:22:01 +01:00
✨ Allow controlling of output hierarchy level, e.g. only line, not words+glyphs 2020-02-05 13:02:10 +01:00			`glyph = GlyphType(id='%s_glyph%04d' % (word.id, glyph_no), Coords=CoordsType(points))`
✨ Include proper word + glyph segmentation 2020-02-03 12:22:01 +01:00
✨ Allow controlling of output hierarchy level, e.g. only line, not words+glyphs 2020-02-05 13:02:10 +01:00			`chars = sorted(p.chars, key=lambda k: k.probability, reverse=True)`
			`char_index = 1 # Must start with 1, see https://ocr-d.github.io/page#multiple-textequivs`
			`for char in chars:`
			`if char.char:`
			`glyph.add_TextEquiv(TextEquivType(Unicode=char.char, index=char_index, conf=char.probability))`
			`char_index += 1`
			`# XXX Note that omission probabilities are not normalized?!`
🧹 Add whitespace 2020-02-03 14:03:54 +01:00
✨ Allow controlling of output hierarchy level, e.g. only line, not words+glyphs 2020-02-05 13:02:10 +01:00			`word.add_Glyph(glyph)`
✨ Include proper word + glyph segmentation 2020-02-03 12:22:01 +01:00
✨ Allow controlling of output hierarchy level, e.g. only line, not words+glyphs 2020-02-05 13:02:10 +01:00			`line.add_Word(word)`
			`word_no += 1`
✨ Include proper word + glyph segmentation 2020-02-03 12:22:01 +01:00
✨ Allow controlling of output hierarchy level, e.g. only line, not words+glyphs 2020-02-05 13:02:10 +01:00			`i += word_length`
🚧 Use character positions as word segmentation 2020-01-31 17:45:00 +01:00

🚧 Update higher TextEquiv levels 2019-08-08 16:28:08 +02:00			`_page_update_higher_textequiv_levels('line', pcgts)`

✨ Add metadata about the recognition operation w/ parameter info 2019-12-19 16:24:34 +01:00
			`# Add metadata about this operation and its runtime parameters:`
			`metadata = pcgts.get_Metadata() # ensured by from_file()`
			`metadata.add_MetadataItem(`
			`MetadataItemType(type_="processingStep",`
			`name=self.ocrd_tool['steps'][0],`
			`value=TOOL,`
			`Labels=[LabelsType(`
			`externalModel="ocrd-tool",`
			`externalId="parameters",`
			`Label=[LabelType(type_=name, value=self.parameter[name])`
			`for name in self.parameter.keys()])]))`


🚧 Save results in the workspace 2019-08-08 11:12:23 +02:00			`file_id = self._make_file_id(input_file, n)`
			`self.workspace.add_file(`
			`ID=file_id,`
			`file_grp=self.output_file_grp,`
			`pageId=input_file.pageId,`
			`mimetype=MIMETYPE_PAGE,`
			`local_filename=os.path.join(self.output_file_grp, file_id + '.xml'),`
			`content=to_xml(pcgts))`
🚧 Update higher TextEquiv levels 2019-08-08 16:28:08 +02:00

			`# TODO: This is a copy of ocrd_tesserocr's function, and should probably be moved to a ocrd lib`
			`def _page_update_higher_textequiv_levels(level, pcgts):`
			"""Update the TextEquivs of all PAGE-XML hierarchy levels above `level` for consistency.

			`Starting with the hierarchy level chosen for processing,`
			`join all first TextEquiv (by the rules governing the respective level)`
			`into TextEquiv of the next higher level, replacing them.`
			`"""`
			`regions = pcgts.get_Page().get_TextRegion()`
			`if level != 'region':`
			`for region in regions:`
			`lines = region.get_TextLine()`
			`if level != 'line':`
			`for line in lines:`
			`words = line.get_Word()`
			`if level != 'word':`
			`for word in words:`
			`glyphs = word.get_Glyph()`
			`word_unicode = u''.join(glyph.get_TextEquiv()[0].Unicode`
			`if glyph.get_TextEquiv()`
			`else u'' for glyph in glyphs)`
			`word.set_TextEquiv(`
			`[TextEquivType(Unicode=word_unicode)]) # remove old`
			`line_unicode = u' '.join(word.get_TextEquiv()[0].Unicode`
			`if word.get_TextEquiv()`
			`else u'' for word in words)`
			`line.set_TextEquiv(`
			`[TextEquivType(Unicode=line_unicode)]) # remove old`
			`region_unicode = u'\n'.join(line.get_TextEquiv()[0].Unicode`
			`if line.get_TextEquiv()`
			`else u'' for line in lines)`
			`region.set_TextEquiv(`
			`[TextEquivType(Unicode=region_unicode)]) # remove old`
🎨 Set vim textwidth 2020-02-03 12:22:22 +01:00
			`# vim:tw=120:`