CNN–RNN–OCR inference and adaptation of the CNN–RNN–OCR model to support inference on both CPU and GPU

2026-02-20 16:32:03 +01:00 · 2025-12-17 15:12:39 +01:00 · 2025-12-17 15:12:39 +01:00 · 49261fa99b
commit 49261fa99b
parent 6ee79c7320
2 changed files with 61 additions and 28 deletions
--- a/src/eynollah/training/inference.py
+++ b/src/eynollah/training/inference.py
@ -25,6 +25,9 @@ from .models import (
    Patches
 )

+from.utils import (scale_padd_image_for_ocr)
+from eynollah.utils.utils_ocr import (decode_batch_predictions)
+
 with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    
@ -34,7 +37,7 @@ Tool to load model and predict for given image.
 """

 class sbb_predict:
-    def __init__(self,image, dir_in, model, task, config_params_model, patches, save, save_layout, ground_truth, xml_file, out, min_area):
+    def __init__(self,image, dir_in, model, task, config_params_model, patches, save, save_layout, ground_truth, xml_file, cpu, out, min_area):
        self.image=image
        self.dir_in=dir_in
        self.patches=patches
@ -46,6 +49,7 @@ class sbb_predict:
        self.config_params_model=config_params_model
        self.xml_file = xml_file
        self.out = out
+        self.cpu = cpu
        if min_area:
            self.min_area = float(min_area)
        else:
@ -157,25 +161,21 @@ class sbb_predict:
            return mIoU
            
    def start_new_session_and_model(self):
-        
+        if self.task == "cnn-rnn-ocr":
+            if self.cpu:
+                os.environ['CUDA_VISIBLE_DEVICES']='-1'
+            self.model = load_model(self.model_dir)
+            self.model = tf.keras.models.Model(
+                            self.model.get_layer(name = "image").input, 
+                            self.model.get_layer(name = "dense2").output)
+        else:
            config = tf.compat.v1.ConfigProto()
            config.gpu_options.allow_growth = True

            session = tf.compat.v1.Session(config=config)  # tf.InteractiveSession()
            tensorflow_backend.set_session(session)
-        #tensorflow.keras.layers.custom_layer = PatchEncoder
-        #tensorflow.keras.layers.custom_layer = Patches
+
            self.model = load_model(self.model_dir , compile=False,custom_objects = {"PatchEncoder": PatchEncoder, "Patches": Patches})
-        #config = tf.ConfigProto()
-        #config.gpu_options.allow_growth=True
-    
-        #self.session = tf.InteractiveSession()
-        #keras.losses.custom_loss = self.weighted_categorical_crossentropy
-        #self.model = load_model(self.model_dir , compile=False)
-
-        
-        ##if self.weights_dir!=None:
-            ##self.model.load_weights(self.weights_dir)
                
            if self.task != 'classification' and self.task != 'reading_order':
                self.img_height=self.model.layers[len(self.model.layers)-1].output_shape[1]
@ -244,6 +244,30 @@ class sbb_predict:
            index_class = np.argmax(label_p_pred[0])
            
            print("Predicted Class: {}".format(classes_names[str(int(index_class))]))
+        elif self.task == "cnn-rnn-ocr":
+            img=cv2.imread(image_dir)
+            img = scale_padd_image_for_ocr(img, self.config_params_model['input_height'], self.config_params_model['input_width'])
+            
+            img = img / 255.
+            
+            with open(os.path.join(self.model_dir, "characters_org.txt"), 'r') as char_txt_f:
+                characters = json.load(char_txt_f)
+                
+            AUTOTUNE = tf.data.AUTOTUNE
+
+            # Mapping characters to integers.
+            char_to_num = StringLookup(vocabulary=list(characters), mask_token=None)
+            
+            # Mapping integers back to original characters.
+            num_to_char = StringLookup(
+                vocabulary=char_to_num.get_vocabulary(), mask_token=None, invert=True
+            )
+            preds = self.model.predict(img.reshape(1, img.shape[0], img.shape[1], img.shape[2]), verbose=0)
+            pred_texts = decode_batch_predictions(preds, num_to_char)
+            pred_texts = pred_texts[0].replace("[UNK]", "")
+            return pred_texts
+            
+            
        elif self.task == 'reading_order':
            img_height = self.config_params_model['input_height']
            img_width = self.config_params_model['input_width']
@ -569,6 +593,8 @@ class sbb_predict:
            elif self.task == 'enhancement':
                if self.save:
                    cv2.imwrite(self.save,res)
+            elif self.task == "cnn-rnn-ocr":
+                print(f"Detected text: {res}")
            else:
                img_seg_overlayed, only_layout  = self.visualize_model_output(res, self.img_org, self.task)
                if self.save:
@ -592,6 +618,8 @@ class sbb_predict:
                elif self.task == 'enhancement':
                    self.save = os.path.join(self.out, f_name+'.png')
                    cv2.imwrite(self.save,res)
+                elif self.task == "cnn-rnn-ocr":
+                    print(f"Detected text for file name {f_name} is: {res}")
                else:
                    img_seg_overlayed, only_layout  = self.visualize_model_output(res, self.img_org, self.task)
                    self.save = os.path.join(self.out, f_name+'_overlayed.png')
@ -657,24 +685,29 @@ class sbb_predict:
    "-xml",
    help="xml file with layout coordinates that reading order detection will be implemented on. The result will be written in the same xml file.",
 )
-
+@click.option(
+    "--cpu",
+    "-cpu",
+    help="For OCR, the default device is the GPU. If this parameter is set to true, inference will be performed on the CPU",
+    is_flag=True,
+)
@click.option(
    "--min_area",
    "-min",
    help="min area size of regions considered for reading order detection. The default value is zero and means that all text regions are considered for reading order.",
 )
-def main(image, dir_in, model, patches, save, save_layout, ground_truth, xml_file, out, min_area):
+def main(image, dir_in, model, patches, save, save_layout, ground_truth, xml_file, cpu, out, min_area):
    assert image or dir_in, "Either a single image -i or a dir_in -di is required"
    with open(os.path.join(model,'config.json')) as f:
        config_params_model = json.load(f)
    task = config_params_model['task']
-    if task != 'classification' and task != 'reading_order':
+    if task != 'classification' and task != 'reading_order' and task != "cnn-rnn-ocr":
        if image and not save:
            print("Error: You used one of segmentation or binarization task with image input but not set -s, you need a filename to save visualized output with -s")
            sys.exit(1)
        if dir_in and not out:
            print("Error: You used one of segmentation or binarization task with dir_in but not set -out")
            sys.exit(1)
-    x=sbb_predict(image, dir_in, model, task, config_params_model, patches, save, save_layout, ground_truth, xml_file, out, min_area)
+    x=sbb_predict(image, dir_in, model, task, config_params_model, patches, save, save_layout, ground_truth, xml_file, cpu, out, min_area)
    x.run()

--- a/src/eynollah/training/models.py
+++ b/src/eynollah/training/models.py
@ -843,7 +843,7 @@ def cnn_rnn_ocr_model(image_height=None, image_width=None, n_classes=None, max_s
    
    addition_rnn = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(image_width, return_sequences=True, dropout=0.25))(addition)
    
-    out = tf.keras.layers.Conv1D(max_seq, 1, data_format="channels_first")(addition_rnn)
+    out = tf.keras.layers.Conv1D(max_seq, 1, data_format="channels_last")(addition_rnn)
    out = tf.keras.layers.BatchNormalization(name="bn9")(out)
    out = tf.keras.layers.Activation("relu", name="relu9")(out)
    #out = tf.keras.layers.Conv1D(n_classes, 1, activation='relu', data_format="channels_last")(out)