training.train: fix data batching for OCR in 27f43c17

2026-06-28 07:49:21 +02:00 · 2026-02-24 20:42:08 +01:00 · 2026-02-24 20:42:08 +01:00 · 92fc2bd815
commit 92fc2bd815
parent 86b009bc31
2 changed files with 63 additions and 65 deletions
--- a/src/eynollah/training/train.py
+++ b/src/eynollah/training/train.py
@ -618,11 +618,6 @@ def run(_config,
        padding_token = len(characters) + 5
        # Mapping characters to integers.
        char_to_num = StringLookup(vocabulary=list(characters), mask_token=None)
-
-        # Mapping integers back to original characters.
-        ##num_to_char = StringLookup(
-            ##vocabulary=char_to_num.get_vocabulary(), mask_token=None, invert=True
-        ##)
        n_classes = len(char_to_num.get_vocabulary()) + 2

        if continue_training:
@ -649,21 +644,23 @@ def run(_config,
                                   char_to_num=char_to_num,
                                   padding_token=padding_token
            )
-        train_ds = tf.data.Dataset.from_generator(gen)
-        train_ds = train_ds.padded_batch(n_batch,
-                                         padded_shapes=([input_height, input_width, 3], [None]),
-                                         padding_values=(0, padding_token),
-                                         drop_remainder=True,
-                                         #num_parallel_calls=tf.data.AUTOTUNE,
+        train_ds = (tf.data.Dataset.from_generator(gen, (tf.float32, tf.int64))
+                    .padded_batch(n_batch,
+                                  padded_shapes=([input_height, input_width, 3], [None]),
+                                  padding_values=(None, tf.constant(padding_token, dtype=tf.int64)),
+                                  drop_remainder=True,
+                                  #num_parallel_calls=tf.data.AUTOTUNE,
+                    )
+                    .map(lambda x, y: {"image": x, "label": y})
+                    .prefetch(tf.data.AUTOTUNE)
        )
-        train_ds = train_ds.prefetch(tf.data.AUTOTUNE)

        #initial_learning_rate = 1e-4
        #decay_steps = int (n_epochs * ( len_dataset / n_batch ))
        #alpha = 0.01
        #lr_schedule = 1e-4
        #tf.keras.optimizers.schedules.CosineDecay(initial_learning_rate, decay_steps, alpha)
-        opt = tf.keras.optimizers.Adam(learning_rate=learning_rate)
+        opt = Adam(learning_rate=learning_rate)
        model.compile(optimizer=opt) # rs: loss seems to be (ctc_batch_cost) in last layer

        callbacks = [TensorBoard(os.path.join(dir_output, 'logs'), write_graph=False),
--- a/src/eynollah/training/utils.py
+++ b/src/eynollah/training/utils.py
@ -1073,58 +1073,59 @@ def preprocess_img(img,
                                                             scaler=sc_ind)
                            
 def preprocess_img_ocr(
-    img,
-    img_name,
-    lab,
-    char_to_num=None,
-    padding_token=-1,
-    max_len=500,
-    n_batch=1,
-    input_height=None,
-    input_width=None,
-    augmentation=False,
-    color_padding_rotation=None,
-    thetha_padd=None,
-    padd_colors=None,
-    rotation_not_90=None,
-    thetha=None,
-    padding_white=None,
-    white_padds=None,
-    degrading=False,
-    bin_deg=None,
-    degrade_scales=None,
-    blur_aug=False,
-    blur_k=None,
-    brightening=False,
-    brightness=None,
-    binarization=False,
-    image_inversion=False,
-    channels_shuffling=False,
-    shuffle_indexes=None,
-    white_noise_strap=False,
-    textline_skewing=False,
-    textline_skewing_bin=False,
-    skewing_amplitudes=None,
-    textline_left_in_depth=False,
-    textline_left_in_depth_bin=False,
-    textline_right_in_depth=False,
-    textline_right_in_depth_bin=False,
-    textline_up_in_depth=False,
-    textline_up_in_depth_bin=False,
-    textline_down_in_depth=False,
-    textline_down_in_depth_bin=False,
-    pepper_aug=False,
-    pepper_bin_aug=False,
-    pepper_indexes=None,
-    dir_img_bin=None,
-    add_red_textlines=False,
-    adding_rgb_background=False,
-    dir_rgb_backgrounds=None,
-    adding_rgb_foreground=False,
-    dir_rgb_foregrounds=None,
-    number_of_backgrounds_per_image=None,
-    list_all_possible_background_images=None,
-    list_all_possible_foreground_rgbs=None,
+        img,
+        img_name,
+        lab,
+        char_to_num=None,
+        padding_token=-1,
+        max_len=500,
+        n_batch=1,
+        input_height=None,
+        input_width=None,
+        augmentation=False,
+        color_padding_rotation=None,
+        thetha_padd=None,
+        padd_colors=None,
+        rotation_not_90=None,
+        thetha=None,
+        padding_white=None,
+        white_padds=None,
+        degrading=False,
+        bin_deg=None,
+        degrade_scales=None,
+        blur_aug=False,
+        blur_k=None,
+        brightening=False,
+        brightness=None,
+        binarization=False,
+        image_inversion=False,
+        channels_shuffling=False,
+        shuffle_indexes=None,
+        white_noise_strap=False,
+        textline_skewing=False,
+        textline_skewing_bin=False,
+        skewing_amplitudes=None,
+        textline_left_in_depth=False,
+        textline_left_in_depth_bin=False,
+        textline_right_in_depth=False,
+        textline_right_in_depth_bin=False,
+        textline_up_in_depth=False,
+        textline_up_in_depth_bin=False,
+        textline_down_in_depth=False,
+        textline_down_in_depth_bin=False,
+        pepper_aug=False,
+        pepper_bin_aug=False,
+        pepper_indexes=None,
+        dir_img_bin=None,
+        add_red_textlines=False,
+        adding_rgb_background=False,
+        dir_rgb_backgrounds=None,
+        adding_rgb_foreground=False,
+        dir_rgb_foregrounds=None,
+        number_of_backgrounds_per_image=None,
+        list_all_possible_background_images=None,
+        list_all_possible_foreground_rgbs=None,
+        **kwargs
 ):
    def scale_image(img):
        return scale_padd_image_for_ocr(img, input_height, input_width).astype(np.float32) / 255.