From 3a73ccca2e4fa09c8026f64446c0477d552de128 Mon Sep 17 00:00:00 2001 From: kba Date: Fri, 17 Oct 2025 13:45:14 +0200 Subject: [PATCH 1/3] training/models.py: make imports explicit --- pyproject.toml | 2 - src/eynollah/training/models.py | 81 +++++++++++++++++++++------------ 2 files changed, 51 insertions(+), 32 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index e7744a1..39992ab 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -58,8 +58,6 @@ source = ["eynollah"] [tool.ruff] line-length = 120 -# TODO: Reenable and fix after release v0.6.0 -exclude = ['src/eynollah/training'] [tool.ruff.lint] ignore = [ diff --git a/src/eynollah/training/models.py b/src/eynollah/training/models.py index fdc5437..7fc34b6 100644 --- a/src/eynollah/training/models.py +++ b/src/eynollah/training/models.py @@ -1,9 +1,29 @@ -import tensorflow as tf from tensorflow import keras -from tensorflow.keras.models import * -from tensorflow.keras.layers import * -from tensorflow.keras import layers -from tensorflow.keras.regularizers import l2 +from keras.layers import ( + Activation, + Add, + AveragePooling2D, + BatchNormalization, + Conv2D, + Dense, + Dropout, + Embedding, + Flatten, + Input, + Lambda, + Layer, + LayerNormalization, + MaxPooling2D, + MultiHeadAttention, + UpSampling2D, + ZeroPadding2D, + add, + concatenate +) +from keras.models import Model +import tensorflow as tf +# from keras import layers, models +from keras.regularizers import l2 ##mlp_head_units = [512, 256]#[2048, 1024] ###projection_dim = 64 @@ -15,13 +35,13 @@ MERGE_AXIS = -1 def mlp(x, hidden_units, dropout_rate): for units in hidden_units: - x = layers.Dense(units, activation=tf.nn.gelu)(x) - x = layers.Dropout(dropout_rate)(x) + x = Dense(units, activation=tf.nn.gelu)(x) + x = Dropout(dropout_rate)(x) return x -class Patches(layers.Layer): +class Patches(Layer): def __init__(self, patch_size_x, patch_size_y):#__init__(self, **kwargs):#:__init__(self, patch_size):#__init__(self, **kwargs): - super(Patches, self).__init__() + super().__init__() self.patch_size_x = patch_size_x self.patch_size_y = patch_size_y @@ -49,9 +69,9 @@ class Patches(layers.Layer): }) return config -class Patches_old(layers.Layer): +class Patches_old(Layer): def __init__(self, patch_size):#__init__(self, **kwargs):#:__init__(self, patch_size):#__init__(self, **kwargs): - super(Patches, self).__init__() + super().__init__() self.patch_size = patch_size def call(self, images): @@ -69,8 +89,8 @@ class Patches_old(layers.Layer): #print(patches.shape,patch_dims,'patch_dims') patches = tf.reshape(patches, [batch_size, -1, patch_dims]) return patches - def get_config(self): + def get_config(self): config = super().get_config().copy() config.update({ 'patch_size': self.patch_size, @@ -78,12 +98,12 @@ class Patches_old(layers.Layer): return config -class PatchEncoder(layers.Layer): +class PatchEncoder(Layer): def __init__(self, num_patches, projection_dim): super(PatchEncoder, self).__init__() self.num_patches = num_patches - self.projection = layers.Dense(units=projection_dim) - self.position_embedding = layers.Embedding( + self.projection = Dense(units=projection_dim) + self.position_embedding = Embedding( input_dim=num_patches, output_dim=projection_dim ) @@ -144,7 +164,7 @@ def identity_block(input_tensor, kernel_size, filters, stage, block): x = Conv2D(filters3, (1, 1), data_format=IMAGE_ORDERING, name=conv_name_base + '2c')(x) x = BatchNormalization(axis=bn_axis, name=bn_name_base + '2c')(x) - x = layers.add([x, input_tensor]) + x = add([x, input_tensor]) x = Activation('relu')(x) return x @@ -189,12 +209,12 @@ def conv_block(input_tensor, kernel_size, filters, stage, block, strides=(2, 2)) name=conv_name_base + '1')(input_tensor) shortcut = BatchNormalization(axis=bn_axis, name=bn_name_base + '1')(shortcut) - x = layers.add([x, shortcut]) + x = add([x, shortcut]) x = Activation('relu')(x) return x -def resnet50_unet_light(n_classes, input_height=224, input_width=224, taks="segmentation", weight_decay=1e-6, pretraining=False): +def resnet50_unet_light(n_classes, input_height=224, input_width=224, task="segmentation", weight_decay=1e-6, pretraining=False): assert input_height % 32 == 0 assert input_width % 32 == 0 @@ -397,7 +417,7 @@ def resnet50_unet(n_classes, input_height=224, input_width=224, task="segmentati def vit_resnet50_unet(n_classes, patch_size_x, patch_size_y, num_patches, mlp_head_units=None, transformer_layers=8, num_heads =4, projection_dim = 64, input_height=224, input_width=224, task="segmentation", weight_decay=1e-6, pretraining=False): if mlp_head_units is None: mlp_head_units = [128, 64] - inputs = layers.Input(shape=(input_height, input_width, 3)) + inputs = Input(shape=(input_height, input_width, 3)) #transformer_units = [ #projection_dim * 2, @@ -452,20 +472,21 @@ def vit_resnet50_unet(n_classes, patch_size_x, patch_size_y, num_patches, mlp_he for _ in range(transformer_layers): # Layer normalization 1. - x1 = layers.LayerNormalization(epsilon=1e-6)(encoded_patches) + x1 = LayerNormalization(epsilon=1e-6)(encoded_patches) # Create a multi-head attention layer. - attention_output = layers.MultiHeadAttention( + attention_output = MultiHeadAttention( num_heads=num_heads, key_dim=projection_dim, dropout=0.1 )(x1, x1) # Skip connection 1. - x2 = layers.Add()([attention_output, encoded_patches]) + x2 = Add()([attention_output, encoded_patches]) # Layer normalization 2. - x3 = layers.LayerNormalization(epsilon=1e-6)(x2) + x3 = LayerNormalization(epsilon=1e-6)(x2) # MLP. x3 = mlp(x3, hidden_units=mlp_head_units, dropout_rate=0.1) # Skip connection 2. - encoded_patches = layers.Add()([x3, x2]) + encoded_patches = Add()([x3, x2]) + assert isinstance(x, Layer) encoded_patches = tf.reshape(encoded_patches, [-1, x.shape[1], x.shape[2] , int( projection_dim / (patch_size_x * patch_size_y) )]) v1024_2048 = Conv2D( 1024 , (1, 1), padding='same', data_format=IMAGE_ORDERING,kernel_regularizer=l2(weight_decay))(encoded_patches) @@ -521,7 +542,7 @@ def vit_resnet50_unet(n_classes, patch_size_x, patch_size_y, num_patches, mlp_he def vit_resnet50_unet_transformer_before_cnn(n_classes, patch_size_x, patch_size_y, num_patches, mlp_head_units=None, transformer_layers=8, num_heads =4, projection_dim = 64, input_height=224, input_width=224, task="segmentation", weight_decay=1e-6, pretraining=False): if mlp_head_units is None: mlp_head_units = [128, 64] - inputs = layers.Input(shape=(input_height, input_width, 3)) + inputs = Input(shape=(input_height, input_width, 3)) ##transformer_units = [ ##projection_dim * 2, @@ -536,19 +557,19 @@ def vit_resnet50_unet_transformer_before_cnn(n_classes, patch_size_x, patch_size for _ in range(transformer_layers): # Layer normalization 1. - x1 = layers.LayerNormalization(epsilon=1e-6)(encoded_patches) + x1 = LayerNormalization(epsilon=1e-6)(encoded_patches) # Create a multi-head attention layer. - attention_output = layers.MultiHeadAttention( + attention_output = MultiHeadAttention( num_heads=num_heads, key_dim=projection_dim, dropout=0.1 )(x1, x1) # Skip connection 1. - x2 = layers.Add()([attention_output, encoded_patches]) + x2 = Add()([attention_output, encoded_patches]) # Layer normalization 2. - x3 = layers.LayerNormalization(epsilon=1e-6)(x2) + x3 = LayerNormalization(epsilon=1e-6)(x2) # MLP. x3 = mlp(x3, hidden_units=mlp_head_units, dropout_rate=0.1) # Skip connection 2. - encoded_patches = layers.Add()([x3, x2]) + encoded_patches = Add()([x3, x2]) encoded_patches = tf.reshape(encoded_patches, [-1, input_height, input_width , int( projection_dim / (patch_size_x * patch_size_y) )]) From af74890b2edafc7256d122ef078ce0e8dfed35ed Mon Sep 17 00:00:00 2001 From: kba Date: Fri, 17 Oct 2025 14:07:43 +0200 Subject: [PATCH 2/3] training/inference.py: add typing info, organize imports --- src/eynollah/training/inference.py | 53 ++++++++++++++++++------------ 1 file changed, 32 insertions(+), 21 deletions(-) diff --git a/src/eynollah/training/inference.py b/src/eynollah/training/inference.py index 3fa8fd6..10fca6c 100644 --- a/src/eynollah/training/inference.py +++ b/src/eynollah/training/inference.py @@ -1,14 +1,15 @@ import sys import os +from typing import Tuple import warnings import json import numpy as np import cv2 -from tensorflow.keras.models import load_model +from numpy._typing import NDArray import tensorflow as tf -from tensorflow.keras import backend as K -from tensorflow.keras.layers import * +from keras.models import Model, load_model +from keras import backend as K import click from tensorflow.python.keras import backend as tensorflow_backend import xml.etree.ElementTree as ET @@ -34,6 +35,7 @@ Tool to load model and predict for given image. """ class sbb_predict: + def __init__(self,image, dir_in, model, task, config_params_model, patches, save, save_layout, ground_truth, xml_file, out, min_area): self.image=image self.dir_in=dir_in @@ -77,7 +79,7 @@ class sbb_predict: #print(img[:,:,0].min()) #blur = cv2.GaussianBlur(img,(5,5)) #ret3,th3 = cv2.threshold(blur,0,255,cv2.THRESH_BINARY+cv2.THRESH_OTSU) - retval1, threshold1 = cv2.threshold(img1, 0, 255, cv2.THRESH_BINARY+cv2.THRESH_OTSU) + _, threshold1 = cv2.threshold(img1, 0, 255, cv2.THRESH_BINARY+cv2.THRESH_OTSU) @@ -116,19 +118,19 @@ class sbb_predict: denominator = K.sum(K.square(y_pred) + K.square(y_true), axes) return 1.00 - K.mean(numerator / (denominator + epsilon)) # average over classes and batch - def weighted_categorical_crossentropy(self,weights=None): - - def loss(y_true, y_pred): - labels_floats = tf.cast(y_true, tf.float32) - per_pixel_loss = tf.nn.sigmoid_cross_entropy_with_logits(labels=labels_floats,logits=y_pred) - - if weights is not None: - weight_mask = tf.maximum(tf.reduce_max(tf.constant( - np.array(weights, dtype=np.float32)[None, None, None]) - * labels_floats, axis=-1), 1.0) - per_pixel_loss = per_pixel_loss * weight_mask[:, :, :, None] - return tf.reduce_mean(per_pixel_loss) - return self.loss + # def weighted_categorical_crossentropy(self,weights=None): + # + # def loss(y_true, y_pred): + # labels_floats = tf.cast(y_true, tf.float32) + # per_pixel_loss = tf.nn.sigmoid_cross_entropy_with_logits(labels=labels_floats,logits=y_pred) + # + # if weights is not None: + # weight_mask = tf.maximum(tf.reduce_max(tf.constant( + # np.array(weights, dtype=np.float32)[None, None, None]) + # * labels_floats, axis=-1), 1.0) + # per_pixel_loss = per_pixel_loss * weight_mask[:, :, :, None] + # return tf.reduce_mean(per_pixel_loss) + # return self.loss def IoU(self,Yi,y_predi): @@ -177,12 +179,13 @@ class sbb_predict: ##if self.weights_dir!=None: ##self.model.load_weights(self.weights_dir) + assert isinstance(self.model, Model) if self.task != 'classification' and self.task != 'reading_order': self.img_height=self.model.layers[len(self.model.layers)-1].output_shape[1] self.img_width=self.model.layers[len(self.model.layers)-1].output_shape[2] self.n_classes=self.model.layers[len(self.model.layers)-1].output_shape[3] - def visualize_model_output(self, prediction, img, task): + def visualize_model_output(self, prediction, img, task) -> Tuple[NDArray, NDArray]: if task == "binarization": prediction = prediction * -1 prediction = prediction + 1 @@ -226,9 +229,12 @@ class sbb_predict: added_image = cv2.addWeighted(img,0.5,layout_only,0.1,0) + assert isinstance(added_image, np.ndarray) + assert isinstance(layout_only, np.ndarray) return added_image, layout_only def predict(self, image_dir): + assert isinstance(self.model, Model) if self.task == 'classification': classes_names = self.config_params_model['classification_classes_name'] img_1ch = img=cv2.imread(image_dir, 0) @@ -240,7 +246,7 @@ class sbb_predict: img_in[0, :, :, 1] = img_1ch[:, :] img_in[0, :, :, 2] = img_1ch[:, :] - label_p_pred = self.model.predict(img_in, verbose=0) + label_p_pred = self.model.predict(img_in, verbose='0') index_class = np.argmax(label_p_pred[0]) print("Predicted Class: {}".format(classes_names[str(int(index_class))])) @@ -361,7 +367,7 @@ class sbb_predict: #input_1[:,:,1] = img3[:,:,0]/5. if batch_counter==inference_bs or ( (tot_counter//inference_bs)==full_bs_ite and tot_counter%inference_bs==last_bs): - y_pr = self.model.predict(input_1 , verbose=0) + y_pr = self.model.predict(input_1 , verbose='0') scalibility_num = scalibility_num+1 if batch_counter==inference_bs: @@ -395,6 +401,7 @@ class sbb_predict: name_space = name_space.split('{')[1] page_element = root_xml.find(link+'Page') + assert isinstance(page_element, ET.Element) """ ro_subelement = ET.SubElement(page_element, 'ReadingOrder') @@ -489,7 +496,7 @@ class sbb_predict: img_patch = img[index_y_d:index_y_u, index_x_d:index_x_u, :] label_p_pred = self.model.predict(img_patch.reshape(1, img_patch.shape[0], img_patch.shape[1], img_patch.shape[2]), - verbose=0) + verbose='0') if self.task == 'enhancement': seg = label_p_pred[0, :, :, :] @@ -497,6 +504,8 @@ class sbb_predict: elif self.task == 'segmentation' or self.task == 'binarization': seg = np.argmax(label_p_pred, axis=3)[0] seg = np.repeat(seg[:, :, np.newaxis], 3, axis=2) + else: + raise ValueError(f"Unhandled task {self.task}") if i == 0 and j == 0: @@ -551,6 +560,8 @@ class sbb_predict: elif self.task == 'segmentation' or self.task == 'binarization': seg = np.argmax(label_p_pred, axis=3)[0] seg = np.repeat(seg[:, :, np.newaxis], 3, axis=2) + else: + raise ValueError(f"Unhandled task {self.task}") prediction_true = seg.astype(int) From 557fb227f3b0e51433ee20c610c197f394f6fd5d Mon Sep 17 00:00:00 2001 From: kba Date: Fri, 17 Oct 2025 14:21:05 +0200 Subject: [PATCH 3/3] training/gt_gen_utils: fix type errors, comment out dead code --- src/eynollah/training/gt_gen_utils.py | 29 +++++++++++++-------------- 1 file changed, 14 insertions(+), 15 deletions(-) diff --git a/src/eynollah/training/gt_gen_utils.py b/src/eynollah/training/gt_gen_utils.py index 2e3428b..28ab422 100644 --- a/src/eynollah/training/gt_gen_utils.py +++ b/src/eynollah/training/gt_gen_utils.py @@ -252,6 +252,7 @@ def get_textline_contours_for_visualization(xml_file): + x_len, y_len = 0, 0 for jj in root1.iter(link+'Page'): y_len=int(jj.attrib['imageHeight']) x_len=int(jj.attrib['imageWidth']) @@ -293,6 +294,7 @@ def get_textline_contours_and_ocr_text(xml_file): + x_len, y_len = 0, 0 for jj in root1.iter(link+'Page'): y_len=int(jj.attrib['imageHeight']) x_len=int(jj.attrib['imageWidth']) @@ -362,7 +364,7 @@ def get_layout_contours_for_visualization(xml_file): link=alltags[0].split('}')[0]+'}' - + x_len, y_len = 0, 0 for jj in root1.iter(link+'Page'): y_len=int(jj.attrib['imageHeight']) x_len=int(jj.attrib['imageWidth']) @@ -637,7 +639,7 @@ def get_images_of_ground_truth(gt_list, dir_in, output_dir, output_type, config_ link=alltags[0].split('}')[0]+'}' - + x_len, y_len = 0, 0 for jj in root1.iter(link+'Page'): y_len=int(jj.attrib['imageHeight']) x_len=int(jj.attrib['imageWidth']) @@ -645,15 +647,12 @@ def get_images_of_ground_truth(gt_list, dir_in, output_dir, output_type, config_ if 'columns_width' in list(config_params.keys()): columns_width_dict = config_params['columns_width'] metadata_element = root1.find(link+'Metadata') - comment_is_sub_element = False + num_col = None for child in metadata_element: tag2 = child.tag if tag2.endswith('}Comments') or tag2.endswith('}comments'): text_comments = child.text num_col = int(text_comments.split('num_col')[1]) - comment_is_sub_element = True - if not comment_is_sub_element: - num_col = None if num_col: x_new = columns_width_dict[str(num_col)] @@ -1739,15 +1738,15 @@ tot_region_ref,x_len, y_len,index_tot_regions, img_poly -def bounding_box(cnt,color, corr_order_index ): - x, y, w, h = cv2.boundingRect(cnt) - x = int(x*scale_w) - y = int(y*scale_h) - - w = int(w*scale_w) - h = int(h*scale_h) - - return [x,y,w,h,int(color), int(corr_order_index)+1] +# def bounding_box(cnt,color, corr_order_index ): +# x, y, w, h = cv2.boundingRect(cnt) +# x = int(x*scale_w) +# y = int(y*scale_h) +# +# w = int(w*scale_w) +# h = int(h*scale_h) +# +# return [x,y,w,h,int(color), int(corr_order_index)+1] def resize_image(seg_in,input_height,input_width): return cv2.resize(seg_in,(input_width,input_height),interpolation=cv2.INTER_NEAREST)