From d27647a0f1c96b12b475add13142fdb91e7888de Mon Sep 17 00:00:00 2001 From: vahidrezanezhad Date: Tue, 16 Apr 2024 01:00:48 +0200 Subject: [PATCH 01/62] first working update of branch --- config_params.json | 19 +++- models.py | 179 +++++++++++++++++++++++++++++ train.py | 132 ++++++++++++++-------- utils.py | 273 +++++++++++++++++++++++++++++---------------- 4 files changed, 452 insertions(+), 151 deletions(-) diff --git a/config_params.json b/config_params.json index 7505a81..bd47a52 100644 --- a/config_params.json +++ b/config_params.json @@ -1,8 +1,9 @@ { - "n_classes" : 3, + "model_name" : "hybrid_transformer_cnn", + "n_classes" : 2, "n_epochs" : 2, "input_height" : 448, - "input_width" : 672, + "input_width" : 448, "weight_decay" : 1e-6, "n_batch" : 2, "learning_rate": 1e-4, @@ -18,13 +19,21 @@ "scaling_flip" : false, "rotation": false, "rotation_not_90": false, + "num_patches_xy": [28, 28], + "transformer_patchsize": 1, + "blur_k" : ["blur","guass","median"], + "scales" : [0.6, 0.7, 0.8, 0.9, 1.1, 1.2, 1.4], + "brightness" : [1.3, 1.5, 1.7, 2], + "degrade_scales" : [0.2, 0.4], + "flip_index" : [0, 1, -1], + "thetha" : [10, -10], "continue_training": false, - "index_start": 0, - "dir_of_start_model": " ", + "index_start" : 0, + "dir_of_start_model" : " ", "weighted_loss": false, "is_loss_soft_dice": false, "data_is_provided": false, "dir_train": "/train", "dir_eval": "/eval", - "dir_output": "/output" + "dir_output": "/out" } diff --git a/models.py b/models.py index f06823e..f7a7ad8 100644 --- a/models.py +++ b/models.py @@ -1,13 +1,81 @@ +import tensorflow as tf +from tensorflow import keras from tensorflow.keras.models import * from tensorflow.keras.layers import * from tensorflow.keras import layers from tensorflow.keras.regularizers import l2 +mlp_head_units = [2048, 1024] +projection_dim = 64 +transformer_layers = 8 +num_heads = 4 resnet50_Weights_path = './pretrained_model/resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5' IMAGE_ORDERING = 'channels_last' MERGE_AXIS = -1 +transformer_units = [ + projection_dim * 2, + projection_dim, +] # Size of the transformer layers +def mlp(x, hidden_units, dropout_rate): + for units in hidden_units: + x = layers.Dense(units, activation=tf.nn.gelu)(x) + x = layers.Dropout(dropout_rate)(x) + return x + +class Patches(layers.Layer): + def __init__(self, patch_size):#__init__(self, **kwargs):#:__init__(self, patch_size):#__init__(self, **kwargs): + super(Patches, self).__init__() + self.patch_size = patch_size + + def call(self, images): + print(tf.shape(images)[1],'images') + print(self.patch_size,'self.patch_size') + batch_size = tf.shape(images)[0] + patches = tf.image.extract_patches( + images=images, + sizes=[1, self.patch_size, self.patch_size, 1], + strides=[1, self.patch_size, self.patch_size, 1], + rates=[1, 1, 1, 1], + padding="VALID", + ) + patch_dims = patches.shape[-1] + print(patches.shape,patch_dims,'patch_dims') + patches = tf.reshape(patches, [batch_size, -1, patch_dims]) + return patches + def get_config(self): + + config = super().get_config().copy() + config.update({ + 'patch_size': self.patch_size, + }) + return config + +class PatchEncoder(layers.Layer): + def __init__(self, num_patches, projection_dim): + super(PatchEncoder, self).__init__() + self.num_patches = num_patches + self.projection = layers.Dense(units=projection_dim) + self.position_embedding = layers.Embedding( + input_dim=num_patches, output_dim=projection_dim + ) + + def call(self, patch): + positions = tf.range(start=0, limit=self.num_patches, delta=1) + encoded = self.projection(patch) + self.position_embedding(positions) + return encoded + def get_config(self): + + config = super().get_config().copy() + config.update({ + 'num_patches': self.num_patches, + 'projection': self.projection, + 'position_embedding': self.position_embedding, + }) + return config + + def one_side_pad(x): x = ZeroPadding2D((1, 1), data_format=IMAGE_ORDERING)(x) if IMAGE_ORDERING == 'channels_first': @@ -292,3 +360,114 @@ def resnet50_unet(n_classes, input_height=224, input_width=224, weight_decay=1e- model = Model(img_input, o) return model + + +def vit_resnet50_unet(n_classes,patch_size, num_patches, input_height=224,input_width=224,weight_decay=1e-6,pretraining=False): + inputs = layers.Input(shape=(input_height, input_width, 3)) + IMAGE_ORDERING = 'channels_last' + bn_axis=3 + + x = ZeroPadding2D((3, 3), data_format=IMAGE_ORDERING)(inputs) + x = Conv2D(64, (7, 7), data_format=IMAGE_ORDERING, strides=(2, 2),kernel_regularizer=l2(weight_decay), name='conv1')(x) + f1 = x + + x = BatchNormalization(axis=bn_axis, name='bn_conv1')(x) + x = Activation('relu')(x) + x = MaxPooling2D((3, 3), data_format=IMAGE_ORDERING, strides=(2, 2))(x) + + x = conv_block(x, 3, [64, 64, 256], stage=2, block='a', strides=(1, 1)) + x = identity_block(x, 3, [64, 64, 256], stage=2, block='b') + x = identity_block(x, 3, [64, 64, 256], stage=2, block='c') + f2 = one_side_pad(x) + + x = conv_block(x, 3, [128, 128, 512], stage=3, block='a') + x = identity_block(x, 3, [128, 128, 512], stage=3, block='b') + x = identity_block(x, 3, [128, 128, 512], stage=3, block='c') + x = identity_block(x, 3, [128, 128, 512], stage=3, block='d') + f3 = x + + x = conv_block(x, 3, [256, 256, 1024], stage=4, block='a') + x = identity_block(x, 3, [256, 256, 1024], stage=4, block='b') + x = identity_block(x, 3, [256, 256, 1024], stage=4, block='c') + x = identity_block(x, 3, [256, 256, 1024], stage=4, block='d') + x = identity_block(x, 3, [256, 256, 1024], stage=4, block='e') + x = identity_block(x, 3, [256, 256, 1024], stage=4, block='f') + f4 = x + + x = conv_block(x, 3, [512, 512, 2048], stage=5, block='a') + x = identity_block(x, 3, [512, 512, 2048], stage=5, block='b') + x = identity_block(x, 3, [512, 512, 2048], stage=5, block='c') + f5 = x + + if pretraining: + model = keras.Model(inputs, x).load_weights(resnet50_Weights_path) + + num_patches = x.shape[1]*x.shape[2] + patches = Patches(patch_size)(x) + # Encode patches. + encoded_patches = PatchEncoder(num_patches, projection_dim)(patches) + + for _ in range(transformer_layers): + # Layer normalization 1. + x1 = layers.LayerNormalization(epsilon=1e-6)(encoded_patches) + # Create a multi-head attention layer. + attention_output = layers.MultiHeadAttention( + num_heads=num_heads, key_dim=projection_dim, dropout=0.1 + )(x1, x1) + # Skip connection 1. + x2 = layers.Add()([attention_output, encoded_patches]) + # Layer normalization 2. + x3 = layers.LayerNormalization(epsilon=1e-6)(x2) + # MLP. + x3 = mlp(x3, hidden_units=transformer_units, dropout_rate=0.1) + # Skip connection 2. + encoded_patches = layers.Add()([x3, x2]) + + encoded_patches = tf.reshape(encoded_patches, [-1, x.shape[1], x.shape[2], 64]) + + v1024_2048 = Conv2D( 1024 , (1, 1), padding='same', data_format=IMAGE_ORDERING,kernel_regularizer=l2(weight_decay))(encoded_patches) + v1024_2048 = (BatchNormalization(axis=bn_axis))(v1024_2048) + v1024_2048 = Activation('relu')(v1024_2048) + + o = (UpSampling2D( (2, 2), data_format=IMAGE_ORDERING))(v1024_2048) + o = (concatenate([o, f4],axis=MERGE_AXIS)) + o = (ZeroPadding2D((1, 1), data_format=IMAGE_ORDERING))(o) + o = (Conv2D(512, (3, 3), padding='valid', data_format=IMAGE_ORDERING,kernel_regularizer=l2(weight_decay)))(o) + o = (BatchNormalization(axis=bn_axis))(o) + o = Activation('relu')(o) + + o = (UpSampling2D((2, 2), data_format=IMAGE_ORDERING))(o) + o = (concatenate([o ,f3], axis=MERGE_AXIS)) + o = (ZeroPadding2D((1, 1), data_format=IMAGE_ORDERING))(o) + o = (Conv2D(256, (3, 3), padding='valid', data_format=IMAGE_ORDERING,kernel_regularizer=l2(weight_decay)))(o) + o = (BatchNormalization(axis=bn_axis))(o) + o = Activation('relu')(o) + + o = (UpSampling2D((2, 2), data_format=IMAGE_ORDERING))(o) + o = (concatenate([o, f2], axis=MERGE_AXIS)) + o = (ZeroPadding2D((1, 1), data_format=IMAGE_ORDERING))(o) + o = (Conv2D(128, (3, 3), padding='valid', data_format=IMAGE_ORDERING,kernel_regularizer=l2(weight_decay)))(o) + o = (BatchNormalization(axis=bn_axis))(o) + o = Activation('relu')(o) + + o = (UpSampling2D((2, 2), data_format=IMAGE_ORDERING))(o) + o = (concatenate([o, f1], axis=MERGE_AXIS)) + o = (ZeroPadding2D((1, 1), data_format=IMAGE_ORDERING))(o) + o = (Conv2D(64, (3, 3), padding='valid', data_format=IMAGE_ORDERING,kernel_regularizer=l2(weight_decay)))(o) + o = (BatchNormalization(axis=bn_axis))(o) + o = Activation('relu')(o) + + o = (UpSampling2D((2, 2), data_format=IMAGE_ORDERING))(o) + o = (concatenate([o, inputs],axis=MERGE_AXIS)) + o = (ZeroPadding2D((1, 1), data_format=IMAGE_ORDERING))(o) + o = (Conv2D(32, (3, 3), padding='valid', data_format=IMAGE_ORDERING,kernel_regularizer=l2(weight_decay)))(o) + o = (BatchNormalization(axis=bn_axis))(o) + o = Activation('relu')(o) + + o = Conv2D(n_classes, (1, 1), padding='same', data_format=IMAGE_ORDERING,kernel_regularizer=l2(weight_decay))(o) + o = (BatchNormalization(axis=bn_axis))(o) + o = (Activation('softmax'))(o) + + model = keras.Model(inputs=inputs, outputs=o) + + return model diff --git a/train.py b/train.py index 03faf46..6e6a172 100644 --- a/train.py +++ b/train.py @@ -10,6 +10,7 @@ from utils import * from metrics import * from tensorflow.keras.models import load_model from tqdm import tqdm +import json def configuration(): @@ -42,9 +43,13 @@ def config_params(): learning_rate = 1e-4 # Set the learning rate. patches = False # Divides input image into smaller patches (input size of the model) when set to true. For the model to see the full image, like page extraction, set this to false. augmentation = False # To apply any kind of augmentation, this parameter must be set to true. - flip_aug = False # If true, different types of flipping will be applied to the image. Types of flips are defined with "flip_index" in train.py. - blur_aug = False # If true, different types of blurring will be applied to the image. Types of blur are defined with "blur_k" in train.py. - scaling = False # If true, scaling will be applied to the image. The amount of scaling is defined with "scales" in train.py. + flip_aug = False # If true, different types of flipping will be applied to the image. Types of flips are defined with "flip_index" in config_params.json. + blur_aug = False # If true, different types of blurring will be applied to the image. Types of blur are defined with "blur_k" in config_params.json. + padding_white = False # If true, white padding will be applied to the image. + padding_black = False # If true, black padding will be applied to the image. + scaling = False # If true, scaling will be applied to the image. The amount of scaling is defined with "scales" in config_params.json. + degrading = False # If true, degrading will be applied to the image. The amount of degrading is defined with "degrade_scales" in config_params.json. + brightening = False # If true, brightening will be applied to the image. The amount of brightening is defined with "brightness" in config_params.json. binarization = False # If true, Otsu thresholding will be applied to augment the input with binarized images. dir_train = None # Directory of training dataset with subdirectories having the names "images" and "labels". dir_eval = None # Directory of validation dataset with subdirectories having the names "images" and "labels". @@ -52,13 +57,18 @@ def config_params(): pretraining = False # Set to true to load pretrained weights of ResNet50 encoder. scaling_bluring = False # If true, a combination of scaling and blurring will be applied to the image. scaling_binarization = False # If true, a combination of scaling and binarization will be applied to the image. + scaling_brightness = False # If true, a combination of scaling and brightening will be applied to the image. scaling_flip = False # If true, a combination of scaling and flipping will be applied to the image. - thetha = [10, -10] # Rotate image by these angles for augmentation. - blur_k = ['blur', 'gauss', 'median'] # Blur image for augmentation. - scales = [0.5, 2] # Scale patches for augmentation. - flip_index = [0, 1, -1] # Flip image for augmentation. + thetha = None # Rotate image by these angles for augmentation. + blur_k = None # Blur image for augmentation. + scales = None # Scale patches for augmentation. + degrade_scales = None # Degrade image for augmentation. + brightness = None # Brighten image for augmentation. + flip_index = None # Flip image for augmentation. continue_training = False # Set to true if you would like to continue training an already trained a model. - index_start = 0 # Index of model to continue training from. E.g. if you trained for 3 epochs and last index is 2, to continue from model_1.h5, set "index_start" to 3 to start naming model with index 3. + transformer_patchsize = None # Patch size of vision transformer patches. + num_patches_xy = None # Number of patches for vision transformer. + index_start = 0 # Index of model to continue training from. E.g. if you trained for 3 epochs and last index is 2, to continue from model_1.h5, set "index_start" to 3 to start naming model with index 3. dir_of_start_model = '' # Directory containing pretrained encoder to continue training the model. is_loss_soft_dice = False # Use soft dice as loss function. When set to true, "weighted_loss" must be false. weighted_loss = False # Use weighted categorical cross entropy as loss fucntion. When set to true, "is_loss_soft_dice" must be false. @@ -66,15 +76,19 @@ def config_params(): @ex.automain -def run(n_classes, n_epochs, input_height, +def run(_config, n_classes, n_epochs, input_height, input_width, weight_decay, weighted_loss, index_start, dir_of_start_model, is_loss_soft_dice, n_batch, patches, augmentation, flip_aug, - blur_aug, scaling, binarization, - blur_k, scales, dir_train, data_is_provided, - scaling_bluring, scaling_binarization, rotation, - rotation_not_90, thetha, scaling_flip, continue_training, - flip_index, dir_eval, dir_output, pretraining, learning_rate): + blur_aug, padding_white, padding_black, scaling, degrading, + brightening, binarization, blur_k, scales, degrade_scales, + brightness, dir_train, data_is_provided, scaling_bluring, + scaling_brightness, scaling_binarization, rotation, rotation_not_90, + thetha, scaling_flip, continue_training, transformer_patchsize, + num_patches_xy, model_name, flip_index, dir_eval, dir_output, + pretraining, learning_rate): + + num_patches = num_patches_xy[0]*num_patches_xy[1] if data_is_provided: dir_train_flowing = os.path.join(dir_output, 'train') dir_eval_flowing = os.path.join(dir_output, 'eval') @@ -121,23 +135,28 @@ def run(n_classes, n_epochs, input_height, # set the gpu configuration configuration() + + imgs_list=np.array(os.listdir(dir_img)) + segs_list=np.array(os.listdir(dir_seg)) + + imgs_list_test=np.array(os.listdir(dir_img_val)) + segs_list_test=np.array(os.listdir(dir_seg_val)) # writing patches into a sub-folder in order to be flowed from directory. - provide_patches(dir_img, dir_seg, dir_flow_train_imgs, - dir_flow_train_labels, - input_height, input_width, blur_k, blur_aug, - flip_aug, binarization, scaling, scales, flip_index, - scaling_bluring, scaling_binarization, rotation, - rotation_not_90, thetha, scaling_flip, - augmentation=augmentation, patches=patches) - - provide_patches(dir_img_val, dir_seg_val, dir_flow_eval_imgs, - dir_flow_eval_labels, - input_height, input_width, blur_k, blur_aug, - flip_aug, binarization, scaling, scales, flip_index, - scaling_bluring, scaling_binarization, rotation, - rotation_not_90, thetha, scaling_flip, - augmentation=False, patches=patches) + provide_patches(imgs_list, segs_list, dir_img, dir_seg, dir_flow_train_imgs, + dir_flow_train_labels, input_height, input_width, blur_k, + blur_aug, padding_white, padding_black, flip_aug, binarization, + scaling, degrading, brightening, scales, degrade_scales, brightness, + flip_index, scaling_bluring, scaling_brightness, scaling_binarization, + rotation, rotation_not_90, thetha, scaling_flip, augmentation=augmentation, + patches=patches) + + provide_patches(imgs_list_test, segs_list_test, dir_img_val, dir_seg_val, + dir_flow_eval_imgs, dir_flow_eval_labels, input_height, input_width, + blur_k, blur_aug, padding_white, padding_black, flip_aug, binarization, + scaling, degrading, brightening, scales, degrade_scales, brightness, + flip_index, scaling_bluring, scaling_brightness, scaling_binarization, + rotation, rotation_not_90, thetha, scaling_flip, augmentation=False, patches=patches) if weighted_loss: weights = np.zeros(n_classes) @@ -166,38 +185,50 @@ def run(n_classes, n_epochs, input_height, weights = weights / float(np.sum(weights)) if continue_training: - if is_loss_soft_dice: - model = load_model(dir_of_start_model, compile=True, custom_objects={'soft_dice_loss': soft_dice_loss}) - if weighted_loss: - model = load_model(dir_of_start_model, compile=True, - custom_objects={'loss': weighted_categorical_crossentropy(weights)}) - if not is_loss_soft_dice and not weighted_loss: - model = load_model(dir_of_start_model, compile=True) + if model_name=='resnet50_unet': + if is_loss_soft_dice: + model = load_model(dir_of_start_model, compile=True, custom_objects={'soft_dice_loss': soft_dice_loss}) + if weighted_loss: + model = load_model(dir_of_start_model, compile=True, custom_objects={'loss': weighted_categorical_crossentropy(weights)}) + if not is_loss_soft_dice and not weighted_loss: + model = load_model(dir_of_start_model , compile=True) + elif model_name=='hybrid_transformer_cnn': + if is_loss_soft_dice: + model = load_model(dir_of_start_model, compile=True, custom_objects={"PatchEncoder": PatchEncoder, "Patches": Patches,'soft_dice_loss': soft_dice_loss}) + if weighted_loss: + model = load_model(dir_of_start_model, compile=True, custom_objects={'loss': weighted_categorical_crossentropy(weights)}) + if not is_loss_soft_dice and not weighted_loss: + model = load_model(dir_of_start_model , compile=True,custom_objects = {"PatchEncoder": PatchEncoder, "Patches": Patches}) else: - # get our model. index_start = 0 - model = resnet50_unet(n_classes, input_height, input_width, weight_decay, pretraining) - - # if you want to see the model structure just uncomment model summary. - # model.summary() + if model_name=='resnet50_unet': + model = resnet50_unet(n_classes, input_height, input_width,weight_decay,pretraining) + elif model_name=='hybrid_transformer_cnn': + model = vit_resnet50_unet(n_classes, transformer_patchsize, num_patches, input_height, input_width,weight_decay,pretraining) + + #if you want to see the model structure just uncomment model summary. + #model.summary() + if not is_loss_soft_dice and not weighted_loss: model.compile(loss='categorical_crossentropy', optimizer=Adam(lr=learning_rate), metrics=['accuracy']) - if is_loss_soft_dice: + if is_loss_soft_dice: model.compile(loss=soft_dice_loss, optimizer=Adam(lr=learning_rate), metrics=['accuracy']) - if weighted_loss: model.compile(loss=weighted_categorical_crossentropy(weights), optimizer=Adam(lr=learning_rate), metrics=['accuracy']) - + # generating train and evaluation data train_gen = data_gen(dir_flow_train_imgs, dir_flow_train_labels, batch_size=n_batch, input_height=input_height, input_width=input_width, n_classes=n_classes) val_gen = data_gen(dir_flow_eval_imgs, dir_flow_eval_labels, batch_size=n_batch, input_height=input_height, input_width=input_width, n_classes=n_classes) - + + ##img_validation_patches = os.listdir(dir_flow_eval_imgs) + ##score_best=[] + ##score_best.append(0) for i in tqdm(range(index_start, n_epochs + index_start)): model.fit_generator( train_gen, @@ -205,9 +236,12 @@ def run(n_classes, n_epochs, input_height, validation_data=val_gen, validation_steps=1, epochs=1) - model.save(dir_output + '/' + 'model_' + str(i)) + model.save(dir_output+'/'+'model_'+str(i)) + + with open(dir_output+'/'+'model_'+str(i)+'/'+"config.json", "w") as fp: + json.dump(_config, fp) # encode dict into JSON - # os.system('rm -rf '+dir_train_flowing) - # os.system('rm -rf '+dir_eval_flowing) + #os.system('rm -rf '+dir_train_flowing) + #os.system('rm -rf '+dir_eval_flowing) - # model.save(dir_output+'/'+'model'+'.h5') + #model.save(dir_output+'/'+'model'+'.h5') diff --git a/utils.py b/utils.py index 7c65f18..c2786ec 100644 --- a/utils.py +++ b/utils.py @@ -9,6 +9,15 @@ from tqdm import tqdm import imutils import math +def do_brightening(img_in_dir, factor): + im = Image.open(img_in_dir) + enhancer = ImageEnhance.Brightness(im) + out_img = enhancer.enhance(factor) + out_img = out_img.convert('RGB') + opencv_img = np.array(out_img) + opencv_img = opencv_img[:,:,::-1].copy() + return opencv_img + def bluring(img_in, kind): if kind == 'gauss': @@ -138,11 +147,11 @@ def IoU(Yi, y_predi): FP = np.sum((Yi != c) & (y_predi == c)) FN = np.sum((Yi == c) & (y_predi != c)) IoU = TP / float(TP + FP + FN) - print("class {:02.0f}: #TP={:6.0f}, #FP={:6.0f}, #FN={:5.0f}, IoU={:4.3f}".format(c, TP, FP, FN, IoU)) + #print("class {:02.0f}: #TP={:6.0f}, #FP={:6.0f}, #FN={:5.0f}, IoU={:4.3f}".format(c, TP, FP, FN, IoU)) IoUs.append(IoU) mIoU = np.mean(IoUs) - print("_________________") - print("Mean IoU: {:4.3f}".format(mIoU)) + #print("_________________") + #print("Mean IoU: {:4.3f}".format(mIoU)) return mIoU @@ -241,124 +250,170 @@ def get_patches(dir_img_f, dir_seg_f, img, label, height, width, indexer): return indexer -def do_padding(img, label, height, width): - height_new = img.shape[0] - width_new = img.shape[1] +def do_padding_white(img): + img_org_h = img.shape[0] + img_org_w = img.shape[1] + + index_start_h = 4 + index_start_w = 4 + + img_padded = np.zeros((img.shape[0] + 2*index_start_h, img.shape[1]+ 2*index_start_w, img.shape[2])) + 255 + img_padded[index_start_h: index_start_h + img.shape[0], index_start_w: index_start_w + img.shape[1], :] = img[:, :, :] + + return img_padded.astype(float) + + +def do_degrading(img, scale): + img_org_h = img.shape[0] + img_org_w = img.shape[1] + + img_res = resize_image(img, int(img_org_h * scale), int(img_org_w * scale)) + + return resize_image(img_res, img_org_h, img_org_w) + + +def do_padding_black(img): + img_org_h = img.shape[0] + img_org_w = img.shape[1] + + index_start_h = 4 + index_start_w = 4 + + img_padded = np.zeros((img.shape[0] + 2*index_start_h, img.shape[1] + 2*index_start_w, img.shape[2])) + img_padded[index_start_h: index_start_h + img.shape[0], index_start_w: index_start_w + img.shape[1], :] = img[:, :, :] + + return img_padded.astype(float) + + +def do_padding_label(img): + img_org_h = img.shape[0] + img_org_w = img.shape[1] + + index_start_h = 4 + index_start_w = 4 + + img_padded = np.zeros((img.shape[0] + 2*index_start_h, img.shape[1] + 2*index_start_w, img.shape[2])) + img_padded[index_start_h: index_start_h + img.shape[0], index_start_w: index_start_w + img.shape[1], :] = img[:, :, :] + + return img_padded.astype(np.int16) +def do_padding(img, label, height, width): + height_new=img.shape[0] + width_new=img.shape[1] + h_start = 0 w_start = 0 - + if img.shape[0] < height: h_start = int(abs(height - img.shape[0]) / 2.) height_new = height - + if img.shape[1] < width: w_start = int(abs(width - img.shape[1]) / 2.) width_new = width - + img_new = np.ones((height_new, width_new, img.shape[2])).astype(float) * 255 label_new = np.zeros((height_new, width_new, label.shape[2])).astype(float) - + img_new[h_start:h_start + img.shape[0], w_start:w_start + img.shape[1], :] = np.copy(img[:, :, :]) label_new[h_start:h_start + label.shape[0], w_start:w_start + label.shape[1], :] = np.copy(label[:, :, :]) - - return img_new, label_new + + return img_new,label_new def get_patches_num_scale(dir_img_f, dir_seg_f, img, label, height, width, indexer, n_patches, scaler): if img.shape[0] < height or img.shape[1] < width: img, label = do_padding(img, label, height, width) - + img_h = img.shape[0] img_w = img.shape[1] - + height_scale = int(height * scaler) width_scale = int(width * scaler) - + + nxf = img_w / float(width_scale) nyf = img_h / float(height_scale) - + if nxf > int(nxf): nxf = int(nxf) + 1 if nyf > int(nyf): nyf = int(nyf) + 1 - + nxf = int(nxf) nyf = int(nyf) - + for i in range(nxf): for j in range(nyf): index_x_d = i * width_scale index_x_u = (i + 1) * width_scale - + index_y_d = j * height_scale index_y_u = (j + 1) * height_scale - + if index_x_u > img_w: index_x_u = img_w index_x_d = img_w - width_scale if index_y_u > img_h: index_y_u = img_h index_y_d = img_h - height_scale - + + img_patch = img[index_y_d:index_y_u, index_x_d:index_x_u, :] label_patch = label[index_y_d:index_y_u, index_x_d:index_x_u, :] - + img_patch = resize_image(img_patch, height, width) label_patch = resize_image(label_patch, height, width) - + cv2.imwrite(dir_img_f + '/img_' + str(indexer) + '.png', img_patch) cv2.imwrite(dir_seg_f + '/img_' + str(indexer) + '.png', label_patch) indexer += 1 - + return indexer def get_patches_num_scale_new(dir_img_f, dir_seg_f, img, label, height, width, indexer, scaler): img = resize_image(img, int(img.shape[0] * scaler), int(img.shape[1] * scaler)) label = resize_image(label, int(label.shape[0] * scaler), int(label.shape[1] * scaler)) - + if img.shape[0] < height or img.shape[1] < width: img, label = do_padding(img, label, height, width) - + img_h = img.shape[0] img_w = img.shape[1] - + height_scale = int(height * 1) width_scale = int(width * 1) - + nxf = img_w / float(width_scale) nyf = img_h / float(height_scale) - + if nxf > int(nxf): nxf = int(nxf) + 1 if nyf > int(nyf): nyf = int(nyf) + 1 - + nxf = int(nxf) nyf = int(nyf) - + for i in range(nxf): for j in range(nyf): index_x_d = i * width_scale index_x_u = (i + 1) * width_scale - + index_y_d = j * height_scale index_y_u = (j + 1) * height_scale - + if index_x_u > img_w: index_x_u = img_w index_x_d = img_w - width_scale if index_y_u > img_h: index_y_u = img_h index_y_d = img_h - height_scale - + img_patch = img[index_y_d:index_y_u, index_x_d:index_x_u, :] label_patch = label[index_y_d:index_y_u, index_x_d:index_x_u, :] - - # img_patch=resize_image(img_patch,height,width) - # label_patch=resize_image(label_patch,height,width) - + cv2.imwrite(dir_img_f + '/img_' + str(indexer) + '.png', img_patch) cv2.imwrite(dir_seg_f + '/img_' + str(indexer) + '.png', label_patch) indexer += 1 @@ -366,78 +421,65 @@ def get_patches_num_scale_new(dir_img_f, dir_seg_f, img, label, height, width, i return indexer -def provide_patches(dir_img, dir_seg, dir_flow_train_imgs, - dir_flow_train_labels, - input_height, input_width, blur_k, blur_aug, - flip_aug, binarization, scaling, scales, flip_index, - scaling_bluring, scaling_binarization, rotation, - rotation_not_90, thetha, scaling_flip, - augmentation=False, patches=False): - imgs_cv_train = np.array(os.listdir(dir_img)) - segs_cv_train = np.array(os.listdir(dir_seg)) - +def provide_patches(imgs_list_train, segs_list_train, dir_img, dir_seg, dir_flow_train_imgs, + dir_flow_train_labels, input_height, input_width, blur_k, blur_aug, + padding_white, padding_black, flip_aug, binarization, scaling, degrading, + brightening, scales, degrade_scales, brightness, flip_index, + scaling_bluring, scaling_brightness, scaling_binarization, rotation, + rotation_not_90, thetha, scaling_flip, augmentation=False, patches=False): + indexer = 0 - for im, seg_i in tqdm(zip(imgs_cv_train, segs_cv_train)): + for im, seg_i in tqdm(zip(imgs_list_train, segs_list_train)): img_name = im.split('.')[0] if not patches: - cv2.imwrite(dir_flow_train_imgs + '/img_' + str(indexer) + '.png', - resize_image(cv2.imread(dir_img + '/' + im), input_height, input_width)) - cv2.imwrite(dir_flow_train_labels + '/img_' + str(indexer) + '.png', - resize_image(cv2.imread(dir_seg + '/' + img_name + '.png'), input_height, input_width)) + cv2.imwrite(dir_flow_train_imgs + '/img_' + str(indexer) + '.png', resize_image(cv2.imread(dir_img + '/' + im), input_height, input_width)) + cv2.imwrite(dir_flow_train_labels + '/img_' + str(indexer) + '.png', resize_image(cv2.imread(dir_seg + '/' + img_name + '.png'), input_height, input_width)) indexer += 1 - + if augmentation: if flip_aug: for f_i in flip_index: cv2.imwrite(dir_flow_train_imgs + '/img_' + str(indexer) + '.png', - resize_image(cv2.flip(cv2.imread(dir_img + '/' + im), f_i), input_height, - input_width)) - + resize_image(cv2.flip(cv2.imread(dir_img+'/'+im),f_i),input_height,input_width) ) + cv2.imwrite(dir_flow_train_labels + '/img_' + str(indexer) + '.png', - resize_image(cv2.flip(cv2.imread(dir_seg + '/' + img_name + '.png'), f_i), - input_height, input_width)) + resize_image(cv2.flip(cv2.imread(dir_seg + '/' + img_name + '.png'), f_i), input_height, input_width)) indexer += 1 - - if blur_aug: + + if blur_aug: for blur_i in blur_k: cv2.imwrite(dir_flow_train_imgs + '/img_' + str(indexer) + '.png', - (resize_image(bluring(cv2.imread(dir_img + '/' + im), blur_i), input_height, - input_width))) - + (resize_image(bluring(cv2.imread(dir_img + '/' + im), blur_i), input_height, input_width))) + cv2.imwrite(dir_flow_train_labels + '/img_' + str(indexer) + '.png', - resize_image(cv2.imread(dir_seg + '/' + img_name + '.png'), input_height, - input_width)) + resize_image(cv2.imread(dir_seg + '/' + img_name + '.png'), input_height, input_width)) indexer += 1 - + if binarization: cv2.imwrite(dir_flow_train_imgs + '/img_' + str(indexer) + '.png', resize_image(otsu_copy(cv2.imread(dir_img + '/' + im)), input_height, input_width)) - + cv2.imwrite(dir_flow_train_labels + '/img_' + str(indexer) + '.png', resize_image(cv2.imread(dir_seg + '/' + img_name + '.png'), input_height, input_width)) indexer += 1 - + + if patches: - indexer = get_patches(dir_flow_train_imgs, dir_flow_train_labels, cv2.imread(dir_img + '/' + im), cv2.imread(dir_seg + '/' + img_name + '.png'), input_height, input_width, indexer=indexer) - + if augmentation: - if rotation: indexer = get_patches(dir_flow_train_imgs, dir_flow_train_labels, - rotation_90(cv2.imread(dir_img + '/' + im)), - rotation_90(cv2.imread(dir_seg + '/' + img_name + '.png')), - input_height, input_width, indexer=indexer) - + rotation_90(cv2.imread(dir_img + '/' + im)), + rotation_90(cv2.imread(dir_seg + '/' + img_name + '.png')), + input_height, input_width, indexer=indexer) + if rotation_not_90: - for thetha_i in thetha: - img_max_rotated, label_max_rotated = rotation_not_90_func(cv2.imread(dir_img + '/' + im), - cv2.imread( - dir_seg + '/' + img_name + '.png'), - thetha_i) + img_max_rotated, label_max_rotated = rotation_not_90_func(cv2.imread(dir_img + '/'+im), + cv2.imread(dir_seg + '/'+img_name + '.png'), thetha_i) indexer = get_patches(dir_flow_train_imgs, dir_flow_train_labels, img_max_rotated, label_max_rotated, @@ -448,47 +490,84 @@ def provide_patches(dir_img, dir_seg, dir_flow_train_imgs, cv2.flip(cv2.imread(dir_img + '/' + im), f_i), cv2.flip(cv2.imread(dir_seg + '/' + img_name + '.png'), f_i), input_height, input_width, indexer=indexer) - if blur_aug: + if blur_aug: for blur_i in blur_k: indexer = get_patches(dir_flow_train_imgs, dir_flow_train_labels, bluring(cv2.imread(dir_img + '/' + im), blur_i), cv2.imread(dir_seg + '/' + img_name + '.png'), - input_height, input_width, indexer=indexer) - - if scaling: + input_height, input_width, indexer=indexer) + if padding_black: + indexer = get_patches(dir_flow_train_imgs, dir_flow_train_labels, + do_padding_black(cv2.imread(dir_img + '/' + im)), + do_padding_label(cv2.imread(dir_seg + '/' + img_name + '.png')), + input_height, input_width, indexer=indexer) + + if padding_white: + indexer = get_patches(dir_flow_train_imgs, dir_flow_train_labels, + do_padding_white(cv2.imread(dir_img + '/'+im)), + do_padding_label(cv2.imread(dir_seg + '/' + img_name + '.png')), + input_height, input_width, indexer=indexer) + + if brightening: + for factor in brightness: + try: + indexer = get_patches(dir_flow_train_imgs, dir_flow_train_labels, + do_brightening(dir_img + '/' +im, factor), + cv2.imread(dir_seg + '/' + img_name + '.png'), + input_height, input_width, indexer=indexer) + except: + pass + if scaling: for sc_ind in scales: indexer = get_patches_num_scale_new(dir_flow_train_imgs, dir_flow_train_labels, - cv2.imread(dir_img + '/' + im), + cv2.imread(dir_img + '/' + im) , cv2.imread(dir_seg + '/' + img_name + '.png'), input_height, input_width, indexer=indexer, scaler=sc_ind) + + if degrading: + for degrade_scale_ind in degrade_scales: + indexer = get_patches(dir_flow_train_imgs, dir_flow_train_labels, + do_degrading(cv2.imread(dir_img + '/' + im), degrade_scale_ind), + cv2.imread(dir_seg + '/' + img_name + '.png'), + input_height, input_width, indexer=indexer) + if binarization: indexer = get_patches(dir_flow_train_imgs, dir_flow_train_labels, otsu_copy(cv2.imread(dir_img + '/' + im)), cv2.imread(dir_seg + '/' + img_name + '.png'), input_height, input_width, indexer=indexer) - if scaling_bluring: + if scaling_brightness: + for sc_ind in scales: + for factor in brightness: + try: + indexer = get_patches_num_scale_new(dir_flow_train_imgs, + dir_flow_train_labels, + do_brightening(dir_img + '/' + im, factor) + ,cv2.imread(dir_seg + '/' + img_name + '.png') + ,input_height, input_width, indexer=indexer, scaler=sc_ind) + except: + pass + + if scaling_bluring: for sc_ind in scales: for blur_i in blur_k: indexer = get_patches_num_scale_new(dir_flow_train_imgs, dir_flow_train_labels, bluring(cv2.imread(dir_img + '/' + im), blur_i), cv2.imread(dir_seg + '/' + img_name + '.png'), - input_height, input_width, indexer=indexer, - scaler=sc_ind) + input_height, input_width, indexer=indexer, scaler=sc_ind) - if scaling_binarization: + if scaling_binarization: for sc_ind in scales: indexer = get_patches_num_scale_new(dir_flow_train_imgs, dir_flow_train_labels, otsu_copy(cv2.imread(dir_img + '/' + im)), cv2.imread(dir_seg + '/' + img_name + '.png'), input_height, input_width, indexer=indexer, scaler=sc_ind) - - if scaling_flip: + + if scaling_flip: for sc_ind in scales: for f_i in flip_index: indexer = get_patches_num_scale_new(dir_flow_train_imgs, dir_flow_train_labels, - cv2.flip(cv2.imread(dir_img + '/' + im), f_i), - cv2.flip(cv2.imread(dir_seg + '/' + img_name + '.png'), - f_i), - input_height, input_width, indexer=indexer, - scaler=sc_ind) + cv2.flip( cv2.imread(dir_img + '/' + im), f_i), + cv2.flip(cv2.imread(dir_seg + '/' + img_name + '.png'), f_i), + input_height, input_width, indexer=indexer, scaler=sc_ind) From dbb84507ed4d56fa88e4a44b4487737d13a2ae43 Mon Sep 17 00:00:00 2001 From: vahidrezanezhad Date: Mon, 29 Apr 2024 20:59:36 +0200 Subject: [PATCH 02/62] integrating first working classification training model --- config_params.json | 20 +-- models.py | 69 ++++++++- requirements.txt | 1 + train.py | 356 +++++++++++++++++++++++++++------------------ utils.py | 113 ++++++++++++++ 5 files changed, 410 insertions(+), 149 deletions(-) diff --git a/config_params.json b/config_params.json index bd47a52..43ad1bc 100644 --- a/config_params.json +++ b/config_params.json @@ -1,13 +1,15 @@ { - "model_name" : "hybrid_transformer_cnn", + "model_name" : "resnet50_unet", + "task": "classification", "n_classes" : 2, - "n_epochs" : 2, - "input_height" : 448, - "input_width" : 448, + "n_epochs" : 7, + "input_height" : 224, + "input_width" : 224, "weight_decay" : 1e-6, - "n_batch" : 2, + "n_batch" : 6, "learning_rate": 1e-4, - "patches" : true, + "f1_threshold_classification": 0.8, + "patches" : false, "pretraining" : true, "augmentation" : false, "flip_aug" : false, @@ -33,7 +35,7 @@ "weighted_loss": false, "is_loss_soft_dice": false, "data_is_provided": false, - "dir_train": "/train", - "dir_eval": "/eval", - "dir_output": "/out" + "dir_train": "/home/vahid/Downloads/image_classification_data/train", + "dir_eval": "/home/vahid/Downloads/image_classification_data/eval", + "dir_output": "/home/vahid/Downloads/image_classification_data/output" } diff --git a/models.py b/models.py index f7a7ad8..a6de1ef 100644 --- a/models.py +++ b/models.py @@ -400,7 +400,7 @@ def vit_resnet50_unet(n_classes,patch_size, num_patches, input_height=224,input_ f5 = x if pretraining: - model = keras.Model(inputs, x).load_weights(resnet50_Weights_path) + model = Model(inputs, x).load_weights(resnet50_Weights_path) num_patches = x.shape[1]*x.shape[2] patches = Patches(patch_size)(x) @@ -468,6 +468,71 @@ def vit_resnet50_unet(n_classes,patch_size, num_patches, input_height=224,input_ o = (BatchNormalization(axis=bn_axis))(o) o = (Activation('softmax'))(o) - model = keras.Model(inputs=inputs, outputs=o) + model = Model(inputs=inputs, outputs=o) + return model + +def resnet50_classifier(n_classes,input_height=224,input_width=224,weight_decay=1e-6,pretraining=False): + include_top=True + assert input_height%32 == 0 + assert input_width%32 == 0 + + + img_input = Input(shape=(input_height,input_width , 3 )) + + if IMAGE_ORDERING == 'channels_last': + bn_axis = 3 + else: + bn_axis = 1 + + x = ZeroPadding2D((3, 3), data_format=IMAGE_ORDERING)(img_input) + x = Conv2D(64, (7, 7), data_format=IMAGE_ORDERING, strides=(2, 2),kernel_regularizer=l2(weight_decay), name='conv1')(x) + f1 = x + + x = BatchNormalization(axis=bn_axis, name='bn_conv1')(x) + x = Activation('relu')(x) + x = MaxPooling2D((3, 3) , data_format=IMAGE_ORDERING , strides=(2, 2))(x) + + + x = conv_block(x, 3, [64, 64, 256], stage=2, block='a', strides=(1, 1)) + x = identity_block(x, 3, [64, 64, 256], stage=2, block='b') + x = identity_block(x, 3, [64, 64, 256], stage=2, block='c') + f2 = one_side_pad(x ) + + + x = conv_block(x, 3, [128, 128, 512], stage=3, block='a') + x = identity_block(x, 3, [128, 128, 512], stage=3, block='b') + x = identity_block(x, 3, [128, 128, 512], stage=3, block='c') + x = identity_block(x, 3, [128, 128, 512], stage=3, block='d') + f3 = x + + x = conv_block(x, 3, [256, 256, 1024], stage=4, block='a') + x = identity_block(x, 3, [256, 256, 1024], stage=4, block='b') + x = identity_block(x, 3, [256, 256, 1024], stage=4, block='c') + x = identity_block(x, 3, [256, 256, 1024], stage=4, block='d') + x = identity_block(x, 3, [256, 256, 1024], stage=4, block='e') + x = identity_block(x, 3, [256, 256, 1024], stage=4, block='f') + f4 = x + + x = conv_block(x, 3, [512, 512, 2048], stage=5, block='a') + x = identity_block(x, 3, [512, 512, 2048], stage=5, block='b') + x = identity_block(x, 3, [512, 512, 2048], stage=5, block='c') + f5 = x + + if pretraining: + Model(img_input, x).load_weights(resnet50_Weights_path) + + x = AveragePooling2D((7, 7), name='avg_pool')(x) + x = Flatten()(x) + + ## + x = Dense(256, activation='relu', name='fc512')(x) + x=Dropout(0.2)(x) + ## + x = Dense(n_classes, activation='softmax', name='fc1000')(x) + model = Model(img_input, x) + + + + return model diff --git a/requirements.txt b/requirements.txt index 20b6a32..3e56438 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,3 +6,4 @@ tqdm imutils numpy scipy +scikit-learn diff --git a/train.py b/train.py index 6e6a172..efcd3ac 100644 --- a/train.py +++ b/train.py @@ -11,6 +11,7 @@ from metrics import * from tensorflow.keras.models import load_model from tqdm import tqdm import json +from sklearn.metrics import f1_score def configuration(): @@ -73,6 +74,8 @@ def config_params(): is_loss_soft_dice = False # Use soft dice as loss function. When set to true, "weighted_loss" must be false. weighted_loss = False # Use weighted categorical cross entropy as loss fucntion. When set to true, "is_loss_soft_dice" must be false. data_is_provided = False # Only set this to true when you have already provided the input data and the train and eval data are in "dir_output". + task = "segmentation" # This parameter defines task of model which can be segmentation, enhancement or classification. + f1_threshold_classification = None # This threshold is used to consider models with an evaluation f1 scores bigger than it. The selected model weights undergo a weights ensembling. And avreage ensembled model will be written to output. @ex.automain @@ -86,162 +89,239 @@ def run(_config, n_classes, n_epochs, input_height, scaling_brightness, scaling_binarization, rotation, rotation_not_90, thetha, scaling_flip, continue_training, transformer_patchsize, num_patches_xy, model_name, flip_index, dir_eval, dir_output, - pretraining, learning_rate): + pretraining, learning_rate, task, f1_threshold_classification): - num_patches = num_patches_xy[0]*num_patches_xy[1] - if data_is_provided: - dir_train_flowing = os.path.join(dir_output, 'train') - dir_eval_flowing = os.path.join(dir_output, 'eval') + if task == "segmentation": + + num_patches = num_patches_xy[0]*num_patches_xy[1] + if data_is_provided: + dir_train_flowing = os.path.join(dir_output, 'train') + dir_eval_flowing = os.path.join(dir_output, 'eval') - dir_flow_train_imgs = os.path.join(dir_train_flowing, 'images') - dir_flow_train_labels = os.path.join(dir_train_flowing, 'labels') + dir_flow_train_imgs = os.path.join(dir_train_flowing, 'images') + dir_flow_train_labels = os.path.join(dir_train_flowing, 'labels') - dir_flow_eval_imgs = os.path.join(dir_eval_flowing, 'images') - dir_flow_eval_labels = os.path.join(dir_eval_flowing, 'labels') + dir_flow_eval_imgs = os.path.join(dir_eval_flowing, 'images') + dir_flow_eval_labels = os.path.join(dir_eval_flowing, 'labels') - configuration() + configuration() - else: - dir_img, dir_seg = get_dirs_or_files(dir_train) - dir_img_val, dir_seg_val = get_dirs_or_files(dir_eval) + else: + dir_img, dir_seg = get_dirs_or_files(dir_train) + dir_img_val, dir_seg_val = get_dirs_or_files(dir_eval) - # make first a directory in output for both training and evaluations in order to flow data from these directories. - dir_train_flowing = os.path.join(dir_output, 'train') - dir_eval_flowing = os.path.join(dir_output, 'eval') + # make first a directory in output for both training and evaluations in order to flow data from these directories. + dir_train_flowing = os.path.join(dir_output, 'train') + dir_eval_flowing = os.path.join(dir_output, 'eval') - dir_flow_train_imgs = os.path.join(dir_train_flowing, 'images/') - dir_flow_train_labels = os.path.join(dir_train_flowing, 'labels/') + dir_flow_train_imgs = os.path.join(dir_train_flowing, 'images/') + dir_flow_train_labels = os.path.join(dir_train_flowing, 'labels/') - dir_flow_eval_imgs = os.path.join(dir_eval_flowing, 'images/') - dir_flow_eval_labels = os.path.join(dir_eval_flowing, 'labels/') + dir_flow_eval_imgs = os.path.join(dir_eval_flowing, 'images/') + dir_flow_eval_labels = os.path.join(dir_eval_flowing, 'labels/') - if os.path.isdir(dir_train_flowing): - os.system('rm -rf ' + dir_train_flowing) - os.makedirs(dir_train_flowing) - else: - os.makedirs(dir_train_flowing) + if os.path.isdir(dir_train_flowing): + os.system('rm -rf ' + dir_train_flowing) + os.makedirs(dir_train_flowing) + else: + os.makedirs(dir_train_flowing) - if os.path.isdir(dir_eval_flowing): - os.system('rm -rf ' + dir_eval_flowing) - os.makedirs(dir_eval_flowing) - else: - os.makedirs(dir_eval_flowing) + if os.path.isdir(dir_eval_flowing): + os.system('rm -rf ' + dir_eval_flowing) + os.makedirs(dir_eval_flowing) + else: + os.makedirs(dir_eval_flowing) - os.mkdir(dir_flow_train_imgs) - os.mkdir(dir_flow_train_labels) + os.mkdir(dir_flow_train_imgs) + os.mkdir(dir_flow_train_labels) - os.mkdir(dir_flow_eval_imgs) - os.mkdir(dir_flow_eval_labels) + os.mkdir(dir_flow_eval_imgs) + os.mkdir(dir_flow_eval_labels) - # set the gpu configuration - configuration() + # set the gpu configuration + configuration() + + imgs_list=np.array(os.listdir(dir_img)) + segs_list=np.array(os.listdir(dir_seg)) + + imgs_list_test=np.array(os.listdir(dir_img_val)) + segs_list_test=np.array(os.listdir(dir_seg_val)) + + # writing patches into a sub-folder in order to be flowed from directory. + provide_patches(imgs_list, segs_list, dir_img, dir_seg, dir_flow_train_imgs, + dir_flow_train_labels, input_height, input_width, blur_k, + blur_aug, padding_white, padding_black, flip_aug, binarization, + scaling, degrading, brightening, scales, degrade_scales, brightness, + flip_index, scaling_bluring, scaling_brightness, scaling_binarization, + rotation, rotation_not_90, thetha, scaling_flip, augmentation=augmentation, + patches=patches) + + provide_patches(imgs_list_test, segs_list_test, dir_img_val, dir_seg_val, + dir_flow_eval_imgs, dir_flow_eval_labels, input_height, input_width, + blur_k, blur_aug, padding_white, padding_black, flip_aug, binarization, + scaling, degrading, brightening, scales, degrade_scales, brightness, + flip_index, scaling_bluring, scaling_brightness, scaling_binarization, + rotation, rotation_not_90, thetha, scaling_flip, augmentation=False, patches=patches) + + if weighted_loss: + weights = np.zeros(n_classes) + if data_is_provided: + for obj in os.listdir(dir_flow_train_labels): + try: + label_obj = cv2.imread(dir_flow_train_labels + '/' + obj) + label_obj_one_hot = get_one_hot(label_obj, label_obj.shape[0], label_obj.shape[1], n_classes) + weights += (label_obj_one_hot.sum(axis=0)).sum(axis=0) + except: + pass + else: + + for obj in os.listdir(dir_seg): + try: + label_obj = cv2.imread(dir_seg + '/' + obj) + label_obj_one_hot = get_one_hot(label_obj, label_obj.shape[0], label_obj.shape[1], n_classes) + weights += (label_obj_one_hot.sum(axis=0)).sum(axis=0) + except: + pass + + weights = 1.00 / weights + + weights = weights / float(np.sum(weights)) + weights = weights / float(np.min(weights)) + weights = weights / float(np.sum(weights)) + + if continue_training: + if model_name=='resnet50_unet': + if is_loss_soft_dice: + model = load_model(dir_of_start_model, compile=True, custom_objects={'soft_dice_loss': soft_dice_loss}) + if weighted_loss: + model = load_model(dir_of_start_model, compile=True, custom_objects={'loss': weighted_categorical_crossentropy(weights)}) + if not is_loss_soft_dice and not weighted_loss: + model = load_model(dir_of_start_model , compile=True) + elif model_name=='hybrid_transformer_cnn': + if is_loss_soft_dice: + model = load_model(dir_of_start_model, compile=True, custom_objects={"PatchEncoder": PatchEncoder, "Patches": Patches,'soft_dice_loss': soft_dice_loss}) + if weighted_loss: + model = load_model(dir_of_start_model, compile=True, custom_objects={'loss': weighted_categorical_crossentropy(weights)}) + if not is_loss_soft_dice and not weighted_loss: + model = load_model(dir_of_start_model , compile=True,custom_objects = {"PatchEncoder": PatchEncoder, "Patches": Patches}) + else: + index_start = 0 + if model_name=='resnet50_unet': + model = resnet50_unet(n_classes, input_height, input_width,weight_decay,pretraining) + elif model_name=='hybrid_transformer_cnn': + model = vit_resnet50_unet(n_classes, transformer_patchsize, num_patches, input_height, input_width,weight_decay,pretraining) + + #if you want to see the model structure just uncomment model summary. + #model.summary() - imgs_list=np.array(os.listdir(dir_img)) - segs_list=np.array(os.listdir(dir_seg)) + + if not is_loss_soft_dice and not weighted_loss: + model.compile(loss='categorical_crossentropy', + optimizer=Adam(lr=learning_rate), metrics=['accuracy']) + if is_loss_soft_dice: + model.compile(loss=soft_dice_loss, + optimizer=Adam(lr=learning_rate), metrics=['accuracy']) + if weighted_loss: + model.compile(loss=weighted_categorical_crossentropy(weights), + optimizer=Adam(lr=learning_rate), metrics=['accuracy']) - imgs_list_test=np.array(os.listdir(dir_img_val)) - segs_list_test=np.array(os.listdir(dir_seg_val)) - - # writing patches into a sub-folder in order to be flowed from directory. - provide_patches(imgs_list, segs_list, dir_img, dir_seg, dir_flow_train_imgs, - dir_flow_train_labels, input_height, input_width, blur_k, - blur_aug, padding_white, padding_black, flip_aug, binarization, - scaling, degrading, brightening, scales, degrade_scales, brightness, - flip_index, scaling_bluring, scaling_brightness, scaling_binarization, - rotation, rotation_not_90, thetha, scaling_flip, augmentation=augmentation, - patches=patches) + # generating train and evaluation data + train_gen = data_gen(dir_flow_train_imgs, dir_flow_train_labels, batch_size=n_batch, + input_height=input_height, input_width=input_width, n_classes=n_classes) + val_gen = data_gen(dir_flow_eval_imgs, dir_flow_eval_labels, batch_size=n_batch, + input_height=input_height, input_width=input_width, n_classes=n_classes) - provide_patches(imgs_list_test, segs_list_test, dir_img_val, dir_seg_val, - dir_flow_eval_imgs, dir_flow_eval_labels, input_height, input_width, - blur_k, blur_aug, padding_white, padding_black, flip_aug, binarization, - scaling, degrading, brightening, scales, degrade_scales, brightness, - flip_index, scaling_bluring, scaling_brightness, scaling_binarization, - rotation, rotation_not_90, thetha, scaling_flip, augmentation=False, patches=patches) - - if weighted_loss: - weights = np.zeros(n_classes) - if data_is_provided: - for obj in os.listdir(dir_flow_train_labels): - try: - label_obj = cv2.imread(dir_flow_train_labels + '/' + obj) - label_obj_one_hot = get_one_hot(label_obj, label_obj.shape[0], label_obj.shape[1], n_classes) - weights += (label_obj_one_hot.sum(axis=0)).sum(axis=0) - except: - pass - else: + ##img_validation_patches = os.listdir(dir_flow_eval_imgs) + ##score_best=[] + ##score_best.append(0) + for i in tqdm(range(index_start, n_epochs + index_start)): + model.fit_generator( + train_gen, + steps_per_epoch=int(len(os.listdir(dir_flow_train_imgs)) / n_batch) - 1, + validation_data=val_gen, + validation_steps=1, + epochs=1) + model.save(dir_output+'/'+'model_'+str(i)) + + with open(dir_output+'/'+'model_'+str(i)+'/'+"config.json", "w") as fp: + json.dump(_config, fp) # encode dict into JSON - for obj in os.listdir(dir_seg): - try: - label_obj = cv2.imread(dir_seg + '/' + obj) - label_obj_one_hot = get_one_hot(label_obj, label_obj.shape[0], label_obj.shape[1], n_classes) - weights += (label_obj_one_hot.sum(axis=0)).sum(axis=0) - except: - pass - - weights = 1.00 / weights - - weights = weights / float(np.sum(weights)) - weights = weights / float(np.min(weights)) - weights = weights / float(np.sum(weights)) - - if continue_training: - if model_name=='resnet50_unet': - if is_loss_soft_dice: - model = load_model(dir_of_start_model, compile=True, custom_objects={'soft_dice_loss': soft_dice_loss}) - if weighted_loss: - model = load_model(dir_of_start_model, compile=True, custom_objects={'loss': weighted_categorical_crossentropy(weights)}) - if not is_loss_soft_dice and not weighted_loss: - model = load_model(dir_of_start_model , compile=True) - elif model_name=='hybrid_transformer_cnn': - if is_loss_soft_dice: - model = load_model(dir_of_start_model, compile=True, custom_objects={"PatchEncoder": PatchEncoder, "Patches": Patches,'soft_dice_loss': soft_dice_loss}) - if weighted_loss: - model = load_model(dir_of_start_model, compile=True, custom_objects={'loss': weighted_categorical_crossentropy(weights)}) - if not is_loss_soft_dice and not weighted_loss: - model = load_model(dir_of_start_model , compile=True,custom_objects = {"PatchEncoder": PatchEncoder, "Patches": Patches}) - else: - index_start = 0 - if model_name=='resnet50_unet': - model = resnet50_unet(n_classes, input_height, input_width,weight_decay,pretraining) - elif model_name=='hybrid_transformer_cnn': - model = vit_resnet50_unet(n_classes, transformer_patchsize, num_patches, input_height, input_width,weight_decay,pretraining) - - #if you want to see the model structure just uncomment model summary. - #model.summary() - + #os.system('rm -rf '+dir_train_flowing) + #os.system('rm -rf '+dir_eval_flowing) + + #model.save(dir_output+'/'+'model'+'.h5') + elif task=='classification': + configuration() + model = resnet50_classifier(n_classes, input_height, input_width,weight_decay,pretraining) - if not is_loss_soft_dice and not weighted_loss: + opt_adam = Adam(learning_rate=0.001) model.compile(loss='categorical_crossentropy', - optimizer=Adam(lr=learning_rate), metrics=['accuracy']) - if is_loss_soft_dice: - model.compile(loss=soft_dice_loss, - optimizer=Adam(lr=learning_rate), metrics=['accuracy']) - if weighted_loss: - model.compile(loss=weighted_categorical_crossentropy(weights), - optimizer=Adam(lr=learning_rate), metrics=['accuracy']) - - # generating train and evaluation data - train_gen = data_gen(dir_flow_train_imgs, dir_flow_train_labels, batch_size=n_batch, - input_height=input_height, input_width=input_width, n_classes=n_classes) - val_gen = data_gen(dir_flow_eval_imgs, dir_flow_eval_labels, batch_size=n_batch, - input_height=input_height, input_width=input_width, n_classes=n_classes) - - ##img_validation_patches = os.listdir(dir_flow_eval_imgs) - ##score_best=[] - ##score_best.append(0) - for i in tqdm(range(index_start, n_epochs + index_start)): - model.fit_generator( - train_gen, - steps_per_epoch=int(len(os.listdir(dir_flow_train_imgs)) / n_batch) - 1, - validation_data=val_gen, - validation_steps=1, - epochs=1) - model.save(dir_output+'/'+'model_'+str(i)) - - with open(dir_output+'/'+'model_'+str(i)+'/'+"config.json", "w") as fp: - json.dump(_config, fp) # encode dict into JSON + optimizer = opt_adam,metrics=['accuracy']) + + + testX, testY = generate_data_from_folder_evaluation(dir_eval, input_height, input_width, n_classes) + + #print(testY.shape, testY) + + y_tot=np.zeros((testX.shape[0],n_classes)) + indexer=0 - #os.system('rm -rf '+dir_train_flowing) - #os.system('rm -rf '+dir_eval_flowing) + score_best=[] + score_best.append(0) - #model.save(dir_output+'/'+'model'+'.h5') + num_rows = return_number_of_total_training_data(dir_train) + + weights=[] + + for i in range(n_epochs): + #history = model.fit(trainX, trainY, epochs=1, batch_size=n_batch, validation_data=(testX, testY), verbose=2)#,class_weight=weights) + history = model.fit( generate_data_from_folder_training(dir_train, n_batch , input_height, input_width, n_classes), steps_per_epoch=num_rows / n_batch, verbose=0)#,class_weight=weights) + + y_pr_class = [] + for jj in range(testY.shape[0]): + y_pr=model.predict(testX[jj,:,:,:].reshape(1,input_height,input_width,3), verbose=0) + y_pr_ind= np.argmax(y_pr,axis=1) + #print(y_pr_ind, 'y_pr_ind') + y_pr_class.append(y_pr_ind) + + + y_pr_class = np.array(y_pr_class) + #model.save('./models_save/model_'+str(i)+'.h5') + #y_pr_class=np.argmax(y_pr,axis=1) + f1score=f1_score(np.argmax(testY,axis=1), y_pr_class, average='macro') + + print(i,f1score) + + if f1score>score_best[0]: + score_best[0]=f1score + model.save(os.path.join(dir_output,'model_best')) + + + ##best_model=keras.models.clone_model(model) + ##best_model.build() + ##best_model.set_weights(model.get_weights()) + if f1score > f1_threshold_classification: + weights.append(model.get_weights() ) + y_tot=y_tot+y_pr + + indexer+=1 + y_tot=y_tot/float(indexer) + + + new_weights=list() + + for weights_list_tuple in zip(*weights): + new_weights.append( [np.array(weights_).mean(axis=0) for weights_ in zip(*weights_list_tuple)] ) + + new_weights = [np.array(x) for x in new_weights] + + model_weight_averaged=tf.keras.models.clone_model(model) + + model_weight_averaged.set_weights(new_weights) + + #y_tot_end=np.argmax(y_tot,axis=1) + #print(f1_score(np.argmax(testY,axis=1), y_tot_end, average='macro')) + + ##best_model.save('model_taza.h5') + model_weight_averaged.save(os.path.join(dir_output,'model_ens_avg')) + diff --git a/utils.py b/utils.py index c2786ec..af3c5f8 100644 --- a/utils.py +++ b/utils.py @@ -8,6 +8,119 @@ import random from tqdm import tqdm import imutils import math +from tensorflow.keras.utils import to_categorical + + +def return_number_of_total_training_data(path_classes): + sub_classes = os.listdir(path_classes) + n_tot = 0 + for sub_c in sub_classes: + sub_files = os.listdir(os.path.join(path_classes,sub_c)) + n_tot = n_tot + len(sub_files) + return n_tot + + + +def generate_data_from_folder_evaluation(path_classes, height, width, n_classes): + sub_classes = os.listdir(path_classes) + #n_classes = len(sub_classes) + all_imgs = [] + labels = [] + dicts =dict() + indexer= 0 + for sub_c in sub_classes: + sub_files = os.listdir(os.path.join(path_classes,sub_c )) + sub_files = [os.path.join(path_classes,sub_c )+'/' + x for x in sub_files] + #print( os.listdir(os.path.join(path_classes,sub_c )) ) + all_imgs = all_imgs + sub_files + sub_labels = list( np.zeros( len(sub_files) ) +indexer ) + + #print( len(sub_labels) ) + labels = labels + sub_labels + dicts[sub_c] = indexer + indexer +=1 + + + categories = to_categorical(range(n_classes)).astype(np.int16)#[ [1 , 0, 0 , 0 , 0 , 0] , [0 , 1, 0 , 0 , 0 , 0] , [0 , 0, 1 , 0 , 0 , 0] , [0 , 0, 0 , 1 , 0 , 0] , [0 , 0, 0 , 0 , 1 , 0] , [0 , 0, 0 , 0 , 0 , 1] ] + ret_x= np.zeros((len(labels), height,width, 3)).astype(np.int16) + ret_y= np.zeros((len(labels), n_classes)).astype(np.int16) + + #print(all_imgs) + for i in range(len(all_imgs)): + row = all_imgs[i] + #####img = cv2.imread(row, 0) + #####img= resize_image (img, height, width) + #####img = img.astype(np.uint16) + #####ret_x[i, :,:,0] = img[:,:] + #####ret_x[i, :,:,1] = img[:,:] + #####ret_x[i, :,:,2] = img[:,:] + + img = cv2.imread(row) + img= resize_image (img, height, width) + img = img.astype(np.uint16) + ret_x[i, :,:] = img[:,:,:] + + ret_y[i, :] = categories[ int( labels[i] ) ][:] + + return ret_x/255., ret_y + +def generate_data_from_folder_training(path_classes, batchsize, height, width, n_classes): + sub_classes = os.listdir(path_classes) + n_classes = len(sub_classes) + + all_imgs = [] + labels = [] + dicts =dict() + indexer= 0 + for sub_c in sub_classes: + sub_files = os.listdir(os.path.join(path_classes,sub_c )) + sub_files = [os.path.join(path_classes,sub_c )+'/' + x for x in sub_files] + #print( os.listdir(os.path.join(path_classes,sub_c )) ) + all_imgs = all_imgs + sub_files + sub_labels = list( np.zeros( len(sub_files) ) +indexer ) + + #print( len(sub_labels) ) + labels = labels + sub_labels + dicts[sub_c] = indexer + indexer +=1 + + ids = np.array(range(len(labels))) + random.shuffle(ids) + + shuffled_labels = np.array(labels)[ids] + shuffled_files = np.array(all_imgs)[ids] + categories = to_categorical(range(n_classes)).astype(np.int16)#[ [1 , 0, 0 , 0 , 0 , 0] , [0 , 1, 0 , 0 , 0 , 0] , [0 , 0, 1 , 0 , 0 , 0] , [0 , 0, 0 , 1 , 0 , 0] , [0 , 0, 0 , 0 , 1 , 0] , [0 , 0, 0 , 0 , 0 , 1] ] + ret_x= np.zeros((batchsize, height,width, 3)).astype(np.int16) + ret_y= np.zeros((batchsize, n_classes)).astype(np.int16) + batchcount = 0 + while True: + for i in range(len(shuffled_files)): + row = shuffled_files[i] + #print(row) + ###img = cv2.imread(row, 0) + ###img= resize_image (img, height, width) + ###img = img.astype(np.uint16) + ###ret_x[batchcount, :,:,0] = img[:,:] + ###ret_x[batchcount, :,:,1] = img[:,:] + ###ret_x[batchcount, :,:,2] = img[:,:] + + img = cv2.imread(row) + img= resize_image (img, height, width) + img = img.astype(np.uint16) + ret_x[batchcount, :,:,:] = img[:,:,:] + + #print(int(shuffled_labels[i]) ) + #print( categories[int(shuffled_labels[i])] ) + ret_y[batchcount, :] = categories[ int( shuffled_labels[i] ) ][:] + + batchcount+=1 + + if batchcount>=batchsize: + ret_x = ret_x/255. + yield (ret_x, ret_y) + ret_x= np.zeros((batchsize, height,width, 3)).astype(np.int16) + ret_y= np.zeros((batchsize, n_classes)).astype(np.int16) + batchcount = 0 def do_brightening(img_in_dir, factor): im = Image.open(img_in_dir) From 38db3e9289a1d5b2f17e26f7a857ee4030f3901e Mon Sep 17 00:00:00 2001 From: vahidrezanezhad Date: Mon, 6 May 2024 18:31:48 +0200 Subject: [PATCH 03/62] adding enhancement training --- config_params.json | 20 +++++------ gt_for_enhancement_creator.py | 31 ++++++++++++++++++ models.py | 27 ++++++++++----- train.py | 47 ++++++++++++++------------ utils.py | 62 +++++++++++++++++++---------------- 5 files changed, 119 insertions(+), 68 deletions(-) create mode 100644 gt_for_enhancement_creator.py diff --git a/config_params.json b/config_params.json index 43ad1bc..1c7a940 100644 --- a/config_params.json +++ b/config_params.json @@ -1,15 +1,15 @@ { "model_name" : "resnet50_unet", - "task": "classification", - "n_classes" : 2, - "n_epochs" : 7, - "input_height" : 224, - "input_width" : 224, + "task": "enhancement", + "n_classes" : 3, + "n_epochs" : 3, + "input_height" : 448, + "input_width" : 448, "weight_decay" : 1e-6, - "n_batch" : 6, + "n_batch" : 3, "learning_rate": 1e-4, "f1_threshold_classification": 0.8, - "patches" : false, + "patches" : true, "pretraining" : true, "augmentation" : false, "flip_aug" : false, @@ -35,7 +35,7 @@ "weighted_loss": false, "is_loss_soft_dice": false, "data_is_provided": false, - "dir_train": "/home/vahid/Downloads/image_classification_data/train", - "dir_eval": "/home/vahid/Downloads/image_classification_data/eval", - "dir_output": "/home/vahid/Downloads/image_classification_data/output" + "dir_train": "./training_data_sample_enhancement", + "dir_eval": "./eval", + "dir_output": "./out" } diff --git a/gt_for_enhancement_creator.py b/gt_for_enhancement_creator.py new file mode 100644 index 0000000..9a4274f --- /dev/null +++ b/gt_for_enhancement_creator.py @@ -0,0 +1,31 @@ +import cv2 +import os + +def resize_image(seg_in, input_height, input_width): + return cv2.resize(seg_in, (input_width, input_height), interpolation=cv2.INTER_NEAREST) + + +dir_imgs = './training_data_sample_enhancement/images' +dir_out_imgs = './training_data_sample_enhancement/images_gt' +dir_out_labs = './training_data_sample_enhancement/labels_gt' + +ls_imgs = os.listdir(dir_imgs) + + +ls_scales = [ 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9] + + +for img in ls_imgs: + img_name = img.split('.')[0] + img_type = img.split('.')[1] + image = cv2.imread(os.path.join(dir_imgs, img)) + for i, scale in enumerate(ls_scales): + height_sc = int(image.shape[0]*scale) + width_sc = int(image.shape[1]*scale) + + image_down_scaled = resize_image(image, height_sc, width_sc) + image_back_to_org_scale = resize_image(image_down_scaled, image.shape[0], image.shape[1]) + + cv2.imwrite(os.path.join(dir_out_imgs, img_name+'_'+str(i)+'.'+img_type), image_back_to_org_scale) + cv2.imwrite(os.path.join(dir_out_labs, img_name+'_'+str(i)+'.'+img_type), image) + diff --git a/models.py b/models.py index a6de1ef..4cceacd 100644 --- a/models.py +++ b/models.py @@ -168,7 +168,7 @@ def conv_block(input_tensor, kernel_size, filters, stage, block, strides=(2, 2)) return x -def resnet50_unet_light(n_classes, input_height=224, input_width=224, weight_decay=1e-6, pretraining=False): +def resnet50_unet_light(n_classes, input_height=224, input_width=224, taks="segmentation", weight_decay=1e-6, pretraining=False): assert input_height % 32 == 0 assert input_width % 32 == 0 @@ -259,14 +259,17 @@ def resnet50_unet_light(n_classes, input_height=224, input_width=224, weight_dec o = Activation('relu')(o) o = Conv2D(n_classes, (1, 1), padding='same', data_format=IMAGE_ORDERING, kernel_regularizer=l2(weight_decay))(o) - o = (BatchNormalization(axis=bn_axis))(o) - o = (Activation('softmax'))(o) + if task == "segmentation": + o = (BatchNormalization(axis=bn_axis))(o) + o = (Activation('softmax'))(o) + else: + o = (Activation('sigmoid'))(o) model = Model(img_input, o) return model -def resnet50_unet(n_classes, input_height=224, input_width=224, weight_decay=1e-6, pretraining=False): +def resnet50_unet(n_classes, input_height=224, input_width=224, task="segmentation", weight_decay=1e-6, pretraining=False): assert input_height % 32 == 0 assert input_width % 32 == 0 @@ -354,15 +357,18 @@ def resnet50_unet(n_classes, input_height=224, input_width=224, weight_decay=1e- o = Activation('relu')(o) o = Conv2D(n_classes, (1, 1), padding='same', data_format=IMAGE_ORDERING, kernel_regularizer=l2(weight_decay))(o) - o = (BatchNormalization(axis=bn_axis))(o) - o = (Activation('softmax'))(o) + if task == "segmentation": + o = (BatchNormalization(axis=bn_axis))(o) + o = (Activation('softmax'))(o) + else: + o = (Activation('sigmoid'))(o) model = Model(img_input, o) return model -def vit_resnet50_unet(n_classes,patch_size, num_patches, input_height=224,input_width=224,weight_decay=1e-6,pretraining=False): +def vit_resnet50_unet(n_classes, patch_size, num_patches, input_height=224, input_width=224, task="segmentation", weight_decay=1e-6, pretraining=False): inputs = layers.Input(shape=(input_height, input_width, 3)) IMAGE_ORDERING = 'channels_last' bn_axis=3 @@ -465,8 +471,11 @@ def vit_resnet50_unet(n_classes,patch_size, num_patches, input_height=224,input_ o = Activation('relu')(o) o = Conv2D(n_classes, (1, 1), padding='same', data_format=IMAGE_ORDERING,kernel_regularizer=l2(weight_decay))(o) - o = (BatchNormalization(axis=bn_axis))(o) - o = (Activation('softmax'))(o) + if task == "segmentation": + o = (BatchNormalization(axis=bn_axis))(o) + o = (Activation('softmax'))(o) + else: + o = (Activation('sigmoid'))(o) model = Model(inputs=inputs, outputs=o) diff --git a/train.py b/train.py index efcd3ac..595debe 100644 --- a/train.py +++ b/train.py @@ -1,5 +1,6 @@ import os import sys +os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' import tensorflow as tf from tensorflow.compat.v1.keras.backend import set_session import warnings @@ -91,7 +92,7 @@ def run(_config, n_classes, n_epochs, input_height, num_patches_xy, model_name, flip_index, dir_eval, dir_output, pretraining, learning_rate, task, f1_threshold_classification): - if task == "segmentation": + if task == "segmentation" or "enhancement": num_patches = num_patches_xy[0]*num_patches_xy[1] if data_is_provided: @@ -153,7 +154,7 @@ def run(_config, n_classes, n_epochs, input_height, blur_aug, padding_white, padding_black, flip_aug, binarization, scaling, degrading, brightening, scales, degrade_scales, brightness, flip_index, scaling_bluring, scaling_brightness, scaling_binarization, - rotation, rotation_not_90, thetha, scaling_flip, augmentation=augmentation, + rotation, rotation_not_90, thetha, scaling_flip, task, augmentation=augmentation, patches=patches) provide_patches(imgs_list_test, segs_list_test, dir_img_val, dir_seg_val, @@ -161,7 +162,7 @@ def run(_config, n_classes, n_epochs, input_height, blur_k, blur_aug, padding_white, padding_black, flip_aug, binarization, scaling, degrading, brightening, scales, degrade_scales, brightness, flip_index, scaling_bluring, scaling_brightness, scaling_binarization, - rotation, rotation_not_90, thetha, scaling_flip, augmentation=False, patches=patches) + rotation, rotation_not_90, thetha, scaling_flip, task, augmentation=False, patches=patches) if weighted_loss: weights = np.zeros(n_classes) @@ -191,45 +192,49 @@ def run(_config, n_classes, n_epochs, input_height, if continue_training: if model_name=='resnet50_unet': - if is_loss_soft_dice: + if is_loss_soft_dice and task == "segmentation": model = load_model(dir_of_start_model, compile=True, custom_objects={'soft_dice_loss': soft_dice_loss}) - if weighted_loss: + if weighted_loss and task == "segmentation": model = load_model(dir_of_start_model, compile=True, custom_objects={'loss': weighted_categorical_crossentropy(weights)}) if not is_loss_soft_dice and not weighted_loss: model = load_model(dir_of_start_model , compile=True) elif model_name=='hybrid_transformer_cnn': - if is_loss_soft_dice: + if is_loss_soft_dice and task == "segmentation": model = load_model(dir_of_start_model, compile=True, custom_objects={"PatchEncoder": PatchEncoder, "Patches": Patches,'soft_dice_loss': soft_dice_loss}) - if weighted_loss: + if weighted_loss and task == "segmentation": model = load_model(dir_of_start_model, compile=True, custom_objects={'loss': weighted_categorical_crossentropy(weights)}) if not is_loss_soft_dice and not weighted_loss: model = load_model(dir_of_start_model , compile=True,custom_objects = {"PatchEncoder": PatchEncoder, "Patches": Patches}) else: index_start = 0 if model_name=='resnet50_unet': - model = resnet50_unet(n_classes, input_height, input_width,weight_decay,pretraining) + model = resnet50_unet(n_classes, input_height, input_width, task, weight_decay, pretraining) elif model_name=='hybrid_transformer_cnn': - model = vit_resnet50_unet(n_classes, transformer_patchsize, num_patches, input_height, input_width,weight_decay,pretraining) + model = vit_resnet50_unet(n_classes, transformer_patchsize, num_patches, input_height, input_width, task, weight_decay, pretraining) #if you want to see the model structure just uncomment model summary. #model.summary() - - if not is_loss_soft_dice and not weighted_loss: - model.compile(loss='categorical_crossentropy', - optimizer=Adam(lr=learning_rate), metrics=['accuracy']) - if is_loss_soft_dice: - model.compile(loss=soft_dice_loss, - optimizer=Adam(lr=learning_rate), metrics=['accuracy']) - if weighted_loss: - model.compile(loss=weighted_categorical_crossentropy(weights), - optimizer=Adam(lr=learning_rate), metrics=['accuracy']) + if task == "segmentation": + if not is_loss_soft_dice and not weighted_loss: + model.compile(loss='categorical_crossentropy', + optimizer=Adam(lr=learning_rate), metrics=['accuracy']) + if is_loss_soft_dice: + model.compile(loss=soft_dice_loss, + optimizer=Adam(lr=learning_rate), metrics=['accuracy']) + if weighted_loss: + model.compile(loss=weighted_categorical_crossentropy(weights), + optimizer=Adam(lr=learning_rate), metrics=['accuracy']) + elif task == "enhancement": + model.compile(loss='mean_squared_error', + optimizer=Adam(lr=learning_rate), metrics=['accuracy']) + # generating train and evaluation data train_gen = data_gen(dir_flow_train_imgs, dir_flow_train_labels, batch_size=n_batch, - input_height=input_height, input_width=input_width, n_classes=n_classes) + input_height=input_height, input_width=input_width, n_classes=n_classes, task=task) val_gen = data_gen(dir_flow_eval_imgs, dir_flow_eval_labels, batch_size=n_batch, - input_height=input_height, input_width=input_width, n_classes=n_classes) + input_height=input_height, input_width=input_width, n_classes=n_classes, task=task) ##img_validation_patches = os.listdir(dir_flow_eval_imgs) ##score_best=[] diff --git a/utils.py b/utils.py index af3c5f8..0c5a458 100644 --- a/utils.py +++ b/utils.py @@ -268,7 +268,7 @@ def IoU(Yi, y_predi): return mIoU -def data_gen(img_folder, mask_folder, batch_size, input_height, input_width, n_classes): +def data_gen(img_folder, mask_folder, batch_size, input_height, input_width, n_classes, task='segmentation'): c = 0 n = [f for f in os.listdir(img_folder) if not f.startswith('.')] # os.listdir(img_folder) #List of training images random.shuffle(n) @@ -277,8 +277,6 @@ def data_gen(img_folder, mask_folder, batch_size, input_height, input_width, n_c mask = np.zeros((batch_size, input_height, input_width, n_classes)).astype('float') for i in range(c, c + batch_size): # initially from 0 to 16, c = 0. - # print(img_folder+'/'+n[i]) - try: filename = n[i].split('.')[0] @@ -287,11 +285,14 @@ def data_gen(img_folder, mask_folder, batch_size, input_height, input_width, n_c interpolation=cv2.INTER_NEAREST) # Read an image from folder and resize img[i - c] = train_img # add to array - img[0], img[1], and so on. - train_mask = cv2.imread(mask_folder + '/' + filename + '.png') - # print(mask_folder+'/'+filename+'.png') - # print(train_mask.shape) - train_mask = get_one_hot(resize_image(train_mask, input_height, input_width), input_height, input_width, - n_classes) + if task == "segmentation": + train_mask = cv2.imread(mask_folder + '/' + filename + '.png') + train_mask = get_one_hot(resize_image(train_mask, input_height, input_width), input_height, input_width, + n_classes) + elif task == "enhancement": + train_mask = cv2.imread(mask_folder + '/' + filename + '.png')/255. + train_mask = resize_image(train_mask, input_height, input_width) + # train_mask = train_mask.reshape(224, 224, 1) # Add extra dimension for parity with train_img size [512 * 512 * 3] mask[i - c] = train_mask @@ -539,14 +540,19 @@ def provide_patches(imgs_list_train, segs_list_train, dir_img, dir_seg, dir_flow padding_white, padding_black, flip_aug, binarization, scaling, degrading, brightening, scales, degrade_scales, brightness, flip_index, scaling_bluring, scaling_brightness, scaling_binarization, rotation, - rotation_not_90, thetha, scaling_flip, augmentation=False, patches=False): + rotation_not_90, thetha, scaling_flip, task, augmentation=False, patches=False): indexer = 0 for im, seg_i in tqdm(zip(imgs_list_train, segs_list_train)): img_name = im.split('.')[0] + if task == "segmentation": + dir_of_label_file = os.path.join(dir_seg, img_name + '.png') + elif task=="enhancement": + dir_of_label_file = os.path.join(dir_seg, im) + if not patches: cv2.imwrite(dir_flow_train_imgs + '/img_' + str(indexer) + '.png', resize_image(cv2.imread(dir_img + '/' + im), input_height, input_width)) - cv2.imwrite(dir_flow_train_labels + '/img_' + str(indexer) + '.png', resize_image(cv2.imread(dir_seg + '/' + img_name + '.png'), input_height, input_width)) + cv2.imwrite(dir_flow_train_labels + '/img_' + str(indexer) + '.png', resize_image(cv2.imread(dir_of_label_file), input_height, input_width)) indexer += 1 if augmentation: @@ -556,7 +562,7 @@ def provide_patches(imgs_list_train, segs_list_train, dir_img, dir_seg, dir_flow resize_image(cv2.flip(cv2.imread(dir_img+'/'+im),f_i),input_height,input_width) ) cv2.imwrite(dir_flow_train_labels + '/img_' + str(indexer) + '.png', - resize_image(cv2.flip(cv2.imread(dir_seg + '/' + img_name + '.png'), f_i), input_height, input_width)) + resize_image(cv2.flip(cv2.imread(dir_of_label_file), f_i), input_height, input_width)) indexer += 1 if blur_aug: @@ -565,7 +571,7 @@ def provide_patches(imgs_list_train, segs_list_train, dir_img, dir_seg, dir_flow (resize_image(bluring(cv2.imread(dir_img + '/' + im), blur_i), input_height, input_width))) cv2.imwrite(dir_flow_train_labels + '/img_' + str(indexer) + '.png', - resize_image(cv2.imread(dir_seg + '/' + img_name + '.png'), input_height, input_width)) + resize_image(cv2.imread(dir_of_label_file), input_height, input_width)) indexer += 1 if binarization: @@ -573,26 +579,26 @@ def provide_patches(imgs_list_train, segs_list_train, dir_img, dir_seg, dir_flow resize_image(otsu_copy(cv2.imread(dir_img + '/' + im)), input_height, input_width)) cv2.imwrite(dir_flow_train_labels + '/img_' + str(indexer) + '.png', - resize_image(cv2.imread(dir_seg + '/' + img_name + '.png'), input_height, input_width)) + resize_image(cv2.imread(dir_of_label_file), input_height, input_width)) indexer += 1 if patches: indexer = get_patches(dir_flow_train_imgs, dir_flow_train_labels, - cv2.imread(dir_img + '/' + im), cv2.imread(dir_seg + '/' + img_name + '.png'), + cv2.imread(dir_img + '/' + im), cv2.imread(dir_of_label_file), input_height, input_width, indexer=indexer) if augmentation: if rotation: indexer = get_patches(dir_flow_train_imgs, dir_flow_train_labels, rotation_90(cv2.imread(dir_img + '/' + im)), - rotation_90(cv2.imread(dir_seg + '/' + img_name + '.png')), + rotation_90(cv2.imread(dir_of_label_file)), input_height, input_width, indexer=indexer) if rotation_not_90: for thetha_i in thetha: img_max_rotated, label_max_rotated = rotation_not_90_func(cv2.imread(dir_img + '/'+im), - cv2.imread(dir_seg + '/'+img_name + '.png'), thetha_i) + cv2.imread(dir_of_label_file), thetha_i) indexer = get_patches(dir_flow_train_imgs, dir_flow_train_labels, img_max_rotated, label_max_rotated, @@ -601,24 +607,24 @@ def provide_patches(imgs_list_train, segs_list_train, dir_img, dir_seg, dir_flow for f_i in flip_index: indexer = get_patches(dir_flow_train_imgs, dir_flow_train_labels, cv2.flip(cv2.imread(dir_img + '/' + im), f_i), - cv2.flip(cv2.imread(dir_seg + '/' + img_name + '.png'), f_i), + cv2.flip(cv2.imread(dir_of_label_file), f_i), input_height, input_width, indexer=indexer) if blur_aug: for blur_i in blur_k: indexer = get_patches(dir_flow_train_imgs, dir_flow_train_labels, bluring(cv2.imread(dir_img + '/' + im), blur_i), - cv2.imread(dir_seg + '/' + img_name + '.png'), + cv2.imread(dir_of_label_file), input_height, input_width, indexer=indexer) if padding_black: indexer = get_patches(dir_flow_train_imgs, dir_flow_train_labels, do_padding_black(cv2.imread(dir_img + '/' + im)), - do_padding_label(cv2.imread(dir_seg + '/' + img_name + '.png')), + do_padding_label(cv2.imread(dir_of_label_file)), input_height, input_width, indexer=indexer) if padding_white: indexer = get_patches(dir_flow_train_imgs, dir_flow_train_labels, do_padding_white(cv2.imread(dir_img + '/'+im)), - do_padding_label(cv2.imread(dir_seg + '/' + img_name + '.png')), + do_padding_label(cv2.imread(dir_of_label_file)), input_height, input_width, indexer=indexer) if brightening: @@ -626,7 +632,7 @@ def provide_patches(imgs_list_train, segs_list_train, dir_img, dir_seg, dir_flow try: indexer = get_patches(dir_flow_train_imgs, dir_flow_train_labels, do_brightening(dir_img + '/' +im, factor), - cv2.imread(dir_seg + '/' + img_name + '.png'), + cv2.imread(dir_of_label_file), input_height, input_width, indexer=indexer) except: pass @@ -634,20 +640,20 @@ def provide_patches(imgs_list_train, segs_list_train, dir_img, dir_seg, dir_flow for sc_ind in scales: indexer = get_patches_num_scale_new(dir_flow_train_imgs, dir_flow_train_labels, cv2.imread(dir_img + '/' + im) , - cv2.imread(dir_seg + '/' + img_name + '.png'), + cv2.imread(dir_of_label_file), input_height, input_width, indexer=indexer, scaler=sc_ind) if degrading: for degrade_scale_ind in degrade_scales: indexer = get_patches(dir_flow_train_imgs, dir_flow_train_labels, do_degrading(cv2.imread(dir_img + '/' + im), degrade_scale_ind), - cv2.imread(dir_seg + '/' + img_name + '.png'), + cv2.imread(dir_of_label_file), input_height, input_width, indexer=indexer) if binarization: indexer = get_patches(dir_flow_train_imgs, dir_flow_train_labels, otsu_copy(cv2.imread(dir_img + '/' + im)), - cv2.imread(dir_seg + '/' + img_name + '.png'), + cv2.imread(dir_of_label_file), input_height, input_width, indexer=indexer) if scaling_brightness: @@ -657,7 +663,7 @@ def provide_patches(imgs_list_train, segs_list_train, dir_img, dir_seg, dir_flow indexer = get_patches_num_scale_new(dir_flow_train_imgs, dir_flow_train_labels, do_brightening(dir_img + '/' + im, factor) - ,cv2.imread(dir_seg + '/' + img_name + '.png') + ,cv2.imread(dir_of_label_file) ,input_height, input_width, indexer=indexer, scaler=sc_ind) except: pass @@ -667,14 +673,14 @@ def provide_patches(imgs_list_train, segs_list_train, dir_img, dir_seg, dir_flow for blur_i in blur_k: indexer = get_patches_num_scale_new(dir_flow_train_imgs, dir_flow_train_labels, bluring(cv2.imread(dir_img + '/' + im), blur_i), - cv2.imread(dir_seg + '/' + img_name + '.png'), + cv2.imread(dir_of_label_file), input_height, input_width, indexer=indexer, scaler=sc_ind) if scaling_binarization: for sc_ind in scales: indexer = get_patches_num_scale_new(dir_flow_train_imgs, dir_flow_train_labels, otsu_copy(cv2.imread(dir_img + '/' + im)), - cv2.imread(dir_seg + '/' + img_name + '.png'), + cv2.imread(dir_of_label_file), input_height, input_width, indexer=indexer, scaler=sc_ind) if scaling_flip: @@ -682,5 +688,5 @@ def provide_patches(imgs_list_train, segs_list_train, dir_img, dir_seg, dir_flow for f_i in flip_index: indexer = get_patches_num_scale_new(dir_flow_train_imgs, dir_flow_train_labels, cv2.flip( cv2.imread(dir_img + '/' + im), f_i), - cv2.flip(cv2.imread(dir_seg + '/' + img_name + '.png'), f_i), + cv2.flip(cv2.imread(dir_of_label_file), f_i), input_height, input_width, indexer=indexer, scaler=sc_ind) From 8d1050ec308109f9e1021060e1d616a65f32f964 Mon Sep 17 00:00:00 2001 From: vahidrezanezhad Date: Tue, 7 May 2024 13:34:03 +0200 Subject: [PATCH 04/62] inference script is added --- config_params.json | 17 +- inference.py | 490 +++++++++++++++++++++++++++++++++++++++++++++ train.py | 42 ++-- utils.py | 30 +-- 4 files changed, 537 insertions(+), 42 deletions(-) create mode 100644 inference.py diff --git a/config_params.json b/config_params.json index 1c7a940..8a56de5 100644 --- a/config_params.json +++ b/config_params.json @@ -1,12 +1,12 @@ { - "model_name" : "resnet50_unet", - "task": "enhancement", - "n_classes" : 3, - "n_epochs" : 3, + "backbone_type" : "nontransformer", + "task": "classification", + "n_classes" : 2, + "n_epochs" : 20, "input_height" : 448, "input_width" : 448, "weight_decay" : 1e-6, - "n_batch" : 3, + "n_batch" : 6, "learning_rate": 1e-4, "f1_threshold_classification": 0.8, "patches" : true, @@ -21,7 +21,7 @@ "scaling_flip" : false, "rotation": false, "rotation_not_90": false, - "num_patches_xy": [28, 28], + "transformer_num_patches_xy": [28, 28], "transformer_patchsize": 1, "blur_k" : ["blur","guass","median"], "scales" : [0.6, 0.7, 0.8, 0.9, 1.1, 1.2, 1.4], @@ -29,13 +29,14 @@ "degrade_scales" : [0.2, 0.4], "flip_index" : [0, 1, -1], "thetha" : [10, -10], + "classification_classes_name" : {"0":"apple", "1":"orange"}, "continue_training": false, "index_start" : 0, "dir_of_start_model" : " ", "weighted_loss": false, "is_loss_soft_dice": false, "data_is_provided": false, - "dir_train": "./training_data_sample_enhancement", + "dir_train": "./train", "dir_eval": "./eval", - "dir_output": "./out" + "dir_output": "./output" } diff --git a/inference.py b/inference.py new file mode 100644 index 0000000..6911bea --- /dev/null +++ b/inference.py @@ -0,0 +1,490 @@ +#! /usr/bin/env python3 + +__version__= '1.0' + +import argparse +import sys +import os +import numpy as np +import warnings +import xml.etree.ElementTree as et +import pandas as pd +from tqdm import tqdm +import csv +import cv2 +import seaborn as sns +import matplotlib.pyplot as plt +from tensorflow.keras.models import load_model +import tensorflow as tf +from tensorflow.keras import backend as K +from tensorflow.keras import layers +import tensorflow.keras.losses +from tensorflow.keras.layers import * +import click +import json +from tensorflow.python.keras import backend as tensorflow_backend + + + + + + +with warnings.catch_warnings(): + warnings.simplefilter("ignore") + +__doc__=\ +""" +Tool to load model and predict for given image. +""" + +projection_dim = 64 +patch_size = 1 +num_patches =28*28 +class Patches(layers.Layer): + def __init__(self, **kwargs): + super(Patches, self).__init__() + self.patch_size = patch_size + + def call(self, images): + print(tf.shape(images)[1],'images') + print(self.patch_size,'self.patch_size') + batch_size = tf.shape(images)[0] + patches = tf.image.extract_patches( + images=images, + sizes=[1, self.patch_size, self.patch_size, 1], + strides=[1, self.patch_size, self.patch_size, 1], + rates=[1, 1, 1, 1], + padding="VALID", + ) + patch_dims = patches.shape[-1] + print(patches.shape,patch_dims,'patch_dims') + patches = tf.reshape(patches, [batch_size, -1, patch_dims]) + return patches + def get_config(self): + + config = super().get_config().copy() + config.update({ + 'patch_size': self.patch_size, + }) + return config + + +class PatchEncoder(layers.Layer): + def __init__(self, **kwargs): + super(PatchEncoder, self).__init__() + self.num_patches = num_patches + self.projection = layers.Dense(units=projection_dim) + self.position_embedding = layers.Embedding( + input_dim=num_patches, output_dim=projection_dim + ) + + def call(self, patch): + positions = tf.range(start=0, limit=self.num_patches, delta=1) + encoded = self.projection(patch) + self.position_embedding(positions) + return encoded + def get_config(self): + + config = super().get_config().copy() + config.update({ + 'num_patches': self.num_patches, + 'projection': self.projection, + 'position_embedding': self.position_embedding, + }) + return config + + +class sbb_predict: + def __init__(self,image, model, task, config_params_model, patches='false',save='false', ground_truth=None,weights_dir=None ): + self.image=image + self.patches=patches + self.save=save + self.model_dir=model + self.ground_truth=ground_truth + self.weights_dir=weights_dir + self.task=task + self.config_params_model=config_params_model + + def resize_image(self,img_in,input_height,input_width): + return cv2.resize( img_in, ( input_width,input_height) ,interpolation=cv2.INTER_NEAREST) + + + def color_images(self,seg): + ann_u=range(self.n_classes) + if len(np.shape(seg))==3: + seg=seg[:,:,0] + + seg_img=np.zeros((np.shape(seg)[0],np.shape(seg)[1],3)).astype(np.uint8) + colors=sns.color_palette("hls", self.n_classes) + + for c in ann_u: + c=int(c) + segl=(seg==c) + seg_img[:,:,0][seg==c]=c + seg_img[:,:,1][seg==c]=c + seg_img[:,:,2][seg==c]=c + return seg_img + + def otsu_copy_binary(self,img): + img_r=np.zeros((img.shape[0],img.shape[1],3)) + img1=img[:,:,0] + + #print(img.min()) + #print(img[:,:,0].min()) + #blur = cv2.GaussianBlur(img,(5,5)) + #ret3,th3 = cv2.threshold(blur,0,255,cv2.THRESH_BINARY+cv2.THRESH_OTSU) + retval1, threshold1 = cv2.threshold(img1, 0, 255, cv2.THRESH_BINARY+cv2.THRESH_OTSU) + + + + img_r[:,:,0]=threshold1 + img_r[:,:,1]=threshold1 + img_r[:,:,2]=threshold1 + #img_r=img_r/float(np.max(img_r))*255 + return img_r + + def otsu_copy(self,img): + img_r=np.zeros((img.shape[0],img.shape[1],3)) + #img1=img[:,:,0] + + #print(img.min()) + #print(img[:,:,0].min()) + #blur = cv2.GaussianBlur(img,(5,5)) + #ret3,th3 = cv2.threshold(blur,0,255,cv2.THRESH_BINARY+cv2.THRESH_OTSU) + _, threshold1 = cv2.threshold(img[:,:,0], 0, 255, cv2.THRESH_BINARY+cv2.THRESH_OTSU) + _, threshold2 = cv2.threshold(img[:,:,1], 0, 255, cv2.THRESH_BINARY+cv2.THRESH_OTSU) + _, threshold3 = cv2.threshold(img[:,:,2], 0, 255, cv2.THRESH_BINARY+cv2.THRESH_OTSU) + + + + img_r[:,:,0]=threshold1 + img_r[:,:,1]=threshold2 + img_r[:,:,2]=threshold3 + ###img_r=img_r/float(np.max(img_r))*255 + return img_r + + def soft_dice_loss(self,y_true, y_pred, epsilon=1e-6): + + axes = tuple(range(1, len(y_pred.shape)-1)) + + numerator = 2. * K.sum(y_pred * y_true, axes) + + denominator = K.sum(K.square(y_pred) + K.square(y_true), axes) + return 1.00 - K.mean(numerator / (denominator + epsilon)) # average over classes and batch + + def weighted_categorical_crossentropy(self,weights=None): + + def loss(y_true, y_pred): + labels_floats = tf.cast(y_true, tf.float32) + per_pixel_loss = tf.nn.sigmoid_cross_entropy_with_logits(labels=labels_floats,logits=y_pred) + + if weights is not None: + weight_mask = tf.maximum(tf.reduce_max(tf.constant( + np.array(weights, dtype=np.float32)[None, None, None]) + * labels_floats, axis=-1), 1.0) + per_pixel_loss = per_pixel_loss * weight_mask[:, :, :, None] + return tf.reduce_mean(per_pixel_loss) + return self.loss + + + def IoU(self,Yi,y_predi): + ## mean Intersection over Union + ## Mean IoU = TP/(FN + TP + FP) + + IoUs = [] + Nclass = np.unique(Yi) + for c in Nclass: + TP = np.sum( (Yi == c)&(y_predi==c) ) + FP = np.sum( (Yi != c)&(y_predi==c) ) + FN = np.sum( (Yi == c)&(y_predi != c)) + IoU = TP/float(TP + FP + FN) + if self.n_classes>2: + print("class {:02.0f}: #TP={:6.0f}, #FP={:6.0f}, #FN={:5.0f}, IoU={:4.3f}".format(c,TP,FP,FN,IoU)) + IoUs.append(IoU) + if self.n_classes>2: + mIoU = np.mean(IoUs) + print("_________________") + print("Mean IoU: {:4.3f}".format(mIoU)) + return mIoU + elif self.n_classes==2: + mIoU = IoUs[1] + print("_________________") + print("IoU: {:4.3f}".format(mIoU)) + return mIoU + + def start_new_session_and_model(self): + + config = tf.compat.v1.ConfigProto() + config.gpu_options.allow_growth = True + + session = tf.compat.v1.Session(config=config) # tf.InteractiveSession() + tensorflow_backend.set_session(session) + #tensorflow.keras.layers.custom_layer = PatchEncoder + #tensorflow.keras.layers.custom_layer = Patches + self.model = load_model(self.model_dir , compile=False,custom_objects = {"PatchEncoder": PatchEncoder, "Patches": Patches}) + #config = tf.ConfigProto() + #config.gpu_options.allow_growth=True + + #self.session = tf.InteractiveSession() + #keras.losses.custom_loss = self.weighted_categorical_crossentropy + #self.model = load_model(self.model_dir , compile=False) + + + ##if self.weights_dir!=None: + ##self.model.load_weights(self.weights_dir) + + if self.task != 'classification': + self.img_height=self.model.layers[len(self.model.layers)-1].output_shape[1] + self.img_width=self.model.layers[len(self.model.layers)-1].output_shape[2] + self.n_classes=self.model.layers[len(self.model.layers)-1].output_shape[3] + + def visualize_model_output(self, prediction, img, task): + if task == "binarization": + prediction = prediction * -1 + prediction = prediction + 1 + added_image = prediction * 255 + else: + unique_classes = np.unique(prediction[:,:,0]) + rgb_colors = {'0' : [255, 255, 255], + '1' : [255, 0, 0], + '2' : [255, 125, 0], + '3' : [255, 0, 125], + '4' : [125, 125, 125], + '5' : [125, 125, 0], + '6' : [0, 125, 255], + '7' : [0, 125, 0], + '8' : [125, 125, 125], + '9' : [0, 125, 255], + '10' : [125, 0, 125], + '11' : [0, 255, 0], + '12' : [0, 0, 255], + '13' : [0, 255, 255], + '14' : [255, 125, 125], + '15' : [255, 0, 255]} + + output = np.zeros(prediction.shape) + + for unq_class in unique_classes: + rgb_class_unique = rgb_colors[str(int(unq_class))] + output[:,:,0][prediction[:,:,0]==unq_class] = rgb_class_unique[0] + output[:,:,1][prediction[:,:,0]==unq_class] = rgb_class_unique[1] + output[:,:,2][prediction[:,:,0]==unq_class] = rgb_class_unique[2] + + + + img = self.resize_image(img, output.shape[0], output.shape[1]) + + output = output.astype(np.int32) + img = img.astype(np.int32) + + + + added_image = cv2.addWeighted(img,0.5,output,0.1,0) + + return added_image + + def predict(self): + self.start_new_session_and_model() + if self.task == 'classification': + classes_names = self.config_params_model['classification_classes_name'] + img_1ch = img=cv2.imread(self.image, 0) + + img_1ch = img_1ch / 255.0 + img_1ch = cv2.resize(img_1ch, (self.config_params_model['input_height'], self.config_params_model['input_width']), interpolation=cv2.INTER_NEAREST) + img_in = np.zeros((1, img_1ch.shape[0], img_1ch.shape[1], 3)) + img_in[0, :, :, 0] = img_1ch[:, :] + img_in[0, :, :, 1] = img_1ch[:, :] + img_in[0, :, :, 2] = img_1ch[:, :] + + label_p_pred = self.model.predict(img_in, verbose=0) + index_class = np.argmax(label_p_pred[0]) + + print("Predicted Class: {}".format(classes_names[str(int(index_class))])) + else: + if self.patches: + #def textline_contours(img,input_width,input_height,n_classes,model): + + img=cv2.imread(self.image) + self.img_org = np.copy(img) + + if img.shape[0] < self.img_height: + img = cv2.resize(img, (img.shape[1], self.img_width), interpolation=cv2.INTER_NEAREST) + + if img.shape[1] < self.img_width: + img = cv2.resize(img, (self.img_height, img.shape[0]), interpolation=cv2.INTER_NEAREST) + margin = int(0 * self.img_width) + width_mid = self.img_width - 2 * margin + height_mid = self.img_height - 2 * margin + img = img / float(255.0) + + img_h = img.shape[0] + img_w = img.shape[1] + + prediction_true = np.zeros((img_h, img_w, 3)) + nxf = img_w / float(width_mid) + nyf = img_h / float(height_mid) + + nxf = int(nxf) + 1 if nxf > int(nxf) else int(nxf) + nyf = int(nyf) + 1 if nyf > int(nyf) else int(nyf) + + for i in range(nxf): + for j in range(nyf): + if i == 0: + index_x_d = i * width_mid + index_x_u = index_x_d + self.img_width + else: + index_x_d = i * width_mid + index_x_u = index_x_d + self.img_width + if j == 0: + index_y_d = j * height_mid + index_y_u = index_y_d + self.img_height + else: + index_y_d = j * height_mid + index_y_u = index_y_d + self.img_height + + if index_x_u > img_w: + index_x_u = img_w + index_x_d = img_w - self.img_width + if index_y_u > img_h: + index_y_u = img_h + index_y_d = img_h - self.img_height + + img_patch = img[index_y_d:index_y_u, index_x_d:index_x_u, :] + label_p_pred = self.model.predict(img_patch.reshape(1, img_patch.shape[0], img_patch.shape[1], img_patch.shape[2]), + verbose=0) + + if self.task == 'enhancement': + seg = label_p_pred[0, :, :, :] + seg = seg * 255 + elif self.task == 'segmentation' or self.task == 'binarization': + seg = np.argmax(label_p_pred, axis=3)[0] + seg = np.repeat(seg[:, :, np.newaxis], 3, axis=2) + + + if i == 0 and j == 0: + seg = seg[0 : seg.shape[0] - margin, 0 : seg.shape[1] - margin] + prediction_true[index_y_d + 0 : index_y_u - margin, index_x_d + 0 : index_x_u - margin, :] = seg + elif i == nxf - 1 and j == nyf - 1: + seg = seg[margin : seg.shape[0] - 0, margin : seg.shape[1] - 0] + prediction_true[index_y_d + margin : index_y_u - 0, index_x_d + margin : index_x_u - 0, :] = seg + elif i == 0 and j == nyf - 1: + seg = seg[margin : seg.shape[0] - 0, 0 : seg.shape[1] - margin] + prediction_true[index_y_d + margin : index_y_u - 0, index_x_d + 0 : index_x_u - margin, :] = seg + elif i == nxf - 1 and j == 0: + seg = seg[0 : seg.shape[0] - margin, margin : seg.shape[1] - 0] + prediction_true[index_y_d + 0 : index_y_u - margin, index_x_d + margin : index_x_u - 0, :] = seg + elif i == 0 and j != 0 and j != nyf - 1: + seg = seg[margin : seg.shape[0] - margin, 0 : seg.shape[1] - margin] + prediction_true[index_y_d + margin : index_y_u - margin, index_x_d + 0 : index_x_u - margin, :] = seg + elif i == nxf - 1 and j != 0 and j != nyf - 1: + seg = seg[margin : seg.shape[0] - margin, margin : seg.shape[1] - 0] + prediction_true[index_y_d + margin : index_y_u - margin, index_x_d + margin : index_x_u - 0, :] = seg + elif i != 0 and i != nxf - 1 and j == 0: + seg = seg[0 : seg.shape[0] - margin, margin : seg.shape[1] - margin] + prediction_true[index_y_d + 0 : index_y_u - margin, index_x_d + margin : index_x_u - margin, :] = seg + elif i != 0 and i != nxf - 1 and j == nyf - 1: + seg = seg[margin : seg.shape[0] - 0, margin : seg.shape[1] - margin] + prediction_true[index_y_d + margin : index_y_u - 0, index_x_d + margin : index_x_u - margin, :] = seg + else: + seg = seg[margin : seg.shape[0] - margin, margin : seg.shape[1] - margin] + prediction_true[index_y_d + margin : index_y_u - margin, index_x_d + margin : index_x_u - margin, :] = seg + prediction_true = prediction_true.astype(int) + prediction_true = cv2.resize(prediction_true, (self.img_org.shape[1], self.img_org.shape[0]), interpolation=cv2.INTER_NEAREST) + return prediction_true + + else: + + img=cv2.imread(self.image) + self.img_org = np.copy(img) + + width=self.img_width + height=self.img_height + + img=img/255.0 + img=self.resize_image(img,self.img_height,self.img_width) + + + label_p_pred=self.model.predict( + img.reshape(1,img.shape[0],img.shape[1],img.shape[2])) + + if self.task == 'enhancement': + seg = label_p_pred[0, :, :, :] + seg = seg * 255 + elif self.task == 'segmentation' or self.task == 'binarization': + seg = np.argmax(label_p_pred, axis=3)[0] + seg = np.repeat(seg[:, :, np.newaxis], 3, axis=2) + + prediction_true = seg.astype(int) + + prediction_true = cv2.resize(prediction_true, (self.img_org.shape[1], self.img_org.shape[0]), interpolation=cv2.INTER_NEAREST) + return prediction_true + + + + def run(self): + res=self.predict() + if self.task == 'classification': + pass + else: + img_seg_overlayed = self.visualize_model_output(res, self.img_org, self.task) + cv2.imwrite('./test.png',img_seg_overlayed) + ##if self.save!=None: + ##img=np.repeat(res[:, :, np.newaxis]*255, 3, axis=2) + ##cv2.imwrite(self.save,img) + + ###if self.ground_truth!=None: + ###gt_img=cv2.imread(self.ground_truth) + ###self.IoU(gt_img[:,:,0],res) + ##plt.imshow(res) + ##plt.show() + +@click.command() +@click.option( + "--image", + "-i", + help="image filename", + type=click.Path(exists=True, dir_okay=False), +) +@click.option( + "--patches/--no-patches", + "-p/-nop", + is_flag=True, + help="if this parameter set to true, this tool will try to do inference in patches.", +) +@click.option( + "--save", + "-s", + help="save prediction as a png file in current folder.", +) +@click.option( + "--model", + "-m", + help="directory of models", + type=click.Path(exists=True, file_okay=False), + required=True, +) +@click.option( + "--ground_truth/--no-ground_truth", + "-gt/-nogt", + is_flag=True, + help="ground truth directory if you want to see the iou of prediction.", +) +@click.option( + "--model_weights/--no-model_weights", + "-mw/-nomw", + is_flag=True, + help="previous model weights which are saved.", +) +def main(image, model, patches, save, ground_truth, model_weights): + + with open(os.path.join(model,'config.json')) as f: + config_params_model = json.load(f) + task = 'classification' + x=sbb_predict(image, model, task, config_params_model, patches, save, ground_truth, model_weights) + x.run() + +if __name__=="__main__": + main() + + + + diff --git a/train.py b/train.py index 595debe..28363d2 100644 --- a/train.py +++ b/train.py @@ -69,7 +69,7 @@ def config_params(): flip_index = None # Flip image for augmentation. continue_training = False # Set to true if you would like to continue training an already trained a model. transformer_patchsize = None # Patch size of vision transformer patches. - num_patches_xy = None # Number of patches for vision transformer. + transformer_num_patches_xy = None # Number of patches for vision transformer. index_start = 0 # Index of model to continue training from. E.g. if you trained for 3 epochs and last index is 2, to continue from model_1.h5, set "index_start" to 3 to start naming model with index 3. dir_of_start_model = '' # Directory containing pretrained encoder to continue training the model. is_loss_soft_dice = False # Use soft dice as loss function. When set to true, "weighted_loss" must be false. @@ -77,6 +77,8 @@ def config_params(): data_is_provided = False # Only set this to true when you have already provided the input data and the train and eval data are in "dir_output". task = "segmentation" # This parameter defines task of model which can be segmentation, enhancement or classification. f1_threshold_classification = None # This threshold is used to consider models with an evaluation f1 scores bigger than it. The selected model weights undergo a weights ensembling. And avreage ensembled model will be written to output. + classification_classes_name = None # Dictionary of classification classes names. + backbone_type = None # As backbone we have 2 types of backbones. A vision transformer alongside a CNN and we call it "transformer" and only CNN called "nontransformer" @ex.automain @@ -89,12 +91,12 @@ def run(_config, n_classes, n_epochs, input_height, brightness, dir_train, data_is_provided, scaling_bluring, scaling_brightness, scaling_binarization, rotation, rotation_not_90, thetha, scaling_flip, continue_training, transformer_patchsize, - num_patches_xy, model_name, flip_index, dir_eval, dir_output, - pretraining, learning_rate, task, f1_threshold_classification): + transformer_num_patches_xy, backbone_type, flip_index, dir_eval, dir_output, + pretraining, learning_rate, task, f1_threshold_classification, classification_classes_name): - if task == "segmentation" or "enhancement": + if task == "segmentation" or task == "enhancement": - num_patches = num_patches_xy[0]*num_patches_xy[1] + num_patches = transformer_num_patches_xy[0]*transformer_num_patches_xy[1] if data_is_provided: dir_train_flowing = os.path.join(dir_output, 'train') dir_eval_flowing = os.path.join(dir_output, 'eval') @@ -191,14 +193,14 @@ def run(_config, n_classes, n_epochs, input_height, weights = weights / float(np.sum(weights)) if continue_training: - if model_name=='resnet50_unet': + if backbone_type=='nontransformer': if is_loss_soft_dice and task == "segmentation": model = load_model(dir_of_start_model, compile=True, custom_objects={'soft_dice_loss': soft_dice_loss}) if weighted_loss and task == "segmentation": model = load_model(dir_of_start_model, compile=True, custom_objects={'loss': weighted_categorical_crossentropy(weights)}) if not is_loss_soft_dice and not weighted_loss: model = load_model(dir_of_start_model , compile=True) - elif model_name=='hybrid_transformer_cnn': + elif backbone_type=='transformer': if is_loss_soft_dice and task == "segmentation": model = load_model(dir_of_start_model, compile=True, custom_objects={"PatchEncoder": PatchEncoder, "Patches": Patches,'soft_dice_loss': soft_dice_loss}) if weighted_loss and task == "segmentation": @@ -207,9 +209,9 @@ def run(_config, n_classes, n_epochs, input_height, model = load_model(dir_of_start_model , compile=True,custom_objects = {"PatchEncoder": PatchEncoder, "Patches": Patches}) else: index_start = 0 - if model_name=='resnet50_unet': + if backbone_type=='nontransformer': model = resnet50_unet(n_classes, input_height, input_width, task, weight_decay, pretraining) - elif model_name=='hybrid_transformer_cnn': + elif backbone_type=='nontransformer': model = vit_resnet50_unet(n_classes, transformer_patchsize, num_patches, input_height, input_width, task, weight_decay, pretraining) #if you want to see the model structure just uncomment model summary. @@ -246,9 +248,9 @@ def run(_config, n_classes, n_epochs, input_height, validation_data=val_gen, validation_steps=1, epochs=1) - model.save(dir_output+'/'+'model_'+str(i)) + model.save(os.path.join(dir_output,'model_'+str(i))) - with open(dir_output+'/'+'model_'+str(i)+'/'+"config.json", "w") as fp: + with open(os.path.join(os.path.join(dir_output,'model_'+str(i)),"config.json"), "w") as fp: json.dump(_config, fp) # encode dict into JSON #os.system('rm -rf '+dir_train_flowing) @@ -257,14 +259,15 @@ def run(_config, n_classes, n_epochs, input_height, #model.save(dir_output+'/'+'model'+'.h5') elif task=='classification': configuration() - model = resnet50_classifier(n_classes, input_height, input_width,weight_decay,pretraining) + model = resnet50_classifier(n_classes, input_height, input_width, weight_decay, pretraining) opt_adam = Adam(learning_rate=0.001) model.compile(loss='categorical_crossentropy', optimizer = opt_adam,metrics=['accuracy']) - - testX, testY = generate_data_from_folder_evaluation(dir_eval, input_height, input_width, n_classes) + + list_classes = list(classification_classes_name.values()) + testX, testY = generate_data_from_folder_evaluation(dir_eval, input_height, input_width, n_classes, list_classes) #print(testY.shape, testY) @@ -280,7 +283,7 @@ def run(_config, n_classes, n_epochs, input_height, for i in range(n_epochs): #history = model.fit(trainX, trainY, epochs=1, batch_size=n_batch, validation_data=(testX, testY), verbose=2)#,class_weight=weights) - history = model.fit( generate_data_from_folder_training(dir_train, n_batch , input_height, input_width, n_classes), steps_per_epoch=num_rows / n_batch, verbose=0)#,class_weight=weights) + history = model.fit( generate_data_from_folder_training(dir_train, n_batch , input_height, input_width, n_classes, list_classes), steps_per_epoch=num_rows / n_batch, verbose=0)#,class_weight=weights) y_pr_class = [] for jj in range(testY.shape[0]): @@ -301,10 +304,6 @@ def run(_config, n_classes, n_epochs, input_height, score_best[0]=f1score model.save(os.path.join(dir_output,'model_best')) - - ##best_model=keras.models.clone_model(model) - ##best_model.build() - ##best_model.set_weights(model.get_weights()) if f1score > f1_threshold_classification: weights.append(model.get_weights() ) y_tot=y_tot+y_pr @@ -329,4 +328,9 @@ def run(_config, n_classes, n_epochs, input_height, ##best_model.save('model_taza.h5') model_weight_averaged.save(os.path.join(dir_output,'model_ens_avg')) + with open(os.path.join( os.path.join(dir_output,'model_ens_avg'), "config.json"), "w") as fp: + json.dump(_config, fp) # encode dict into JSON + + with open(os.path.join( os.path.join(dir_output,'model_best'), "config.json"), "w") as fp: + json.dump(_config, fp) # encode dict into JSON diff --git a/utils.py b/utils.py index 0c5a458..3a0375a 100644 --- a/utils.py +++ b/utils.py @@ -21,14 +21,14 @@ def return_number_of_total_training_data(path_classes): -def generate_data_from_folder_evaluation(path_classes, height, width, n_classes): - sub_classes = os.listdir(path_classes) +def generate_data_from_folder_evaluation(path_classes, height, width, n_classes, list_classes): + #sub_classes = os.listdir(path_classes) #n_classes = len(sub_classes) all_imgs = [] labels = [] - dicts =dict() - indexer= 0 - for sub_c in sub_classes: + #dicts =dict() + #indexer= 0 + for indexer, sub_c in enumerate(list_classes): sub_files = os.listdir(os.path.join(path_classes,sub_c )) sub_files = [os.path.join(path_classes,sub_c )+'/' + x for x in sub_files] #print( os.listdir(os.path.join(path_classes,sub_c )) ) @@ -37,8 +37,8 @@ def generate_data_from_folder_evaluation(path_classes, height, width, n_classes) #print( len(sub_labels) ) labels = labels + sub_labels - dicts[sub_c] = indexer - indexer +=1 + #dicts[sub_c] = indexer + #indexer +=1 categories = to_categorical(range(n_classes)).astype(np.int16)#[ [1 , 0, 0 , 0 , 0 , 0] , [0 , 1, 0 , 0 , 0 , 0] , [0 , 0, 1 , 0 , 0 , 0] , [0 , 0, 0 , 1 , 0 , 0] , [0 , 0, 0 , 0 , 1 , 0] , [0 , 0, 0 , 0 , 0 , 1] ] @@ -64,15 +64,15 @@ def generate_data_from_folder_evaluation(path_classes, height, width, n_classes) return ret_x/255., ret_y -def generate_data_from_folder_training(path_classes, batchsize, height, width, n_classes): - sub_classes = os.listdir(path_classes) - n_classes = len(sub_classes) +def generate_data_from_folder_training(path_classes, batchsize, height, width, n_classes, list_classes): + #sub_classes = os.listdir(path_classes) + #n_classes = len(sub_classes) all_imgs = [] labels = [] - dicts =dict() - indexer= 0 - for sub_c in sub_classes: + #dicts =dict() + #indexer= 0 + for indexer, sub_c in enumerate(list_classes): sub_files = os.listdir(os.path.join(path_classes,sub_c )) sub_files = [os.path.join(path_classes,sub_c )+'/' + x for x in sub_files] #print( os.listdir(os.path.join(path_classes,sub_c )) ) @@ -81,8 +81,8 @@ def generate_data_from_folder_training(path_classes, batchsize, height, width, n #print( len(sub_labels) ) labels = labels + sub_labels - dicts[sub_c] = indexer - indexer +=1 + #dicts[sub_c] = indexer + #indexer +=1 ids = np.array(range(len(labels))) random.shuffle(ids) From ce1108aca07a8e43f6d437825d3d5aca119f3dcd Mon Sep 17 00:00:00 2001 From: vahidrezanezhad Date: Tue, 7 May 2024 16:24:12 +0200 Subject: [PATCH 05/62] modifications --- inference.py | 108 ++++++++------------------------------------------- 1 file changed, 17 insertions(+), 91 deletions(-) diff --git a/inference.py b/inference.py index 6911bea..94e318d 100644 --- a/inference.py +++ b/inference.py @@ -1,25 +1,16 @@ -#! /usr/bin/env python3 - -__version__= '1.0' - -import argparse import sys import os import numpy as np import warnings -import xml.etree.ElementTree as et -import pandas as pd -from tqdm import tqdm -import csv import cv2 import seaborn as sns -import matplotlib.pyplot as plt from tensorflow.keras.models import load_model import tensorflow as tf from tensorflow.keras import backend as K from tensorflow.keras import layers import tensorflow.keras.losses from tensorflow.keras.layers import * +from models import * import click import json from tensorflow.python.keras import backend as tensorflow_backend @@ -37,70 +28,13 @@ __doc__=\ Tool to load model and predict for given image. """ -projection_dim = 64 -patch_size = 1 -num_patches =28*28 -class Patches(layers.Layer): - def __init__(self, **kwargs): - super(Patches, self).__init__() - self.patch_size = patch_size - - def call(self, images): - print(tf.shape(images)[1],'images') - print(self.patch_size,'self.patch_size') - batch_size = tf.shape(images)[0] - patches = tf.image.extract_patches( - images=images, - sizes=[1, self.patch_size, self.patch_size, 1], - strides=[1, self.patch_size, self.patch_size, 1], - rates=[1, 1, 1, 1], - padding="VALID", - ) - patch_dims = patches.shape[-1] - print(patches.shape,patch_dims,'patch_dims') - patches = tf.reshape(patches, [batch_size, -1, patch_dims]) - return patches - def get_config(self): - - config = super().get_config().copy() - config.update({ - 'patch_size': self.patch_size, - }) - return config - - -class PatchEncoder(layers.Layer): - def __init__(self, **kwargs): - super(PatchEncoder, self).__init__() - self.num_patches = num_patches - self.projection = layers.Dense(units=projection_dim) - self.position_embedding = layers.Embedding( - input_dim=num_patches, output_dim=projection_dim - ) - - def call(self, patch): - positions = tf.range(start=0, limit=self.num_patches, delta=1) - encoded = self.projection(patch) + self.position_embedding(positions) - return encoded - def get_config(self): - - config = super().get_config().copy() - config.update({ - 'num_patches': self.num_patches, - 'projection': self.projection, - 'position_embedding': self.position_embedding, - }) - return config - - class sbb_predict: - def __init__(self,image, model, task, config_params_model, patches='false',save='false', ground_truth=None,weights_dir=None ): + def __init__(self,image, model, task, config_params_model, patches, save, ground_truth): self.image=image self.patches=patches self.save=save self.model_dir=model self.ground_truth=ground_truth - self.weights_dir=weights_dir self.task=task self.config_params_model=config_params_model @@ -426,16 +360,12 @@ class sbb_predict: pass else: img_seg_overlayed = self.visualize_model_output(res, self.img_org, self.task) - cv2.imwrite('./test.png',img_seg_overlayed) - ##if self.save!=None: - ##img=np.repeat(res[:, :, np.newaxis]*255, 3, axis=2) - ##cv2.imwrite(self.save,img) - - ###if self.ground_truth!=None: - ###gt_img=cv2.imread(self.ground_truth) - ###self.IoU(gt_img[:,:,0],res) - ##plt.imshow(res) - ##plt.show() + if self.save: + cv2.imwrite(self.save,img_seg_overlayed) + + if self.ground_truth: + gt_img=cv2.imread(self.ground_truth) + self.IoU(gt_img[:,:,0],res[:,:,0]) @click.command() @click.option( @@ -463,23 +393,19 @@ class sbb_predict: required=True, ) @click.option( - "--ground_truth/--no-ground_truth", - "-gt/-nogt", - is_flag=True, + "--ground_truth", + "-gt", help="ground truth directory if you want to see the iou of prediction.", ) -@click.option( - "--model_weights/--no-model_weights", - "-mw/-nomw", - is_flag=True, - help="previous model weights which are saved.", -) -def main(image, model, patches, save, ground_truth, model_weights): - +def main(image, model, patches, save, ground_truth): with open(os.path.join(model,'config.json')) as f: config_params_model = json.load(f) - task = 'classification' - x=sbb_predict(image, model, task, config_params_model, patches, save, ground_truth, model_weights) + task = config_params_model['task'] + if task != 'classification': + if not save: + print("Error: You used one of segmentation or binarization task but not set -s, you need a filename to save visualized output with -s") + sys.exit(1) + x=sbb_predict(image, model, task, config_params_model, patches, save, ground_truth) x.run() if __name__=="__main__": From a7e1f255f3468d113ed748d42c338c0a1c7e3a1f Mon Sep 17 00:00:00 2001 From: vahidrezanezhad Date: Wed, 8 May 2024 14:47:16 +0200 Subject: [PATCH 06/62] Update train.py avoid ensembling if no model weights met the threshold f1 score in the case of classification --- train.py | 46 +++++++++++++--------------------------------- 1 file changed, 13 insertions(+), 33 deletions(-) diff --git a/train.py b/train.py index 28363d2..78974d3 100644 --- a/train.py +++ b/train.py @@ -268,36 +268,26 @@ def run(_config, n_classes, n_epochs, input_height, list_classes = list(classification_classes_name.values()) testX, testY = generate_data_from_folder_evaluation(dir_eval, input_height, input_width, n_classes, list_classes) - - #print(testY.shape, testY) y_tot=np.zeros((testX.shape[0],n_classes)) - indexer=0 score_best=[] score_best.append(0) num_rows = return_number_of_total_training_data(dir_train) - weights=[] for i in range(n_epochs): - #history = model.fit(trainX, trainY, epochs=1, batch_size=n_batch, validation_data=(testX, testY), verbose=2)#,class_weight=weights) - history = model.fit( generate_data_from_folder_training(dir_train, n_batch , input_height, input_width, n_classes, list_classes), steps_per_epoch=num_rows / n_batch, verbose=0)#,class_weight=weights) + history = model.fit( generate_data_from_folder_training(dir_train, n_batch , input_height, input_width, n_classes, list_classes), steps_per_epoch=num_rows / n_batch, verbose=1)#,class_weight=weights) y_pr_class = [] for jj in range(testY.shape[0]): y_pr=model.predict(testX[jj,:,:,:].reshape(1,input_height,input_width,3), verbose=0) y_pr_ind= np.argmax(y_pr,axis=1) - #print(y_pr_ind, 'y_pr_ind') y_pr_class.append(y_pr_ind) - y_pr_class = np.array(y_pr_class) - #model.save('./models_save/model_'+str(i)+'.h5') - #y_pr_class=np.argmax(y_pr,axis=1) f1score=f1_score(np.argmax(testY,axis=1), y_pr_class, average='macro') - print(i,f1score) if f1score>score_best[0]: @@ -306,30 +296,20 @@ def run(_config, n_classes, n_epochs, input_height, if f1score > f1_threshold_classification: weights.append(model.get_weights() ) - y_tot=y_tot+y_pr - indexer+=1 - y_tot=y_tot/float(indexer) - - new_weights=list() - - for weights_list_tuple in zip(*weights): - new_weights.append( [np.array(weights_).mean(axis=0) for weights_ in zip(*weights_list_tuple)] ) - - new_weights = [np.array(x) for x in new_weights] - - model_weight_averaged=tf.keras.models.clone_model(model) - - model_weight_averaged.set_weights(new_weights) - - #y_tot_end=np.argmax(y_tot,axis=1) - #print(f1_score(np.argmax(testY,axis=1), y_tot_end, average='macro')) - - ##best_model.save('model_taza.h5') - model_weight_averaged.save(os.path.join(dir_output,'model_ens_avg')) - with open(os.path.join( os.path.join(dir_output,'model_ens_avg'), "config.json"), "w") as fp: - json.dump(_config, fp) # encode dict into JSON + if len(weights) >= 1: + new_weights=list() + for weights_list_tuple in zip(*weights): + new_weights.append( [np.array(weights_).mean(axis=0) for weights_ in zip(*weights_list_tuple)] ) + + new_weights = [np.array(x) for x in new_weights] + model_weight_averaged=tf.keras.models.clone_model(model) + model_weight_averaged.set_weights(new_weights) + + model_weight_averaged.save(os.path.join(dir_output,'model_ens_avg')) + with open(os.path.join( os.path.join(dir_output,'model_ens_avg'), "config.json"), "w") as fp: + json.dump(_config, fp) # encode dict into JSON with open(os.path.join( os.path.join(dir_output,'model_best'), "config.json"), "w") as fp: json.dump(_config, fp) # encode dict into JSON From 45aba3235107b628af7ecdaebc22b3a3b94cf970 Mon Sep 17 00:00:00 2001 From: vahidrezanezhad Date: Sun, 12 May 2024 08:32:28 +0200 Subject: [PATCH 07/62] Update utils.py --- utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/utils.py b/utils.py index 3a0375a..271d977 100644 --- a/utils.py +++ b/utils.py @@ -9,6 +9,7 @@ from tqdm import tqdm import imutils import math from tensorflow.keras.utils import to_categorical +from PIL import Image, ImageEnhance def return_number_of_total_training_data(path_classes): From 6ef86585c0a56efc5ed931b3ad4585f36cf70066 Mon Sep 17 00:00:00 2001 From: vahidrezanezhad Date: Thu, 16 May 2024 15:03:23 +0200 Subject: [PATCH 08/62] adding page xml to label generator --- pagexml2label.py | 1009 ++++++++++++++++++++++++++++++++++++++++++++++ requirements.txt | 1 + 2 files changed, 1010 insertions(+) create mode 100644 pagexml2label.py diff --git a/pagexml2label.py b/pagexml2label.py new file mode 100644 index 0000000..715f99f --- /dev/null +++ b/pagexml2label.py @@ -0,0 +1,1009 @@ +import click +import sys +import os +import numpy as np +import warnings +import xml.etree.ElementTree as ET +from tqdm import tqdm +import cv2 +from shapely import geometry + +with warnings.catch_warnings(): + warnings.simplefilter("ignore") + +__doc__=\ +""" +tool to extract 2d or 3d RGB images from page xml data. In former case output will be 1 +2D image array which each class has filled with a pixel value. In the case of 3D RGB image +each class will be defined with a RGB value and beside images a text file of classes also will be produced. +This classes.txt file is required for dhsegment tool. +""" +KERNEL = np.ones((5, 5), np.uint8) + +class pagexml2word: + def __init__(self,dir_in, out_dir,output_type,experiment): + self.dir=dir_in + self.output_dir=out_dir + self.output_type=output_type + self.experiment=experiment + + def get_content_of_dir(self): + """ + Listing all ground truth page xml files. All files are needed to have xml format. + """ + + gt_all=os.listdir(self.dir) + self.gt_list=[file for file in gt_all if file.split('.')[ len(file.split('.'))-1 ]=='xml' ] + + def return_parent_contours(self,contours, hierarchy): + contours_parent = [contours[i] for i in range(len(contours)) if hierarchy[0][i][3] == -1] + return contours_parent + def filter_contours_area_of_image_tables(self,image, contours, hierarchy, max_area, min_area): + found_polygons_early = list() + + jv = 0 + for c in contours: + if len(c) < 3: # A polygon cannot have less than 3 points + continue + + polygon = geometry.Polygon([point[0] for point in c]) + # area = cv2.contourArea(c) + area = polygon.area + ##print(np.prod(thresh.shape[:2])) + # Check that polygon has area greater than minimal area + # print(hierarchy[0][jv][3],hierarchy ) + if area >= min_area * np.prod(image.shape[:2]) and area <= max_area * np.prod(image.shape[:2]): # and hierarchy[0][jv][3]==-1 : + # print(c[0][0][1]) + found_polygons_early.append(np.array([[point] for point in polygon.exterior.coords], dtype=np.int32)) + jv += 1 + return found_polygons_early + + def return_contours_of_interested_region(self,region_pre_p, pixel, min_area=0.0002): + + # pixels of images are identified by 5 + if len(region_pre_p.shape) == 3: + cnts_images = (region_pre_p[:, :, 0] == pixel) * 1 + else: + cnts_images = (region_pre_p[:, :] == pixel) * 1 + cnts_images = cnts_images.astype(np.uint8) + cnts_images = np.repeat(cnts_images[:, :, np.newaxis], 3, axis=2) + imgray = cv2.cvtColor(cnts_images, cv2.COLOR_BGR2GRAY) + ret, thresh = cv2.threshold(imgray, 0, 255, 0) + + contours_imgs, hierarchy = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) + + contours_imgs = self.return_parent_contours(contours_imgs, hierarchy) + contours_imgs = self.filter_contours_area_of_image_tables(thresh, contours_imgs, hierarchy, max_area=1, min_area=min_area) + + return contours_imgs + + def get_images_of_ground_truth(self): + """ + Reading the page xml files and write the ground truth images into given output directory. + """ + for index in tqdm(range(len(self.gt_list))): + #try: + tree1 = ET.parse(self.dir+'/'+self.gt_list[index]) + root1=tree1.getroot() + alltags=[elem.tag for elem in root1.iter()] + link=alltags[0].split('}')[0]+'}' + + + + for jj in root1.iter(link+'Page'): + y_len=int(jj.attrib['imageHeight']) + x_len=int(jj.attrib['imageWidth']) + if self.experiment=='word': + region_tags=np.unique([x for x in alltags if x.endswith('Word')]) + co_word=[] + + for tag in region_tags: + if tag.endswith('}Word') or tag.endswith('}word'): + #print('sth') + for nn in root1.iter(tag): + c_t_in=[] + sumi=0 + for vv in nn.iter(): + # check the format of coords + if vv.tag==link+'Coords': + coords=bool(vv.attrib) + if coords: + p_h=vv.attrib['points'].split(' ') + c_t_in.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) + break + else: + pass + + + if vv.tag==link+'Point': + c_t_in.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) + sumi+=1 + #print(vv.tag,'in') + elif vv.tag!=link+'Point' and sumi>=1: + break + co_word.append(np.array(c_t_in)) + + img = np.zeros( (y_len,x_len, 3) ) + if self.output_type == '2d': + img_poly=cv2.fillPoly(img, pts =co_word, color=(1,1,1)) + elif self.output_type == '3d': + img_poly=cv2.fillPoly(img, pts =co_word, color=(255,0,0)) + + try: + cv2.imwrite(self.output_dir+'/'+self.gt_list[index].split('-')[1].split('.')[0]+'.png',img_poly ) + except: + cv2.imwrite(self.output_dir+'/'+self.gt_list[index].split('.')[0]+'.png',img_poly ) + + + elif self.experiment=='glyph': + region_tags=np.unique([x for x in alltags if x.endswith('Glyph')]) + co_glyph=[] + + for tag in region_tags: + if tag.endswith('}Glyph') or tag.endswith('}glyph'): + #print('sth') + for nn in root1.iter(tag): + c_t_in=[] + sumi=0 + for vv in nn.iter(): + # check the format of coords + if vv.tag==link+'Coords': + coords=bool(vv.attrib) + if coords: + p_h=vv.attrib['points'].split(' ') + c_t_in.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) + break + else: + pass + + + if vv.tag==link+'Point': + c_t_in.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) + sumi+=1 + #print(vv.tag,'in') + elif vv.tag!=link+'Point' and sumi>=1: + break + co_glyph.append(np.array(c_t_in)) + + img = np.zeros( (y_len,x_len, 3) ) + if self.output_type == '2d': + img_poly=cv2.fillPoly(img, pts =co_glyph, color=(1,1,1)) + elif self.output_type == '3d': + img_poly=cv2.fillPoly(img, pts =co_glyph, color=(255,0,0)) + + try: + cv2.imwrite(self.output_dir+'/'+self.gt_list[index].split('-')[1].split('.')[0]+'.png',img_poly ) + except: + cv2.imwrite(self.output_dir+'/'+self.gt_list[index].split('.')[0]+'.png',img_poly ) + + elif self.experiment=='textline': + region_tags=np.unique([x for x in alltags if x.endswith('TextLine')]) + co_line=[] + + for tag in region_tags: + if tag.endswith('}TextLine') or tag.endswith('}textline'): + #print('sth') + for nn in root1.iter(tag): + c_t_in=[] + sumi=0 + for vv in nn.iter(): + # check the format of coords + if vv.tag==link+'Coords': + coords=bool(vv.attrib) + if coords: + p_h=vv.attrib['points'].split(' ') + c_t_in.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) + break + else: + pass + + + if vv.tag==link+'Point': + c_t_in.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) + sumi+=1 + #print(vv.tag,'in') + elif vv.tag!=link+'Point' and sumi>=1: + break + co_line.append(np.array(c_t_in)) + + img = np.zeros( (y_len,x_len, 3) ) + if self.output_type == '2d': + img_poly=cv2.fillPoly(img, pts =co_line, color=(1,1,1)) + elif self.output_type == '3d': + img_poly=cv2.fillPoly(img, pts =co_line, color=(255,0,0)) + + try: + cv2.imwrite(self.output_dir+'/'+self.gt_list[index].split('-')[1].split('.')[0]+'.png',img_poly ) + except: + cv2.imwrite(self.output_dir+'/'+self.gt_list[index].split('.')[0]+'.png',img_poly ) + + elif self.experiment=='layout_for_main_regions': + region_tags=np.unique([x for x in alltags if x.endswith('Region')]) + #print(region_tags) + co_text=[] + co_sep=[] + co_img=[] + #co_graphic=[] + + for tag in region_tags: + if tag.endswith('}TextRegion') or tag.endswith('}Textregion'): + #print('sth') + for nn in root1.iter(tag): + c_t_in=[] + sumi=0 + for vv in nn.iter(): + # check the format of coords + if vv.tag==link+'Coords': + coords=bool(vv.attrib) + if coords: + p_h=vv.attrib['points'].split(' ') + c_t_in.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) + break + else: + pass + + + if vv.tag==link+'Point': + c_t_in.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) + sumi+=1 + #print(vv.tag,'in') + elif vv.tag!=link+'Point' and sumi>=1: + break + co_text.append(np.array(c_t_in)) + + elif tag.endswith('}ImageRegion') or tag.endswith('}GraphicRegion') or tag.endswith('}imageregion'): + #print('sth') + for nn in root1.iter(tag): + c_t_in=[] + sumi=0 + for vv in nn.iter(): + # check the format of coords + if vv.tag==link+'Coords': + coords=bool(vv.attrib) + if coords: + p_h=vv.attrib['points'].split(' ') + c_t_in.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) + break + else: + pass + + + if vv.tag==link+'Point': + c_t_in.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) + sumi+=1 + #print(vv.tag,'in') + elif vv.tag!=link+'Point' and sumi>=1: + break + co_img.append(np.array(c_t_in)) + + elif tag.endswith('}SeparatorRegion') or tag.endswith('}separatorregion'): + #print('sth') + for nn in root1.iter(tag): + c_t_in=[] + sumi=0 + for vv in nn.iter(): + # check the format of coords + if vv.tag==link+'Coords': + coords=bool(vv.attrib) + if coords: + p_h=vv.attrib['points'].split(' ') + c_t_in.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) + break + else: + pass + + + if vv.tag==link+'Point': + c_t_in.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) + sumi+=1 + #print(vv.tag,'in') + elif vv.tag!=link+'Point' and sumi>=1: + break + co_sep.append(np.array(c_t_in)) + + + + img = np.zeros( (y_len,x_len,3) ) + + if self.output_type == '3d': + img_poly=cv2.fillPoly(img, pts =co_text, color=(255,0,0)) + img_poly=cv2.fillPoly(img, pts =co_img, color=(0,255,0)) + img_poly=cv2.fillPoly(img, pts =co_sep, color=(0,0,255)) + ##img_poly=cv2.fillPoly(img, pts =co_graphic, color=(255,125,125)) + elif self.output_type == '2d': + img_poly=cv2.fillPoly(img, pts =co_text, color=(1,1,1)) + img_poly=cv2.fillPoly(img, pts =co_img, color=(2,2,2)) + img_poly=cv2.fillPoly(img, pts =co_sep, color=(3,3,3)) + + try: + cv2.imwrite(self.output_dir+'/'+self.gt_list[index].split('-')[1].split('.')[0]+'.png',img_poly ) + except: + cv2.imwrite(self.output_dir+'/'+self.gt_list[index].split('.')[0]+'.png',img_poly ) + + elif self.experiment=='textregion': + region_tags=np.unique([x for x in alltags if x.endswith('TextRegion')]) + co_textregion=[] + + for tag in region_tags: + if tag.endswith('}TextRegion') or tag.endswith('}Textregion'): + #print('sth') + for nn in root1.iter(tag): + c_t_in=[] + sumi=0 + for vv in nn.iter(): + # check the format of coords + if vv.tag==link+'Coords': + coords=bool(vv.attrib) + if coords: + p_h=vv.attrib['points'].split(' ') + c_t_in.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) + break + else: + pass + + + if vv.tag==link+'Point': + c_t_in.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) + sumi+=1 + #print(vv.tag,'in') + elif vv.tag!=link+'Point' and sumi>=1: + break + co_textregion.append(np.array(c_t_in)) + + img = np.zeros( (y_len,x_len,3) ) + if self.output_type == '3d': + img_poly=cv2.fillPoly(img, pts =co_textregion, color=(255,0,0)) + elif self.output_type == '2d': + img_poly=cv2.fillPoly(img, pts =co_textregion, color=(1,1,1)) + + + try: + cv2.imwrite(self.output_dir+'/'+self.gt_list[index].split('-')[1].split('.')[0]+'.png',img_poly ) + except: + cv2.imwrite(self.output_dir+'/'+self.gt_list[index].split('.')[0]+'.png',img_poly ) + + elif self.experiment=='layout': + region_tags=np.unique([x for x in alltags if x.endswith('Region')]) + + co_text_paragraph=[] + co_text_drop=[] + co_text_heading=[] + co_text_header=[] + co_text_marginalia=[] + co_text_catch=[] + co_text_page_number=[] + co_text_signature_mark=[] + co_sep=[] + co_img=[] + co_table=[] + co_graphic=[] + co_graphic_text_annotation=[] + co_graphic_decoration=[] + co_noise=[] + + for tag in region_tags: + if tag.endswith('}TextRegion') or tag.endswith('}Textregion'): + for nn in root1.iter(tag): + c_t_in_drop=[] + c_t_in_paragraph=[] + c_t_in_heading=[] + c_t_in_header=[] + c_t_in_page_number=[] + c_t_in_signature_mark=[] + c_t_in_catch=[] + c_t_in_marginalia=[] + sumi=0 + for vv in nn.iter(): + # check the format of coords + if vv.tag==link+'Coords': + + coords=bool(vv.attrib) + if coords: + #print('birda1') + p_h=vv.attrib['points'].split(' ') + + + + if "type" in nn.attrib and nn.attrib['type']=='drop-capital': + #if nn.attrib['type']=='paragraph': + + c_t_in_drop.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) + + elif "type" in nn.attrib and nn.attrib['type']=='heading': + c_t_in_heading.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) + + + elif "type" in nn.attrib and nn.attrib['type']=='signature-mark': + + c_t_in_signature_mark.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) + #print(c_t_in_paragraph) + elif "type" in nn.attrib and nn.attrib['type']=='header': + c_t_in_header.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) + + + elif "type" in nn.attrib and nn.attrib['type']=='catch-word': + c_t_in_catch.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) + + + elif "type" in nn.attrib and nn.attrib['type']=='page-number': + + c_t_in_page_number.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) + #print(c_t_in_paragraph) + + elif "type" in nn.attrib and nn.attrib['type']=='marginalia': + + c_t_in_marginalia.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) + #print(c_t_in_paragraph) + else: + + c_t_in_paragraph.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) + #print(c_t_in_paragraph) + + break + else: + pass + + + if vv.tag==link+'Point': + if "type" in nn.attrib and nn.attrib['type']=='drop-capital': + #if nn.attrib['type']=='paragraph': + + c_t_in_drop.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) + sumi+=1 + + elif "type" in nn.attrib and nn.attrib['type']=='heading': + c_t_in_heading.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) + sumi+=1 + + + elif "type" in nn.attrib and nn.attrib['type']=='signature-mark': + + c_t_in_signature_mark.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) + #print(c_t_in_paragraph) + sumi+=1 + elif "type" in nn.attrib and nn.attrib['type']=='header': + c_t_in_header.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) + sumi+=1 + + + elif "type" in nn.attrib and nn.attrib['type']=='catch-word': + c_t_in_catch.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) + sumi+=1 + + + elif "type" in nn.attrib and nn.attrib['type']=='page-number': + + c_t_in_page_number.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) + #print(c_t_in_paragraph) + sumi+=1 + + elif "type" in nn.attrib and nn.attrib['type']=='marginalia': + + c_t_in_marginalia.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) + #print(c_t_in_paragraph) + sumi+=1 + + else: + c_t_in_paragraph.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) + #print(c_t_in_paragraph) + sumi+=1 + + #c_t_in.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) + + #print(vv.tag,'in') + elif vv.tag!=link+'Point' and sumi>=1: + break + + if len(c_t_in_drop)>0: + co_text_drop.append(np.array(c_t_in_drop)) + if len(c_t_in_paragraph)>0: + co_text_paragraph.append(np.array(c_t_in_paragraph)) + if len(c_t_in_heading)>0: + co_text_heading.append(np.array(c_t_in_heading)) + + if len(c_t_in_header)>0: + co_text_header.append(np.array(c_t_in_header)) + if len(c_t_in_page_number)>0: + co_text_page_number.append(np.array(c_t_in_page_number)) + if len(c_t_in_catch)>0: + co_text_catch.append(np.array(c_t_in_catch)) + + if len(c_t_in_signature_mark)>0: + co_text_signature_mark.append(np.array(c_t_in_signature_mark)) + + if len(c_t_in_marginalia)>0: + co_text_marginalia.append(np.array(c_t_in_marginalia)) + + + elif tag.endswith('}GraphicRegion') or tag.endswith('}graphicregion'): + #print('sth') + for nn in root1.iter(tag): + c_t_in=[] + c_t_in_text_annotation=[] + c_t_in_decoration=[] + sumi=0 + for vv in nn.iter(): + # check the format of coords + if vv.tag==link+'Coords': + coords=bool(vv.attrib) + if coords: + p_h=vv.attrib['points'].split(' ') + #c_t_in.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) + + if "type" in nn.attrib and nn.attrib['type']=='handwritten-annotation': + #if nn.attrib['type']=='paragraph': + + c_t_in_text_annotation.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) + + elif "type" in nn.attrib and nn.attrib['type']=='decoration': + + c_t_in_decoration.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) + #print(c_t_in_paragraph) + else: + c_t_in.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) + + + + break + else: + pass + + + if vv.tag==link+'Point': + + if "type" in nn.attrib and nn.attrib['type']=='handwritten-annotation': + #if nn.attrib['type']=='paragraph': + + c_t_in_text_annotation.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) + sumi+=1 + + elif "type" in nn.attrib and nn.attrib['type']=='decoration': + + c_t_in_decoration.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) + #print(c_t_in_paragraph) + sumi+=1 + else: + c_t_in.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) + sumi+=1 + + if len(c_t_in_text_annotation)>0: + co_graphic_text_annotation.append(np.array(c_t_in_text_annotation)) + if len(c_t_in_decoration)>0: + co_graphic_decoration.append(np.array(c_t_in_decoration)) + if len(c_t_in)>0: + co_graphic.append(np.array(c_t_in)) + + + + elif tag.endswith('}ImageRegion') or tag.endswith('}imageregion'): + #print('sth') + for nn in root1.iter(tag): + c_t_in=[] + sumi=0 + for vv in nn.iter(): + # check the format of coords + if vv.tag==link+'Coords': + coords=bool(vv.attrib) + if coords: + p_h=vv.attrib['points'].split(' ') + c_t_in.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) + break + else: + pass + + + if vv.tag==link+'Point': + c_t_in.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) + sumi+=1 + #print(vv.tag,'in') + elif vv.tag!=link+'Point' and sumi>=1: + break + co_img.append(np.array(c_t_in)) + + elif tag.endswith('}SeparatorRegion') or tag.endswith('}separatorregion'): + #print('sth') + for nn in root1.iter(tag): + c_t_in=[] + sumi=0 + for vv in nn.iter(): + # check the format of coords + if vv.tag==link+'Coords': + coords=bool(vv.attrib) + if coords: + p_h=vv.attrib['points'].split(' ') + c_t_in.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) + break + else: + pass + + + if vv.tag==link+'Point': + c_t_in.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) + sumi+=1 + #print(vv.tag,'in') + elif vv.tag!=link+'Point' and sumi>=1: + break + co_sep.append(np.array(c_t_in)) + + + + elif tag.endswith('}TableRegion') or tag.endswith('}tableregion'): + #print('sth') + for nn in root1.iter(tag): + c_t_in=[] + sumi=0 + for vv in nn.iter(): + # check the format of coords + if vv.tag==link+'Coords': + coords=bool(vv.attrib) + if coords: + p_h=vv.attrib['points'].split(' ') + c_t_in.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) + break + else: + pass + + + if vv.tag==link+'Point': + c_t_in.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) + sumi+=1 + #print(vv.tag,'in') + elif vv.tag!=link+'Point' and sumi>=1: + break + co_table.append(np.array(c_t_in)) + + elif tag.endswith('}NoiseRegion') or tag.endswith('}noiseregion'): + #print('sth') + for nn in root1.iter(tag): + c_t_in=[] + sumi=0 + for vv in nn.iter(): + # check the format of coords + if vv.tag==link+'Coords': + coords=bool(vv.attrib) + if coords: + p_h=vv.attrib['points'].split(' ') + c_t_in.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) + break + else: + pass + + + if vv.tag==link+'Point': + c_t_in.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) + sumi+=1 + #print(vv.tag,'in') + elif vv.tag!=link+'Point' and sumi>=1: + break + co_noise.append(np.array(c_t_in)) + + + img = np.zeros( (y_len,x_len,3) ) + + if self.output_type == '3d': + img_poly=cv2.fillPoly(img, pts =co_text_paragraph, color=(255,0,0)) + + img_poly=cv2.fillPoly(img, pts =co_text_heading, color=(255,125,0)) + img_poly=cv2.fillPoly(img, pts =co_text_header, color=(255,0,125)) + img_poly=cv2.fillPoly(img, pts =co_text_catch, color=(125,255,125)) + img_poly=cv2.fillPoly(img, pts =co_text_signature_mark, color=(125,125,0)) + img_poly=cv2.fillPoly(img, pts =co_graphic_decoration, color=(0,125,255)) + img_poly=cv2.fillPoly(img, pts =co_text_page_number, color=(0,125,0)) + img_poly=cv2.fillPoly(img, pts =co_text_marginalia, color=(125,125,125)) + img_poly=cv2.fillPoly(img, pts =co_text_drop, color=(0,125,255)) + + img_poly=cv2.fillPoly(img, pts =co_graphic_text_annotation, color=(125,0,125)) + img_poly=cv2.fillPoly(img, pts =co_img, color=(0,255,0)) + img_poly=cv2.fillPoly(img, pts =co_sep, color=(0,0,255)) + img_poly=cv2.fillPoly(img, pts =co_table, color=(0,255,255)) + img_poly=cv2.fillPoly(img, pts =co_graphic, color=(255,125,125)) + img_poly=cv2.fillPoly(img, pts =co_noise, color=(255,0,255)) + elif self.output_type == '2d': + img_poly=cv2.fillPoly(img, pts =co_text_paragraph, color=(1,1,1)) + + img_poly=cv2.fillPoly(img, pts =co_text_heading, color=(2,2,2)) + img_poly=cv2.fillPoly(img, pts =co_text_header, color=(2,2,2)) + img_poly=cv2.fillPoly(img, pts =co_text_catch, color=(3,3,3)) + img_poly=cv2.fillPoly(img, pts =co_text_signature_mark, color=(4,4,4)) + img_poly=cv2.fillPoly(img, pts =co_graphic_decoration, color=(5,5,5)) + img_poly=cv2.fillPoly(img, pts =co_text_page_number, color=(6,6,6)) + img_poly=cv2.fillPoly(img, pts =co_text_marginalia, color=(7,7,7)) + img_poly=cv2.fillPoly(img, pts =co_text_drop, color=(8,8,8)) + + img_poly=cv2.fillPoly(img, pts =co_graphic_text_annotation, color=(9,9,9)) + img_poly=cv2.fillPoly(img, pts =co_img, color=(10,10,10)) + img_poly=cv2.fillPoly(img, pts =co_sep, color=(11,11,11)) + img_poly=cv2.fillPoly(img, pts =co_table, color=(12,12,12)) + img_poly=cv2.fillPoly(img, pts =co_graphic, color=(13,13,14)) + img_poly=cv2.fillPoly(img, pts =co_noise, color=(15,15,15)) + + try: + cv2.imwrite(self.output_dir+'/'+self.gt_list[index].split('-')[1].split('.')[0]+'.png',img_poly ) + except: + cv2.imwrite(self.output_dir+'/'+self.gt_list[index].split('.')[0]+'.png',img_poly ) + + + elif self.experiment=='layout_for_main_regions_new_concept': + region_tags=np.unique([x for x in alltags if x.endswith('Region')]) + #print(region_tags) + co_text=[] + co_sep=[] + co_img=[] + co_drop = [] + co_graphic=[] + co_table = [] + + for tag in region_tags: + if tag.endswith('}TextRegion') or tag.endswith('}Textregion'): + #print('sth') + for nn in root1.iter(tag): + c_t_in=[] + c_t_in_drop = [] + sumi=0 + for vv in nn.iter(): + # check the format of coords + if vv.tag==link+'Coords': + coords=bool(vv.attrib) + if coords: + p_h=vv.attrib['points'].split(' ') + if "type" in nn.attrib and nn.attrib['type']=='drop-capital': + c_t_in_drop.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) + else: + c_t_in.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) + break + else: + pass + + + if vv.tag==link+'Point': + if "type" in nn.attrib and nn.attrib['type']=='drop-capital': + c_t_in_drop.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) + sumi+=1 + else: + c_t_in.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) + sumi+=1 + #print(vv.tag,'in') + elif vv.tag!=link+'Point' and sumi>=1: + break + if len(c_t_in)>0: + co_text.append(np.array(c_t_in)) + if len(c_t_in_drop)>0: + co_drop.append(np.array(c_t_in_drop)) + + elif tag.endswith('}ImageRegion') or tag.endswith('}GraphicRegion') or tag.endswith('}imageregion'): + #print('sth') + for nn in root1.iter(tag): + c_t_in=[] + sumi=0 + for vv in nn.iter(): + # check the format of coords + if vv.tag==link+'Coords': + coords=bool(vv.attrib) + if coords: + p_h=vv.attrib['points'].split(' ') + c_t_in.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) + break + else: + pass + + + if vv.tag==link+'Point': + c_t_in.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) + sumi+=1 + #print(vv.tag,'in') + elif vv.tag!=link+'Point' and sumi>=1: + break + co_img.append(np.array(c_t_in)) + + elif tag.endswith('}SeparatorRegion') or tag.endswith('}separatorregion'): + #print('sth') + for nn in root1.iter(tag): + c_t_in=[] + sumi=0 + for vv in nn.iter(): + # check the format of coords + if vv.tag==link+'Coords': + coords=bool(vv.attrib) + if coords: + p_h=vv.attrib['points'].split(' ') + c_t_in.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) + break + else: + pass + + + if vv.tag==link+'Point': + c_t_in.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) + sumi+=1 + #print(vv.tag,'in') + elif vv.tag!=link+'Point' and sumi>=1: + break + co_sep.append(np.array(c_t_in)) + + elif tag.endswith('}TableRegion') or tag.endswith('}tableregion'): + #print('sth') + for nn in root1.iter(tag): + c_t_in=[] + sumi=0 + for vv in nn.iter(): + # check the format of coords + if vv.tag==link+'Coords': + coords=bool(vv.attrib) + if coords: + p_h=vv.attrib['points'].split(' ') + c_t_in.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) + break + else: + pass + + + if vv.tag==link+'Point': + c_t_in.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) + sumi+=1 + #print(vv.tag,'in') + elif vv.tag!=link+'Point' and sumi>=1: + break + co_table.append(np.array(c_t_in)) + + img_boundary = np.zeros( (y_len,x_len) ) + + + co_text_eroded = [] + for con in co_text: + #try: + img_boundary_in = np.zeros( (y_len,x_len) ) + img_boundary_in = cv2.fillPoly(img_boundary_in, pts=[con], color=(1, 1, 1)) + #print('bidiahhhhaaa') + + + + #img_boundary_in = cv2.erode(img_boundary_in[:,:], KERNEL, iterations=7)#asiatica + img_boundary_in = cv2.erode(img_boundary_in[:,:], KERNEL, iterations=2) + + pixel = 1 + min_size = 0 + con_eroded = self.return_contours_of_interested_region(img_boundary_in,pixel, min_size ) + + try: + co_text_eroded.append(con_eroded[0]) + except: + co_text_eroded.append(con) + + img_boundary_in_dilated = cv2.dilate(img_boundary_in[:,:], KERNEL, iterations=4) + #img_boundary_in_dilated = cv2.dilate(img_boundary_in[:,:], KERNEL, iterations=5) + + boundary = img_boundary_in_dilated[:,:] - img_boundary_in[:,:] + + img_boundary[:,:][boundary[:,:]==1] =1 + + + ###co_table_eroded = [] + ###for con in co_table: + ####try: + ###img_boundary_in = np.zeros( (y_len,x_len) ) + ###img_boundary_in = cv2.fillPoly(img_boundary_in, pts=[con], color=(1, 1, 1)) + ####print('bidiahhhhaaa') + + + + #####img_boundary_in = cv2.erode(img_boundary_in[:,:], KERNEL, iterations=7)#asiatica + ###img_boundary_in = cv2.erode(img_boundary_in[:,:], KERNEL, iterations=2) + + ###pixel = 1 + ###min_size = 0 + ###con_eroded = self.return_contours_of_interested_region(img_boundary_in,pixel, min_size ) + + ###try: + ###co_table_eroded.append(con_eroded[0]) + ###except: + ###co_table_eroded.append(con) + + ###img_boundary_in_dilated = cv2.dilate(img_boundary_in[:,:], KERNEL, iterations=4) + + ###boundary = img_boundary_in_dilated[:,:] - img_boundary_in[:,:] + + ###img_boundary[:,:][boundary[:,:]==1] =1 + #except: + #pass + + #for con in co_img: + #img_boundary_in = np.zeros( (y_len,x_len) ) + #img_boundary_in = cv2.fillPoly(img_boundary_in, pts=[con], color=(1, 1, 1)) + #img_boundary_in_dilated = cv2.dilate(img_boundary_in[:,:], KERNEL, iterations=3) + + #boundary = img_boundary_in_dilated[:,:] - img_boundary_in[:,:] + + #img_boundary[:,:][boundary[:,:]==1] =1 + + + #for con in co_sep: + + #img_boundary_in = np.zeros( (y_len,x_len) ) + #img_boundary_in = cv2.fillPoly(img_boundary_in, pts=[con], color=(1, 1, 1)) + #img_boundary_in_dilated = cv2.dilate(img_boundary_in[:,:], KERNEL, iterations=3) + + #boundary = img_boundary_in_dilated[:,:] - img_boundary_in[:,:] + + img_boundary[:,:][boundary[:,:]==1] =1 + for con in co_drop: + img_boundary_in = np.zeros( (y_len,x_len) ) + img_boundary_in = cv2.fillPoly(img_boundary_in, pts=[con], color=(1, 1, 1)) + img_boundary_in_dilated = cv2.dilate(img_boundary_in[:,:], KERNEL, iterations=3) + + boundary = img_boundary_in_dilated[:,:] - img_boundary_in[:,:] + + img_boundary[:,:][boundary[:,:]==1] =1 + + + img = np.zeros( (y_len,x_len,3) ) + + if self.output_type == '2d': + img_poly=cv2.fillPoly(img, pts =co_img, color=(2,2,2)) + + img_poly=cv2.fillPoly(img, pts =co_text_eroded, color=(1,1,1)) + ##img_poly=cv2.fillPoly(img, pts =co_graphic, color=(4,4,4)) + ###img_poly=cv2.fillPoly(img, pts =co_table, color=(1,1,1)) + + img_poly=cv2.fillPoly(img, pts =co_drop, color=(1,1,1)) + img_poly[:,:][img_boundary[:,:]==1] = 4 + img_poly=cv2.fillPoly(img, pts =co_sep, color=(3,3,3)) + elif self.output_type == '3d': + img_poly=cv2.fillPoly(img, pts =co_img, color=(0,255,0)) + img_poly=cv2.fillPoly(img, pts =co_text_eroded, color=(255,0,0)) + img_poly=cv2.fillPoly(img, pts =co_drop, color=(0,125,255)) + + img_poly[:,:,0][img_boundary[:,:]==1]=255 + img_poly[:,:,1][img_boundary[:,:]==1]=125 + img_poly[:,:,2][img_boundary[:,:]==1]=125 + + img_poly=cv2.fillPoly(img, pts =co_sep, color=(0,0,255)) + ##img_poly=cv2.fillPoly(img, pts =co_graphic, color=(255,125,125)) + + #print('yazdimmm',self.output_dir+'/'+self.gt_list[index].split('.')[0]+'.png') + try: + #print('yazdimmm',self.output_dir+'/'+self.gt_list[index].split('.')[0]+'.png') + cv2.imwrite(self.output_dir+'/'+self.gt_list[index].split('-')[1].split('.')[0]+'.png',img_poly ) + except: + cv2.imwrite(self.output_dir+'/'+self.gt_list[index].split('.')[0]+'.png',img_poly ) + + + + #except: + #pass + def run(self): + self.get_content_of_dir() + self.get_images_of_ground_truth() + + +@click.command() +@click.option( + "--dir_xml", + "-dx", + help="directory of GT page-xml files", + type=click.Path(exists=True, file_okay=False), +) +@click.option( + "--dir_out", + "-do", + help="directory where ground truth images would be written", + type=click.Path(exists=True, file_okay=False), +) +@click.option( + "--type_output", + "-to", + help="this defines how output should be. A 2d image array or a 3d image array encoded with RGB color. Just pass 2d or 3d. The file will be saved one directory up. 2D image array is 3d but only information of one channel would be enough since all channels have the same values.", +) +@click.option( + "--experiment", + "-exp", + help="experiment of ineterst. Word , textline , glyph and textregion are desired options.", +) + +def main(dir_xml,dir_out,type_output,experiment): + x=pagexml2word(dir_xml,dir_out,type_output,experiment) + x.run() +if __name__=="__main__": + main() + + + diff --git a/requirements.txt b/requirements.txt index 3e56438..efee9df 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,3 +7,4 @@ imutils numpy scipy scikit-learn +shapely From 2623113b0cb58a54beaafd3a8ee9aea3b317f18b Mon Sep 17 00:00:00 2001 From: vahidrezanezhad Date: Fri, 17 May 2024 09:10:13 +0200 Subject: [PATCH 09/62] page to label enable textline new concept --- pagexml2label.py | 73 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 73 insertions(+) diff --git a/pagexml2label.py b/pagexml2label.py index 715f99f..b094e9b 100644 --- a/pagexml2label.py +++ b/pagexml2label.py @@ -217,6 +217,79 @@ class pagexml2word: except: cv2.imwrite(self.output_dir+'/'+self.gt_list[index].split('.')[0]+'.png',img_poly ) + elif self.experiment == 'textline_new_concept': + region_tags = np.unique([x for x in alltags if x.endswith('TextLine')]) + co_line = [] + + for tag in region_tags: + if tag.endswith('}TextLine') or tag.endswith('}textline'): + # print('sth') + for nn in root1.iter(tag): + c_t_in = [] + sumi = 0 + for vv in nn.iter(): + # check the format of coords + if vv.tag == link + 'Coords': + coords = bool(vv.attrib) + if coords: + p_h = vv.attrib['points'].split(' ') + c_t_in.append( + np.array([[int(x.split(',')[0]), int(x.split(',')[1])] for x in p_h])) + break + else: + pass + + if vv.tag == link + 'Point': + c_t_in.append([int(np.float(vv.attrib['x'])), int(np.float(vv.attrib['y']))]) + sumi += 1 + # print(vv.tag,'in') + elif vv.tag != link + 'Point' and sumi >= 1: + break + co_line.append(np.array(c_t_in)) + + img_boundary = np.zeros((y_len, x_len)) + co_textline_eroded = [] + for con in co_line: + # try: + img_boundary_in = np.zeros((y_len, x_len)) + img_boundary_in = cv2.fillPoly(img_boundary_in, pts=[con], color=(1, 1, 1)) + # print('bidiahhhhaaa') + + # img_boundary_in = cv2.erode(img_boundary_in[:,:], KERNEL, iterations=7)#asiatica + img_boundary_in = cv2.erode(img_boundary_in[:, :], KERNEL, iterations=1) + + pixel = 1 + min_size = 0 + con_eroded = self.return_contours_of_interested_region(img_boundary_in, pixel, min_size) + + try: + co_textline_eroded.append(con_eroded[0]) + except: + co_textline_eroded.append(con) + + img_boundary_in_dilated = cv2.dilate(img_boundary_in[:, :], KERNEL, iterations=3) + # img_boundary_in_dilated = cv2.dilate(img_boundary_in[:,:], KERNEL, iterations=5) + + boundary = img_boundary_in_dilated[:, :] - img_boundary_in[:, :] + + img_boundary[:, :][boundary[:, :] == 1] = 1 + + img = np.zeros((y_len, x_len, 3)) + if self.output_type == '2d': + img_poly = cv2.fillPoly(img, pts=co_textline_eroded, color=(1, 1, 1)) + img_poly[:, :][img_boundary[:, :] == 1] = 2 + elif self.output_type == '3d': + img_poly = cv2.fillPoly(img, pts=co_textline_eroded, color=(255, 0, 0)) + img_poly[:, :, 0][img_boundary[:, :] == 1] = 255 + img_poly[:, :, 1][img_boundary[:, :] == 1] = 125 + img_poly[:, :, 2][img_boundary[:, :] == 1] = 125 + + try: + cv2.imwrite(self.output_dir + '/' + self.gt_list[index].split('-')[1].split('.')[0] + '.png', + img_poly) + except: + cv2.imwrite(self.output_dir + '/' + self.gt_list[index].split('.')[0] + '.png', img_poly) + elif self.experiment=='layout_for_main_regions': region_tags=np.unique([x for x in alltags if x.endswith('Region')]) #print(region_tags) From 5f06a02441e3e06b1123d5c7b9baaf2788a9080c Mon Sep 17 00:00:00 2001 From: vahidrezanezhad Date: Fri, 17 May 2024 09:08:25 +0200 Subject: [PATCH 10/62] update requirements --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index efee9df..d8f9003 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,3 +8,4 @@ numpy scipy scikit-learn shapely +click From f7dda078d22f7d4d86c411aa816e50104e39cbe6 Mon Sep 17 00:00:00 2001 From: vahidrezanezhad Date: Wed, 22 May 2024 12:38:24 +0200 Subject: [PATCH 11/62] page2label with a dynamic layout --- custom_config_page2label.json | 6 + pagexml2label.py | 490 ++++++++++++++++++++++++++++++++-- 2 files changed, 479 insertions(+), 17 deletions(-) create mode 100644 custom_config_page2label.json diff --git a/custom_config_page2label.json b/custom_config_page2label.json new file mode 100644 index 0000000..75c4b96 --- /dev/null +++ b/custom_config_page2label.json @@ -0,0 +1,6 @@ +{ +"textregions":{"paragraph":1, "heading": 2, "header":2,"drop-capital": 3, "marginal":4 }, +"imageregion":5, +"separatorregion":6, +"graphicregions" :{"handwritten-annotation":7, "decoration": 8, "signature": 9, "stamp": 10} +} diff --git a/pagexml2label.py b/pagexml2label.py index b094e9b..6907e84 100644 --- a/pagexml2label.py +++ b/pagexml2label.py @@ -7,6 +7,7 @@ import xml.etree.ElementTree as ET from tqdm import tqdm import cv2 from shapely import geometry +import json with warnings.catch_warnings(): warnings.simplefilter("ignore") @@ -21,11 +22,12 @@ This classes.txt file is required for dhsegment tool. KERNEL = np.ones((5, 5), np.uint8) class pagexml2word: - def __init__(self,dir_in, out_dir,output_type,experiment): + def __init__(self,dir_in, out_dir,output_type,experiment,layout_config): self.dir=dir_in self.output_dir=out_dir self.output_type=output_type self.experiment=experiment + self.layout_config=layout_config def get_content_of_dir(self): """ @@ -77,7 +79,7 @@ class pagexml2word: return contours_imgs - def get_images_of_ground_truth(self): + def get_images_of_ground_truth(self, config_params): """ Reading the page xml files and write the ground truth images into given output directory. """ @@ -93,6 +95,445 @@ class pagexml2word: for jj in root1.iter(link+'Page'): y_len=int(jj.attrib['imageHeight']) x_len=int(jj.attrib['imageWidth']) + + if self.layout_config: + keys = list(config_params.keys()) + #values = config_params.values() + + if 'textregions' in keys: + types_text_dict = config_params['textregions'] + types_text = list(types_text_dict.keys()) + types_text_label = list(types_text_dict.values()) + if 'graphicregions' in keys: + types_graphic_dict = config_params['graphicregions'] + types_graphic = list(types_graphic_dict.keys()) + types_graphic_label = list(types_graphic_dict.values()) + + + types_text_label_rgb = [ (0,0,0), (255,0,0), (255,125,0), (255,0,125), (125,255,125), (125,125,0), (0,125,255), (0,125,0), (125,125,125), (0,125,255), (125,0,125), (0,255,0),(0,0,255), (0,255,255), (255,125,125), (0,125,255), (0,255,125)] + + region_tags=np.unique([x for x in alltags if x.endswith('Region')]) + + co_text_paragraph=[] + co_text_drop=[] + co_text_heading=[] + co_text_header=[] + co_text_marginalia=[] + co_text_catch=[] + co_text_page_number=[] + co_text_signature_mark=[] + co_sep=[] + co_img=[] + co_table=[] + co_graphic_signature=[] + co_graphic_text_annotation=[] + co_graphic_decoration=[] + co_graphic_stamp=[] + co_noise=[] + + for tag in region_tags: + if 'textregions' in keys: + if tag.endswith('}TextRegion') or tag.endswith('}Textregion'): + for nn in root1.iter(tag): + c_t_in_drop=[] + c_t_in_paragraph=[] + c_t_in_heading=[] + c_t_in_header=[] + c_t_in_page_number=[] + c_t_in_signature_mark=[] + c_t_in_catch=[] + c_t_in_marginalia=[] + sumi=0 + for vv in nn.iter(): + # check the format of coords + if vv.tag==link+'Coords': + + coords=bool(vv.attrib) + if coords: + #print('birda1') + p_h=vv.attrib['points'].split(' ') + + if "drop-capital" in types_text: + if "type" in nn.attrib and nn.attrib['type']=='drop-capital': + c_t_in_drop.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) + + if "heading" in types_text: + if "type" in nn.attrib and nn.attrib['type']=='heading': + c_t_in_heading.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) + + if "signature-mark" in types_text: + if "type" in nn.attrib and nn.attrib['type']=='signature-mark': + c_t_in_signature_mark.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) + + if "header" in types_text: + if "type" in nn.attrib and nn.attrib['type']=='header': + c_t_in_header.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) + + if "catch-word" in types_text: + if "type" in nn.attrib and nn.attrib['type']=='catch-word': + c_t_in_catch.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) + + if "page-number" in types_text: + if "type" in nn.attrib and nn.attrib['type']=='page-number': + c_t_in_page_number.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) + + if "marginalia" in types_text: + if "type" in nn.attrib and nn.attrib['type']=='marginalia': + c_t_in_marginalia.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) + + if "paragraph" in types_text: + if "type" in nn.attrib and nn.attrib['type']=='paragraph': + c_t_in_paragraph.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) + + + break + else: + pass + + + if vv.tag==link+'Point': + if "drop-capital" in types_text: + if "type" in nn.attrib and nn.attrib['type']=='drop-capital': + c_t_in_drop.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) + sumi+=1 + + if "heading" in types_text: + if "type" in nn.attrib and nn.attrib['type']=='heading': + c_t_in_heading.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) + sumi+=1 + + if "signature-mark" in types_text: + if "type" in nn.attrib and nn.attrib['type']=='signature-mark': + c_t_in_signature_mark.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) + sumi+=1 + + if "header" in types_text: + if "type" in nn.attrib and nn.attrib['type']=='header': + c_t_in_header.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) + sumi+=1 + + if "catch-word" in types_text: + if "type" in nn.attrib and nn.attrib['type']=='catch-word': + c_t_in_catch.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) + sumi+=1 + + if "page-number" in types_text: + if "type" in nn.attrib and nn.attrib['type']=='page-number': + c_t_in_page_number.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) + sumi+=1 + + if "marginalia" in types_text: + if "type" in nn.attrib and nn.attrib['type']=='marginalia': + c_t_in_marginalia.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) + sumi+=1 + + if "paragraph" in types_text: + if "type" in nn.attrib and nn.attrib['type']=='paragraph': + c_t_in_paragraph.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) + sumi+=1 + + + elif vv.tag!=link+'Point' and sumi>=1: + break + + if len(c_t_in_drop)>0: + co_text_drop.append(np.array(c_t_in_drop)) + if len(c_t_in_paragraph)>0: + co_text_paragraph.append(np.array(c_t_in_paragraph)) + if len(c_t_in_heading)>0: + co_text_heading.append(np.array(c_t_in_heading)) + + if len(c_t_in_header)>0: + co_text_header.append(np.array(c_t_in_header)) + if len(c_t_in_page_number)>0: + co_text_page_number.append(np.array(c_t_in_page_number)) + if len(c_t_in_catch)>0: + co_text_catch.append(np.array(c_t_in_catch)) + + if len(c_t_in_signature_mark)>0: + co_text_signature_mark.append(np.array(c_t_in_signature_mark)) + + if len(c_t_in_marginalia)>0: + co_text_marginalia.append(np.array(c_t_in_marginalia)) + + + if 'graphicregions' in keys: + if tag.endswith('}GraphicRegion') or tag.endswith('}graphicregion'): + #print('sth') + for nn in root1.iter(tag): + c_t_in_stamp=[] + c_t_in_text_annotation=[] + c_t_in_decoration=[] + c_t_in_signature=[] + sumi=0 + for vv in nn.iter(): + # check the format of coords + if vv.tag==link+'Coords': + coords=bool(vv.attrib) + if coords: + p_h=vv.attrib['points'].split(' ') + if "handwritten-annotation" in types_graphic: + if "type" in nn.attrib and nn.attrib['type']=='handwritten-annotation': + c_t_in_text_annotation.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) + + if "decoration" in types_graphic: + if "type" in nn.attrib and nn.attrib['type']=='decoration': + c_t_in_decoration.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) + + if "stamp" in types_graphic: + if "type" in nn.attrib and nn.attrib['type']=='stamp': + c_t_in_stamp.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) + + if "signature" in types_graphic: + if "type" in nn.attrib and nn.attrib['type']=='signature': + c_t_in_signature.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) + + + + break + else: + pass + + + if vv.tag==link+'Point': + if "handwritten-annotation" in types_graphic: + if "type" in nn.attrib and nn.attrib['type']=='handwritten-annotation': + c_t_in_text_annotation.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) + sumi+=1 + + if "decoration" in types_graphic: + if "type" in nn.attrib and nn.attrib['type']=='decoration': + c_t_in_decoration.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) + sumi+=1 + + if "stamp" in types_graphic: + if "type" in nn.attrib and nn.attrib['type']=='stamp': + c_t_in_stamp.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) + sumi+=1 + + if "signature" in types_graphic: + if "type" in nn.attrib and nn.attrib['type']=='signature': + c_t_in_signature.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) + sumi+=1 + + if len(c_t_in_text_annotation)>0: + co_graphic_text_annotation.append(np.array(c_t_in_text_annotation)) + if len(c_t_in_decoration)>0: + co_graphic_decoration.append(np.array(c_t_in_decoration)) + if len(c_t_in_stamp)>0: + co_graphic_stamp.append(np.array(c_t_in_stamp)) + if len(c_t_in_signature)>0: + co_graphic_signature.append(np.array(c_t_in_signature)) + + if 'imageregion' in keys: + if tag.endswith('}ImageRegion') or tag.endswith('}imageregion'): + for nn in root1.iter(tag): + c_t_in=[] + sumi=0 + for vv in nn.iter(): + if vv.tag==link+'Coords': + coords=bool(vv.attrib) + if coords: + p_h=vv.attrib['points'].split(' ') + c_t_in.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) + break + else: + pass + + + if vv.tag==link+'Point': + c_t_in.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) + sumi+=1 + + elif vv.tag!=link+'Point' and sumi>=1: + break + co_img.append(np.array(c_t_in)) + + + if 'separatorregion' in keys: + if tag.endswith('}SeparatorRegion') or tag.endswith('}separatorregion'): + for nn in root1.iter(tag): + c_t_in=[] + sumi=0 + for vv in nn.iter(): + # check the format of coords + if vv.tag==link+'Coords': + coords=bool(vv.attrib) + if coords: + p_h=vv.attrib['points'].split(' ') + c_t_in.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) + break + else: + pass + + + if vv.tag==link+'Point': + c_t_in.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) + sumi+=1 + + elif vv.tag!=link+'Point' and sumi>=1: + break + co_sep.append(np.array(c_t_in)) + + + + if 'tableregion' in keys: + if tag.endswith('}TableRegion') or tag.endswith('}tableregion'): + #print('sth') + for nn in root1.iter(tag): + c_t_in=[] + sumi=0 + for vv in nn.iter(): + # check the format of coords + if vv.tag==link+'Coords': + coords=bool(vv.attrib) + if coords: + p_h=vv.attrib['points'].split(' ') + c_t_in.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) + break + else: + pass + + + if vv.tag==link+'Point': + c_t_in.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) + sumi+=1 + #print(vv.tag,'in') + elif vv.tag!=link+'Point' and sumi>=1: + break + co_table.append(np.array(c_t_in)) + + if 'noiseregion' in keys: + if tag.endswith('}NoiseRegion') or tag.endswith('}noiseregion'): + #print('sth') + for nn in root1.iter(tag): + c_t_in=[] + sumi=0 + for vv in nn.iter(): + # check the format of coords + if vv.tag==link+'Coords': + coords=bool(vv.attrib) + if coords: + p_h=vv.attrib['points'].split(' ') + c_t_in.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) + break + else: + pass + + + if vv.tag==link+'Point': + c_t_in.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) + sumi+=1 + #print(vv.tag,'in') + elif vv.tag!=link+'Point' and sumi>=1: + break + co_noise.append(np.array(c_t_in)) + + img = np.zeros( (y_len,x_len,3) ) + + if self.output_type == '3d': + + if 'graphicregions' in keys: + if "handwritten-annotation" in types_graphic: + img_poly=cv2.fillPoly(img, pts =co_graphic_text_annotation, color=types_text_label_rgb[ config_params['graphicregions']['handwritten-annotation']]) + if "signature" in types_graphic: + img_poly=cv2.fillPoly(img, pts =co_graphic_signature, color=types_text_label_rgb[ config_params['graphicregions']['signature']]) + if "decoration" in types_graphic: + img_poly=cv2.fillPoly(img, pts =co_graphic_decoration, color=types_text_label_rgb[ config_params['graphicregions']['decoration']]) + if "stamp" in types_graphic: + img_poly=cv2.fillPoly(img, pts =co_graphic_stamp, color=types_text_label_rgb[ config_params['graphicregions']['stamp']]) + + if 'imageregion' in keys: + img_poly=cv2.fillPoly(img, pts =co_img, color=types_text_label_rgb[ config_params['imageregion']]) + if 'separatorregion' in keys: + img_poly=cv2.fillPoly(img, pts =co_sep, color=types_text_label_rgb[ config_params['separatorregion']]) + if 'tableregion' in keys: + img_poly=cv2.fillPoly(img, pts =co_table, color=types_text_label_rgb[ config_params['tableregion']]) + if 'noiseregion' in keys: + img_poly=cv2.fillPoly(img, pts =co_noise, color=types_text_label_rgb[ config_params['noiseregion']]) + + if 'textregions' in keys: + if "paragraph" in types_text: + img_poly=cv2.fillPoly(img, pts =co_text_paragraph, color=types_text_label_rgb[ config_params['textregions']['paragraph']]) + if "heading" in types_text: + img_poly=cv2.fillPoly(img, pts =co_text_heading, color=types_text_label_rgb[ config_params['textregions']['heading']]) + if "header" in types_text: + img_poly=cv2.fillPoly(img, pts =co_text_header, color=types_text_label_rgb[ config_params['textregions']['header']]) + if "catch-word" in types_text: + img_poly=cv2.fillPoly(img, pts =co_text_catch, color=types_text_label_rgb[ config_params['textregions']['catch-word']]) + if "signature-mark" in types_text: + img_poly=cv2.fillPoly(img, pts =co_text_signature_mark, color=types_text_label_rgb[ config_params['textregions']['signature-mark']]) + if "page-number" in types_text: + img_poly=cv2.fillPoly(img, pts =co_text_page_number, color=types_text_label_rgb[ config_params['textregions']['page-number']]) + if "marginalia" in types_text: + img_poly=cv2.fillPoly(img, pts =co_text_marginalia, color=types_text_label_rgb[ config_params['textregions']['marginalia']]) + if "drop-capital" in types_text: + img_poly=cv2.fillPoly(img, pts =co_text_drop, color=types_text_label_rgb[ config_params['textregions']['drop-capital']]) + + elif self.output_type == '2d': + if 'graphicregions' in keys: + if "handwritten-annotation" in types_graphic: + color_label = config_params['graphicregions']['handwritten-annotation'] + img_poly=cv2.fillPoly(img, pts =co_graphic_text_annotation, color=(color_label,color_label,color_label)) + if "signature" in types_graphic: + color_label = config_params['graphicregions']['signature'] + img_poly=cv2.fillPoly(img, pts =co_graphic_signature, color=(color_label,color_label,color_label)) + if "decoration" in types_graphic: + color_label = config_params['graphicregions']['decoration'] + img_poly=cv2.fillPoly(img, pts =co_graphic_decoration, color=(color_label,color_label,color_label)) + if "stamp" in types_graphic: + color_label = config_params['graphicregions']['stamp'] + img_poly=cv2.fillPoly(img, pts =co_graphic_stamp, color=(color_label,color_label,color_label)) + + if 'imageregion' in keys: + color_label = config_params['imageregion'] + img_poly=cv2.fillPoly(img, pts =co_img, color=(color_label,color_label,color_label)) + if 'separatorregion' in keys: + color_label = config_params['separatorregion'] + img_poly=cv2.fillPoly(img, pts =co_sep, color=(color_label,color_label,color_label)) + if 'tableregion' in keys: + color_label = config_params['tableregion'] + img_poly=cv2.fillPoly(img, pts =co_table, color=(color_label,color_label,color_label)) + if 'noiseregion' in keys: + color_label = config_params['noiseregion'] + img_poly=cv2.fillPoly(img, pts =co_noise, color=(color_label,color_label,color_label)) + + if 'textregions' in keys: + if "paragraph" in types_text: + color_label = config_params['textregions']['paragraph'] + img_poly=cv2.fillPoly(img, pts =co_text_paragraph, color=(color_label,color_label,color_label)) + if "heading" in types_text: + color_label = config_params['textregions']['heading'] + img_poly=cv2.fillPoly(img, pts =co_text_heading, color=(color_label,color_label,color_label)) + if "header" in types_text: + color_label = config_params['textregions']['header'] + img_poly=cv2.fillPoly(img, pts =co_text_header, color=(color_label,color_label,color_label)) + if "catch-word" in types_text: + color_label = config_params['textregions']['catch-word'] + img_poly=cv2.fillPoly(img, pts =co_text_catch, color=(color_label,color_label,color_label)) + if "signature-mark" in types_text: + color_label = config_params['textregions']['signature-mark'] + img_poly=cv2.fillPoly(img, pts =co_text_signature_mark, color=(color_label,color_label,color_label)) + if "page-number" in types_text: + color_label = config_params['textregions']['page-number'] + img_poly=cv2.fillPoly(img, pts =co_text_page_number, color=(color_label,color_label,color_label)) + if "marginalia" in types_text: + color_label = config_params['textregions']['marginalia'] + img_poly=cv2.fillPoly(img, pts =co_text_marginalia, color=(color_label,color_label,color_label)) + if "drop-capital" in types_text: + color_label = config_params['textregions']['drop-capital'] + img_poly=cv2.fillPoly(img, pts =co_text_drop, color=(color_label,color_label,color_label)) + + + + + try: + cv2.imwrite(self.output_dir+'/'+self.gt_list[index].split('-')[1].split('.')[0]+'.png',img_poly ) + except: + cv2.imwrite(self.output_dir+'/'+self.gt_list[index].split('.')[0]+'.png',img_poly ) + + + #print(values[0]) if self.experiment=='word': region_tags=np.unique([x for x in alltags if x.endswith('Word')]) co_word=[] @@ -302,6 +743,7 @@ class pagexml2word: if tag.endswith('}TextRegion') or tag.endswith('}Textregion'): #print('sth') for nn in root1.iter(tag): + print(nn.attrib['type']) c_t_in=[] sumi=0 for vv in nn.iter(): @@ -373,20 +815,19 @@ class pagexml2word: elif vv.tag!=link+'Point' and sumi>=1: break co_sep.append(np.array(c_t_in)) - - - img = np.zeros( (y_len,x_len,3) ) + img_poly = np.zeros( (y_len,x_len,3) ) + if self.output_type == '3d': - img_poly=cv2.fillPoly(img, pts =co_text, color=(255,0,0)) - img_poly=cv2.fillPoly(img, pts =co_img, color=(0,255,0)) - img_poly=cv2.fillPoly(img, pts =co_sep, color=(0,0,255)) + img_poly=cv2.fillPoly(img_poly, pts =co_text, color=(255,0,0)) + img_poly=cv2.fillPoly(img_poly, pts =co_img, color=(0,255,0)) + img_poly=cv2.fillPoly(img_poly, pts =co_sep, color=(0,0,255)) ##img_poly=cv2.fillPoly(img, pts =co_graphic, color=(255,125,125)) elif self.output_type == '2d': - img_poly=cv2.fillPoly(img, pts =co_text, color=(1,1,1)) - img_poly=cv2.fillPoly(img, pts =co_img, color=(2,2,2)) - img_poly=cv2.fillPoly(img, pts =co_sep, color=(3,3,3)) + img_poly=cv2.fillPoly(img_poly, pts =co_text, color=(1,1,1)) + img_poly=cv2.fillPoly(img_poly, pts =co_img, color=(2,2,2)) + img_poly=cv2.fillPoly(img_poly, pts =co_sep, color=(3,3,3)) try: cv2.imwrite(self.output_dir+'/'+self.gt_list[index].split('-')[1].split('.')[0]+'.png',img_poly ) @@ -752,7 +1193,7 @@ class pagexml2word: img = np.zeros( (y_len,x_len,3) ) - + if self.output_type == '3d': img_poly=cv2.fillPoly(img, pts =co_text_paragraph, color=(255,0,0)) @@ -1043,9 +1484,9 @@ class pagexml2word: #except: #pass - def run(self): + def run(self,config_params): self.get_content_of_dir() - self.get_images_of_ground_truth() + self.get_images_of_ground_truth(config_params) @click.command() @@ -1061,6 +1502,14 @@ class pagexml2word: help="directory where ground truth images would be written", type=click.Path(exists=True, file_okay=False), ) + +@click.option( + "--layout_config", + "-lc", + help="experiment of ineterst. Word , textline , glyph and textregion are desired options.", + type=click.Path(exists=True, dir_okay=False), +) + @click.option( "--type_output", "-to", @@ -1072,9 +1521,16 @@ class pagexml2word: help="experiment of ineterst. Word , textline , glyph and textregion are desired options.", ) -def main(dir_xml,dir_out,type_output,experiment): - x=pagexml2word(dir_xml,dir_out,type_output,experiment) - x.run() + +def main(dir_xml,dir_out,type_output,experiment,layout_config): + if layout_config: + with open(layout_config) as f: + config_params = json.load(f) + else: + print("passed") + config_params = None + x=pagexml2word(dir_xml,dir_out,type_output,experiment, layout_config) + x.run(config_params) if __name__=="__main__": main() From d687f5328f3f8dd61f4094240c2989966dbbf9f8 Mon Sep 17 00:00:00 2001 From: vahidrezanezhad Date: Thu, 23 May 2024 11:14:14 +0200 Subject: [PATCH 12/62] dynamic layout decorated with artificial class on text elements boundry --- custom_config_page2label.json | 6 +- pagexml2label.py | 117 ++++++++++++++++++++++++++++------ 2 files changed, 103 insertions(+), 20 deletions(-) diff --git a/custom_config_page2label.json b/custom_config_page2label.json index 75c4b96..85b5d7e 100644 --- a/custom_config_page2label.json +++ b/custom_config_page2label.json @@ -1,6 +1,8 @@ { -"textregions":{"paragraph":1, "heading": 2, "header":2,"drop-capital": 3, "marginal":4 }, +"textregions":{"paragraph":1, "heading": 2, "header":2,"drop-capital": 3, "marginalia":4 ,"page-number":1 , "catch-word":1 }, "imageregion":5, "separatorregion":6, -"graphicregions" :{"handwritten-annotation":7, "decoration": 8, "signature": 9, "stamp": 10} +"graphicregions" :{"handwritten-annotation":7, "decoration": 8, "signature": 9, "stamp": 10}, +"artificial_class_on_boundry": ["paragraph","header", "heading", "marginalia", "page-number", "catch-word", "drop-capital"], +"artificial_class_label":11 } diff --git a/pagexml2label.py b/pagexml2label.py index 6907e84..5311c24 100644 --- a/pagexml2label.py +++ b/pagexml2label.py @@ -78,7 +78,37 @@ class pagexml2word: contours_imgs = self.filter_contours_area_of_image_tables(thresh, contours_imgs, hierarchy, max_area=1, min_area=min_area) return contours_imgs + def update_region_contours(self, co_text, img_boundary, erosion_rate, dilation_rate, y_len, x_len): + co_text_eroded = [] + for con in co_text: + #try: + img_boundary_in = np.zeros( (y_len,x_len) ) + img_boundary_in = cv2.fillPoly(img_boundary_in, pts=[con], color=(1, 1, 1)) + #print('bidiahhhhaaa') + + + + #img_boundary_in = cv2.erode(img_boundary_in[:,:], KERNEL, iterations=7)#asiatica + if erosion_rate > 0: + img_boundary_in = cv2.erode(img_boundary_in[:,:], KERNEL, iterations=erosion_rate) + + pixel = 1 + min_size = 0 + con_eroded = self.return_contours_of_interested_region(img_boundary_in,pixel, min_size ) + + try: + co_text_eroded.append(con_eroded[0]) + except: + co_text_eroded.append(con) + + img_boundary_in_dilated = cv2.dilate(img_boundary_in[:,:], KERNEL, iterations=dilation_rate) + #img_boundary_in_dilated = cv2.dilate(img_boundary_in[:,:], KERNEL, iterations=5) + + boundary = img_boundary_in_dilated[:,:] - img_boundary_in[:,:] + + img_boundary[:,:][boundary[:,:]==1] =1 + return co_text_eroded, img_boundary def get_images_of_ground_truth(self, config_params): """ Reading the page xml files and write the ground truth images into given output directory. @@ -98,6 +128,10 @@ class pagexml2word: if self.layout_config: keys = list(config_params.keys()) + if "artificial_class_on_boundry" in keys: + elements_with_artificial_class = list(config_params['artificial_class_on_boundry']) + artificial_class_rgb_color = (255,255,0) + artificial_class_label = config_params['artificial_class_label'] #values = config_params.values() if 'textregions' in keys: @@ -110,7 +144,7 @@ class pagexml2word: types_graphic_label = list(types_graphic_dict.values()) - types_text_label_rgb = [ (0,0,0), (255,0,0), (255,125,0), (255,0,125), (125,255,125), (125,125,0), (0,125,255), (0,125,0), (125,125,125), (0,125,255), (125,0,125), (0,255,0),(0,0,255), (0,255,255), (255,125,125), (0,125,255), (0,255,125)] + labels_rgb_color = [ (0,0,0), (255,0,0), (255,125,0), (255,0,125), (125,255,125), (125,125,0), (0,125,255), (0,125,0), (125,125,125), (255,0,255), (125,0,125), (0,255,0),(0,0,255), (0,255,255), (255,125,125), (0,125,125), (0,255,125)] region_tags=np.unique([x for x in alltags if x.endswith('Region')]) @@ -429,46 +463,90 @@ class pagexml2word: break co_noise.append(np.array(c_t_in)) + if "artificial_class_on_boundry" in keys: + img_boundary = np.zeros( (y_len,x_len) ) + if "paragraph" in elements_with_artificial_class: + erosion_rate = 2 + dilation_rate = 4 + co_text_paragraph, img_boundary = self.update_region_contours(co_text_paragraph, img_boundary, erosion_rate, dilation_rate, y_len, x_len ) + if "drop-capital" in elements_with_artificial_class: + erosion_rate = 0 + dilation_rate = 4 + co_text_drop, img_boundary = self.update_region_contours(co_text_drop, img_boundary, erosion_rate, dilation_rate, y_len, x_len ) + if "catch-word" in elements_with_artificial_class: + erosion_rate = 0 + dilation_rate = 4 + co_text_catch, img_boundary = self.update_region_contours(co_text_catch, img_boundary, erosion_rate, dilation_rate, y_len, x_len ) + if "page-number" in elements_with_artificial_class: + erosion_rate = 0 + dilation_rate = 4 + co_text_page_number, img_boundary = self.update_region_contours(co_text_page_number, img_boundary, erosion_rate, dilation_rate, y_len, x_len ) + if "header" in elements_with_artificial_class: + erosion_rate = 1 + dilation_rate = 4 + co_text_header, img_boundary = self.update_region_contours(co_text_header, img_boundary, erosion_rate, dilation_rate, y_len, x_len ) + if "heading" in elements_with_artificial_class: + erosion_rate = 1 + dilation_rate = 4 + co_text_heading, img_boundary = self.update_region_contours(co_text_heading, img_boundary, erosion_rate, dilation_rate, y_len, x_len ) + if "signature-mark" in elements_with_artificial_class: + erosion_rate = 1 + dilation_rate = 4 + co_text_signature_mark, img_boundary = self.update_region_contours(co_text_signature_mark, img_boundary, erosion_rate, dilation_rate, y_len, x_len ) + if "marginalia" in elements_with_artificial_class: + erosion_rate = 2 + dilation_rate = 4 + co_text_marginalia, img_boundary = self.update_region_contours(co_text_marginalia, img_boundary, erosion_rate, dilation_rate, y_len, x_len ) + + img = np.zeros( (y_len,x_len,3) ) if self.output_type == '3d': if 'graphicregions' in keys: if "handwritten-annotation" in types_graphic: - img_poly=cv2.fillPoly(img, pts =co_graphic_text_annotation, color=types_text_label_rgb[ config_params['graphicregions']['handwritten-annotation']]) + img_poly=cv2.fillPoly(img, pts =co_graphic_text_annotation, color=labels_rgb_color[ config_params['graphicregions']['handwritten-annotation']]) if "signature" in types_graphic: - img_poly=cv2.fillPoly(img, pts =co_graphic_signature, color=types_text_label_rgb[ config_params['graphicregions']['signature']]) + img_poly=cv2.fillPoly(img, pts =co_graphic_signature, color=labels_rgb_color[ config_params['graphicregions']['signature']]) if "decoration" in types_graphic: - img_poly=cv2.fillPoly(img, pts =co_graphic_decoration, color=types_text_label_rgb[ config_params['graphicregions']['decoration']]) + img_poly=cv2.fillPoly(img, pts =co_graphic_decoration, color=labels_rgb_color[ config_params['graphicregions']['decoration']]) if "stamp" in types_graphic: - img_poly=cv2.fillPoly(img, pts =co_graphic_stamp, color=types_text_label_rgb[ config_params['graphicregions']['stamp']]) + img_poly=cv2.fillPoly(img, pts =co_graphic_stamp, color=labels_rgb_color[ config_params['graphicregions']['stamp']]) if 'imageregion' in keys: - img_poly=cv2.fillPoly(img, pts =co_img, color=types_text_label_rgb[ config_params['imageregion']]) + img_poly=cv2.fillPoly(img, pts =co_img, color=labels_rgb_color[ config_params['imageregion']]) if 'separatorregion' in keys: - img_poly=cv2.fillPoly(img, pts =co_sep, color=types_text_label_rgb[ config_params['separatorregion']]) + img_poly=cv2.fillPoly(img, pts =co_sep, color=labels_rgb_color[ config_params['separatorregion']]) if 'tableregion' in keys: - img_poly=cv2.fillPoly(img, pts =co_table, color=types_text_label_rgb[ config_params['tableregion']]) + img_poly=cv2.fillPoly(img, pts =co_table, color=labels_rgb_color[ config_params['tableregion']]) if 'noiseregion' in keys: - img_poly=cv2.fillPoly(img, pts =co_noise, color=types_text_label_rgb[ config_params['noiseregion']]) + img_poly=cv2.fillPoly(img, pts =co_noise, color=labels_rgb_color[ config_params['noiseregion']]) if 'textregions' in keys: if "paragraph" in types_text: - img_poly=cv2.fillPoly(img, pts =co_text_paragraph, color=types_text_label_rgb[ config_params['textregions']['paragraph']]) + img_poly=cv2.fillPoly(img, pts =co_text_paragraph, color=labels_rgb_color[ config_params['textregions']['paragraph']]) if "heading" in types_text: - img_poly=cv2.fillPoly(img, pts =co_text_heading, color=types_text_label_rgb[ config_params['textregions']['heading']]) + img_poly=cv2.fillPoly(img, pts =co_text_heading, color=labels_rgb_color[ config_params['textregions']['heading']]) if "header" in types_text: - img_poly=cv2.fillPoly(img, pts =co_text_header, color=types_text_label_rgb[ config_params['textregions']['header']]) + img_poly=cv2.fillPoly(img, pts =co_text_header, color=labels_rgb_color[ config_params['textregions']['header']]) if "catch-word" in types_text: - img_poly=cv2.fillPoly(img, pts =co_text_catch, color=types_text_label_rgb[ config_params['textregions']['catch-word']]) + img_poly=cv2.fillPoly(img, pts =co_text_catch, color=labels_rgb_color[ config_params['textregions']['catch-word']]) if "signature-mark" in types_text: - img_poly=cv2.fillPoly(img, pts =co_text_signature_mark, color=types_text_label_rgb[ config_params['textregions']['signature-mark']]) + img_poly=cv2.fillPoly(img, pts =co_text_signature_mark, color=labels_rgb_color[ config_params['textregions']['signature-mark']]) if "page-number" in types_text: - img_poly=cv2.fillPoly(img, pts =co_text_page_number, color=types_text_label_rgb[ config_params['textregions']['page-number']]) + img_poly=cv2.fillPoly(img, pts =co_text_page_number, color=labels_rgb_color[ config_params['textregions']['page-number']]) if "marginalia" in types_text: - img_poly=cv2.fillPoly(img, pts =co_text_marginalia, color=types_text_label_rgb[ config_params['textregions']['marginalia']]) + img_poly=cv2.fillPoly(img, pts =co_text_marginalia, color=labels_rgb_color[ config_params['textregions']['marginalia']]) if "drop-capital" in types_text: - img_poly=cv2.fillPoly(img, pts =co_text_drop, color=types_text_label_rgb[ config_params['textregions']['drop-capital']]) + img_poly=cv2.fillPoly(img, pts =co_text_drop, color=labels_rgb_color[ config_params['textregions']['drop-capital']]) + + if "artificial_class_on_boundry" in keys: + img_poly[:,:,0][img_boundary[:,:]==1] = artificial_class_rgb_color[0] + img_poly[:,:,1][img_boundary[:,:]==1] = artificial_class_rgb_color[1] + img_poly[:,:,2][img_boundary[:,:]==1] = artificial_class_rgb_color[2] + + + elif self.output_type == '2d': if 'graphicregions' in keys: @@ -523,6 +601,9 @@ class pagexml2word: if "drop-capital" in types_text: color_label = config_params['textregions']['drop-capital'] img_poly=cv2.fillPoly(img, pts =co_text_drop, color=(color_label,color_label,color_label)) + + if "artificial_class_on_boundry" in keys: + img_poly[:,:][img_boundary[:,:]==1] = artificial_class_label @@ -1506,7 +1587,7 @@ class pagexml2word: @click.option( "--layout_config", "-lc", - help="experiment of ineterst. Word , textline , glyph and textregion are desired options.", + help="config file of prefered layout.", type=click.Path(exists=True, dir_okay=False), ) From 947a0e06f8cdaaeeec2400b0d2afe420cc8d05d5 Mon Sep 17 00:00:00 2001 From: vahidrezanezhad Date: Thu, 23 May 2024 15:43:31 +0200 Subject: [PATCH 13/62] missing text types are added --- custom_config_page2label.json | 12 ++++----- pagexml2label.py | 48 ++++++++++++++++++++++++++++++++++- 2 files changed, 53 insertions(+), 7 deletions(-) diff --git a/custom_config_page2label.json b/custom_config_page2label.json index 85b5d7e..254f4df 100644 --- a/custom_config_page2label.json +++ b/custom_config_page2label.json @@ -1,8 +1,8 @@ { -"textregions":{"paragraph":1, "heading": 2, "header":2,"drop-capital": 3, "marginalia":4 ,"page-number":1 , "catch-word":1 }, -"imageregion":5, -"separatorregion":6, -"graphicregions" :{"handwritten-annotation":7, "decoration": 8, "signature": 9, "stamp": 10}, -"artificial_class_on_boundry": ["paragraph","header", "heading", "marginalia", "page-number", "catch-word", "drop-capital"], -"artificial_class_label":11 +"textregions":{"paragraph":1, "heading": 1, "header":1,"drop-capital": 1, "marginalia":1 ,"page-number":1 , "catch-word":1 ,"footnote": 1, "footnote-continued": 1}, +"imageregion":2, +"separatorregion":3, +"graphicregions" :{"handwritten-annotation":2, "decoration": 2, "signature": 2, "stamp": 2}, +"artificial_class_on_boundry": ["paragraph","header", "heading", "marginalia", "page-number", "catch-word", "drop-capital","footnote", "footnote-continued"], +"artificial_class_label":4 } diff --git a/pagexml2label.py b/pagexml2label.py index 5311c24..63b7acf 100644 --- a/pagexml2label.py +++ b/pagexml2label.py @@ -113,6 +113,7 @@ class pagexml2word: """ Reading the page xml files and write the ground truth images into given output directory. """ + ## to do: add footnote to text regions for index in tqdm(range(len(self.gt_list))): #try: tree1 = ET.parse(self.dir+'/'+self.gt_list[index]) @@ -144,11 +145,13 @@ class pagexml2word: types_graphic_label = list(types_graphic_dict.values()) - labels_rgb_color = [ (0,0,0), (255,0,0), (255,125,0), (255,0,125), (125,255,125), (125,125,0), (0,125,255), (0,125,0), (125,125,125), (255,0,255), (125,0,125), (0,255,0),(0,0,255), (0,255,255), (255,125,125), (0,125,125), (0,255,125)] + labels_rgb_color = [ (0,0,0), (255,0,0), (255,125,0), (255,0,125), (125,255,125), (125,125,0), (0,125,255), (0,125,0), (125,125,125), (255,0,255), (125,0,125), (0,255,0),(0,0,255), (0,255,255), (255,125,125), (0,125,125), (0,255,125), (255,125,255), (125,255,0)] region_tags=np.unique([x for x in alltags if x.endswith('Region')]) co_text_paragraph=[] + co_text_footnote=[] + co_text_footnote_con=[] co_text_drop=[] co_text_heading=[] co_text_header=[] @@ -177,6 +180,8 @@ class pagexml2word: c_t_in_signature_mark=[] c_t_in_catch=[] c_t_in_marginalia=[] + c_t_in_footnote=[] + c_t_in_footnote_con=[] sumi=0 for vv in nn.iter(): # check the format of coords @@ -190,6 +195,14 @@ class pagexml2word: if "drop-capital" in types_text: if "type" in nn.attrib and nn.attrib['type']=='drop-capital': c_t_in_drop.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) + + if "footnote" in types_text: + if "type" in nn.attrib and nn.attrib['type']=='footnote': + c_t_in_footnote.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) + + if "footnote-continued" in types_text: + if "type" in nn.attrib and nn.attrib['type']=='footnote-continued': + c_t_in_footnote_con.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) if "heading" in types_text: if "type" in nn.attrib and nn.attrib['type']=='heading': @@ -231,6 +244,16 @@ class pagexml2word: c_t_in_drop.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) sumi+=1 + if "footnote" in types_text: + if "type" in nn.attrib and nn.attrib['type']=='footnote': + c_t_in_footnote.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) + sumi+=1 + + if "footnote-continued" in types_text: + if "type" in nn.attrib and nn.attrib['type']=='footnote-continued': + c_t_in_footnote_con.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) + sumi+=1 + if "heading" in types_text: if "type" in nn.attrib and nn.attrib['type']=='heading': c_t_in_heading.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) @@ -272,6 +295,10 @@ class pagexml2word: if len(c_t_in_drop)>0: co_text_drop.append(np.array(c_t_in_drop)) + if len(c_t_in_footnote_con)>0: + co_text_footnote_con.append(np.array(c_t_in_footnote_con)) + if len(c_t_in_footnote)>0: + co_text_footnote.append(np.array(c_t_in_footnote)) if len(c_t_in_paragraph)>0: co_text_paragraph.append(np.array(c_t_in_paragraph)) if len(c_t_in_heading)>0: @@ -497,6 +524,15 @@ class pagexml2word: erosion_rate = 2 dilation_rate = 4 co_text_marginalia, img_boundary = self.update_region_contours(co_text_marginalia, img_boundary, erosion_rate, dilation_rate, y_len, x_len ) + if "footnote" in elements_with_artificial_class: + erosion_rate = 2 + dilation_rate = 4 + co_text_footnote, img_boundary = self.update_region_contours(co_text_footnote, img_boundary, erosion_rate, dilation_rate, y_len, x_len ) + if "footnote-continued" in elements_with_artificial_class: + erosion_rate = 2 + dilation_rate = 4 + co_text_footnote_con, img_boundary = self.update_region_contours(co_text_footnote_con, img_boundary, erosion_rate, dilation_rate, y_len, x_len ) + img = np.zeros( (y_len,x_len,3) ) @@ -525,6 +561,10 @@ class pagexml2word: if 'textregions' in keys: if "paragraph" in types_text: img_poly=cv2.fillPoly(img, pts =co_text_paragraph, color=labels_rgb_color[ config_params['textregions']['paragraph']]) + if "footnote" in types_text: + img_poly=cv2.fillPoly(img, pts =co_text_footnote, color=labels_rgb_color[ config_params['textregions']['footnote']]) + if "footnote-continued" in types_text: + img_poly=cv2.fillPoly(img, pts =co_text_footnote_con, color=labels_rgb_color[ config_params['textregions']['footnote-continued']]) if "heading" in types_text: img_poly=cv2.fillPoly(img, pts =co_text_heading, color=labels_rgb_color[ config_params['textregions']['heading']]) if "header" in types_text: @@ -580,6 +620,12 @@ class pagexml2word: if "paragraph" in types_text: color_label = config_params['textregions']['paragraph'] img_poly=cv2.fillPoly(img, pts =co_text_paragraph, color=(color_label,color_label,color_label)) + if "footnote" in types_text: + color_label = config_params['textregions']['footnote'] + img_poly=cv2.fillPoly(img, pts =co_text_footnote, color=(color_label,color_label,color_label)) + if "footnote-continued" in types_text: + color_label = config_params['textregions']['footnote-continued'] + img_poly=cv2.fillPoly(img, pts =co_text_footnote_con, color=(color_label,color_label,color_label)) if "heading" in types_text: color_label = config_params['textregions']['heading'] img_poly=cv2.fillPoly(img, pts =co_text_heading, color=(color_label,color_label,color_label)) From 4b7f7da07c96d410d0decb4b6879d5586132fdab Mon Sep 17 00:00:00 2001 From: vahidrezanezhad Date: Thu, 23 May 2024 17:14:31 +0200 Subject: [PATCH 14/62] use cases like textline, word and glyph are added --- custom_config_page2label.json | 11 +- pagexml2label.py | 1055 +++------------------------------ 2 files changed, 93 insertions(+), 973 deletions(-) diff --git a/custom_config_page2label.json b/custom_config_page2label.json index 254f4df..d6320fa 100644 --- a/custom_config_page2label.json +++ b/custom_config_page2label.json @@ -1,8 +1,9 @@ { -"textregions":{"paragraph":1, "heading": 1, "header":1,"drop-capital": 1, "marginalia":1 ,"page-number":1 , "catch-word":1 ,"footnote": 1, "footnote-continued": 1}, -"imageregion":2, -"separatorregion":3, -"graphicregions" :{"handwritten-annotation":2, "decoration": 2, "signature": 2, "stamp": 2}, +"use_case": "layout", +"textregions":{"paragraph":1, "heading": 2, "header":2,"drop-capital": 3, "marginalia":4 ,"page-number":1 , "catch-word":1 ,"footnote": 1, "footnote-continued": 1}, +"imageregion":5, +"separatorregion":6, +"graphicregions" :{"handwritten-annotation":5, "decoration": 5, "signature": 5, "stamp": 5}, "artificial_class_on_boundry": ["paragraph","header", "heading", "marginalia", "page-number", "catch-word", "drop-capital","footnote", "footnote-continued"], -"artificial_class_label":4 +"artificial_class_label":7 } diff --git a/pagexml2label.py b/pagexml2label.py index 63b7acf..16cda8b 100644 --- a/pagexml2label.py +++ b/pagexml2label.py @@ -21,13 +21,12 @@ This classes.txt file is required for dhsegment tool. """ KERNEL = np.ones((5, 5), np.uint8) -class pagexml2word: - def __init__(self,dir_in, out_dir,output_type,experiment,layout_config): +class pagexml2label: + def __init__(self,dir_in, out_dir,output_type,config): self.dir=dir_in self.output_dir=out_dir self.output_type=output_type - self.experiment=experiment - self.layout_config=layout_config + self.config=config def get_content_of_dir(self): """ @@ -127,7 +126,82 @@ class pagexml2word: y_len=int(jj.attrib['imageHeight']) x_len=int(jj.attrib['imageWidth']) - if self.layout_config: + if self.config and (config_params['use_case']=='textline' or config_params['use_case']=='word' or config_params['use_case']=='glyph'): + keys = list(config_params.keys()) + if "artificial_class_label" in keys: + artificial_class_rgb_color = (255,255,0) + artificial_class_label = config_params['artificial_class_label'] + + textline_rgb_color = (255, 0, 0) + + if config_params['use_case']=='textline': + region_tags = np.unique([x for x in alltags if x.endswith('TextLine')]) + elif config_params['use_case']=='word': + region_tags = np.unique([x for x in alltags if x.endswith('Word')]) + elif config_params['use_case']=='glyph': + region_tags = np.unique([x for x in alltags if x.endswith('Glyph')]) + co_use_case = [] + + for tag in region_tags: + if config_params['use_case']=='textline': + tag_endings = ['}TextLine','}textline'] + elif config_params['use_case']=='word': + tag_endings = ['}Word','}word'] + elif config_params['use_case']=='glyph': + tag_endings = ['}Glyph','}glyph'] + + if tag.endswith(tag_endings[0]) or tag.endswith(tag_endings[1]): + for nn in root1.iter(tag): + c_t_in = [] + sumi = 0 + for vv in nn.iter(): + # check the format of coords + if vv.tag == link + 'Coords': + coords = bool(vv.attrib) + if coords: + p_h = vv.attrib['points'].split(' ') + c_t_in.append( + np.array([[int(x.split(',')[0]), int(x.split(',')[1])] for x in p_h])) + break + else: + pass + + if vv.tag == link + 'Point': + c_t_in.append([int(np.float(vv.attrib['x'])), int(np.float(vv.attrib['y']))]) + sumi += 1 + elif vv.tag != link + 'Point' and sumi >= 1: + break + co_use_case.append(np.array(c_t_in)) + + + + if "artificial_class_label" in keys: + img_boundary = np.zeros((y_len, x_len)) + erosion_rate = 1 + dilation_rate = 3 + co_use_case, img_boundary = self.update_region_contours(co_use_case, img_boundary, erosion_rate, dilation_rate, y_len, x_len ) + + + img = np.zeros((y_len, x_len, 3)) + if self.output_type == '2d': + img_poly = cv2.fillPoly(img, pts=co_use_case, color=(1, 1, 1)) + if "artificial_class_label" in keys: + img_poly[:,:][img_boundary[:,:]==1] = artificial_class_label + elif self.output_type == '3d': + img_poly = cv2.fillPoly(img, pts=co_use_case, color=textline_rgb_color) + if "artificial_class_label" in keys: + img_poly[:,:,0][img_boundary[:,:]==1] = artificial_class_rgb_color[0] + img_poly[:,:,1][img_boundary[:,:]==1] = artificial_class_rgb_color[1] + img_poly[:,:,2][img_boundary[:,:]==1] = artificial_class_rgb_color[2] + + try: + cv2.imwrite(self.output_dir + '/' + self.gt_list[index].split('-')[1].split('.')[0] + '.png', + img_poly) + except: + cv2.imwrite(self.output_dir + '/' + self.gt_list[index].split('.')[0] + '.png', img_poly) + + + if self.config and config_params['use_case']=='layout': keys = list(config_params.keys()) if "artificial_class_on_boundry" in keys: elements_with_artificial_class = list(config_params['artificial_class_on_boundry']) @@ -139,6 +213,7 @@ class pagexml2word: types_text_dict = config_params['textregions'] types_text = list(types_text_dict.keys()) types_text_label = list(types_text_dict.values()) + print(types_text) if 'graphicregions' in keys: types_graphic_dict = config_params['graphicregions'] types_graphic = list(types_graphic_dict.keys()) @@ -660,957 +735,6 @@ class pagexml2word: cv2.imwrite(self.output_dir+'/'+self.gt_list[index].split('.')[0]+'.png',img_poly ) - #print(values[0]) - if self.experiment=='word': - region_tags=np.unique([x for x in alltags if x.endswith('Word')]) - co_word=[] - - for tag in region_tags: - if tag.endswith('}Word') or tag.endswith('}word'): - #print('sth') - for nn in root1.iter(tag): - c_t_in=[] - sumi=0 - for vv in nn.iter(): - # check the format of coords - if vv.tag==link+'Coords': - coords=bool(vv.attrib) - if coords: - p_h=vv.attrib['points'].split(' ') - c_t_in.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) - break - else: - pass - - - if vv.tag==link+'Point': - c_t_in.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) - sumi+=1 - #print(vv.tag,'in') - elif vv.tag!=link+'Point' and sumi>=1: - break - co_word.append(np.array(c_t_in)) - - img = np.zeros( (y_len,x_len, 3) ) - if self.output_type == '2d': - img_poly=cv2.fillPoly(img, pts =co_word, color=(1,1,1)) - elif self.output_type == '3d': - img_poly=cv2.fillPoly(img, pts =co_word, color=(255,0,0)) - - try: - cv2.imwrite(self.output_dir+'/'+self.gt_list[index].split('-')[1].split('.')[0]+'.png',img_poly ) - except: - cv2.imwrite(self.output_dir+'/'+self.gt_list[index].split('.')[0]+'.png',img_poly ) - - - elif self.experiment=='glyph': - region_tags=np.unique([x for x in alltags if x.endswith('Glyph')]) - co_glyph=[] - - for tag in region_tags: - if tag.endswith('}Glyph') or tag.endswith('}glyph'): - #print('sth') - for nn in root1.iter(tag): - c_t_in=[] - sumi=0 - for vv in nn.iter(): - # check the format of coords - if vv.tag==link+'Coords': - coords=bool(vv.attrib) - if coords: - p_h=vv.attrib['points'].split(' ') - c_t_in.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) - break - else: - pass - - - if vv.tag==link+'Point': - c_t_in.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) - sumi+=1 - #print(vv.tag,'in') - elif vv.tag!=link+'Point' and sumi>=1: - break - co_glyph.append(np.array(c_t_in)) - - img = np.zeros( (y_len,x_len, 3) ) - if self.output_type == '2d': - img_poly=cv2.fillPoly(img, pts =co_glyph, color=(1,1,1)) - elif self.output_type == '3d': - img_poly=cv2.fillPoly(img, pts =co_glyph, color=(255,0,0)) - - try: - cv2.imwrite(self.output_dir+'/'+self.gt_list[index].split('-')[1].split('.')[0]+'.png',img_poly ) - except: - cv2.imwrite(self.output_dir+'/'+self.gt_list[index].split('.')[0]+'.png',img_poly ) - - elif self.experiment=='textline': - region_tags=np.unique([x for x in alltags if x.endswith('TextLine')]) - co_line=[] - - for tag in region_tags: - if tag.endswith('}TextLine') or tag.endswith('}textline'): - #print('sth') - for nn in root1.iter(tag): - c_t_in=[] - sumi=0 - for vv in nn.iter(): - # check the format of coords - if vv.tag==link+'Coords': - coords=bool(vv.attrib) - if coords: - p_h=vv.attrib['points'].split(' ') - c_t_in.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) - break - else: - pass - - - if vv.tag==link+'Point': - c_t_in.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) - sumi+=1 - #print(vv.tag,'in') - elif vv.tag!=link+'Point' and sumi>=1: - break - co_line.append(np.array(c_t_in)) - - img = np.zeros( (y_len,x_len, 3) ) - if self.output_type == '2d': - img_poly=cv2.fillPoly(img, pts =co_line, color=(1,1,1)) - elif self.output_type == '3d': - img_poly=cv2.fillPoly(img, pts =co_line, color=(255,0,0)) - - try: - cv2.imwrite(self.output_dir+'/'+self.gt_list[index].split('-')[1].split('.')[0]+'.png',img_poly ) - except: - cv2.imwrite(self.output_dir+'/'+self.gt_list[index].split('.')[0]+'.png',img_poly ) - - elif self.experiment == 'textline_new_concept': - region_tags = np.unique([x for x in alltags if x.endswith('TextLine')]) - co_line = [] - - for tag in region_tags: - if tag.endswith('}TextLine') or tag.endswith('}textline'): - # print('sth') - for nn in root1.iter(tag): - c_t_in = [] - sumi = 0 - for vv in nn.iter(): - # check the format of coords - if vv.tag == link + 'Coords': - coords = bool(vv.attrib) - if coords: - p_h = vv.attrib['points'].split(' ') - c_t_in.append( - np.array([[int(x.split(',')[0]), int(x.split(',')[1])] for x in p_h])) - break - else: - pass - - if vv.tag == link + 'Point': - c_t_in.append([int(np.float(vv.attrib['x'])), int(np.float(vv.attrib['y']))]) - sumi += 1 - # print(vv.tag,'in') - elif vv.tag != link + 'Point' and sumi >= 1: - break - co_line.append(np.array(c_t_in)) - - img_boundary = np.zeros((y_len, x_len)) - co_textline_eroded = [] - for con in co_line: - # try: - img_boundary_in = np.zeros((y_len, x_len)) - img_boundary_in = cv2.fillPoly(img_boundary_in, pts=[con], color=(1, 1, 1)) - # print('bidiahhhhaaa') - - # img_boundary_in = cv2.erode(img_boundary_in[:,:], KERNEL, iterations=7)#asiatica - img_boundary_in = cv2.erode(img_boundary_in[:, :], KERNEL, iterations=1) - - pixel = 1 - min_size = 0 - con_eroded = self.return_contours_of_interested_region(img_boundary_in, pixel, min_size) - - try: - co_textline_eroded.append(con_eroded[0]) - except: - co_textline_eroded.append(con) - - img_boundary_in_dilated = cv2.dilate(img_boundary_in[:, :], KERNEL, iterations=3) - # img_boundary_in_dilated = cv2.dilate(img_boundary_in[:,:], KERNEL, iterations=5) - - boundary = img_boundary_in_dilated[:, :] - img_boundary_in[:, :] - - img_boundary[:, :][boundary[:, :] == 1] = 1 - - img = np.zeros((y_len, x_len, 3)) - if self.output_type == '2d': - img_poly = cv2.fillPoly(img, pts=co_textline_eroded, color=(1, 1, 1)) - img_poly[:, :][img_boundary[:, :] == 1] = 2 - elif self.output_type == '3d': - img_poly = cv2.fillPoly(img, pts=co_textline_eroded, color=(255, 0, 0)) - img_poly[:, :, 0][img_boundary[:, :] == 1] = 255 - img_poly[:, :, 1][img_boundary[:, :] == 1] = 125 - img_poly[:, :, 2][img_boundary[:, :] == 1] = 125 - - try: - cv2.imwrite(self.output_dir + '/' + self.gt_list[index].split('-')[1].split('.')[0] + '.png', - img_poly) - except: - cv2.imwrite(self.output_dir + '/' + self.gt_list[index].split('.')[0] + '.png', img_poly) - - elif self.experiment=='layout_for_main_regions': - region_tags=np.unique([x for x in alltags if x.endswith('Region')]) - #print(region_tags) - co_text=[] - co_sep=[] - co_img=[] - #co_graphic=[] - - for tag in region_tags: - if tag.endswith('}TextRegion') or tag.endswith('}Textregion'): - #print('sth') - for nn in root1.iter(tag): - print(nn.attrib['type']) - c_t_in=[] - sumi=0 - for vv in nn.iter(): - # check the format of coords - if vv.tag==link+'Coords': - coords=bool(vv.attrib) - if coords: - p_h=vv.attrib['points'].split(' ') - c_t_in.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) - break - else: - pass - - - if vv.tag==link+'Point': - c_t_in.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) - sumi+=1 - #print(vv.tag,'in') - elif vv.tag!=link+'Point' and sumi>=1: - break - co_text.append(np.array(c_t_in)) - - elif tag.endswith('}ImageRegion') or tag.endswith('}GraphicRegion') or tag.endswith('}imageregion'): - #print('sth') - for nn in root1.iter(tag): - c_t_in=[] - sumi=0 - for vv in nn.iter(): - # check the format of coords - if vv.tag==link+'Coords': - coords=bool(vv.attrib) - if coords: - p_h=vv.attrib['points'].split(' ') - c_t_in.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) - break - else: - pass - - - if vv.tag==link+'Point': - c_t_in.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) - sumi+=1 - #print(vv.tag,'in') - elif vv.tag!=link+'Point' and sumi>=1: - break - co_img.append(np.array(c_t_in)) - - elif tag.endswith('}SeparatorRegion') or tag.endswith('}separatorregion'): - #print('sth') - for nn in root1.iter(tag): - c_t_in=[] - sumi=0 - for vv in nn.iter(): - # check the format of coords - if vv.tag==link+'Coords': - coords=bool(vv.attrib) - if coords: - p_h=vv.attrib['points'].split(' ') - c_t_in.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) - break - else: - pass - - - if vv.tag==link+'Point': - c_t_in.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) - sumi+=1 - #print(vv.tag,'in') - elif vv.tag!=link+'Point' and sumi>=1: - break - co_sep.append(np.array(c_t_in)) - - img_poly = np.zeros( (y_len,x_len,3) ) - - - if self.output_type == '3d': - img_poly=cv2.fillPoly(img_poly, pts =co_text, color=(255,0,0)) - img_poly=cv2.fillPoly(img_poly, pts =co_img, color=(0,255,0)) - img_poly=cv2.fillPoly(img_poly, pts =co_sep, color=(0,0,255)) - ##img_poly=cv2.fillPoly(img, pts =co_graphic, color=(255,125,125)) - elif self.output_type == '2d': - img_poly=cv2.fillPoly(img_poly, pts =co_text, color=(1,1,1)) - img_poly=cv2.fillPoly(img_poly, pts =co_img, color=(2,2,2)) - img_poly=cv2.fillPoly(img_poly, pts =co_sep, color=(3,3,3)) - - try: - cv2.imwrite(self.output_dir+'/'+self.gt_list[index].split('-')[1].split('.')[0]+'.png',img_poly ) - except: - cv2.imwrite(self.output_dir+'/'+self.gt_list[index].split('.')[0]+'.png',img_poly ) - - elif self.experiment=='textregion': - region_tags=np.unique([x for x in alltags if x.endswith('TextRegion')]) - co_textregion=[] - - for tag in region_tags: - if tag.endswith('}TextRegion') or tag.endswith('}Textregion'): - #print('sth') - for nn in root1.iter(tag): - c_t_in=[] - sumi=0 - for vv in nn.iter(): - # check the format of coords - if vv.tag==link+'Coords': - coords=bool(vv.attrib) - if coords: - p_h=vv.attrib['points'].split(' ') - c_t_in.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) - break - else: - pass - - - if vv.tag==link+'Point': - c_t_in.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) - sumi+=1 - #print(vv.tag,'in') - elif vv.tag!=link+'Point' and sumi>=1: - break - co_textregion.append(np.array(c_t_in)) - - img = np.zeros( (y_len,x_len,3) ) - if self.output_type == '3d': - img_poly=cv2.fillPoly(img, pts =co_textregion, color=(255,0,0)) - elif self.output_type == '2d': - img_poly=cv2.fillPoly(img, pts =co_textregion, color=(1,1,1)) - - - try: - cv2.imwrite(self.output_dir+'/'+self.gt_list[index].split('-')[1].split('.')[0]+'.png',img_poly ) - except: - cv2.imwrite(self.output_dir+'/'+self.gt_list[index].split('.')[0]+'.png',img_poly ) - - elif self.experiment=='layout': - region_tags=np.unique([x for x in alltags if x.endswith('Region')]) - - co_text_paragraph=[] - co_text_drop=[] - co_text_heading=[] - co_text_header=[] - co_text_marginalia=[] - co_text_catch=[] - co_text_page_number=[] - co_text_signature_mark=[] - co_sep=[] - co_img=[] - co_table=[] - co_graphic=[] - co_graphic_text_annotation=[] - co_graphic_decoration=[] - co_noise=[] - - for tag in region_tags: - if tag.endswith('}TextRegion') or tag.endswith('}Textregion'): - for nn in root1.iter(tag): - c_t_in_drop=[] - c_t_in_paragraph=[] - c_t_in_heading=[] - c_t_in_header=[] - c_t_in_page_number=[] - c_t_in_signature_mark=[] - c_t_in_catch=[] - c_t_in_marginalia=[] - sumi=0 - for vv in nn.iter(): - # check the format of coords - if vv.tag==link+'Coords': - - coords=bool(vv.attrib) - if coords: - #print('birda1') - p_h=vv.attrib['points'].split(' ') - - - - if "type" in nn.attrib and nn.attrib['type']=='drop-capital': - #if nn.attrib['type']=='paragraph': - - c_t_in_drop.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) - - elif "type" in nn.attrib and nn.attrib['type']=='heading': - c_t_in_heading.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) - - - elif "type" in nn.attrib and nn.attrib['type']=='signature-mark': - - c_t_in_signature_mark.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) - #print(c_t_in_paragraph) - elif "type" in nn.attrib and nn.attrib['type']=='header': - c_t_in_header.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) - - - elif "type" in nn.attrib and nn.attrib['type']=='catch-word': - c_t_in_catch.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) - - - elif "type" in nn.attrib and nn.attrib['type']=='page-number': - - c_t_in_page_number.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) - #print(c_t_in_paragraph) - - elif "type" in nn.attrib and nn.attrib['type']=='marginalia': - - c_t_in_marginalia.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) - #print(c_t_in_paragraph) - else: - - c_t_in_paragraph.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) - #print(c_t_in_paragraph) - - break - else: - pass - - - if vv.tag==link+'Point': - if "type" in nn.attrib and nn.attrib['type']=='drop-capital': - #if nn.attrib['type']=='paragraph': - - c_t_in_drop.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) - sumi+=1 - - elif "type" in nn.attrib and nn.attrib['type']=='heading': - c_t_in_heading.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) - sumi+=1 - - - elif "type" in nn.attrib and nn.attrib['type']=='signature-mark': - - c_t_in_signature_mark.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) - #print(c_t_in_paragraph) - sumi+=1 - elif "type" in nn.attrib and nn.attrib['type']=='header': - c_t_in_header.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) - sumi+=1 - - - elif "type" in nn.attrib and nn.attrib['type']=='catch-word': - c_t_in_catch.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) - sumi+=1 - - - elif "type" in nn.attrib and nn.attrib['type']=='page-number': - - c_t_in_page_number.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) - #print(c_t_in_paragraph) - sumi+=1 - - elif "type" in nn.attrib and nn.attrib['type']=='marginalia': - - c_t_in_marginalia.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) - #print(c_t_in_paragraph) - sumi+=1 - - else: - c_t_in_paragraph.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) - #print(c_t_in_paragraph) - sumi+=1 - - #c_t_in.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) - - #print(vv.tag,'in') - elif vv.tag!=link+'Point' and sumi>=1: - break - - if len(c_t_in_drop)>0: - co_text_drop.append(np.array(c_t_in_drop)) - if len(c_t_in_paragraph)>0: - co_text_paragraph.append(np.array(c_t_in_paragraph)) - if len(c_t_in_heading)>0: - co_text_heading.append(np.array(c_t_in_heading)) - - if len(c_t_in_header)>0: - co_text_header.append(np.array(c_t_in_header)) - if len(c_t_in_page_number)>0: - co_text_page_number.append(np.array(c_t_in_page_number)) - if len(c_t_in_catch)>0: - co_text_catch.append(np.array(c_t_in_catch)) - - if len(c_t_in_signature_mark)>0: - co_text_signature_mark.append(np.array(c_t_in_signature_mark)) - - if len(c_t_in_marginalia)>0: - co_text_marginalia.append(np.array(c_t_in_marginalia)) - - - elif tag.endswith('}GraphicRegion') or tag.endswith('}graphicregion'): - #print('sth') - for nn in root1.iter(tag): - c_t_in=[] - c_t_in_text_annotation=[] - c_t_in_decoration=[] - sumi=0 - for vv in nn.iter(): - # check the format of coords - if vv.tag==link+'Coords': - coords=bool(vv.attrib) - if coords: - p_h=vv.attrib['points'].split(' ') - #c_t_in.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) - - if "type" in nn.attrib and nn.attrib['type']=='handwritten-annotation': - #if nn.attrib['type']=='paragraph': - - c_t_in_text_annotation.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) - - elif "type" in nn.attrib and nn.attrib['type']=='decoration': - - c_t_in_decoration.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) - #print(c_t_in_paragraph) - else: - c_t_in.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) - - - - break - else: - pass - - - if vv.tag==link+'Point': - - if "type" in nn.attrib and nn.attrib['type']=='handwritten-annotation': - #if nn.attrib['type']=='paragraph': - - c_t_in_text_annotation.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) - sumi+=1 - - elif "type" in nn.attrib and nn.attrib['type']=='decoration': - - c_t_in_decoration.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) - #print(c_t_in_paragraph) - sumi+=1 - else: - c_t_in.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) - sumi+=1 - - if len(c_t_in_text_annotation)>0: - co_graphic_text_annotation.append(np.array(c_t_in_text_annotation)) - if len(c_t_in_decoration)>0: - co_graphic_decoration.append(np.array(c_t_in_decoration)) - if len(c_t_in)>0: - co_graphic.append(np.array(c_t_in)) - - - - elif tag.endswith('}ImageRegion') or tag.endswith('}imageregion'): - #print('sth') - for nn in root1.iter(tag): - c_t_in=[] - sumi=0 - for vv in nn.iter(): - # check the format of coords - if vv.tag==link+'Coords': - coords=bool(vv.attrib) - if coords: - p_h=vv.attrib['points'].split(' ') - c_t_in.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) - break - else: - pass - - - if vv.tag==link+'Point': - c_t_in.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) - sumi+=1 - #print(vv.tag,'in') - elif vv.tag!=link+'Point' and sumi>=1: - break - co_img.append(np.array(c_t_in)) - - elif tag.endswith('}SeparatorRegion') or tag.endswith('}separatorregion'): - #print('sth') - for nn in root1.iter(tag): - c_t_in=[] - sumi=0 - for vv in nn.iter(): - # check the format of coords - if vv.tag==link+'Coords': - coords=bool(vv.attrib) - if coords: - p_h=vv.attrib['points'].split(' ') - c_t_in.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) - break - else: - pass - - - if vv.tag==link+'Point': - c_t_in.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) - sumi+=1 - #print(vv.tag,'in') - elif vv.tag!=link+'Point' and sumi>=1: - break - co_sep.append(np.array(c_t_in)) - - - - elif tag.endswith('}TableRegion') or tag.endswith('}tableregion'): - #print('sth') - for nn in root1.iter(tag): - c_t_in=[] - sumi=0 - for vv in nn.iter(): - # check the format of coords - if vv.tag==link+'Coords': - coords=bool(vv.attrib) - if coords: - p_h=vv.attrib['points'].split(' ') - c_t_in.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) - break - else: - pass - - - if vv.tag==link+'Point': - c_t_in.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) - sumi+=1 - #print(vv.tag,'in') - elif vv.tag!=link+'Point' and sumi>=1: - break - co_table.append(np.array(c_t_in)) - - elif tag.endswith('}NoiseRegion') or tag.endswith('}noiseregion'): - #print('sth') - for nn in root1.iter(tag): - c_t_in=[] - sumi=0 - for vv in nn.iter(): - # check the format of coords - if vv.tag==link+'Coords': - coords=bool(vv.attrib) - if coords: - p_h=vv.attrib['points'].split(' ') - c_t_in.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) - break - else: - pass - - - if vv.tag==link+'Point': - c_t_in.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) - sumi+=1 - #print(vv.tag,'in') - elif vv.tag!=link+'Point' and sumi>=1: - break - co_noise.append(np.array(c_t_in)) - - - img = np.zeros( (y_len,x_len,3) ) - - if self.output_type == '3d': - img_poly=cv2.fillPoly(img, pts =co_text_paragraph, color=(255,0,0)) - - img_poly=cv2.fillPoly(img, pts =co_text_heading, color=(255,125,0)) - img_poly=cv2.fillPoly(img, pts =co_text_header, color=(255,0,125)) - img_poly=cv2.fillPoly(img, pts =co_text_catch, color=(125,255,125)) - img_poly=cv2.fillPoly(img, pts =co_text_signature_mark, color=(125,125,0)) - img_poly=cv2.fillPoly(img, pts =co_graphic_decoration, color=(0,125,255)) - img_poly=cv2.fillPoly(img, pts =co_text_page_number, color=(0,125,0)) - img_poly=cv2.fillPoly(img, pts =co_text_marginalia, color=(125,125,125)) - img_poly=cv2.fillPoly(img, pts =co_text_drop, color=(0,125,255)) - - img_poly=cv2.fillPoly(img, pts =co_graphic_text_annotation, color=(125,0,125)) - img_poly=cv2.fillPoly(img, pts =co_img, color=(0,255,0)) - img_poly=cv2.fillPoly(img, pts =co_sep, color=(0,0,255)) - img_poly=cv2.fillPoly(img, pts =co_table, color=(0,255,255)) - img_poly=cv2.fillPoly(img, pts =co_graphic, color=(255,125,125)) - img_poly=cv2.fillPoly(img, pts =co_noise, color=(255,0,255)) - elif self.output_type == '2d': - img_poly=cv2.fillPoly(img, pts =co_text_paragraph, color=(1,1,1)) - - img_poly=cv2.fillPoly(img, pts =co_text_heading, color=(2,2,2)) - img_poly=cv2.fillPoly(img, pts =co_text_header, color=(2,2,2)) - img_poly=cv2.fillPoly(img, pts =co_text_catch, color=(3,3,3)) - img_poly=cv2.fillPoly(img, pts =co_text_signature_mark, color=(4,4,4)) - img_poly=cv2.fillPoly(img, pts =co_graphic_decoration, color=(5,5,5)) - img_poly=cv2.fillPoly(img, pts =co_text_page_number, color=(6,6,6)) - img_poly=cv2.fillPoly(img, pts =co_text_marginalia, color=(7,7,7)) - img_poly=cv2.fillPoly(img, pts =co_text_drop, color=(8,8,8)) - - img_poly=cv2.fillPoly(img, pts =co_graphic_text_annotation, color=(9,9,9)) - img_poly=cv2.fillPoly(img, pts =co_img, color=(10,10,10)) - img_poly=cv2.fillPoly(img, pts =co_sep, color=(11,11,11)) - img_poly=cv2.fillPoly(img, pts =co_table, color=(12,12,12)) - img_poly=cv2.fillPoly(img, pts =co_graphic, color=(13,13,14)) - img_poly=cv2.fillPoly(img, pts =co_noise, color=(15,15,15)) - - try: - cv2.imwrite(self.output_dir+'/'+self.gt_list[index].split('-')[1].split('.')[0]+'.png',img_poly ) - except: - cv2.imwrite(self.output_dir+'/'+self.gt_list[index].split('.')[0]+'.png',img_poly ) - - - elif self.experiment=='layout_for_main_regions_new_concept': - region_tags=np.unique([x for x in alltags if x.endswith('Region')]) - #print(region_tags) - co_text=[] - co_sep=[] - co_img=[] - co_drop = [] - co_graphic=[] - co_table = [] - - for tag in region_tags: - if tag.endswith('}TextRegion') or tag.endswith('}Textregion'): - #print('sth') - for nn in root1.iter(tag): - c_t_in=[] - c_t_in_drop = [] - sumi=0 - for vv in nn.iter(): - # check the format of coords - if vv.tag==link+'Coords': - coords=bool(vv.attrib) - if coords: - p_h=vv.attrib['points'].split(' ') - if "type" in nn.attrib and nn.attrib['type']=='drop-capital': - c_t_in_drop.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) - else: - c_t_in.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) - break - else: - pass - - - if vv.tag==link+'Point': - if "type" in nn.attrib and nn.attrib['type']=='drop-capital': - c_t_in_drop.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) - sumi+=1 - else: - c_t_in.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) - sumi+=1 - #print(vv.tag,'in') - elif vv.tag!=link+'Point' and sumi>=1: - break - if len(c_t_in)>0: - co_text.append(np.array(c_t_in)) - if len(c_t_in_drop)>0: - co_drop.append(np.array(c_t_in_drop)) - - elif tag.endswith('}ImageRegion') or tag.endswith('}GraphicRegion') or tag.endswith('}imageregion'): - #print('sth') - for nn in root1.iter(tag): - c_t_in=[] - sumi=0 - for vv in nn.iter(): - # check the format of coords - if vv.tag==link+'Coords': - coords=bool(vv.attrib) - if coords: - p_h=vv.attrib['points'].split(' ') - c_t_in.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) - break - else: - pass - - - if vv.tag==link+'Point': - c_t_in.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) - sumi+=1 - #print(vv.tag,'in') - elif vv.tag!=link+'Point' and sumi>=1: - break - co_img.append(np.array(c_t_in)) - - elif tag.endswith('}SeparatorRegion') or tag.endswith('}separatorregion'): - #print('sth') - for nn in root1.iter(tag): - c_t_in=[] - sumi=0 - for vv in nn.iter(): - # check the format of coords - if vv.tag==link+'Coords': - coords=bool(vv.attrib) - if coords: - p_h=vv.attrib['points'].split(' ') - c_t_in.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) - break - else: - pass - - - if vv.tag==link+'Point': - c_t_in.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) - sumi+=1 - #print(vv.tag,'in') - elif vv.tag!=link+'Point' and sumi>=1: - break - co_sep.append(np.array(c_t_in)) - - elif tag.endswith('}TableRegion') or tag.endswith('}tableregion'): - #print('sth') - for nn in root1.iter(tag): - c_t_in=[] - sumi=0 - for vv in nn.iter(): - # check the format of coords - if vv.tag==link+'Coords': - coords=bool(vv.attrib) - if coords: - p_h=vv.attrib['points'].split(' ') - c_t_in.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) - break - else: - pass - - - if vv.tag==link+'Point': - c_t_in.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) - sumi+=1 - #print(vv.tag,'in') - elif vv.tag!=link+'Point' and sumi>=1: - break - co_table.append(np.array(c_t_in)) - - img_boundary = np.zeros( (y_len,x_len) ) - - - co_text_eroded = [] - for con in co_text: - #try: - img_boundary_in = np.zeros( (y_len,x_len) ) - img_boundary_in = cv2.fillPoly(img_boundary_in, pts=[con], color=(1, 1, 1)) - #print('bidiahhhhaaa') - - - - #img_boundary_in = cv2.erode(img_boundary_in[:,:], KERNEL, iterations=7)#asiatica - img_boundary_in = cv2.erode(img_boundary_in[:,:], KERNEL, iterations=2) - - pixel = 1 - min_size = 0 - con_eroded = self.return_contours_of_interested_region(img_boundary_in,pixel, min_size ) - - try: - co_text_eroded.append(con_eroded[0]) - except: - co_text_eroded.append(con) - - img_boundary_in_dilated = cv2.dilate(img_boundary_in[:,:], KERNEL, iterations=4) - #img_boundary_in_dilated = cv2.dilate(img_boundary_in[:,:], KERNEL, iterations=5) - - boundary = img_boundary_in_dilated[:,:] - img_boundary_in[:,:] - - img_boundary[:,:][boundary[:,:]==1] =1 - - - ###co_table_eroded = [] - ###for con in co_table: - ####try: - ###img_boundary_in = np.zeros( (y_len,x_len) ) - ###img_boundary_in = cv2.fillPoly(img_boundary_in, pts=[con], color=(1, 1, 1)) - ####print('bidiahhhhaaa') - - - - #####img_boundary_in = cv2.erode(img_boundary_in[:,:], KERNEL, iterations=7)#asiatica - ###img_boundary_in = cv2.erode(img_boundary_in[:,:], KERNEL, iterations=2) - - ###pixel = 1 - ###min_size = 0 - ###con_eroded = self.return_contours_of_interested_region(img_boundary_in,pixel, min_size ) - - ###try: - ###co_table_eroded.append(con_eroded[0]) - ###except: - ###co_table_eroded.append(con) - - ###img_boundary_in_dilated = cv2.dilate(img_boundary_in[:,:], KERNEL, iterations=4) - - ###boundary = img_boundary_in_dilated[:,:] - img_boundary_in[:,:] - - ###img_boundary[:,:][boundary[:,:]==1] =1 - #except: - #pass - - #for con in co_img: - #img_boundary_in = np.zeros( (y_len,x_len) ) - #img_boundary_in = cv2.fillPoly(img_boundary_in, pts=[con], color=(1, 1, 1)) - #img_boundary_in_dilated = cv2.dilate(img_boundary_in[:,:], KERNEL, iterations=3) - - #boundary = img_boundary_in_dilated[:,:] - img_boundary_in[:,:] - - #img_boundary[:,:][boundary[:,:]==1] =1 - - - #for con in co_sep: - - #img_boundary_in = np.zeros( (y_len,x_len) ) - #img_boundary_in = cv2.fillPoly(img_boundary_in, pts=[con], color=(1, 1, 1)) - #img_boundary_in_dilated = cv2.dilate(img_boundary_in[:,:], KERNEL, iterations=3) - - #boundary = img_boundary_in_dilated[:,:] - img_boundary_in[:,:] - - img_boundary[:,:][boundary[:,:]==1] =1 - for con in co_drop: - img_boundary_in = np.zeros( (y_len,x_len) ) - img_boundary_in = cv2.fillPoly(img_boundary_in, pts=[con], color=(1, 1, 1)) - img_boundary_in_dilated = cv2.dilate(img_boundary_in[:,:], KERNEL, iterations=3) - - boundary = img_boundary_in_dilated[:,:] - img_boundary_in[:,:] - - img_boundary[:,:][boundary[:,:]==1] =1 - - - img = np.zeros( (y_len,x_len,3) ) - - if self.output_type == '2d': - img_poly=cv2.fillPoly(img, pts =co_img, color=(2,2,2)) - - img_poly=cv2.fillPoly(img, pts =co_text_eroded, color=(1,1,1)) - ##img_poly=cv2.fillPoly(img, pts =co_graphic, color=(4,4,4)) - ###img_poly=cv2.fillPoly(img, pts =co_table, color=(1,1,1)) - - img_poly=cv2.fillPoly(img, pts =co_drop, color=(1,1,1)) - img_poly[:,:][img_boundary[:,:]==1] = 4 - img_poly=cv2.fillPoly(img, pts =co_sep, color=(3,3,3)) - elif self.output_type == '3d': - img_poly=cv2.fillPoly(img, pts =co_img, color=(0,255,0)) - img_poly=cv2.fillPoly(img, pts =co_text_eroded, color=(255,0,0)) - img_poly=cv2.fillPoly(img, pts =co_drop, color=(0,125,255)) - - img_poly[:,:,0][img_boundary[:,:]==1]=255 - img_poly[:,:,1][img_boundary[:,:]==1]=125 - img_poly[:,:,2][img_boundary[:,:]==1]=125 - - img_poly=cv2.fillPoly(img, pts =co_sep, color=(0,0,255)) - ##img_poly=cv2.fillPoly(img, pts =co_graphic, color=(255,125,125)) - - #print('yazdimmm',self.output_dir+'/'+self.gt_list[index].split('.')[0]+'.png') - try: - #print('yazdimmm',self.output_dir+'/'+self.gt_list[index].split('.')[0]+'.png') - cv2.imwrite(self.output_dir+'/'+self.gt_list[index].split('-')[1].split('.')[0]+'.png',img_poly ) - except: - cv2.imwrite(self.output_dir+'/'+self.gt_list[index].split('.')[0]+'.png',img_poly ) - - - - #except: - #pass def run(self,config_params): self.get_content_of_dir() self.get_images_of_ground_truth(config_params) @@ -1631,9 +755,9 @@ class pagexml2word: ) @click.option( - "--layout_config", - "-lc", - help="config file of prefered layout.", + "--config", + "-cfg", + help="config file of prefered layout or use case.", type=click.Path(exists=True, dir_okay=False), ) @@ -1642,21 +766,16 @@ class pagexml2word: "-to", help="this defines how output should be. A 2d image array or a 3d image array encoded with RGB color. Just pass 2d or 3d. The file will be saved one directory up. 2D image array is 3d but only information of one channel would be enough since all channels have the same values.", ) -@click.option( - "--experiment", - "-exp", - help="experiment of ineterst. Word , textline , glyph and textregion are desired options.", -) -def main(dir_xml,dir_out,type_output,experiment,layout_config): - if layout_config: - with open(layout_config) as f: +def main(dir_xml,dir_out,type_output,config): + if config: + with open(config) as f: config_params = json.load(f) else: print("passed") config_params = None - x=pagexml2word(dir_xml,dir_out,type_output,experiment, layout_config) + x=pagexml2label(dir_xml,dir_out,type_output, config) x.run(config_params) if __name__=="__main__": main() From f5746011f6d04e73617a3bbd750ea39c6864e41c Mon Sep 17 00:00:00 2001 From: vahidrezanezhad Date: Thu, 23 May 2024 17:36:23 +0200 Subject: [PATCH 15/62] use case printspace is added --- pagexml2label.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/pagexml2label.py b/pagexml2label.py index 16cda8b..94596db 100644 --- a/pagexml2label.py +++ b/pagexml2label.py @@ -126,7 +126,7 @@ class pagexml2label: y_len=int(jj.attrib['imageHeight']) x_len=int(jj.attrib['imageWidth']) - if self.config and (config_params['use_case']=='textline' or config_params['use_case']=='word' or config_params['use_case']=='glyph'): + if self.config and (config_params['use_case']=='textline' or config_params['use_case']=='word' or config_params['use_case']=='glyph' or config_params['use_case']=='printspace'): keys = list(config_params.keys()) if "artificial_class_label" in keys: artificial_class_rgb_color = (255,255,0) @@ -140,6 +140,9 @@ class pagexml2label: region_tags = np.unique([x for x in alltags if x.endswith('Word')]) elif config_params['use_case']=='glyph': region_tags = np.unique([x for x in alltags if x.endswith('Glyph')]) + elif config_params['use_case']=='printspace': + region_tags = np.unique([x for x in alltags if x.endswith('PrintSpace')]) + co_use_case = [] for tag in region_tags: @@ -149,6 +152,8 @@ class pagexml2label: tag_endings = ['}Word','}word'] elif config_params['use_case']=='glyph': tag_endings = ['}Glyph','}glyph'] + elif config_params['use_case']=='printspace': + tag_endings = ['}PrintSpace','}printspace'] if tag.endswith(tag_endings[0]) or tag.endswith(tag_endings[1]): for nn in root1.iter(tag): From bf1468391ad78e45944fb79b4be4611495d00e52 Mon Sep 17 00:00:00 2001 From: vahidrezanezhad Date: Fri, 24 May 2024 14:42:58 +0200 Subject: [PATCH 16/62] machine based reading order training dataset generator is added --- generate_gt_for_training.py | 194 ++++++ gt_for_enhancement_creator.py | 31 - gt_gen_utils.py | 1239 +++++++++++++++++++++++++++++++++ pagexml2label.py | 789 --------------------- 4 files changed, 1433 insertions(+), 820 deletions(-) create mode 100644 generate_gt_for_training.py delete mode 100644 gt_for_enhancement_creator.py create mode 100644 gt_gen_utils.py delete mode 100644 pagexml2label.py diff --git a/generate_gt_for_training.py b/generate_gt_for_training.py new file mode 100644 index 0000000..e296029 --- /dev/null +++ b/generate_gt_for_training.py @@ -0,0 +1,194 @@ +import click +import json +from gt_gen_utils import * +from tqdm import tqdm + +@click.group() +def main(): + pass + +@main.command() +@click.option( + "--dir_xml", + "-dx", + help="directory of GT page-xml files", + type=click.Path(exists=True, file_okay=False), +) +@click.option( + "--dir_out", + "-do", + help="directory where ground truth images would be written", + type=click.Path(exists=True, file_okay=False), +) + +@click.option( + "--config", + "-cfg", + help="config file of prefered layout or use case.", + type=click.Path(exists=True, dir_okay=False), +) + +@click.option( + "--type_output", + "-to", + help="this defines how output should be. A 2d image array or a 3d image array encoded with RGB color. Just pass 2d or 3d. The file will be saved one directory up. 2D image array is 3d but only information of one channel would be enough since all channels have the same values.", +) + +def pagexml2label(dir_xml,dir_out,type_output,config): + if config: + with open(config) as f: + config_params = json.load(f) + else: + print("passed") + config_params = None + gt_list = get_content_of_dir(dir_xml) + get_images_of_ground_truth(gt_list,dir_xml,dir_out,type_output, config, config_params) + +@main.command() +@click.option( + "--dir_imgs", + "-dis", + help="directory of images with high resolution.", + type=click.Path(exists=True, file_okay=False), +) +@click.option( + "--dir_out_images", + "-dois", + help="directory where degraded images will be written.", + type=click.Path(exists=True, file_okay=False), +) + +@click.option( + "--dir_out_labels", + "-dols", + help="directory where original images will be written as labels.", + type=click.Path(exists=True, file_okay=False), +) +def image_enhancement(dir_imgs, dir_out_images, dir_out_labels): + #dir_imgs = './training_data_sample_enhancement/images' + #dir_out_images = './training_data_sample_enhancement/images_gt' + #dir_out_labels = './training_data_sample_enhancement/labels_gt' + + ls_imgs = os.listdir(dir_imgs) + ls_scales = [ 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9] + + for img in tqdm(ls_imgs): + img_name = img.split('.')[0] + img_type = img.split('.')[1] + image = cv2.imread(os.path.join(dir_imgs, img)) + for i, scale in enumerate(ls_scales): + height_sc = int(image.shape[0]*scale) + width_sc = int(image.shape[1]*scale) + + image_down_scaled = resize_image(image, height_sc, width_sc) + image_back_to_org_scale = resize_image(image_down_scaled, image.shape[0], image.shape[1]) + + cv2.imwrite(os.path.join(dir_out_images, img_name+'_'+str(i)+'.'+img_type), image_back_to_org_scale) + cv2.imwrite(os.path.join(dir_out_labels, img_name+'_'+str(i)+'.'+img_type), image) + + +@main.command() +@click.option( + "--dir_xml", + "-dx", + help="directory of GT page-xml files", + type=click.Path(exists=True, file_okay=False), +) + +@click.option( + "--dir_out_modal_image", + "-domi", + help="directory where ground truth images would be written", + type=click.Path(exists=True, file_okay=False), +) + +@click.option( + "--dir_out_classes", + "-docl", + help="directory where ground truth classes would be written", + type=click.Path(exists=True, file_okay=False), +) + +@click.option( + "--input_height", + "-ih", + help="input_height", +) +@click.option( + "--input_width", + "-iw", + help="input_width", +) + +def machine_based_reading_order(dir_xml, dir_out_modal_image, dir_out_classes, input_height, input_width): + xml_files_ind = os.listdir(dir_xml) + input_height = int(input_height) + input_width = int(input_width) + + indexer_start= 0#55166 + max_area = 1 + min_area = 0.0001 + + for ind_xml in tqdm(xml_files_ind): + indexer = 0 + #print(ind_xml) + #print('########################') + xml_file = os.path.join(dir_xml,ind_xml ) + f_name = ind_xml.split('.')[0] + file_name, id_paragraph, id_header,co_text_paragraph,\ + co_text_header,tot_region_ref,x_len, y_len,index_tot_regions,img_poly = read_xml(xml_file) + + id_all_text = id_paragraph + id_header + co_text_all = co_text_paragraph + co_text_header + + + _, cy_main, x_min_main, x_max_main, y_min_main, y_max_main, _ = find_new_features_of_contours(co_text_header) + + img_header_and_sep = np.zeros((y_len,x_len), dtype='uint8') + + for j in range(len(cy_main)): + img_header_and_sep[int(y_max_main[j]):int(y_max_main[j])+12,int(x_min_main[j]):int(x_max_main[j]) ] = 1 + + + texts_corr_order_index = [index_tot_regions[tot_region_ref.index(i)] for i in id_all_text ] + texts_corr_order_index_int = [int(x) for x in texts_corr_order_index] + + + co_text_all, texts_corr_order_index_int = filter_contours_area_of_image(img_poly, co_text_all, texts_corr_order_index_int, max_area, min_area) + + arg_array = np.array(range(len(texts_corr_order_index_int))) + + labels_con = np.zeros((y_len,x_len,len(arg_array)),dtype='uint8') + for i in range(len(co_text_all)): + img_label = np.zeros((y_len,x_len,3),dtype='uint8') + img_label=cv2.fillPoly(img_label, pts =[co_text_all[i]], color=(1,1,1)) + + img_label[:,:,0][img_poly[:,:,0]==5] = 2 + img_label[:,:,0][img_header_and_sep[:,:]==1] = 3 + + labels_con[:,:,i] = img_label[:,:,0] + + for i in range(len(texts_corr_order_index_int)): + for j in range(len(texts_corr_order_index_int)): + if i!=j: + input_matrix = np.zeros((input_height,input_width,3)).astype(np.int8) + final_f_name = f_name+'_'+str(indexer+indexer_start) + order_class_condition = texts_corr_order_index_int[i]-texts_corr_order_index_int[j] + if order_class_condition<0: + class_type = 1 + else: + class_type = 0 + + input_matrix[:,:,0] = resize_image(labels_con[:,:,i], input_height, input_width) + input_matrix[:,:,1] = resize_image(img_poly[:,:,0], input_height, input_width) + input_matrix[:,:,2] = resize_image(labels_con[:,:,j], input_height, input_width) + + np.save(os.path.join(dir_out_classes,final_f_name+'.npy' ), class_type) + + cv2.imwrite(os.path.join(dir_out_modal_image,final_f_name+'.png' ), input_matrix) + indexer = indexer+1 + + + +if __name__ == "__main__": + main() diff --git a/gt_for_enhancement_creator.py b/gt_for_enhancement_creator.py deleted file mode 100644 index 9a4274f..0000000 --- a/gt_for_enhancement_creator.py +++ /dev/null @@ -1,31 +0,0 @@ -import cv2 -import os - -def resize_image(seg_in, input_height, input_width): - return cv2.resize(seg_in, (input_width, input_height), interpolation=cv2.INTER_NEAREST) - - -dir_imgs = './training_data_sample_enhancement/images' -dir_out_imgs = './training_data_sample_enhancement/images_gt' -dir_out_labs = './training_data_sample_enhancement/labels_gt' - -ls_imgs = os.listdir(dir_imgs) - - -ls_scales = [ 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9] - - -for img in ls_imgs: - img_name = img.split('.')[0] - img_type = img.split('.')[1] - image = cv2.imread(os.path.join(dir_imgs, img)) - for i, scale in enumerate(ls_scales): - height_sc = int(image.shape[0]*scale) - width_sc = int(image.shape[1]*scale) - - image_down_scaled = resize_image(image, height_sc, width_sc) - image_back_to_org_scale = resize_image(image_down_scaled, image.shape[0], image.shape[1]) - - cv2.imwrite(os.path.join(dir_out_imgs, img_name+'_'+str(i)+'.'+img_type), image_back_to_org_scale) - cv2.imwrite(os.path.join(dir_out_labs, img_name+'_'+str(i)+'.'+img_type), image) - diff --git a/gt_gen_utils.py b/gt_gen_utils.py new file mode 100644 index 0000000..9862e29 --- /dev/null +++ b/gt_gen_utils.py @@ -0,0 +1,1239 @@ +import click +import sys +import os +import numpy as np +import warnings +import xml.etree.ElementTree as ET +from tqdm import tqdm +import cv2 +from shapely import geometry +from pathlib import Path + + +KERNEL = np.ones((5, 5), np.uint8) + +with warnings.catch_warnings(): + warnings.simplefilter("ignore") + +def get_content_of_dir(dir_in): + """ + Listing all ground truth page xml files. All files are needed to have xml format. + """ + + gt_all=os.listdir(dir_in) + gt_list=[file for file in gt_all if file.split('.')[ len(file.split('.'))-1 ]=='xml' ] + return gt_list + +def return_parent_contours(contours, hierarchy): + contours_parent = [contours[i] for i in range(len(contours)) if hierarchy[0][i][3] == -1] + return contours_parent +def filter_contours_area_of_image_tables(image, contours, hierarchy, max_area, min_area): + found_polygons_early = list() + + jv = 0 + for c in contours: + if len(c) < 3: # A polygon cannot have less than 3 points + continue + + polygon = geometry.Polygon([point[0] for point in c]) + # area = cv2.contourArea(c) + area = polygon.area + ##print(np.prod(thresh.shape[:2])) + # Check that polygon has area greater than minimal area + # print(hierarchy[0][jv][3],hierarchy ) + if area >= min_area * np.prod(image.shape[:2]) and area <= max_area * np.prod(image.shape[:2]): # and hierarchy[0][jv][3]==-1 : + # print(c[0][0][1]) + found_polygons_early.append(np.array([[point] for point in polygon.exterior.coords], dtype=np.int32)) + jv += 1 + return found_polygons_early + +def filter_contours_area_of_image(image, contours, order_index, max_area, min_area): + found_polygons_early = list() + order_index_filtered = list() + #jv = 0 + for jv, c in enumerate(contours): + #print(len(c[0])) + c = c[0] + if len(c) < 3: # A polygon cannot have less than 3 points + continue + c_e = [point for point in c] + #print(c_e) + polygon = geometry.Polygon(c_e) + area = polygon.area + #print(area,'area') + if area >= min_area * np.prod(image.shape[:2]) and area <= max_area * np.prod(image.shape[:2]): # and hierarchy[0][jv][3]==-1 : + found_polygons_early.append(np.array([[point] for point in polygon.exterior.coords], dtype=np.uint)) + order_index_filtered.append(order_index[jv]) + #jv += 1 + return found_polygons_early, order_index_filtered + +def return_contours_of_interested_region(region_pre_p, pixel, min_area=0.0002): + + # pixels of images are identified by 5 + if len(region_pre_p.shape) == 3: + cnts_images = (region_pre_p[:, :, 0] == pixel) * 1 + else: + cnts_images = (region_pre_p[:, :] == pixel) * 1 + cnts_images = cnts_images.astype(np.uint8) + cnts_images = np.repeat(cnts_images[:, :, np.newaxis], 3, axis=2) + imgray = cv2.cvtColor(cnts_images, cv2.COLOR_BGR2GRAY) + ret, thresh = cv2.threshold(imgray, 0, 255, 0) + + contours_imgs, hierarchy = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) + + contours_imgs = return_parent_contours(contours_imgs, hierarchy) + contours_imgs = filter_contours_area_of_image_tables(thresh, contours_imgs, hierarchy, max_area=1, min_area=min_area) + + return contours_imgs +def update_region_contours(co_text, img_boundary, erosion_rate, dilation_rate, y_len, x_len): + co_text_eroded = [] + for con in co_text: + #try: + img_boundary_in = np.zeros( (y_len,x_len) ) + img_boundary_in = cv2.fillPoly(img_boundary_in, pts=[con], color=(1, 1, 1)) + #print('bidiahhhhaaa') + + + + #img_boundary_in = cv2.erode(img_boundary_in[:,:], KERNEL, iterations=7)#asiatica + if erosion_rate > 0: + img_boundary_in = cv2.erode(img_boundary_in[:,:], KERNEL, iterations=erosion_rate) + + pixel = 1 + min_size = 0 + con_eroded = return_contours_of_interested_region(img_boundary_in,pixel, min_size ) + + try: + co_text_eroded.append(con_eroded[0]) + except: + co_text_eroded.append(con) + + + img_boundary_in_dilated = cv2.dilate(img_boundary_in[:,:], KERNEL, iterations=dilation_rate) + #img_boundary_in_dilated = cv2.dilate(img_boundary_in[:,:], KERNEL, iterations=5) + + boundary = img_boundary_in_dilated[:,:] - img_boundary_in[:,:] + + img_boundary[:,:][boundary[:,:]==1] =1 + return co_text_eroded, img_boundary +def get_images_of_ground_truth(gt_list, dir_in, output_dir, output_type, config_file, config_params): + """ + Reading the page xml files and write the ground truth images into given output directory. + """ + ## to do: add footnote to text regions + for index in tqdm(range(len(gt_list))): + #try: + tree1 = ET.parse(dir_in+'/'+gt_list[index]) + root1=tree1.getroot() + alltags=[elem.tag for elem in root1.iter()] + link=alltags[0].split('}')[0]+'}' + + + + for jj in root1.iter(link+'Page'): + y_len=int(jj.attrib['imageHeight']) + x_len=int(jj.attrib['imageWidth']) + + if config_file and (config_params['use_case']=='textline' or config_params['use_case']=='word' or config_params['use_case']=='glyph' or config_params['use_case']=='printspace'): + keys = list(config_params.keys()) + if "artificial_class_label" in keys: + artificial_class_rgb_color = (255,255,0) + artificial_class_label = config_params['artificial_class_label'] + + textline_rgb_color = (255, 0, 0) + + if config_params['use_case']=='textline': + region_tags = np.unique([x for x in alltags if x.endswith('TextLine')]) + elif config_params['use_case']=='word': + region_tags = np.unique([x for x in alltags if x.endswith('Word')]) + elif config_params['use_case']=='glyph': + region_tags = np.unique([x for x in alltags if x.endswith('Glyph')]) + elif config_params['use_case']=='printspace': + region_tags = np.unique([x for x in alltags if x.endswith('PrintSpace')]) + + co_use_case = [] + + for tag in region_tags: + if config_params['use_case']=='textline': + tag_endings = ['}TextLine','}textline'] + elif config_params['use_case']=='word': + tag_endings = ['}Word','}word'] + elif config_params['use_case']=='glyph': + tag_endings = ['}Glyph','}glyph'] + elif config_params['use_case']=='printspace': + tag_endings = ['}PrintSpace','}printspace'] + + if tag.endswith(tag_endings[0]) or tag.endswith(tag_endings[1]): + for nn in root1.iter(tag): + c_t_in = [] + sumi = 0 + for vv in nn.iter(): + # check the format of coords + if vv.tag == link + 'Coords': + coords = bool(vv.attrib) + if coords: + p_h = vv.attrib['points'].split(' ') + c_t_in.append( + np.array([[int(x.split(',')[0]), int(x.split(',')[1])] for x in p_h])) + break + else: + pass + + if vv.tag == link + 'Point': + c_t_in.append([int(np.float(vv.attrib['x'])), int(np.float(vv.attrib['y']))]) + sumi += 1 + elif vv.tag != link + 'Point' and sumi >= 1: + break + co_use_case.append(np.array(c_t_in)) + + + + if "artificial_class_label" in keys: + img_boundary = np.zeros((y_len, x_len)) + erosion_rate = 1 + dilation_rate = 3 + co_use_case, img_boundary = update_region_contours(co_use_case, img_boundary, erosion_rate, dilation_rate, y_len, x_len ) + + + img = np.zeros((y_len, x_len, 3)) + if output_type == '2d': + img_poly = cv2.fillPoly(img, pts=co_use_case, color=(1, 1, 1)) + if "artificial_class_label" in keys: + img_poly[:,:][img_boundary[:,:]==1] = artificial_class_label + elif output_type == '3d': + img_poly = cv2.fillPoly(img, pts=co_use_case, color=textline_rgb_color) + if "artificial_class_label" in keys: + img_poly[:,:,0][img_boundary[:,:]==1] = artificial_class_rgb_color[0] + img_poly[:,:,1][img_boundary[:,:]==1] = artificial_class_rgb_color[1] + img_poly[:,:,2][img_boundary[:,:]==1] = artificial_class_rgb_color[2] + + try: + cv2.imwrite(output_dir + '/' + gt_list[index].split('-')[1].split('.')[0] + '.png', + img_poly) + except: + cv2.imwrite(output_dir + '/' + gt_list[index].split('.')[0] + '.png', img_poly) + + + if config_file and config_params['use_case']=='layout': + keys = list(config_params.keys()) + if "artificial_class_on_boundry" in keys: + elements_with_artificial_class = list(config_params['artificial_class_on_boundry']) + artificial_class_rgb_color = (255,255,0) + artificial_class_label = config_params['artificial_class_label'] + #values = config_params.values() + + if 'textregions' in keys: + types_text_dict = config_params['textregions'] + types_text = list(types_text_dict.keys()) + types_text_label = list(types_text_dict.values()) + print(types_text) + if 'graphicregions' in keys: + types_graphic_dict = config_params['graphicregions'] + types_graphic = list(types_graphic_dict.keys()) + types_graphic_label = list(types_graphic_dict.values()) + + + labels_rgb_color = [ (0,0,0), (255,0,0), (255,125,0), (255,0,125), (125,255,125), (125,125,0), (0,125,255), (0,125,0), (125,125,125), (255,0,255), (125,0,125), (0,255,0),(0,0,255), (0,255,255), (255,125,125), (0,125,125), (0,255,125), (255,125,255), (125,255,0)] + + region_tags=np.unique([x for x in alltags if x.endswith('Region')]) + + co_text_paragraph=[] + co_text_footnote=[] + co_text_footnote_con=[] + co_text_drop=[] + co_text_heading=[] + co_text_header=[] + co_text_marginalia=[] + co_text_catch=[] + co_text_page_number=[] + co_text_signature_mark=[] + co_sep=[] + co_img=[] + co_table=[] + co_graphic_signature=[] + co_graphic_text_annotation=[] + co_graphic_decoration=[] + co_graphic_stamp=[] + co_noise=[] + + for tag in region_tags: + if 'textregions' in keys: + if tag.endswith('}TextRegion') or tag.endswith('}Textregion'): + for nn in root1.iter(tag): + c_t_in_drop=[] + c_t_in_paragraph=[] + c_t_in_heading=[] + c_t_in_header=[] + c_t_in_page_number=[] + c_t_in_signature_mark=[] + c_t_in_catch=[] + c_t_in_marginalia=[] + c_t_in_footnote=[] + c_t_in_footnote_con=[] + sumi=0 + for vv in nn.iter(): + # check the format of coords + if vv.tag==link+'Coords': + + coords=bool(vv.attrib) + if coords: + #print('birda1') + p_h=vv.attrib['points'].split(' ') + + if "drop-capital" in types_text: + if "type" in nn.attrib and nn.attrib['type']=='drop-capital': + c_t_in_drop.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) + + if "footnote" in types_text: + if "type" in nn.attrib and nn.attrib['type']=='footnote': + c_t_in_footnote.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) + + if "footnote-continued" in types_text: + if "type" in nn.attrib and nn.attrib['type']=='footnote-continued': + c_t_in_footnote_con.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) + + if "heading" in types_text: + if "type" in nn.attrib and nn.attrib['type']=='heading': + c_t_in_heading.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) + + if "signature-mark" in types_text: + if "type" in nn.attrib and nn.attrib['type']=='signature-mark': + c_t_in_signature_mark.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) + + if "header" in types_text: + if "type" in nn.attrib and nn.attrib['type']=='header': + c_t_in_header.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) + + if "catch-word" in types_text: + if "type" in nn.attrib and nn.attrib['type']=='catch-word': + c_t_in_catch.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) + + if "page-number" in types_text: + if "type" in nn.attrib and nn.attrib['type']=='page-number': + c_t_in_page_number.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) + + if "marginalia" in types_text: + if "type" in nn.attrib and nn.attrib['type']=='marginalia': + c_t_in_marginalia.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) + + if "paragraph" in types_text: + if "type" in nn.attrib and nn.attrib['type']=='paragraph': + c_t_in_paragraph.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) + + + break + else: + pass + + + if vv.tag==link+'Point': + if "drop-capital" in types_text: + if "type" in nn.attrib and nn.attrib['type']=='drop-capital': + c_t_in_drop.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) + sumi+=1 + + if "footnote" in types_text: + if "type" in nn.attrib and nn.attrib['type']=='footnote': + c_t_in_footnote.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) + sumi+=1 + + if "footnote-continued" in types_text: + if "type" in nn.attrib and nn.attrib['type']=='footnote-continued': + c_t_in_footnote_con.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) + sumi+=1 + + if "heading" in types_text: + if "type" in nn.attrib and nn.attrib['type']=='heading': + c_t_in_heading.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) + sumi+=1 + + if "signature-mark" in types_text: + if "type" in nn.attrib and nn.attrib['type']=='signature-mark': + c_t_in_signature_mark.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) + sumi+=1 + + if "header" in types_text: + if "type" in nn.attrib and nn.attrib['type']=='header': + c_t_in_header.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) + sumi+=1 + + if "catch-word" in types_text: + if "type" in nn.attrib and nn.attrib['type']=='catch-word': + c_t_in_catch.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) + sumi+=1 + + if "page-number" in types_text: + if "type" in nn.attrib and nn.attrib['type']=='page-number': + c_t_in_page_number.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) + sumi+=1 + + if "marginalia" in types_text: + if "type" in nn.attrib and nn.attrib['type']=='marginalia': + c_t_in_marginalia.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) + sumi+=1 + + if "paragraph" in types_text: + if "type" in nn.attrib and nn.attrib['type']=='paragraph': + c_t_in_paragraph.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) + sumi+=1 + + + elif vv.tag!=link+'Point' and sumi>=1: + break + + if len(c_t_in_drop)>0: + co_text_drop.append(np.array(c_t_in_drop)) + if len(c_t_in_footnote_con)>0: + co_text_footnote_con.append(np.array(c_t_in_footnote_con)) + if len(c_t_in_footnote)>0: + co_text_footnote.append(np.array(c_t_in_footnote)) + if len(c_t_in_paragraph)>0: + co_text_paragraph.append(np.array(c_t_in_paragraph)) + if len(c_t_in_heading)>0: + co_text_heading.append(np.array(c_t_in_heading)) + + if len(c_t_in_header)>0: + co_text_header.append(np.array(c_t_in_header)) + if len(c_t_in_page_number)>0: + co_text_page_number.append(np.array(c_t_in_page_number)) + if len(c_t_in_catch)>0: + co_text_catch.append(np.array(c_t_in_catch)) + + if len(c_t_in_signature_mark)>0: + co_text_signature_mark.append(np.array(c_t_in_signature_mark)) + + if len(c_t_in_marginalia)>0: + co_text_marginalia.append(np.array(c_t_in_marginalia)) + + + if 'graphicregions' in keys: + if tag.endswith('}GraphicRegion') or tag.endswith('}graphicregion'): + #print('sth') + for nn in root1.iter(tag): + c_t_in_stamp=[] + c_t_in_text_annotation=[] + c_t_in_decoration=[] + c_t_in_signature=[] + sumi=0 + for vv in nn.iter(): + # check the format of coords + if vv.tag==link+'Coords': + coords=bool(vv.attrib) + if coords: + p_h=vv.attrib['points'].split(' ') + if "handwritten-annotation" in types_graphic: + if "type" in nn.attrib and nn.attrib['type']=='handwritten-annotation': + c_t_in_text_annotation.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) + + if "decoration" in types_graphic: + if "type" in nn.attrib and nn.attrib['type']=='decoration': + c_t_in_decoration.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) + + if "stamp" in types_graphic: + if "type" in nn.attrib and nn.attrib['type']=='stamp': + c_t_in_stamp.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) + + if "signature" in types_graphic: + if "type" in nn.attrib and nn.attrib['type']=='signature': + c_t_in_signature.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) + + + + break + else: + pass + + + if vv.tag==link+'Point': + if "handwritten-annotation" in types_graphic: + if "type" in nn.attrib and nn.attrib['type']=='handwritten-annotation': + c_t_in_text_annotation.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) + sumi+=1 + + if "decoration" in types_graphic: + if "type" in nn.attrib and nn.attrib['type']=='decoration': + c_t_in_decoration.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) + sumi+=1 + + if "stamp" in types_graphic: + if "type" in nn.attrib and nn.attrib['type']=='stamp': + c_t_in_stamp.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) + sumi+=1 + + if "signature" in types_graphic: + if "type" in nn.attrib and nn.attrib['type']=='signature': + c_t_in_signature.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) + sumi+=1 + + if len(c_t_in_text_annotation)>0: + co_graphic_text_annotation.append(np.array(c_t_in_text_annotation)) + if len(c_t_in_decoration)>0: + co_graphic_decoration.append(np.array(c_t_in_decoration)) + if len(c_t_in_stamp)>0: + co_graphic_stamp.append(np.array(c_t_in_stamp)) + if len(c_t_in_signature)>0: + co_graphic_signature.append(np.array(c_t_in_signature)) + + if 'imageregion' in keys: + if tag.endswith('}ImageRegion') or tag.endswith('}imageregion'): + for nn in root1.iter(tag): + c_t_in=[] + sumi=0 + for vv in nn.iter(): + if vv.tag==link+'Coords': + coords=bool(vv.attrib) + if coords: + p_h=vv.attrib['points'].split(' ') + c_t_in.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) + break + else: + pass + + + if vv.tag==link+'Point': + c_t_in.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) + sumi+=1 + + elif vv.tag!=link+'Point' and sumi>=1: + break + co_img.append(np.array(c_t_in)) + + + if 'separatorregion' in keys: + if tag.endswith('}SeparatorRegion') or tag.endswith('}separatorregion'): + for nn in root1.iter(tag): + c_t_in=[] + sumi=0 + for vv in nn.iter(): + # check the format of coords + if vv.tag==link+'Coords': + coords=bool(vv.attrib) + if coords: + p_h=vv.attrib['points'].split(' ') + c_t_in.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) + break + else: + pass + + + if vv.tag==link+'Point': + c_t_in.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) + sumi+=1 + + elif vv.tag!=link+'Point' and sumi>=1: + break + co_sep.append(np.array(c_t_in)) + + + + if 'tableregion' in keys: + if tag.endswith('}TableRegion') or tag.endswith('}tableregion'): + #print('sth') + for nn in root1.iter(tag): + c_t_in=[] + sumi=0 + for vv in nn.iter(): + # check the format of coords + if vv.tag==link+'Coords': + coords=bool(vv.attrib) + if coords: + p_h=vv.attrib['points'].split(' ') + c_t_in.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) + break + else: + pass + + + if vv.tag==link+'Point': + c_t_in.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) + sumi+=1 + #print(vv.tag,'in') + elif vv.tag!=link+'Point' and sumi>=1: + break + co_table.append(np.array(c_t_in)) + + if 'noiseregion' in keys: + if tag.endswith('}NoiseRegion') or tag.endswith('}noiseregion'): + #print('sth') + for nn in root1.iter(tag): + c_t_in=[] + sumi=0 + for vv in nn.iter(): + # check the format of coords + if vv.tag==link+'Coords': + coords=bool(vv.attrib) + if coords: + p_h=vv.attrib['points'].split(' ') + c_t_in.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) + break + else: + pass + + + if vv.tag==link+'Point': + c_t_in.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) + sumi+=1 + #print(vv.tag,'in') + elif vv.tag!=link+'Point' and sumi>=1: + break + co_noise.append(np.array(c_t_in)) + + if "artificial_class_on_boundry" in keys: + img_boundary = np.zeros( (y_len,x_len) ) + if "paragraph" in elements_with_artificial_class: + erosion_rate = 2 + dilation_rate = 4 + co_text_paragraph, img_boundary = update_region_contours(co_text_paragraph, img_boundary, erosion_rate, dilation_rate, y_len, x_len ) + if "drop-capital" in elements_with_artificial_class: + erosion_rate = 0 + dilation_rate = 4 + co_text_drop, img_boundary = update_region_contours(co_text_drop, img_boundary, erosion_rate, dilation_rate, y_len, x_len ) + if "catch-word" in elements_with_artificial_class: + erosion_rate = 0 + dilation_rate = 4 + co_text_catch, img_boundary = update_region_contours(co_text_catch, img_boundary, erosion_rate, dilation_rate, y_len, x_len ) + if "page-number" in elements_with_artificial_class: + erosion_rate = 0 + dilation_rate = 4 + co_text_page_number, img_boundary = update_region_contours(co_text_page_number, img_boundary, erosion_rate, dilation_rate, y_len, x_len ) + if "header" in elements_with_artificial_class: + erosion_rate = 1 + dilation_rate = 4 + co_text_header, img_boundary = update_region_contours(co_text_header, img_boundary, erosion_rate, dilation_rate, y_len, x_len ) + if "heading" in elements_with_artificial_class: + erosion_rate = 1 + dilation_rate = 4 + co_text_heading, img_boundary = update_region_contours(co_text_heading, img_boundary, erosion_rate, dilation_rate, y_len, x_len ) + if "signature-mark" in elements_with_artificial_class: + erosion_rate = 1 + dilation_rate = 4 + co_text_signature_mark, img_boundary = update_region_contours(co_text_signature_mark, img_boundary, erosion_rate, dilation_rate, y_len, x_len ) + if "marginalia" in elements_with_artificial_class: + erosion_rate = 2 + dilation_rate = 4 + co_text_marginalia, img_boundary = update_region_contours(co_text_marginalia, img_boundary, erosion_rate, dilation_rate, y_len, x_len ) + if "footnote" in elements_with_artificial_class: + erosion_rate = 2 + dilation_rate = 4 + co_text_footnote, img_boundary = update_region_contours(co_text_footnote, img_boundary, erosion_rate, dilation_rate, y_len, x_len ) + if "footnote-continued" in elements_with_artificial_class: + erosion_rate = 2 + dilation_rate = 4 + co_text_footnote_con, img_boundary = update_region_contours(co_text_footnote_con, img_boundary, erosion_rate, dilation_rate, y_len, x_len ) + + + + img = np.zeros( (y_len,x_len,3) ) + + if output_type == '3d': + + if 'graphicregions' in keys: + if "handwritten-annotation" in types_graphic: + img_poly=cv2.fillPoly(img, pts =co_graphic_text_annotation, color=labels_rgb_color[ config_params['graphicregions']['handwritten-annotation']]) + if "signature" in types_graphic: + img_poly=cv2.fillPoly(img, pts =co_graphic_signature, color=labels_rgb_color[ config_params['graphicregions']['signature']]) + if "decoration" in types_graphic: + img_poly=cv2.fillPoly(img, pts =co_graphic_decoration, color=labels_rgb_color[ config_params['graphicregions']['decoration']]) + if "stamp" in types_graphic: + img_poly=cv2.fillPoly(img, pts =co_graphic_stamp, color=labels_rgb_color[ config_params['graphicregions']['stamp']]) + + if 'imageregion' in keys: + img_poly=cv2.fillPoly(img, pts =co_img, color=labels_rgb_color[ config_params['imageregion']]) + if 'separatorregion' in keys: + img_poly=cv2.fillPoly(img, pts =co_sep, color=labels_rgb_color[ config_params['separatorregion']]) + if 'tableregion' in keys: + img_poly=cv2.fillPoly(img, pts =co_table, color=labels_rgb_color[ config_params['tableregion']]) + if 'noiseregion' in keys: + img_poly=cv2.fillPoly(img, pts =co_noise, color=labels_rgb_color[ config_params['noiseregion']]) + + if 'textregions' in keys: + if "paragraph" in types_text: + img_poly=cv2.fillPoly(img, pts =co_text_paragraph, color=labels_rgb_color[ config_params['textregions']['paragraph']]) + if "footnote" in types_text: + img_poly=cv2.fillPoly(img, pts =co_text_footnote, color=labels_rgb_color[ config_params['textregions']['footnote']]) + if "footnote-continued" in types_text: + img_poly=cv2.fillPoly(img, pts =co_text_footnote_con, color=labels_rgb_color[ config_params['textregions']['footnote-continued']]) + if "heading" in types_text: + img_poly=cv2.fillPoly(img, pts =co_text_heading, color=labels_rgb_color[ config_params['textregions']['heading']]) + if "header" in types_text: + img_poly=cv2.fillPoly(img, pts =co_text_header, color=labels_rgb_color[ config_params['textregions']['header']]) + if "catch-word" in types_text: + img_poly=cv2.fillPoly(img, pts =co_text_catch, color=labels_rgb_color[ config_params['textregions']['catch-word']]) + if "signature-mark" in types_text: + img_poly=cv2.fillPoly(img, pts =co_text_signature_mark, color=labels_rgb_color[ config_params['textregions']['signature-mark']]) + if "page-number" in types_text: + img_poly=cv2.fillPoly(img, pts =co_text_page_number, color=labels_rgb_color[ config_params['textregions']['page-number']]) + if "marginalia" in types_text: + img_poly=cv2.fillPoly(img, pts =co_text_marginalia, color=labels_rgb_color[ config_params['textregions']['marginalia']]) + if "drop-capital" in types_text: + img_poly=cv2.fillPoly(img, pts =co_text_drop, color=labels_rgb_color[ config_params['textregions']['drop-capital']]) + + if "artificial_class_on_boundry" in keys: + img_poly[:,:,0][img_boundary[:,:]==1] = artificial_class_rgb_color[0] + img_poly[:,:,1][img_boundary[:,:]==1] = artificial_class_rgb_color[1] + img_poly[:,:,2][img_boundary[:,:]==1] = artificial_class_rgb_color[2] + + + + + elif output_type == '2d': + if 'graphicregions' in keys: + if "handwritten-annotation" in types_graphic: + color_label = config_params['graphicregions']['handwritten-annotation'] + img_poly=cv2.fillPoly(img, pts =co_graphic_text_annotation, color=(color_label,color_label,color_label)) + if "signature" in types_graphic: + color_label = config_params['graphicregions']['signature'] + img_poly=cv2.fillPoly(img, pts =co_graphic_signature, color=(color_label,color_label,color_label)) + if "decoration" in types_graphic: + color_label = config_params['graphicregions']['decoration'] + img_poly=cv2.fillPoly(img, pts =co_graphic_decoration, color=(color_label,color_label,color_label)) + if "stamp" in types_graphic: + color_label = config_params['graphicregions']['stamp'] + img_poly=cv2.fillPoly(img, pts =co_graphic_stamp, color=(color_label,color_label,color_label)) + + if 'imageregion' in keys: + color_label = config_params['imageregion'] + img_poly=cv2.fillPoly(img, pts =co_img, color=(color_label,color_label,color_label)) + if 'separatorregion' in keys: + color_label = config_params['separatorregion'] + img_poly=cv2.fillPoly(img, pts =co_sep, color=(color_label,color_label,color_label)) + if 'tableregion' in keys: + color_label = config_params['tableregion'] + img_poly=cv2.fillPoly(img, pts =co_table, color=(color_label,color_label,color_label)) + if 'noiseregion' in keys: + color_label = config_params['noiseregion'] + img_poly=cv2.fillPoly(img, pts =co_noise, color=(color_label,color_label,color_label)) + + if 'textregions' in keys: + if "paragraph" in types_text: + color_label = config_params['textregions']['paragraph'] + img_poly=cv2.fillPoly(img, pts =co_text_paragraph, color=(color_label,color_label,color_label)) + if "footnote" in types_text: + color_label = config_params['textregions']['footnote'] + img_poly=cv2.fillPoly(img, pts =co_text_footnote, color=(color_label,color_label,color_label)) + if "footnote-continued" in types_text: + color_label = config_params['textregions']['footnote-continued'] + img_poly=cv2.fillPoly(img, pts =co_text_footnote_con, color=(color_label,color_label,color_label)) + if "heading" in types_text: + color_label = config_params['textregions']['heading'] + img_poly=cv2.fillPoly(img, pts =co_text_heading, color=(color_label,color_label,color_label)) + if "header" in types_text: + color_label = config_params['textregions']['header'] + img_poly=cv2.fillPoly(img, pts =co_text_header, color=(color_label,color_label,color_label)) + if "catch-word" in types_text: + color_label = config_params['textregions']['catch-word'] + img_poly=cv2.fillPoly(img, pts =co_text_catch, color=(color_label,color_label,color_label)) + if "signature-mark" in types_text: + color_label = config_params['textregions']['signature-mark'] + img_poly=cv2.fillPoly(img, pts =co_text_signature_mark, color=(color_label,color_label,color_label)) + if "page-number" in types_text: + color_label = config_params['textregions']['page-number'] + img_poly=cv2.fillPoly(img, pts =co_text_page_number, color=(color_label,color_label,color_label)) + if "marginalia" in types_text: + color_label = config_params['textregions']['marginalia'] + img_poly=cv2.fillPoly(img, pts =co_text_marginalia, color=(color_label,color_label,color_label)) + if "drop-capital" in types_text: + color_label = config_params['textregions']['drop-capital'] + img_poly=cv2.fillPoly(img, pts =co_text_drop, color=(color_label,color_label,color_label)) + + if "artificial_class_on_boundry" in keys: + img_poly[:,:][img_boundary[:,:]==1] = artificial_class_label + + + + + try: + cv2.imwrite(output_dir+'/'+gt_list[index].split('-')[1].split('.')[0]+'.png',img_poly ) + except: + cv2.imwrite(output_dir+'/'+gt_list[index].split('.')[0]+'.png',img_poly ) + + + +def find_new_features_of_contours(contours_main): + + #print(contours_main[0][0][:, 0]) + + areas_main = np.array([cv2.contourArea(contours_main[j]) for j in range(len(contours_main))]) + M_main = [cv2.moments(contours_main[j]) for j in range(len(contours_main))] + cx_main = [(M_main[j]["m10"] / (M_main[j]["m00"] + 1e-32)) for j in range(len(M_main))] + cy_main = [(M_main[j]["m01"] / (M_main[j]["m00"] + 1e-32)) for j in range(len(M_main))] + try: + x_min_main = np.array([np.min(contours_main[j][0][:, 0]) for j in range(len(contours_main))]) + + argmin_x_main = np.array([np.argmin(contours_main[j][0][:, 0]) for j in range(len(contours_main))]) + + x_min_from_argmin = np.array([contours_main[j][0][argmin_x_main[j], 0] for j in range(len(contours_main))]) + y_corr_x_min_from_argmin = np.array([contours_main[j][0][argmin_x_main[j], 1] for j in range(len(contours_main))]) + + x_max_main = np.array([np.max(contours_main[j][0][:, 0]) for j in range(len(contours_main))]) + + y_min_main = np.array([np.min(contours_main[j][0][:, 1]) for j in range(len(contours_main))]) + y_max_main = np.array([np.max(contours_main[j][0][:, 1]) for j in range(len(contours_main))]) + except: + x_min_main = np.array([np.min(contours_main[j][:, 0]) for j in range(len(contours_main))]) + + argmin_x_main = np.array([np.argmin(contours_main[j][:, 0]) for j in range(len(contours_main))]) + + x_min_from_argmin = np.array([contours_main[j][argmin_x_main[j], 0] for j in range(len(contours_main))]) + y_corr_x_min_from_argmin = np.array([contours_main[j][argmin_x_main[j], 1] for j in range(len(contours_main))]) + + x_max_main = np.array([np.max(contours_main[j][:, 0]) for j in range(len(contours_main))]) + + y_min_main = np.array([np.min(contours_main[j][:, 1]) for j in range(len(contours_main))]) + y_max_main = np.array([np.max(contours_main[j][:, 1]) for j in range(len(contours_main))]) + + # dis_x=np.abs(x_max_main-x_min_main) + + return cx_main, cy_main, x_min_main, x_max_main, y_min_main, y_max_main, y_corr_x_min_from_argmin +def read_xml(xml_file): + file_name = Path(xml_file).stem + tree1 = ET.parse(xml_file) + root1=tree1.getroot() + alltags=[elem.tag for elem in root1.iter()] + link=alltags[0].split('}')[0]+'}' + + index_tot_regions = [] + tot_region_ref = [] + + for jj in root1.iter(link+'Page'): + y_len=int(jj.attrib['imageHeight']) + x_len=int(jj.attrib['imageWidth']) + + + for jj in root1.iter(link+'RegionRefIndexed'): + index_tot_regions.append(jj.attrib['index']) + tot_region_ref.append(jj.attrib['regionRef']) + + region_tags=np.unique([x for x in alltags if x.endswith('Region')]) + #print(region_tags) + co_text_paragraph=[] + co_text_drop=[] + co_text_heading=[] + co_text_header=[] + co_text_marginalia=[] + co_text_catch=[] + co_text_page_number=[] + co_text_signature_mark=[] + co_sep=[] + co_img=[] + co_table=[] + co_graphic=[] + co_graphic_text_annotation=[] + co_graphic_decoration=[] + co_noise=[] + + + co_text_paragraph_text=[] + co_text_drop_text=[] + co_text_heading_text=[] + co_text_header_text=[] + co_text_marginalia_text=[] + co_text_catch_text=[] + co_text_page_number_text=[] + co_text_signature_mark_text=[] + co_sep_text=[] + co_img_text=[] + co_table_text=[] + co_graphic_text=[] + co_graphic_text_annotation_text=[] + co_graphic_decoration_text=[] + co_noise_text=[] + + + id_paragraph = [] + id_header = [] + id_heading = [] + id_marginalia = [] + + for tag in region_tags: + if tag.endswith('}TextRegion') or tag.endswith('}Textregion'): + for nn in root1.iter(tag): + for child2 in nn: + tag2 = child2.tag + #print(child2.tag) + if tag2.endswith('}TextEquiv') or tag2.endswith('}TextEquiv'): + #children2 = childtext.getchildren() + #rank = child2.find('Unicode').text + for childtext2 in child2: + #rank = childtext2.find('Unicode').text + #if childtext2.tag.endswith('}PlainText') or childtext2.tag.endswith('}PlainText'): + #print(childtext2.text) + if childtext2.tag.endswith('}Unicode') or childtext2.tag.endswith('}Unicode'): + if "type" in nn.attrib and nn.attrib['type']=='drop-capital': + co_text_drop_text.append(childtext2.text) + elif "type" in nn.attrib and nn.attrib['type']=='heading': + co_text_heading_text.append(childtext2.text) + elif "type" in nn.attrib and nn.attrib['type']=='signature-mark': + co_text_signature_mark_text.append(childtext2.text) + elif "type" in nn.attrib and nn.attrib['type']=='header': + co_text_header_text.append(childtext2.text) + elif "type" in nn.attrib and nn.attrib['type']=='catch-word': + co_text_catch_text.append(childtext2.text) + elif "type" in nn.attrib and nn.attrib['type']=='page-number': + co_text_page_number_text.append(childtext2.text) + elif "type" in nn.attrib and nn.attrib['type']=='marginalia': + co_text_marginalia_text.append(childtext2.text) + else: + co_text_paragraph_text.append(childtext2.text) + c_t_in_drop=[] + c_t_in_paragraph=[] + c_t_in_heading=[] + c_t_in_header=[] + c_t_in_page_number=[] + c_t_in_signature_mark=[] + c_t_in_catch=[] + c_t_in_marginalia=[] + + + sumi=0 + for vv in nn.iter(): + # check the format of coords + if vv.tag==link+'Coords': + + coords=bool(vv.attrib) + if coords: + #print('birda1') + p_h=vv.attrib['points'].split(' ') + + + + if "type" in nn.attrib and nn.attrib['type']=='drop-capital': + #if nn.attrib['type']=='paragraph': + + c_t_in_drop.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) + + elif "type" in nn.attrib and nn.attrib['type']=='heading': + id_heading.append(nn.attrib['id']) + c_t_in_heading.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) + + + elif "type" in nn.attrib and nn.attrib['type']=='signature-mark': + + c_t_in_signature_mark.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) + #print(c_t_in_paragraph) + elif "type" in nn.attrib and nn.attrib['type']=='header': + id_header.append(nn.attrib['id']) + c_t_in_header.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) + + + elif "type" in nn.attrib and nn.attrib['type']=='catch-word': + c_t_in_catch.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) + + + elif "type" in nn.attrib and nn.attrib['type']=='page-number': + + c_t_in_page_number.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) + #print(c_t_in_paragraph) + + elif "type" in nn.attrib and nn.attrib['type']=='marginalia': + id_marginalia.append(nn.attrib['id']) + + c_t_in_marginalia.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) + #print(c_t_in_paragraph) + else: + #print(nn.attrib['id']) + + id_paragraph.append(nn.attrib['id']) + + c_t_in_paragraph.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) + #print(c_t_in_paragraph) + + break + else: + pass + + + if vv.tag==link+'Point': + if "type" in nn.attrib and nn.attrib['type']=='drop-capital': + #if nn.attrib['type']=='paragraph': + + c_t_in_drop.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) + sumi+=1 + + elif "type" in nn.attrib and nn.attrib['type']=='heading': + id_heading.append(nn.attrib['id']) + c_t_in_heading.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) + sumi+=1 + + + elif "type" in nn.attrib and nn.attrib['type']=='signature-mark': + + c_t_in_signature_mark.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) + #print(c_t_in_paragraph) + sumi+=1 + elif "type" in nn.attrib and nn.attrib['type']=='header': + id_header.append(nn.attrib['id']) + c_t_in_header.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) + sumi+=1 + + + elif "type" in nn.attrib and nn.attrib['type']=='catch-word': + c_t_in_catch.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) + sumi+=1 + + + elif "type" in nn.attrib and nn.attrib['type']=='page-number': + + c_t_in_page_number.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) + #print(c_t_in_paragraph) + sumi+=1 + + elif "type" in nn.attrib and nn.attrib['type']=='marginalia': + id_marginalia.append(nn.attrib['id']) + + c_t_in_marginalia.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) + #print(c_t_in_paragraph) + sumi+=1 + + else: + id_paragraph.append(nn.attrib['id']) + c_t_in_paragraph.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) + #print(c_t_in_paragraph) + sumi+=1 + + #c_t_in.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) + + #print(vv.tag,'in') + elif vv.tag!=link+'Point' and sumi>=1: + break + + if len(c_t_in_drop)>0: + co_text_drop.append(np.array(c_t_in_drop)) + if len(c_t_in_paragraph)>0: + co_text_paragraph.append(np.array(c_t_in_paragraph)) + if len(c_t_in_heading)>0: + co_text_heading.append(np.array(c_t_in_heading)) + + if len(c_t_in_header)>0: + co_text_header.append(np.array(c_t_in_header)) + if len(c_t_in_page_number)>0: + co_text_page_number.append(np.array(c_t_in_page_number)) + if len(c_t_in_catch)>0: + co_text_catch.append(np.array(c_t_in_catch)) + + if len(c_t_in_signature_mark)>0: + co_text_signature_mark.append(np.array(c_t_in_signature_mark)) + + if len(c_t_in_marginalia)>0: + co_text_marginalia.append(np.array(c_t_in_marginalia)) + + + elif tag.endswith('}GraphicRegion') or tag.endswith('}graphicregion'): + #print('sth') + for nn in root1.iter(tag): + c_t_in=[] + c_t_in_text_annotation=[] + c_t_in_decoration=[] + sumi=0 + for vv in nn.iter(): + # check the format of coords + if vv.tag==link+'Coords': + coords=bool(vv.attrib) + if coords: + p_h=vv.attrib['points'].split(' ') + #c_t_in.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) + + if "type" in nn.attrib and nn.attrib['type']=='handwritten-annotation': + #if nn.attrib['type']=='paragraph': + + c_t_in_text_annotation.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) + + elif "type" in nn.attrib and nn.attrib['type']=='decoration': + + c_t_in_decoration.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) + #print(c_t_in_paragraph) + else: + c_t_in.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) + + + + break + else: + pass + + + if vv.tag==link+'Point': + + if "type" in nn.attrib and nn.attrib['type']=='handwritten-annotation': + #if nn.attrib['type']=='paragraph': + + c_t_in_text_annotation.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) + sumi+=1 + + elif "type" in nn.attrib and nn.attrib['type']=='decoration': + + c_t_in_decoration.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) + #print(c_t_in_paragraph) + sumi+=1 + else: + c_t_in.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) + sumi+=1 + + if len(c_t_in_text_annotation)>0: + co_graphic_text_annotation.append(np.array(c_t_in_text_annotation)) + if len(c_t_in_decoration)>0: + co_graphic_decoration.append(np.array(c_t_in_decoration)) + if len(c_t_in)>0: + co_graphic.append(np.array(c_t_in)) + + + + elif tag.endswith('}ImageRegion') or tag.endswith('}imageregion'): + #print('sth') + for nn in root1.iter(tag): + c_t_in=[] + sumi=0 + for vv in nn.iter(): + # check the format of coords + if vv.tag==link+'Coords': + coords=bool(vv.attrib) + if coords: + p_h=vv.attrib['points'].split(' ') + c_t_in.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) + break + else: + pass + + + if vv.tag==link+'Point': + c_t_in.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) + sumi+=1 + #print(vv.tag,'in') + elif vv.tag!=link+'Point' and sumi>=1: + break + co_img.append(np.array(c_t_in)) + co_img_text.append(' ') + + + elif tag.endswith('}SeparatorRegion') or tag.endswith('}separatorregion'): + #print('sth') + for nn in root1.iter(tag): + c_t_in=[] + sumi=0 + for vv in nn.iter(): + # check the format of coords + if vv.tag==link+'Coords': + coords=bool(vv.attrib) + if coords: + p_h=vv.attrib['points'].split(' ') + c_t_in.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) + break + else: + pass + + + if vv.tag==link+'Point': + c_t_in.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) + sumi+=1 + #print(vv.tag,'in') + elif vv.tag!=link+'Point' and sumi>=1: + break + co_sep.append(np.array(c_t_in)) + + + + elif tag.endswith('}TableRegion') or tag.endswith('}tableregion'): + #print('sth') + for nn in root1.iter(tag): + c_t_in=[] + sumi=0 + for vv in nn.iter(): + # check the format of coords + if vv.tag==link+'Coords': + coords=bool(vv.attrib) + if coords: + p_h=vv.attrib['points'].split(' ') + c_t_in.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) + break + else: + pass + + + if vv.tag==link+'Point': + c_t_in.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) + sumi+=1 + #print(vv.tag,'in') + elif vv.tag!=link+'Point' and sumi>=1: + break + co_table.append(np.array(c_t_in)) + co_table_text.append(' ') + + elif tag.endswith('}NoiseRegion') or tag.endswith('}noiseregion'): + #print('sth') + for nn in root1.iter(tag): + c_t_in=[] + sumi=0 + for vv in nn.iter(): + # check the format of coords + if vv.tag==link+'Coords': + coords=bool(vv.attrib) + if coords: + p_h=vv.attrib['points'].split(' ') + c_t_in.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) + break + else: + pass + + + if vv.tag==link+'Point': + c_t_in.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) + sumi+=1 + #print(vv.tag,'in') + elif vv.tag!=link+'Point' and sumi>=1: + break + co_noise.append(np.array(c_t_in)) + co_noise_text.append(' ') + + + img = np.zeros( (y_len,x_len,3) ) + + img_poly=cv2.fillPoly(img, pts =co_text_paragraph, color=(1,1,1)) + + img_poly=cv2.fillPoly(img, pts =co_text_heading, color=(2,2,2)) + img_poly=cv2.fillPoly(img, pts =co_text_header, color=(2,2,2)) + #img_poly=cv2.fillPoly(img, pts =co_text_catch, color=(125,255,125)) + #img_poly=cv2.fillPoly(img, pts =co_text_signature_mark, color=(125,125,0)) + #img_poly=cv2.fillPoly(img, pts =co_graphic_decoration, color=(1,125,255)) + #img_poly=cv2.fillPoly(img, pts =co_text_page_number, color=(1,125,0)) + img_poly=cv2.fillPoly(img, pts =co_text_marginalia, color=(3,3,3)) + #img_poly=cv2.fillPoly(img, pts =co_text_drop, color=(1,125,255)) + + #img_poly=cv2.fillPoly(img, pts =co_graphic_text_annotation, color=(125,0,125)) + img_poly=cv2.fillPoly(img, pts =co_img, color=(4,4,4)) + img_poly=cv2.fillPoly(img, pts =co_sep, color=(5,5,5)) + #img_poly=cv2.fillPoly(img, pts =co_table, color=(1,255,255)) + #img_poly=cv2.fillPoly(img, pts =co_graphic, color=(255,125,125)) + #img_poly=cv2.fillPoly(img, pts =co_noise, color=(255,0,255)) + + #print('yazdimmm',self.output_dir+'/'+self.gt_list[index].split('.')[0]+'.jpg') + ###try: + ####print('yazdimmm',self.output_dir+'/'+self.gt_list[index].split('.')[0]+'.jpg') + ###cv2.imwrite(self.output_dir+'/'+self.gt_list[index].split('-')[1].split('.')[0]+'.jpg',img_poly ) + ###except: + ###cv2.imwrite(self.output_dir+'/'+self.gt_list[index].split('.')[0]+'.jpg',img_poly ) + return file_name, id_paragraph, id_header,co_text_paragraph, co_text_header,\ +tot_region_ref,x_len, y_len,index_tot_regions, img_poly + + + + +def bounding_box(cnt,color, corr_order_index ): + x, y, w, h = cv2.boundingRect(cnt) + x = int(x*scale_w) + y = int(y*scale_h) + + w = int(w*scale_w) + h = int(h*scale_h) + + return [x,y,w,h,int(color), int(corr_order_index)+1] + +def resize_image(seg_in,input_height,input_width): + return cv2.resize(seg_in,(input_width,input_height),interpolation=cv2.INTER_NEAREST) + +def make_image_from_bb(width_l, height_l, bb_all): + bb_all =np.array(bb_all) + img_remade = np.zeros((height_l,width_l )) + + for i in range(bb_all.shape[0]): + img_remade[bb_all[i,1]:bb_all[i,1]+bb_all[i,3],bb_all[i,0]:bb_all[i,0]+bb_all[i,2] ] = 1 + return img_remade diff --git a/pagexml2label.py b/pagexml2label.py deleted file mode 100644 index 94596db..0000000 --- a/pagexml2label.py +++ /dev/null @@ -1,789 +0,0 @@ -import click -import sys -import os -import numpy as np -import warnings -import xml.etree.ElementTree as ET -from tqdm import tqdm -import cv2 -from shapely import geometry -import json - -with warnings.catch_warnings(): - warnings.simplefilter("ignore") - -__doc__=\ -""" -tool to extract 2d or 3d RGB images from page xml data. In former case output will be 1 -2D image array which each class has filled with a pixel value. In the case of 3D RGB image -each class will be defined with a RGB value and beside images a text file of classes also will be produced. -This classes.txt file is required for dhsegment tool. -""" -KERNEL = np.ones((5, 5), np.uint8) - -class pagexml2label: - def __init__(self,dir_in, out_dir,output_type,config): - self.dir=dir_in - self.output_dir=out_dir - self.output_type=output_type - self.config=config - - def get_content_of_dir(self): - """ - Listing all ground truth page xml files. All files are needed to have xml format. - """ - - gt_all=os.listdir(self.dir) - self.gt_list=[file for file in gt_all if file.split('.')[ len(file.split('.'))-1 ]=='xml' ] - - def return_parent_contours(self,contours, hierarchy): - contours_parent = [contours[i] for i in range(len(contours)) if hierarchy[0][i][3] == -1] - return contours_parent - def filter_contours_area_of_image_tables(self,image, contours, hierarchy, max_area, min_area): - found_polygons_early = list() - - jv = 0 - for c in contours: - if len(c) < 3: # A polygon cannot have less than 3 points - continue - - polygon = geometry.Polygon([point[0] for point in c]) - # area = cv2.contourArea(c) - area = polygon.area - ##print(np.prod(thresh.shape[:2])) - # Check that polygon has area greater than minimal area - # print(hierarchy[0][jv][3],hierarchy ) - if area >= min_area * np.prod(image.shape[:2]) and area <= max_area * np.prod(image.shape[:2]): # and hierarchy[0][jv][3]==-1 : - # print(c[0][0][1]) - found_polygons_early.append(np.array([[point] for point in polygon.exterior.coords], dtype=np.int32)) - jv += 1 - return found_polygons_early - - def return_contours_of_interested_region(self,region_pre_p, pixel, min_area=0.0002): - - # pixels of images are identified by 5 - if len(region_pre_p.shape) == 3: - cnts_images = (region_pre_p[:, :, 0] == pixel) * 1 - else: - cnts_images = (region_pre_p[:, :] == pixel) * 1 - cnts_images = cnts_images.astype(np.uint8) - cnts_images = np.repeat(cnts_images[:, :, np.newaxis], 3, axis=2) - imgray = cv2.cvtColor(cnts_images, cv2.COLOR_BGR2GRAY) - ret, thresh = cv2.threshold(imgray, 0, 255, 0) - - contours_imgs, hierarchy = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) - - contours_imgs = self.return_parent_contours(contours_imgs, hierarchy) - contours_imgs = self.filter_contours_area_of_image_tables(thresh, contours_imgs, hierarchy, max_area=1, min_area=min_area) - - return contours_imgs - def update_region_contours(self, co_text, img_boundary, erosion_rate, dilation_rate, y_len, x_len): - co_text_eroded = [] - for con in co_text: - #try: - img_boundary_in = np.zeros( (y_len,x_len) ) - img_boundary_in = cv2.fillPoly(img_boundary_in, pts=[con], color=(1, 1, 1)) - #print('bidiahhhhaaa') - - - - #img_boundary_in = cv2.erode(img_boundary_in[:,:], KERNEL, iterations=7)#asiatica - if erosion_rate > 0: - img_boundary_in = cv2.erode(img_boundary_in[:,:], KERNEL, iterations=erosion_rate) - - pixel = 1 - min_size = 0 - con_eroded = self.return_contours_of_interested_region(img_boundary_in,pixel, min_size ) - - try: - co_text_eroded.append(con_eroded[0]) - except: - co_text_eroded.append(con) - - - img_boundary_in_dilated = cv2.dilate(img_boundary_in[:,:], KERNEL, iterations=dilation_rate) - #img_boundary_in_dilated = cv2.dilate(img_boundary_in[:,:], KERNEL, iterations=5) - - boundary = img_boundary_in_dilated[:,:] - img_boundary_in[:,:] - - img_boundary[:,:][boundary[:,:]==1] =1 - return co_text_eroded, img_boundary - def get_images_of_ground_truth(self, config_params): - """ - Reading the page xml files and write the ground truth images into given output directory. - """ - ## to do: add footnote to text regions - for index in tqdm(range(len(self.gt_list))): - #try: - tree1 = ET.parse(self.dir+'/'+self.gt_list[index]) - root1=tree1.getroot() - alltags=[elem.tag for elem in root1.iter()] - link=alltags[0].split('}')[0]+'}' - - - - for jj in root1.iter(link+'Page'): - y_len=int(jj.attrib['imageHeight']) - x_len=int(jj.attrib['imageWidth']) - - if self.config and (config_params['use_case']=='textline' or config_params['use_case']=='word' or config_params['use_case']=='glyph' or config_params['use_case']=='printspace'): - keys = list(config_params.keys()) - if "artificial_class_label" in keys: - artificial_class_rgb_color = (255,255,0) - artificial_class_label = config_params['artificial_class_label'] - - textline_rgb_color = (255, 0, 0) - - if config_params['use_case']=='textline': - region_tags = np.unique([x for x in alltags if x.endswith('TextLine')]) - elif config_params['use_case']=='word': - region_tags = np.unique([x for x in alltags if x.endswith('Word')]) - elif config_params['use_case']=='glyph': - region_tags = np.unique([x for x in alltags if x.endswith('Glyph')]) - elif config_params['use_case']=='printspace': - region_tags = np.unique([x for x in alltags if x.endswith('PrintSpace')]) - - co_use_case = [] - - for tag in region_tags: - if config_params['use_case']=='textline': - tag_endings = ['}TextLine','}textline'] - elif config_params['use_case']=='word': - tag_endings = ['}Word','}word'] - elif config_params['use_case']=='glyph': - tag_endings = ['}Glyph','}glyph'] - elif config_params['use_case']=='printspace': - tag_endings = ['}PrintSpace','}printspace'] - - if tag.endswith(tag_endings[0]) or tag.endswith(tag_endings[1]): - for nn in root1.iter(tag): - c_t_in = [] - sumi = 0 - for vv in nn.iter(): - # check the format of coords - if vv.tag == link + 'Coords': - coords = bool(vv.attrib) - if coords: - p_h = vv.attrib['points'].split(' ') - c_t_in.append( - np.array([[int(x.split(',')[0]), int(x.split(',')[1])] for x in p_h])) - break - else: - pass - - if vv.tag == link + 'Point': - c_t_in.append([int(np.float(vv.attrib['x'])), int(np.float(vv.attrib['y']))]) - sumi += 1 - elif vv.tag != link + 'Point' and sumi >= 1: - break - co_use_case.append(np.array(c_t_in)) - - - - if "artificial_class_label" in keys: - img_boundary = np.zeros((y_len, x_len)) - erosion_rate = 1 - dilation_rate = 3 - co_use_case, img_boundary = self.update_region_contours(co_use_case, img_boundary, erosion_rate, dilation_rate, y_len, x_len ) - - - img = np.zeros((y_len, x_len, 3)) - if self.output_type == '2d': - img_poly = cv2.fillPoly(img, pts=co_use_case, color=(1, 1, 1)) - if "artificial_class_label" in keys: - img_poly[:,:][img_boundary[:,:]==1] = artificial_class_label - elif self.output_type == '3d': - img_poly = cv2.fillPoly(img, pts=co_use_case, color=textline_rgb_color) - if "artificial_class_label" in keys: - img_poly[:,:,0][img_boundary[:,:]==1] = artificial_class_rgb_color[0] - img_poly[:,:,1][img_boundary[:,:]==1] = artificial_class_rgb_color[1] - img_poly[:,:,2][img_boundary[:,:]==1] = artificial_class_rgb_color[2] - - try: - cv2.imwrite(self.output_dir + '/' + self.gt_list[index].split('-')[1].split('.')[0] + '.png', - img_poly) - except: - cv2.imwrite(self.output_dir + '/' + self.gt_list[index].split('.')[0] + '.png', img_poly) - - - if self.config and config_params['use_case']=='layout': - keys = list(config_params.keys()) - if "artificial_class_on_boundry" in keys: - elements_with_artificial_class = list(config_params['artificial_class_on_boundry']) - artificial_class_rgb_color = (255,255,0) - artificial_class_label = config_params['artificial_class_label'] - #values = config_params.values() - - if 'textregions' in keys: - types_text_dict = config_params['textregions'] - types_text = list(types_text_dict.keys()) - types_text_label = list(types_text_dict.values()) - print(types_text) - if 'graphicregions' in keys: - types_graphic_dict = config_params['graphicregions'] - types_graphic = list(types_graphic_dict.keys()) - types_graphic_label = list(types_graphic_dict.values()) - - - labels_rgb_color = [ (0,0,0), (255,0,0), (255,125,0), (255,0,125), (125,255,125), (125,125,0), (0,125,255), (0,125,0), (125,125,125), (255,0,255), (125,0,125), (0,255,0),(0,0,255), (0,255,255), (255,125,125), (0,125,125), (0,255,125), (255,125,255), (125,255,0)] - - region_tags=np.unique([x for x in alltags if x.endswith('Region')]) - - co_text_paragraph=[] - co_text_footnote=[] - co_text_footnote_con=[] - co_text_drop=[] - co_text_heading=[] - co_text_header=[] - co_text_marginalia=[] - co_text_catch=[] - co_text_page_number=[] - co_text_signature_mark=[] - co_sep=[] - co_img=[] - co_table=[] - co_graphic_signature=[] - co_graphic_text_annotation=[] - co_graphic_decoration=[] - co_graphic_stamp=[] - co_noise=[] - - for tag in region_tags: - if 'textregions' in keys: - if tag.endswith('}TextRegion') or tag.endswith('}Textregion'): - for nn in root1.iter(tag): - c_t_in_drop=[] - c_t_in_paragraph=[] - c_t_in_heading=[] - c_t_in_header=[] - c_t_in_page_number=[] - c_t_in_signature_mark=[] - c_t_in_catch=[] - c_t_in_marginalia=[] - c_t_in_footnote=[] - c_t_in_footnote_con=[] - sumi=0 - for vv in nn.iter(): - # check the format of coords - if vv.tag==link+'Coords': - - coords=bool(vv.attrib) - if coords: - #print('birda1') - p_h=vv.attrib['points'].split(' ') - - if "drop-capital" in types_text: - if "type" in nn.attrib and nn.attrib['type']=='drop-capital': - c_t_in_drop.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) - - if "footnote" in types_text: - if "type" in nn.attrib and nn.attrib['type']=='footnote': - c_t_in_footnote.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) - - if "footnote-continued" in types_text: - if "type" in nn.attrib and nn.attrib['type']=='footnote-continued': - c_t_in_footnote_con.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) - - if "heading" in types_text: - if "type" in nn.attrib and nn.attrib['type']=='heading': - c_t_in_heading.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) - - if "signature-mark" in types_text: - if "type" in nn.attrib and nn.attrib['type']=='signature-mark': - c_t_in_signature_mark.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) - - if "header" in types_text: - if "type" in nn.attrib and nn.attrib['type']=='header': - c_t_in_header.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) - - if "catch-word" in types_text: - if "type" in nn.attrib and nn.attrib['type']=='catch-word': - c_t_in_catch.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) - - if "page-number" in types_text: - if "type" in nn.attrib and nn.attrib['type']=='page-number': - c_t_in_page_number.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) - - if "marginalia" in types_text: - if "type" in nn.attrib and nn.attrib['type']=='marginalia': - c_t_in_marginalia.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) - - if "paragraph" in types_text: - if "type" in nn.attrib and nn.attrib['type']=='paragraph': - c_t_in_paragraph.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) - - - break - else: - pass - - - if vv.tag==link+'Point': - if "drop-capital" in types_text: - if "type" in nn.attrib and nn.attrib['type']=='drop-capital': - c_t_in_drop.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) - sumi+=1 - - if "footnote" in types_text: - if "type" in nn.attrib and nn.attrib['type']=='footnote': - c_t_in_footnote.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) - sumi+=1 - - if "footnote-continued" in types_text: - if "type" in nn.attrib and nn.attrib['type']=='footnote-continued': - c_t_in_footnote_con.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) - sumi+=1 - - if "heading" in types_text: - if "type" in nn.attrib and nn.attrib['type']=='heading': - c_t_in_heading.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) - sumi+=1 - - if "signature-mark" in types_text: - if "type" in nn.attrib and nn.attrib['type']=='signature-mark': - c_t_in_signature_mark.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) - sumi+=1 - - if "header" in types_text: - if "type" in nn.attrib and nn.attrib['type']=='header': - c_t_in_header.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) - sumi+=1 - - if "catch-word" in types_text: - if "type" in nn.attrib and nn.attrib['type']=='catch-word': - c_t_in_catch.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) - sumi+=1 - - if "page-number" in types_text: - if "type" in nn.attrib and nn.attrib['type']=='page-number': - c_t_in_page_number.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) - sumi+=1 - - if "marginalia" in types_text: - if "type" in nn.attrib and nn.attrib['type']=='marginalia': - c_t_in_marginalia.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) - sumi+=1 - - if "paragraph" in types_text: - if "type" in nn.attrib and nn.attrib['type']=='paragraph': - c_t_in_paragraph.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) - sumi+=1 - - - elif vv.tag!=link+'Point' and sumi>=1: - break - - if len(c_t_in_drop)>0: - co_text_drop.append(np.array(c_t_in_drop)) - if len(c_t_in_footnote_con)>0: - co_text_footnote_con.append(np.array(c_t_in_footnote_con)) - if len(c_t_in_footnote)>0: - co_text_footnote.append(np.array(c_t_in_footnote)) - if len(c_t_in_paragraph)>0: - co_text_paragraph.append(np.array(c_t_in_paragraph)) - if len(c_t_in_heading)>0: - co_text_heading.append(np.array(c_t_in_heading)) - - if len(c_t_in_header)>0: - co_text_header.append(np.array(c_t_in_header)) - if len(c_t_in_page_number)>0: - co_text_page_number.append(np.array(c_t_in_page_number)) - if len(c_t_in_catch)>0: - co_text_catch.append(np.array(c_t_in_catch)) - - if len(c_t_in_signature_mark)>0: - co_text_signature_mark.append(np.array(c_t_in_signature_mark)) - - if len(c_t_in_marginalia)>0: - co_text_marginalia.append(np.array(c_t_in_marginalia)) - - - if 'graphicregions' in keys: - if tag.endswith('}GraphicRegion') or tag.endswith('}graphicregion'): - #print('sth') - for nn in root1.iter(tag): - c_t_in_stamp=[] - c_t_in_text_annotation=[] - c_t_in_decoration=[] - c_t_in_signature=[] - sumi=0 - for vv in nn.iter(): - # check the format of coords - if vv.tag==link+'Coords': - coords=bool(vv.attrib) - if coords: - p_h=vv.attrib['points'].split(' ') - if "handwritten-annotation" in types_graphic: - if "type" in nn.attrib and nn.attrib['type']=='handwritten-annotation': - c_t_in_text_annotation.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) - - if "decoration" in types_graphic: - if "type" in nn.attrib and nn.attrib['type']=='decoration': - c_t_in_decoration.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) - - if "stamp" in types_graphic: - if "type" in nn.attrib and nn.attrib['type']=='stamp': - c_t_in_stamp.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) - - if "signature" in types_graphic: - if "type" in nn.attrib and nn.attrib['type']=='signature': - c_t_in_signature.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) - - - - break - else: - pass - - - if vv.tag==link+'Point': - if "handwritten-annotation" in types_graphic: - if "type" in nn.attrib and nn.attrib['type']=='handwritten-annotation': - c_t_in_text_annotation.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) - sumi+=1 - - if "decoration" in types_graphic: - if "type" in nn.attrib and nn.attrib['type']=='decoration': - c_t_in_decoration.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) - sumi+=1 - - if "stamp" in types_graphic: - if "type" in nn.attrib and nn.attrib['type']=='stamp': - c_t_in_stamp.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) - sumi+=1 - - if "signature" in types_graphic: - if "type" in nn.attrib and nn.attrib['type']=='signature': - c_t_in_signature.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) - sumi+=1 - - if len(c_t_in_text_annotation)>0: - co_graphic_text_annotation.append(np.array(c_t_in_text_annotation)) - if len(c_t_in_decoration)>0: - co_graphic_decoration.append(np.array(c_t_in_decoration)) - if len(c_t_in_stamp)>0: - co_graphic_stamp.append(np.array(c_t_in_stamp)) - if len(c_t_in_signature)>0: - co_graphic_signature.append(np.array(c_t_in_signature)) - - if 'imageregion' in keys: - if tag.endswith('}ImageRegion') or tag.endswith('}imageregion'): - for nn in root1.iter(tag): - c_t_in=[] - sumi=0 - for vv in nn.iter(): - if vv.tag==link+'Coords': - coords=bool(vv.attrib) - if coords: - p_h=vv.attrib['points'].split(' ') - c_t_in.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) - break - else: - pass - - - if vv.tag==link+'Point': - c_t_in.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) - sumi+=1 - - elif vv.tag!=link+'Point' and sumi>=1: - break - co_img.append(np.array(c_t_in)) - - - if 'separatorregion' in keys: - if tag.endswith('}SeparatorRegion') or tag.endswith('}separatorregion'): - for nn in root1.iter(tag): - c_t_in=[] - sumi=0 - for vv in nn.iter(): - # check the format of coords - if vv.tag==link+'Coords': - coords=bool(vv.attrib) - if coords: - p_h=vv.attrib['points'].split(' ') - c_t_in.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) - break - else: - pass - - - if vv.tag==link+'Point': - c_t_in.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) - sumi+=1 - - elif vv.tag!=link+'Point' and sumi>=1: - break - co_sep.append(np.array(c_t_in)) - - - - if 'tableregion' in keys: - if tag.endswith('}TableRegion') or tag.endswith('}tableregion'): - #print('sth') - for nn in root1.iter(tag): - c_t_in=[] - sumi=0 - for vv in nn.iter(): - # check the format of coords - if vv.tag==link+'Coords': - coords=bool(vv.attrib) - if coords: - p_h=vv.attrib['points'].split(' ') - c_t_in.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) - break - else: - pass - - - if vv.tag==link+'Point': - c_t_in.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) - sumi+=1 - #print(vv.tag,'in') - elif vv.tag!=link+'Point' and sumi>=1: - break - co_table.append(np.array(c_t_in)) - - if 'noiseregion' in keys: - if tag.endswith('}NoiseRegion') or tag.endswith('}noiseregion'): - #print('sth') - for nn in root1.iter(tag): - c_t_in=[] - sumi=0 - for vv in nn.iter(): - # check the format of coords - if vv.tag==link+'Coords': - coords=bool(vv.attrib) - if coords: - p_h=vv.attrib['points'].split(' ') - c_t_in.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) - break - else: - pass - - - if vv.tag==link+'Point': - c_t_in.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) - sumi+=1 - #print(vv.tag,'in') - elif vv.tag!=link+'Point' and sumi>=1: - break - co_noise.append(np.array(c_t_in)) - - if "artificial_class_on_boundry" in keys: - img_boundary = np.zeros( (y_len,x_len) ) - if "paragraph" in elements_with_artificial_class: - erosion_rate = 2 - dilation_rate = 4 - co_text_paragraph, img_boundary = self.update_region_contours(co_text_paragraph, img_boundary, erosion_rate, dilation_rate, y_len, x_len ) - if "drop-capital" in elements_with_artificial_class: - erosion_rate = 0 - dilation_rate = 4 - co_text_drop, img_boundary = self.update_region_contours(co_text_drop, img_boundary, erosion_rate, dilation_rate, y_len, x_len ) - if "catch-word" in elements_with_artificial_class: - erosion_rate = 0 - dilation_rate = 4 - co_text_catch, img_boundary = self.update_region_contours(co_text_catch, img_boundary, erosion_rate, dilation_rate, y_len, x_len ) - if "page-number" in elements_with_artificial_class: - erosion_rate = 0 - dilation_rate = 4 - co_text_page_number, img_boundary = self.update_region_contours(co_text_page_number, img_boundary, erosion_rate, dilation_rate, y_len, x_len ) - if "header" in elements_with_artificial_class: - erosion_rate = 1 - dilation_rate = 4 - co_text_header, img_boundary = self.update_region_contours(co_text_header, img_boundary, erosion_rate, dilation_rate, y_len, x_len ) - if "heading" in elements_with_artificial_class: - erosion_rate = 1 - dilation_rate = 4 - co_text_heading, img_boundary = self.update_region_contours(co_text_heading, img_boundary, erosion_rate, dilation_rate, y_len, x_len ) - if "signature-mark" in elements_with_artificial_class: - erosion_rate = 1 - dilation_rate = 4 - co_text_signature_mark, img_boundary = self.update_region_contours(co_text_signature_mark, img_boundary, erosion_rate, dilation_rate, y_len, x_len ) - if "marginalia" in elements_with_artificial_class: - erosion_rate = 2 - dilation_rate = 4 - co_text_marginalia, img_boundary = self.update_region_contours(co_text_marginalia, img_boundary, erosion_rate, dilation_rate, y_len, x_len ) - if "footnote" in elements_with_artificial_class: - erosion_rate = 2 - dilation_rate = 4 - co_text_footnote, img_boundary = self.update_region_contours(co_text_footnote, img_boundary, erosion_rate, dilation_rate, y_len, x_len ) - if "footnote-continued" in elements_with_artificial_class: - erosion_rate = 2 - dilation_rate = 4 - co_text_footnote_con, img_boundary = self.update_region_contours(co_text_footnote_con, img_boundary, erosion_rate, dilation_rate, y_len, x_len ) - - - - img = np.zeros( (y_len,x_len,3) ) - - if self.output_type == '3d': - - if 'graphicregions' in keys: - if "handwritten-annotation" in types_graphic: - img_poly=cv2.fillPoly(img, pts =co_graphic_text_annotation, color=labels_rgb_color[ config_params['graphicregions']['handwritten-annotation']]) - if "signature" in types_graphic: - img_poly=cv2.fillPoly(img, pts =co_graphic_signature, color=labels_rgb_color[ config_params['graphicregions']['signature']]) - if "decoration" in types_graphic: - img_poly=cv2.fillPoly(img, pts =co_graphic_decoration, color=labels_rgb_color[ config_params['graphicregions']['decoration']]) - if "stamp" in types_graphic: - img_poly=cv2.fillPoly(img, pts =co_graphic_stamp, color=labels_rgb_color[ config_params['graphicregions']['stamp']]) - - if 'imageregion' in keys: - img_poly=cv2.fillPoly(img, pts =co_img, color=labels_rgb_color[ config_params['imageregion']]) - if 'separatorregion' in keys: - img_poly=cv2.fillPoly(img, pts =co_sep, color=labels_rgb_color[ config_params['separatorregion']]) - if 'tableregion' in keys: - img_poly=cv2.fillPoly(img, pts =co_table, color=labels_rgb_color[ config_params['tableregion']]) - if 'noiseregion' in keys: - img_poly=cv2.fillPoly(img, pts =co_noise, color=labels_rgb_color[ config_params['noiseregion']]) - - if 'textregions' in keys: - if "paragraph" in types_text: - img_poly=cv2.fillPoly(img, pts =co_text_paragraph, color=labels_rgb_color[ config_params['textregions']['paragraph']]) - if "footnote" in types_text: - img_poly=cv2.fillPoly(img, pts =co_text_footnote, color=labels_rgb_color[ config_params['textregions']['footnote']]) - if "footnote-continued" in types_text: - img_poly=cv2.fillPoly(img, pts =co_text_footnote_con, color=labels_rgb_color[ config_params['textregions']['footnote-continued']]) - if "heading" in types_text: - img_poly=cv2.fillPoly(img, pts =co_text_heading, color=labels_rgb_color[ config_params['textregions']['heading']]) - if "header" in types_text: - img_poly=cv2.fillPoly(img, pts =co_text_header, color=labels_rgb_color[ config_params['textregions']['header']]) - if "catch-word" in types_text: - img_poly=cv2.fillPoly(img, pts =co_text_catch, color=labels_rgb_color[ config_params['textregions']['catch-word']]) - if "signature-mark" in types_text: - img_poly=cv2.fillPoly(img, pts =co_text_signature_mark, color=labels_rgb_color[ config_params['textregions']['signature-mark']]) - if "page-number" in types_text: - img_poly=cv2.fillPoly(img, pts =co_text_page_number, color=labels_rgb_color[ config_params['textregions']['page-number']]) - if "marginalia" in types_text: - img_poly=cv2.fillPoly(img, pts =co_text_marginalia, color=labels_rgb_color[ config_params['textregions']['marginalia']]) - if "drop-capital" in types_text: - img_poly=cv2.fillPoly(img, pts =co_text_drop, color=labels_rgb_color[ config_params['textregions']['drop-capital']]) - - if "artificial_class_on_boundry" in keys: - img_poly[:,:,0][img_boundary[:,:]==1] = artificial_class_rgb_color[0] - img_poly[:,:,1][img_boundary[:,:]==1] = artificial_class_rgb_color[1] - img_poly[:,:,2][img_boundary[:,:]==1] = artificial_class_rgb_color[2] - - - - - elif self.output_type == '2d': - if 'graphicregions' in keys: - if "handwritten-annotation" in types_graphic: - color_label = config_params['graphicregions']['handwritten-annotation'] - img_poly=cv2.fillPoly(img, pts =co_graphic_text_annotation, color=(color_label,color_label,color_label)) - if "signature" in types_graphic: - color_label = config_params['graphicregions']['signature'] - img_poly=cv2.fillPoly(img, pts =co_graphic_signature, color=(color_label,color_label,color_label)) - if "decoration" in types_graphic: - color_label = config_params['graphicregions']['decoration'] - img_poly=cv2.fillPoly(img, pts =co_graphic_decoration, color=(color_label,color_label,color_label)) - if "stamp" in types_graphic: - color_label = config_params['graphicregions']['stamp'] - img_poly=cv2.fillPoly(img, pts =co_graphic_stamp, color=(color_label,color_label,color_label)) - - if 'imageregion' in keys: - color_label = config_params['imageregion'] - img_poly=cv2.fillPoly(img, pts =co_img, color=(color_label,color_label,color_label)) - if 'separatorregion' in keys: - color_label = config_params['separatorregion'] - img_poly=cv2.fillPoly(img, pts =co_sep, color=(color_label,color_label,color_label)) - if 'tableregion' in keys: - color_label = config_params['tableregion'] - img_poly=cv2.fillPoly(img, pts =co_table, color=(color_label,color_label,color_label)) - if 'noiseregion' in keys: - color_label = config_params['noiseregion'] - img_poly=cv2.fillPoly(img, pts =co_noise, color=(color_label,color_label,color_label)) - - if 'textregions' in keys: - if "paragraph" in types_text: - color_label = config_params['textregions']['paragraph'] - img_poly=cv2.fillPoly(img, pts =co_text_paragraph, color=(color_label,color_label,color_label)) - if "footnote" in types_text: - color_label = config_params['textregions']['footnote'] - img_poly=cv2.fillPoly(img, pts =co_text_footnote, color=(color_label,color_label,color_label)) - if "footnote-continued" in types_text: - color_label = config_params['textregions']['footnote-continued'] - img_poly=cv2.fillPoly(img, pts =co_text_footnote_con, color=(color_label,color_label,color_label)) - if "heading" in types_text: - color_label = config_params['textregions']['heading'] - img_poly=cv2.fillPoly(img, pts =co_text_heading, color=(color_label,color_label,color_label)) - if "header" in types_text: - color_label = config_params['textregions']['header'] - img_poly=cv2.fillPoly(img, pts =co_text_header, color=(color_label,color_label,color_label)) - if "catch-word" in types_text: - color_label = config_params['textregions']['catch-word'] - img_poly=cv2.fillPoly(img, pts =co_text_catch, color=(color_label,color_label,color_label)) - if "signature-mark" in types_text: - color_label = config_params['textregions']['signature-mark'] - img_poly=cv2.fillPoly(img, pts =co_text_signature_mark, color=(color_label,color_label,color_label)) - if "page-number" in types_text: - color_label = config_params['textregions']['page-number'] - img_poly=cv2.fillPoly(img, pts =co_text_page_number, color=(color_label,color_label,color_label)) - if "marginalia" in types_text: - color_label = config_params['textregions']['marginalia'] - img_poly=cv2.fillPoly(img, pts =co_text_marginalia, color=(color_label,color_label,color_label)) - if "drop-capital" in types_text: - color_label = config_params['textregions']['drop-capital'] - img_poly=cv2.fillPoly(img, pts =co_text_drop, color=(color_label,color_label,color_label)) - - if "artificial_class_on_boundry" in keys: - img_poly[:,:][img_boundary[:,:]==1] = artificial_class_label - - - - - try: - cv2.imwrite(self.output_dir+'/'+self.gt_list[index].split('-')[1].split('.')[0]+'.png',img_poly ) - except: - cv2.imwrite(self.output_dir+'/'+self.gt_list[index].split('.')[0]+'.png',img_poly ) - - - def run(self,config_params): - self.get_content_of_dir() - self.get_images_of_ground_truth(config_params) - - -@click.command() -@click.option( - "--dir_xml", - "-dx", - help="directory of GT page-xml files", - type=click.Path(exists=True, file_okay=False), -) -@click.option( - "--dir_out", - "-do", - help="directory where ground truth images would be written", - type=click.Path(exists=True, file_okay=False), -) - -@click.option( - "--config", - "-cfg", - help="config file of prefered layout or use case.", - type=click.Path(exists=True, dir_okay=False), -) - -@click.option( - "--type_output", - "-to", - help="this defines how output should be. A 2d image array or a 3d image array encoded with RGB color. Just pass 2d or 3d. The file will be saved one directory up. 2D image array is 3d but only information of one channel would be enough since all channels have the same values.", -) - - -def main(dir_xml,dir_out,type_output,config): - if config: - with open(config) as f: - config_params = json.load(f) - else: - print("passed") - config_params = None - x=pagexml2label(dir_xml,dir_out,type_output, config) - x.run(config_params) -if __name__=="__main__": - main() - - - From 4e4490d740b7063ce3324a3a51a2f50a2963f01d Mon Sep 17 00:00:00 2001 From: vahidrezanezhad Date: Fri, 24 May 2024 16:39:48 +0200 Subject: [PATCH 17/62] machine based reading order training is integrated --- models.py | 55 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ train.py | 31 +++++++++++++++++++++++++++++++ utils.py | 23 +++++++++++++++++++++++ 3 files changed, 109 insertions(+) diff --git a/models.py b/models.py index 4cceacd..d852ac3 100644 --- a/models.py +++ b/models.py @@ -544,4 +544,59 @@ def resnet50_classifier(n_classes,input_height=224,input_width=224,weight_decay= + return model + +def machine_based_reading_order_model(n_classes,input_height=224,input_width=224,weight_decay=1e-6,pretraining=False): + assert input_height%32 == 0 + assert input_width%32 == 0 + + img_input = Input(shape=(input_height,input_width , 3 )) + + if IMAGE_ORDERING == 'channels_last': + bn_axis = 3 + else: + bn_axis = 1 + + x1 = ZeroPadding2D((3, 3), data_format=IMAGE_ORDERING)(img_input) + x1 = Conv2D(64, (7, 7), data_format=IMAGE_ORDERING, strides=(2, 2),kernel_regularizer=l2(weight_decay), name='conv1')(x1) + + x1 = BatchNormalization(axis=bn_axis, name='bn_conv1')(x1) + x1 = Activation('relu')(x1) + x1 = MaxPooling2D((3, 3) , data_format=IMAGE_ORDERING , strides=(2, 2))(x1) + + x1 = conv_block(x1, 3, [64, 64, 256], stage=2, block='a', strides=(1, 1)) + x1 = identity_block(x1, 3, [64, 64, 256], stage=2, block='b') + x1 = identity_block(x1, 3, [64, 64, 256], stage=2, block='c') + + x1 = conv_block(x1, 3, [128, 128, 512], stage=3, block='a') + x1 = identity_block(x1, 3, [128, 128, 512], stage=3, block='b') + x1 = identity_block(x1, 3, [128, 128, 512], stage=3, block='c') + x1 = identity_block(x1, 3, [128, 128, 512], stage=3, block='d') + + x1 = conv_block(x1, 3, [256, 256, 1024], stage=4, block='a') + x1 = identity_block(x1, 3, [256, 256, 1024], stage=4, block='b') + x1 = identity_block(x1, 3, [256, 256, 1024], stage=4, block='c') + x1 = identity_block(x1, 3, [256, 256, 1024], stage=4, block='d') + x1 = identity_block(x1, 3, [256, 256, 1024], stage=4, block='e') + x1 = identity_block(x1, 3, [256, 256, 1024], stage=4, block='f') + + x1 = conv_block(x1, 3, [512, 512, 2048], stage=5, block='a') + x1 = identity_block(x1, 3, [512, 512, 2048], stage=5, block='b') + x1 = identity_block(x1, 3, [512, 512, 2048], stage=5, block='c') + + if pretraining: + Model(img_input , x1).load_weights(resnet50_Weights_path) + + x1 = AveragePooling2D((7, 7), name='avg_pool1')(x1) + flattened = Flatten()(x1) + + o = Dense(256, activation='relu', name='fc512')(flattened) + o=Dropout(0.2)(o) + + o = Dense(256, activation='relu', name='fc512a')(o) + o=Dropout(0.2)(o) + + o = Dense(n_classes, activation='sigmoid', name='fc1000')(o) + model = Model(img_input , o) + return model diff --git a/train.py b/train.py index 78974d3..f338c78 100644 --- a/train.py +++ b/train.py @@ -313,4 +313,35 @@ def run(_config, n_classes, n_epochs, input_height, with open(os.path.join( os.path.join(dir_output,'model_best'), "config.json"), "w") as fp: json.dump(_config, fp) # encode dict into JSON + + elif task=='reading_order': + configuration() + model = machine_based_reading_order_model(n_classes,input_height,input_width,weight_decay,pretraining) + + dir_flow_train_imgs = os.path.join(dir_train, 'images') + dir_flow_train_labels = os.path.join(dir_train, 'labels') + + classes = os.listdir(dir_flow_train_labels) + num_rows =len(classes) + #ls_test = os.listdir(dir_flow_train_labels) + + #f1score_tot = [0] + indexer_start = 0 + opt = SGD(lr=0.01, momentum=0.9) + opt_adam = tf.keras.optimizers.Adam(learning_rate=0.0001) + model.compile(loss="binary_crossentropy", + optimizer = opt_adam,metrics=['accuracy']) + for i in range(n_epochs): + history = model.fit(generate_arrays_from_folder_reading_order(dir_flow_train_labels, dir_flow_train_imgs, n_batch, input_height, input_width, n_classes), steps_per_epoch=num_rows / n_batch, verbose=1) + model.save( os.path.join(dir_output,'model_'+str(i+indexer_start) )) + + with open(os.path.join(os.path.join(dir_output,'model_'+str(i)),"config.json"), "w") as fp: + json.dump(_config, fp) # encode dict into JSON + ''' + if f1score>f1score_tot[0]: + f1score_tot[0] = f1score + model_dir = os.path.join(dir_out,'model_best') + model.save(model_dir) + ''' + diff --git a/utils.py b/utils.py index 271d977..a2e8a9c 100644 --- a/utils.py +++ b/utils.py @@ -268,6 +268,29 @@ def IoU(Yi, y_predi): #print("Mean IoU: {:4.3f}".format(mIoU)) return mIoU +def generate_arrays_from_folder_reading_order(classes_file_dir, modal_dir, batchsize, height, width, n_classes): + all_labels_files = os.listdir(classes_file_dir) + ret_x= np.zeros((batchsize, height, width, 3))#.astype(np.int16) + ret_y= np.zeros((batchsize, n_classes)).astype(np.int16) + batchcount = 0 + while True: + for i in all_labels_files: + file_name = i.split('.')[0] + img = cv2.imread(os.path.join(modal_dir,file_name+'.png')) + + label_class = int( np.load(os.path.join(classes_file_dir,i)) ) + + ret_x[batchcount, :,:,0] = img[:,:,0]/3.0 + ret_x[batchcount, :,:,2] = img[:,:,2]/3.0 + ret_x[batchcount, :,:,1] = img[:,:,1]/5.0 + + ret_y[batchcount, :] = label_class + batchcount+=1 + if batchcount>=batchsize: + yield (ret_x, ret_y) + ret_x= np.zeros((batchsize, height, width, 3))#.astype(np.int16) + ret_y= np.zeros((batchsize, n_classes)).astype(np.int16) + batchcount = 0 def data_gen(img_folder, mask_folder, batch_size, input_height, input_width, n_classes, task='segmentation'): c = 0 From 5aa6ee001042980eb55ef3a41ee46349ca85d42b Mon Sep 17 00:00:00 2001 From: vahidrezanezhad Date: Mon, 27 May 2024 17:23:49 +0200 Subject: [PATCH 18/62] adding rest_as_paragraph and rest_as_graphic to elements --- custom_config_page2label.json | 10 +- gt_gen_utils.py | 454 ++++++++++++---------------------- 2 files changed, 170 insertions(+), 294 deletions(-) diff --git a/custom_config_page2label.json b/custom_config_page2label.json index d6320fa..e4c02cb 100644 --- a/custom_config_page2label.json +++ b/custom_config_page2label.json @@ -1,9 +1,9 @@ { "use_case": "layout", -"textregions":{"paragraph":1, "heading": 2, "header":2,"drop-capital": 3, "marginalia":4 ,"page-number":1 , "catch-word":1 ,"footnote": 1, "footnote-continued": 1}, -"imageregion":5, -"separatorregion":6, -"graphicregions" :{"handwritten-annotation":5, "decoration": 5, "signature": 5, "stamp": 5}, -"artificial_class_on_boundry": ["paragraph","header", "heading", "marginalia", "page-number", "catch-word", "drop-capital","footnote", "footnote-continued"], +"textregions":{ "rest_as_paragraph": 1, "header":2 , "heading":2 , "marginalia":3 }, +"imageregion":4, +"separatorregion":5, +"graphicregions" :{"rest_as_decoration":6}, +"artificial_class_on_boundry": ["paragraph"], "artificial_class_label":7 } diff --git a/gt_gen_utils.py b/gt_gen_utils.py index 9862e29..9dc8377 100644 --- a/gt_gen_utils.py +++ b/gt_gen_utils.py @@ -180,7 +180,7 @@ def get_images_of_ground_truth(gt_list, dir_in, output_dir, output_type, config_ pass if vv.tag == link + 'Point': - c_t_in.append([int(np.float(vv.attrib['x'])), int(np.float(vv.attrib['y']))]) + c_t_in.append([int(float(vv.attrib['x'])), int(float(vv.attrib['y']))]) sumi += 1 elif vv.tag != link + 'Point' and sumi >= 1: break @@ -226,7 +226,6 @@ def get_images_of_ground_truth(gt_list, dir_in, output_dir, output_type, config_ types_text_dict = config_params['textregions'] types_text = list(types_text_dict.keys()) types_text_label = list(types_text_dict.values()) - print(types_text) if 'graphicregions' in keys: types_graphic_dict = config_params['graphicregions'] types_graphic = list(types_graphic_dict.keys()) @@ -235,41 +234,20 @@ def get_images_of_ground_truth(gt_list, dir_in, output_dir, output_type, config_ labels_rgb_color = [ (0,0,0), (255,0,0), (255,125,0), (255,0,125), (125,255,125), (125,125,0), (0,125,255), (0,125,0), (125,125,125), (255,0,255), (125,0,125), (0,255,0),(0,0,255), (0,255,255), (255,125,125), (0,125,125), (0,255,125), (255,125,255), (125,255,0)] + region_tags=np.unique([x for x in alltags if x.endswith('Region')]) - - co_text_paragraph=[] - co_text_footnote=[] - co_text_footnote_con=[] - co_text_drop=[] - co_text_heading=[] - co_text_header=[] - co_text_marginalia=[] - co_text_catch=[] - co_text_page_number=[] - co_text_signature_mark=[] + co_text = {'drop-capital':[], "footnote":[], "footnote-continued":[], "heading":[], "signature-mark":[], "header":[], "catch-word":[], "page-number":[], "marginalia":[], "paragraph":[]} + co_graphic = {"handwritten-annotation":[], "decoration":[], "stamp":[], "signature":[]} co_sep=[] co_img=[] co_table=[] - co_graphic_signature=[] - co_graphic_text_annotation=[] - co_graphic_decoration=[] - co_graphic_stamp=[] co_noise=[] for tag in region_tags: if 'textregions' in keys: if tag.endswith('}TextRegion') or tag.endswith('}Textregion'): for nn in root1.iter(tag): - c_t_in_drop=[] - c_t_in_paragraph=[] - c_t_in_heading=[] - c_t_in_header=[] - c_t_in_page_number=[] - c_t_in_signature_mark=[] - c_t_in_catch=[] - c_t_in_marginalia=[] - c_t_in_footnote=[] - c_t_in_footnote_con=[] + c_t_in = {'drop-capital':[], "footnote":[], "footnote-continued":[], "heading":[], "signature-mark":[], "header":[], "catch-word":[], "page-number":[], "marginalia":[], "paragraph":[]} sumi=0 for vv in nn.iter(): # check the format of coords @@ -277,143 +255,63 @@ def get_images_of_ground_truth(gt_list, dir_in, output_dir, output_type, config_ coords=bool(vv.attrib) if coords: - #print('birda1') p_h=vv.attrib['points'].split(' ') - if "drop-capital" in types_text: - if "type" in nn.attrib and nn.attrib['type']=='drop-capital': - c_t_in_drop.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) - - if "footnote" in types_text: - if "type" in nn.attrib and nn.attrib['type']=='footnote': - c_t_in_footnote.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) - - if "footnote-continued" in types_text: - if "type" in nn.attrib and nn.attrib['type']=='footnote-continued': - c_t_in_footnote_con.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) - - if "heading" in types_text: - if "type" in nn.attrib and nn.attrib['type']=='heading': - c_t_in_heading.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) - - if "signature-mark" in types_text: - if "type" in nn.attrib and nn.attrib['type']=='signature-mark': - c_t_in_signature_mark.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) - - if "header" in types_text: - if "type" in nn.attrib and nn.attrib['type']=='header': - c_t_in_header.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) - - if "catch-word" in types_text: - if "type" in nn.attrib and nn.attrib['type']=='catch-word': - c_t_in_catch.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) - - if "page-number" in types_text: - if "type" in nn.attrib and nn.attrib['type']=='page-number': - c_t_in_page_number.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) - - if "marginalia" in types_text: - if "type" in nn.attrib and nn.attrib['type']=='marginalia': - c_t_in_marginalia.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) - - if "paragraph" in types_text: - if "type" in nn.attrib and nn.attrib['type']=='paragraph': - c_t_in_paragraph.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) - + if "rest_as_paragraph" in types_text: + types_text_without_paragraph = [element for element in types_text if element!='rest_as_paragraph' and element!='paragraph'] + if len(types_text_without_paragraph) == 0: + if "type" in nn.attrib: + c_t_in['paragraph'].append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) + elif len(types_text_without_paragraph) >= 1: + if "type" in nn.attrib: + if nn.attrib['type'] in types_text_without_paragraph: + c_t_in[nn.attrib['type']].append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) + else: + c_t_in['paragraph'].append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) + + else: + if "type" in nn.attrib: + c_t_in[nn.attrib['type']].append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) break else: pass - + if vv.tag==link+'Point': - if "drop-capital" in types_text: - if "type" in nn.attrib and nn.attrib['type']=='drop-capital': - c_t_in_drop.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) - sumi+=1 - - if "footnote" in types_text: - if "type" in nn.attrib and nn.attrib['type']=='footnote': - c_t_in_footnote.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) - sumi+=1 - - if "footnote-continued" in types_text: - if "type" in nn.attrib and nn.attrib['type']=='footnote-continued': - c_t_in_footnote_con.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) - sumi+=1 - - if "heading" in types_text: - if "type" in nn.attrib and nn.attrib['type']=='heading': - c_t_in_heading.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) - sumi+=1 - - if "signature-mark" in types_text: - if "type" in nn.attrib and nn.attrib['type']=='signature-mark': - c_t_in_signature_mark.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) - sumi+=1 - - if "header" in types_text: - if "type" in nn.attrib and nn.attrib['type']=='header': - c_t_in_header.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) - sumi+=1 - - if "catch-word" in types_text: - if "type" in nn.attrib and nn.attrib['type']=='catch-word': - c_t_in_catch.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) - sumi+=1 - - if "page-number" in types_text: - if "type" in nn.attrib and nn.attrib['type']=='page-number': - c_t_in_page_number.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) - sumi+=1 - - if "marginalia" in types_text: - if "type" in nn.attrib and nn.attrib['type']=='marginalia': - c_t_in_marginalia.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) - sumi+=1 - - if "paragraph" in types_text: - if "type" in nn.attrib and nn.attrib['type']=='paragraph': - c_t_in_paragraph.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) + if "rest_as_paragraph" in types_text: + types_text_without_paragraph = [element for element in types_text if element!='rest_as_paragraph' and element!='paragraph'] + if len(types_text_without_paragraph) == 0: + if "type" in nn.attrib: + c_t_in['paragraph'].append( [ int(float(vv.attrib['x'])) , int(float(vv.attrib['y'])) ] ) + sumi+=1 + elif len(types_text_without_paragraph) >= 1: + if "type" in nn.attrib: + if nn.attrib['type'] in types_text_without_paragraph: + c_t_in[nn.attrib['type']].append( [ int(float(vv.attrib['x'])) , int(float(vv.attrib['y'])) ] ) + sumi+=1 + else: + c_t_in['paragraph'].append( [ int(float(vv.attrib['x'])) , int(float(vv.attrib['y'])) ] ) + sumi+=1 + + else: + if "type" in nn.attrib: + c_t_in[nn.attrib['type']].append( [ int(float(vv.attrib['x'])) , int(float(vv.attrib['y'])) ] ) sumi+=1 - + elif vv.tag!=link+'Point' and sumi>=1: break - if len(c_t_in_drop)>0: - co_text_drop.append(np.array(c_t_in_drop)) - if len(c_t_in_footnote_con)>0: - co_text_footnote_con.append(np.array(c_t_in_footnote_con)) - if len(c_t_in_footnote)>0: - co_text_footnote.append(np.array(c_t_in_footnote)) - if len(c_t_in_paragraph)>0: - co_text_paragraph.append(np.array(c_t_in_paragraph)) - if len(c_t_in_heading)>0: - co_text_heading.append(np.array(c_t_in_heading)) - - if len(c_t_in_header)>0: - co_text_header.append(np.array(c_t_in_header)) - if len(c_t_in_page_number)>0: - co_text_page_number.append(np.array(c_t_in_page_number)) - if len(c_t_in_catch)>0: - co_text_catch.append(np.array(c_t_in_catch)) - - if len(c_t_in_signature_mark)>0: - co_text_signature_mark.append(np.array(c_t_in_signature_mark)) - - if len(c_t_in_marginalia)>0: - co_text_marginalia.append(np.array(c_t_in_marginalia)) - - + for element_text in list(c_t_in.keys()): + if len(c_t_in[element_text])>0: + co_text[element_text].append(np.array(c_t_in[element_text])) + if 'graphicregions' in keys: if tag.endswith('}GraphicRegion') or tag.endswith('}graphicregion'): #print('sth') for nn in root1.iter(tag): - c_t_in_stamp=[] - c_t_in_text_annotation=[] - c_t_in_decoration=[] - c_t_in_signature=[] + c_t_in_graphic = {"handwritten-annotation":[], "decoration":[], "stamp":[], "signature":[]} sumi=0 for vv in nn.iter(): # check the format of coords @@ -421,23 +319,22 @@ def get_images_of_ground_truth(gt_list, dir_in, output_dir, output_type, config_ coords=bool(vv.attrib) if coords: p_h=vv.attrib['points'].split(' ') - if "handwritten-annotation" in types_graphic: - if "type" in nn.attrib and nn.attrib['type']=='handwritten-annotation': - c_t_in_text_annotation.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) - - if "decoration" in types_graphic: - if "type" in nn.attrib and nn.attrib['type']=='decoration': - c_t_in_decoration.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) - - if "stamp" in types_graphic: - if "type" in nn.attrib and nn.attrib['type']=='stamp': - c_t_in_stamp.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) - - if "signature" in types_graphic: - if "type" in nn.attrib and nn.attrib['type']=='signature': - c_t_in_signature.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) - + if "rest_as_decoration" in types_graphic: + types_graphic_without_decoration = [element for element in types_graphic if element!='rest_as_decoration' and element!='decoration'] + if len(types_graphic_without_decoration) == 0: + if "type" in nn.attrib: + c_t_in_graphic['decoration'].append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) + elif len(types_graphic_without_decoration) >= 1: + if "type" in nn.attrib: + if nn.attrib['type'] in types_graphic_without_decoration: + c_t_in_graphic[nn.attrib['type']].append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) + else: + c_t_in_graphic['decoration'].append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) + + else: + if "type" in nn.attrib: + c_t_in_graphic[nn.attrib['type']].append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) break else: @@ -445,34 +342,33 @@ def get_images_of_ground_truth(gt_list, dir_in, output_dir, output_type, config_ if vv.tag==link+'Point': - if "handwritten-annotation" in types_graphic: - if "type" in nn.attrib and nn.attrib['type']=='handwritten-annotation': - c_t_in_text_annotation.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) + if "rest_as_decoration" in types_graphic: + types_graphic_without_decoration = [element for element in types_graphic if element!='rest_as_decoration' and element!='decoration'] + if len(types_graphic_without_decoration) == 0: + if "type" in nn.attrib: + c_t_in_graphic['decoration'].append( [ int(float(vv.attrib['x'])) , int(float(vv.attrib['y'])) ] ) + sumi+=1 + elif len(types_graphic_without_decoration) >= 1: + if "type" in nn.attrib: + if nn.attrib['type'] in types_graphic_without_decoration: + c_t_in_graphic[nn.attrib['type']].append( [ int(float(vv.attrib['x'])) , int(float(vv.attrib['y'])) ] ) + sumi+=1 + else: + c_t_in_graphic['decoration'].append( [ int(float(vv.attrib['x'])) , int(float(vv.attrib['y'])) ] ) + sumi+=1 + + else: + if "type" in nn.attrib: + c_t_in_graphic[nn.attrib['type']].append( [ int(float(vv.attrib['x'])) , int(float(vv.attrib['y'])) ] ) sumi+=1 - if "decoration" in types_graphic: - if "type" in nn.attrib and nn.attrib['type']=='decoration': - c_t_in_decoration.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) - sumi+=1 - - if "stamp" in types_graphic: - if "type" in nn.attrib and nn.attrib['type']=='stamp': - c_t_in_stamp.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) - sumi+=1 - - if "signature" in types_graphic: - if "type" in nn.attrib and nn.attrib['type']=='signature': - c_t_in_signature.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) - sumi+=1 + elif vv.tag!=link+'Point' and sumi>=1: + break + + for element_graphic in list(c_t_in_graphic.keys()): + if len(c_t_in_graphic[element_graphic])>0: + co_graphic[element_graphic].append(np.array(c_t_in_graphic[element_graphic])) - if len(c_t_in_text_annotation)>0: - co_graphic_text_annotation.append(np.array(c_t_in_text_annotation)) - if len(c_t_in_decoration)>0: - co_graphic_decoration.append(np.array(c_t_in_decoration)) - if len(c_t_in_stamp)>0: - co_graphic_stamp.append(np.array(c_t_in_stamp)) - if len(c_t_in_signature)>0: - co_graphic_signature.append(np.array(c_t_in_signature)) if 'imageregion' in keys: if tag.endswith('}ImageRegion') or tag.endswith('}imageregion'): @@ -491,7 +387,7 @@ def get_images_of_ground_truth(gt_list, dir_in, output_dir, output_type, config_ if vv.tag==link+'Point': - c_t_in.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) + c_t_in.append([ int(float(vv.attrib['x'])) , int(float(vv.attrib['y'])) ]) sumi+=1 elif vv.tag!=link+'Point' and sumi>=1: @@ -517,7 +413,7 @@ def get_images_of_ground_truth(gt_list, dir_in, output_dir, output_type, config_ if vv.tag==link+'Point': - c_t_in.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) + c_t_in.append([ int(float(vv.attrib['x'])) , int(float(vv.attrib['y'])) ]) sumi+=1 elif vv.tag!=link+'Point' and sumi>=1: @@ -545,7 +441,7 @@ def get_images_of_ground_truth(gt_list, dir_in, output_dir, output_type, config_ if vv.tag==link+'Point': - c_t_in.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) + c_t_in.append([ int(float(vv.attrib['x'])) , int(float(vv.attrib['y'])) ]) sumi+=1 #print(vv.tag,'in') elif vv.tag!=link+'Point' and sumi>=1: @@ -571,7 +467,7 @@ def get_images_of_ground_truth(gt_list, dir_in, output_dir, output_type, config_ if vv.tag==link+'Point': - c_t_in.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) + c_t_in.append([ int(float(vv.attrib['x'])) , int(float(vv.attrib['y'])) ]) sumi+=1 #print(vv.tag,'in') elif vv.tag!=link+'Point' and sumi>=1: @@ -583,59 +479,63 @@ def get_images_of_ground_truth(gt_list, dir_in, output_dir, output_type, config_ if "paragraph" in elements_with_artificial_class: erosion_rate = 2 dilation_rate = 4 - co_text_paragraph, img_boundary = update_region_contours(co_text_paragraph, img_boundary, erosion_rate, dilation_rate, y_len, x_len ) + co_text['paragraph'], img_boundary = update_region_contours(co_text['paragraph'], img_boundary, erosion_rate, dilation_rate, y_len, x_len ) if "drop-capital" in elements_with_artificial_class: erosion_rate = 0 dilation_rate = 4 - co_text_drop, img_boundary = update_region_contours(co_text_drop, img_boundary, erosion_rate, dilation_rate, y_len, x_len ) + co_text["drop-capital"], img_boundary = update_region_contours(co_text["drop-capital"], img_boundary, erosion_rate, dilation_rate, y_len, x_len ) if "catch-word" in elements_with_artificial_class: erosion_rate = 0 dilation_rate = 4 - co_text_catch, img_boundary = update_region_contours(co_text_catch, img_boundary, erosion_rate, dilation_rate, y_len, x_len ) + co_text["catch-word"], img_boundary = update_region_contours(co_text["catch-word"], img_boundary, erosion_rate, dilation_rate, y_len, x_len ) if "page-number" in elements_with_artificial_class: erosion_rate = 0 dilation_rate = 4 - co_text_page_number, img_boundary = update_region_contours(co_text_page_number, img_boundary, erosion_rate, dilation_rate, y_len, x_len ) + co_text["page-number"], img_boundary = update_region_contours(co_text["page-number"], img_boundary, erosion_rate, dilation_rate, y_len, x_len ) if "header" in elements_with_artificial_class: erosion_rate = 1 dilation_rate = 4 - co_text_header, img_boundary = update_region_contours(co_text_header, img_boundary, erosion_rate, dilation_rate, y_len, x_len ) + co_text["header"], img_boundary = update_region_contours(co_text["header"], img_boundary, erosion_rate, dilation_rate, y_len, x_len ) if "heading" in elements_with_artificial_class: erosion_rate = 1 dilation_rate = 4 - co_text_heading, img_boundary = update_region_contours(co_text_heading, img_boundary, erosion_rate, dilation_rate, y_len, x_len ) + co_text["heading"], img_boundary = update_region_contours(co_text["heading"], img_boundary, erosion_rate, dilation_rate, y_len, x_len ) if "signature-mark" in elements_with_artificial_class: erosion_rate = 1 dilation_rate = 4 - co_text_signature_mark, img_boundary = update_region_contours(co_text_signature_mark, img_boundary, erosion_rate, dilation_rate, y_len, x_len ) + co_text["signature-mark"], img_boundary = update_region_contours(co_text["signature-mark"], img_boundary, erosion_rate, dilation_rate, y_len, x_len ) if "marginalia" in elements_with_artificial_class: erosion_rate = 2 dilation_rate = 4 - co_text_marginalia, img_boundary = update_region_contours(co_text_marginalia, img_boundary, erosion_rate, dilation_rate, y_len, x_len ) + co_text["marginalia"], img_boundary = update_region_contours(co_text["marginalia"], img_boundary, erosion_rate, dilation_rate, y_len, x_len ) if "footnote" in elements_with_artificial_class: erosion_rate = 2 dilation_rate = 4 - co_text_footnote, img_boundary = update_region_contours(co_text_footnote, img_boundary, erosion_rate, dilation_rate, y_len, x_len ) + co_text["footnote"], img_boundary = update_region_contours(co_text["footnote"], img_boundary, erosion_rate, dilation_rate, y_len, x_len ) if "footnote-continued" in elements_with_artificial_class: erosion_rate = 2 dilation_rate = 4 - co_text_footnote_con, img_boundary = update_region_contours(co_text_footnote_con, img_boundary, erosion_rate, dilation_rate, y_len, x_len ) + co_text["footnote-continued"], img_boundary = update_region_contours(co_text["footnote-continued"], img_boundary, erosion_rate, dilation_rate, y_len, x_len ) img = np.zeros( (y_len,x_len,3) ) if output_type == '3d': - if 'graphicregions' in keys: - if "handwritten-annotation" in types_graphic: - img_poly=cv2.fillPoly(img, pts =co_graphic_text_annotation, color=labels_rgb_color[ config_params['graphicregions']['handwritten-annotation']]) - if "signature" in types_graphic: - img_poly=cv2.fillPoly(img, pts =co_graphic_signature, color=labels_rgb_color[ config_params['graphicregions']['signature']]) - if "decoration" in types_graphic: - img_poly=cv2.fillPoly(img, pts =co_graphic_decoration, color=labels_rgb_color[ config_params['graphicregions']['decoration']]) - if "stamp" in types_graphic: - img_poly=cv2.fillPoly(img, pts =co_graphic_stamp, color=labels_rgb_color[ config_params['graphicregions']['stamp']]) + if 'rest_as_decoration' in types_graphic: + types_graphic[types_graphic=='rest_as_decoration'] = 'decoration' + for element_graphic in types_graphic: + if element_graphic == 'decoration': + color_label = labels_rgb_color[ config_params['graphicregions']['rest_as_decoration']] + else: + color_label = labels_rgb_color[ config_params['graphicregions'][element_graphic]] + img_poly=cv2.fillPoly(img, pts =co_graphic[element_graphic], color=color_label) + else: + for element_graphic in types_graphic: + color_label = labels_rgb_color[ config_params['graphicregions'][element_graphic]] + img_poly=cv2.fillPoly(img, pts =co_graphic[element_graphic], color=color_label) + if 'imageregion' in keys: img_poly=cv2.fillPoly(img, pts =co_img, color=labels_rgb_color[ config_params['imageregion']]) @@ -647,26 +547,19 @@ def get_images_of_ground_truth(gt_list, dir_in, output_dir, output_type, config_ img_poly=cv2.fillPoly(img, pts =co_noise, color=labels_rgb_color[ config_params['noiseregion']]) if 'textregions' in keys: - if "paragraph" in types_text: - img_poly=cv2.fillPoly(img, pts =co_text_paragraph, color=labels_rgb_color[ config_params['textregions']['paragraph']]) - if "footnote" in types_text: - img_poly=cv2.fillPoly(img, pts =co_text_footnote, color=labels_rgb_color[ config_params['textregions']['footnote']]) - if "footnote-continued" in types_text: - img_poly=cv2.fillPoly(img, pts =co_text_footnote_con, color=labels_rgb_color[ config_params['textregions']['footnote-continued']]) - if "heading" in types_text: - img_poly=cv2.fillPoly(img, pts =co_text_heading, color=labels_rgb_color[ config_params['textregions']['heading']]) - if "header" in types_text: - img_poly=cv2.fillPoly(img, pts =co_text_header, color=labels_rgb_color[ config_params['textregions']['header']]) - if "catch-word" in types_text: - img_poly=cv2.fillPoly(img, pts =co_text_catch, color=labels_rgb_color[ config_params['textregions']['catch-word']]) - if "signature-mark" in types_text: - img_poly=cv2.fillPoly(img, pts =co_text_signature_mark, color=labels_rgb_color[ config_params['textregions']['signature-mark']]) - if "page-number" in types_text: - img_poly=cv2.fillPoly(img, pts =co_text_page_number, color=labels_rgb_color[ config_params['textregions']['page-number']]) - if "marginalia" in types_text: - img_poly=cv2.fillPoly(img, pts =co_text_marginalia, color=labels_rgb_color[ config_params['textregions']['marginalia']]) - if "drop-capital" in types_text: - img_poly=cv2.fillPoly(img, pts =co_text_drop, color=labels_rgb_color[ config_params['textregions']['drop-capital']]) + if 'rest_as_paragraph' in types_text: + types_text[types_text=='rest_as_paragraph'] = 'paragraph' + for element_text in types_text: + if element_text == 'paragraph': + color_label = labels_rgb_color[ config_params['textregions']['rest_as_paragraph']] + else: + color_label = labels_rgb_color[ config_params['textregions'][element_text]] + img_poly=cv2.fillPoly(img, pts =co_text[element_text], color=color_label) + else: + for element_text in types_text: + color_label = labels_rgb_color[ config_params['textregions'][element_text]] + img_poly=cv2.fillPoly(img, pts =co_text[element_text], color=color_label) + if "artificial_class_on_boundry" in keys: img_poly[:,:,0][img_boundary[:,:]==1] = artificial_class_rgb_color[0] @@ -678,18 +571,19 @@ def get_images_of_ground_truth(gt_list, dir_in, output_dir, output_type, config_ elif output_type == '2d': if 'graphicregions' in keys: - if "handwritten-annotation" in types_graphic: - color_label = config_params['graphicregions']['handwritten-annotation'] - img_poly=cv2.fillPoly(img, pts =co_graphic_text_annotation, color=(color_label,color_label,color_label)) - if "signature" in types_graphic: - color_label = config_params['graphicregions']['signature'] - img_poly=cv2.fillPoly(img, pts =co_graphic_signature, color=(color_label,color_label,color_label)) - if "decoration" in types_graphic: - color_label = config_params['graphicregions']['decoration'] - img_poly=cv2.fillPoly(img, pts =co_graphic_decoration, color=(color_label,color_label,color_label)) - if "stamp" in types_graphic: - color_label = config_params['graphicregions']['stamp'] - img_poly=cv2.fillPoly(img, pts =co_graphic_stamp, color=(color_label,color_label,color_label)) + if 'rest_as_decoration' in types_graphic: + types_graphic[types_graphic=='rest_as_decoration'] = 'decoration' + for element_graphic in types_graphic: + if element_graphic == 'decoration': + color_label = config_params['graphicregions']['rest_as_decoration'] + else: + color_label = config_params['graphicregions'][element_graphic] + img_poly=cv2.fillPoly(img, pts =co_graphic[element_graphic], color=color_label) + else: + for element_graphic in types_graphic: + color_label = config_params['graphicregions'][element_graphic] + img_poly=cv2.fillPoly(img, pts =co_graphic[element_graphic], color=color_label) + if 'imageregion' in keys: color_label = config_params['imageregion'] @@ -705,36 +599,18 @@ def get_images_of_ground_truth(gt_list, dir_in, output_dir, output_type, config_ img_poly=cv2.fillPoly(img, pts =co_noise, color=(color_label,color_label,color_label)) if 'textregions' in keys: - if "paragraph" in types_text: - color_label = config_params['textregions']['paragraph'] - img_poly=cv2.fillPoly(img, pts =co_text_paragraph, color=(color_label,color_label,color_label)) - if "footnote" in types_text: - color_label = config_params['textregions']['footnote'] - img_poly=cv2.fillPoly(img, pts =co_text_footnote, color=(color_label,color_label,color_label)) - if "footnote-continued" in types_text: - color_label = config_params['textregions']['footnote-continued'] - img_poly=cv2.fillPoly(img, pts =co_text_footnote_con, color=(color_label,color_label,color_label)) - if "heading" in types_text: - color_label = config_params['textregions']['heading'] - img_poly=cv2.fillPoly(img, pts =co_text_heading, color=(color_label,color_label,color_label)) - if "header" in types_text: - color_label = config_params['textregions']['header'] - img_poly=cv2.fillPoly(img, pts =co_text_header, color=(color_label,color_label,color_label)) - if "catch-word" in types_text: - color_label = config_params['textregions']['catch-word'] - img_poly=cv2.fillPoly(img, pts =co_text_catch, color=(color_label,color_label,color_label)) - if "signature-mark" in types_text: - color_label = config_params['textregions']['signature-mark'] - img_poly=cv2.fillPoly(img, pts =co_text_signature_mark, color=(color_label,color_label,color_label)) - if "page-number" in types_text: - color_label = config_params['textregions']['page-number'] - img_poly=cv2.fillPoly(img, pts =co_text_page_number, color=(color_label,color_label,color_label)) - if "marginalia" in types_text: - color_label = config_params['textregions']['marginalia'] - img_poly=cv2.fillPoly(img, pts =co_text_marginalia, color=(color_label,color_label,color_label)) - if "drop-capital" in types_text: - color_label = config_params['textregions']['drop-capital'] - img_poly=cv2.fillPoly(img, pts =co_text_drop, color=(color_label,color_label,color_label)) + if 'rest_as_paragraph' in types_text: + types_text[types_text=='rest_as_paragraph'] = 'paragraph' + for element_text in types_text: + if element_text == 'paragraph': + color_label = config_params['textregions']['rest_as_paragraph'] + else: + color_label = config_params['textregions'][element_text] + img_poly=cv2.fillPoly(img, pts =co_text[element_text], color=color_label) + else: + for element_text in types_text: + color_label = config_params['textregions'][element_text] + img_poly=cv2.fillPoly(img, pts =co_text[element_text], color=color_label) if "artificial_class_on_boundry" in keys: img_poly[:,:][img_boundary[:,:]==1] = artificial_class_label @@ -947,51 +823,51 @@ def read_xml(xml_file): if "type" in nn.attrib and nn.attrib['type']=='drop-capital': #if nn.attrib['type']=='paragraph': - c_t_in_drop.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) + c_t_in_drop.append([ int(float(vv.attrib['x'])) , int(float(vv.attrib['y'])) ]) sumi+=1 elif "type" in nn.attrib and nn.attrib['type']=='heading': id_heading.append(nn.attrib['id']) - c_t_in_heading.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) + c_t_in_heading.append([ int(float(vv.attrib['x'])) , int(float(vv.attrib['y'])) ]) sumi+=1 elif "type" in nn.attrib and nn.attrib['type']=='signature-mark': - c_t_in_signature_mark.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) + c_t_in_signature_mark.append([ int(float(vv.attrib['x'])) , int(float(vv.attrib['y'])) ]) #print(c_t_in_paragraph) sumi+=1 elif "type" in nn.attrib and nn.attrib['type']=='header': id_header.append(nn.attrib['id']) - c_t_in_header.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) + c_t_in_header.append([ int(float(vv.attrib['x'])) , int(float(vv.attrib['y'])) ]) sumi+=1 elif "type" in nn.attrib and nn.attrib['type']=='catch-word': - c_t_in_catch.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) + c_t_in_catch.append([ int(float(vv.attrib['x'])) , int(float(vv.attrib['y'])) ]) sumi+=1 elif "type" in nn.attrib and nn.attrib['type']=='page-number': - c_t_in_page_number.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) + c_t_in_page_number.append([ int(float(vv.attrib['x'])) , int(float(vv.attrib['y'])) ]) #print(c_t_in_paragraph) sumi+=1 elif "type" in nn.attrib and nn.attrib['type']=='marginalia': id_marginalia.append(nn.attrib['id']) - c_t_in_marginalia.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) + c_t_in_marginalia.append([ int(float(vv.attrib['x'])) , int(float(vv.attrib['y'])) ]) #print(c_t_in_paragraph) sumi+=1 else: id_paragraph.append(nn.attrib['id']) - c_t_in_paragraph.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) + c_t_in_paragraph.append([ int(float(vv.attrib['x'])) , int(float(vv.attrib['y'])) ]) #print(c_t_in_paragraph) sumi+=1 - #c_t_in.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) + #c_t_in.append([ int(float(vv.attrib['x'])) , int(float(vv.attrib['y'])) ]) #print(vv.tag,'in') elif vv.tag!=link+'Point' and sumi>=1: @@ -1057,16 +933,16 @@ def read_xml(xml_file): if "type" in nn.attrib and nn.attrib['type']=='handwritten-annotation': #if nn.attrib['type']=='paragraph': - c_t_in_text_annotation.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) + c_t_in_text_annotation.append([ int(float(vv.attrib['x'])) , int(float(vv.attrib['y'])) ]) sumi+=1 elif "type" in nn.attrib and nn.attrib['type']=='decoration': - c_t_in_decoration.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) + c_t_in_decoration.append([ int(float(vv.attrib['x'])) , int(float(vv.attrib['y'])) ]) #print(c_t_in_paragraph) sumi+=1 else: - c_t_in.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) + c_t_in.append([ int(float(vv.attrib['x'])) , int(float(vv.attrib['y'])) ]) sumi+=1 if len(c_t_in_text_annotation)>0: @@ -1096,7 +972,7 @@ def read_xml(xml_file): if vv.tag==link+'Point': - c_t_in.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) + c_t_in.append([ int(float(vv.attrib['x'])) , int(float(vv.attrib['y'])) ]) sumi+=1 #print(vv.tag,'in') elif vv.tag!=link+'Point' and sumi>=1: @@ -1123,7 +999,7 @@ def read_xml(xml_file): if vv.tag==link+'Point': - c_t_in.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) + c_t_in.append([ int(float(vv.attrib['x'])) , int(float(vv.attrib['y'])) ]) sumi+=1 #print(vv.tag,'in') elif vv.tag!=link+'Point' and sumi>=1: @@ -1150,7 +1026,7 @@ def read_xml(xml_file): if vv.tag==link+'Point': - c_t_in.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) + c_t_in.append([ int(float(vv.attrib['x'])) , int(float(vv.attrib['y'])) ]) sumi+=1 #print(vv.tag,'in') elif vv.tag!=link+'Point' and sumi>=1: @@ -1176,7 +1052,7 @@ def read_xml(xml_file): if vv.tag==link+'Point': - c_t_in.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) + c_t_in.append([ int(float(vv.attrib['x'])) , int(float(vv.attrib['y'])) ]) sumi+=1 #print(vv.tag,'in') elif vv.tag!=link+'Point' and sumi>=1: From 29ddd4d909f819c68730e2d162664089d377ffd6 Mon Sep 17 00:00:00 2001 From: vahidrezanezhad Date: Tue, 28 May 2024 10:01:17 +0200 Subject: [PATCH 19/62] pass degrading scales for image enhancement as a json file --- generate_gt_for_training.py | 16 ++++++++++------ scales_enhancement.json | 3 +++ 2 files changed, 13 insertions(+), 6 deletions(-) create mode 100644 scales_enhancement.json diff --git a/generate_gt_for_training.py b/generate_gt_for_training.py index e296029..2a2a776 100644 --- a/generate_gt_for_training.py +++ b/generate_gt_for_training.py @@ -64,13 +64,17 @@ def pagexml2label(dir_xml,dir_out,type_output,config): help="directory where original images will be written as labels.", type=click.Path(exists=True, file_okay=False), ) -def image_enhancement(dir_imgs, dir_out_images, dir_out_labels): - #dir_imgs = './training_data_sample_enhancement/images' - #dir_out_images = './training_data_sample_enhancement/images_gt' - #dir_out_labels = './training_data_sample_enhancement/labels_gt' - +@click.option( + "--scales", + "-scs", + help="json dictionary where the scales are written.", + type=click.Path(exists=True, dir_okay=False), +) +def image_enhancement(dir_imgs, dir_out_images, dir_out_labels, scales): ls_imgs = os.listdir(dir_imgs) - ls_scales = [ 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9] + with open(scales) as f: + scale_dict = json.load(f) + ls_scales = scale_dict['scales'] for img in tqdm(ls_imgs): img_name = img.split('.')[0] diff --git a/scales_enhancement.json b/scales_enhancement.json new file mode 100644 index 0000000..58034f0 --- /dev/null +++ b/scales_enhancement.json @@ -0,0 +1,3 @@ +{ + "scales" : [ 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9] +} From 356da4cc53add49306aabbcb3177a46266bd90c1 Mon Sep 17 00:00:00 2001 From: vahidrezanezhad Date: Tue, 28 May 2024 10:14:16 +0200 Subject: [PATCH 20/62] min area size of text region passes as an argument for machine based reading order --- generate_gt_for_training.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/generate_gt_for_training.py b/generate_gt_for_training.py index 2a2a776..cf2b2a6 100644 --- a/generate_gt_for_training.py +++ b/generate_gt_for_training.py @@ -116,22 +116,28 @@ def image_enhancement(dir_imgs, dir_out_images, dir_out_labels, scales): @click.option( "--input_height", "-ih", - help="input_height", + help="input height", ) @click.option( "--input_width", "-iw", - help="input_width", + help="input width", +) +@click.option( + "--min_area_size", + "-min", + help="min area size of regions considered for reading order training.", ) -def machine_based_reading_order(dir_xml, dir_out_modal_image, dir_out_classes, input_height, input_width): +def machine_based_reading_order(dir_xml, dir_out_modal_image, dir_out_classes, input_height, input_width, min_area_size): xml_files_ind = os.listdir(dir_xml) input_height = int(input_height) input_width = int(input_width) + min_area = float(min_area_size) indexer_start= 0#55166 max_area = 1 - min_area = 0.0001 + #min_area = 0.0001 for ind_xml in tqdm(xml_files_ind): indexer = 0 From 2e7c69f2ac4d3c1aa68498fae409e8d6f25ebf8b Mon Sep 17 00:00:00 2001 From: vahidrezanezhad Date: Tue, 28 May 2024 16:48:51 +0200 Subject: [PATCH 21/62] inference for reading order --- gt_gen_utils.py | 136 +++++++++++---------------------- inference.py | 196 +++++++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 228 insertions(+), 104 deletions(-) diff --git a/gt_gen_utils.py b/gt_gen_utils.py index 9dc8377..0286ac7 100644 --- a/gt_gen_utils.py +++ b/gt_gen_utils.py @@ -38,11 +38,8 @@ def filter_contours_area_of_image_tables(image, contours, hierarchy, max_area, m polygon = geometry.Polygon([point[0] for point in c]) # area = cv2.contourArea(c) area = polygon.area - ##print(np.prod(thresh.shape[:2])) # Check that polygon has area greater than minimal area - # print(hierarchy[0][jv][3],hierarchy ) if area >= min_area * np.prod(image.shape[:2]) and area <= max_area * np.prod(image.shape[:2]): # and hierarchy[0][jv][3]==-1 : - # print(c[0][0][1]) found_polygons_early.append(np.array([[point] for point in polygon.exterior.coords], dtype=np.int32)) jv += 1 return found_polygons_early @@ -52,15 +49,12 @@ def filter_contours_area_of_image(image, contours, order_index, max_area, min_ar order_index_filtered = list() #jv = 0 for jv, c in enumerate(contours): - #print(len(c[0])) c = c[0] if len(c) < 3: # A polygon cannot have less than 3 points continue c_e = [point for point in c] - #print(c_e) polygon = geometry.Polygon(c_e) area = polygon.area - #print(area,'area') if area >= min_area * np.prod(image.shape[:2]) and area <= max_area * np.prod(image.shape[:2]): # and hierarchy[0][jv][3]==-1 : found_polygons_early.append(np.array([[point] for point in polygon.exterior.coords], dtype=np.uint)) order_index_filtered.append(order_index[jv]) @@ -88,12 +82,8 @@ def return_contours_of_interested_region(region_pre_p, pixel, min_area=0.0002): def update_region_contours(co_text, img_boundary, erosion_rate, dilation_rate, y_len, x_len): co_text_eroded = [] for con in co_text: - #try: img_boundary_in = np.zeros( (y_len,x_len) ) img_boundary_in = cv2.fillPoly(img_boundary_in, pts=[con], color=(1, 1, 1)) - #print('bidiahhhhaaa') - - #img_boundary_in = cv2.erode(img_boundary_in[:,:], KERNEL, iterations=7)#asiatica if erosion_rate > 0: @@ -626,8 +616,6 @@ def get_images_of_ground_truth(gt_list, dir_in, output_dir, output_type, config_ def find_new_features_of_contours(contours_main): - - #print(contours_main[0][0][:, 0]) areas_main = np.array([cv2.contourArea(contours_main[j]) for j in range(len(contours_main))]) M_main = [cv2.moments(contours_main[j]) for j in range(len(contours_main))] @@ -658,8 +646,6 @@ def find_new_features_of_contours(contours_main): y_min_main = np.array([np.min(contours_main[j][:, 1]) for j in range(len(contours_main))]) y_max_main = np.array([np.max(contours_main[j][:, 1]) for j in range(len(contours_main))]) - # dis_x=np.abs(x_max_main-x_min_main) - return cx_main, cy_main, x_min_main, x_max_main, y_min_main, y_max_main, y_corr_x_min_from_argmin def read_xml(xml_file): file_name = Path(xml_file).stem @@ -675,13 +661,11 @@ def read_xml(xml_file): y_len=int(jj.attrib['imageHeight']) x_len=int(jj.attrib['imageWidth']) - for jj in root1.iter(link+'RegionRefIndexed'): index_tot_regions.append(jj.attrib['index']) tot_region_ref.append(jj.attrib['regionRef']) region_tags=np.unique([x for x in alltags if x.endswith('Region')]) - #print(region_tags) co_text_paragraph=[] co_text_drop=[] co_text_heading=[] @@ -698,7 +682,6 @@ def read_xml(xml_file): co_graphic_decoration=[] co_noise=[] - co_text_paragraph_text=[] co_text_drop_text=[] co_text_heading_text=[] @@ -715,7 +698,6 @@ def read_xml(xml_file): co_graphic_decoration_text=[] co_noise_text=[] - id_paragraph = [] id_header = [] id_heading = [] @@ -726,14 +708,8 @@ def read_xml(xml_file): for nn in root1.iter(tag): for child2 in nn: tag2 = child2.tag - #print(child2.tag) if tag2.endswith('}TextEquiv') or tag2.endswith('}TextEquiv'): - #children2 = childtext.getchildren() - #rank = child2.find('Unicode').text for childtext2 in child2: - #rank = childtext2.find('Unicode').text - #if childtext2.tag.endswith('}PlainText') or childtext2.tag.endswith('}PlainText'): - #print(childtext2.text) if childtext2.tag.endswith('}Unicode') or childtext2.tag.endswith('}Unicode'): if "type" in nn.attrib and nn.attrib['type']=='drop-capital': co_text_drop_text.append(childtext2.text) @@ -743,10 +719,10 @@ def read_xml(xml_file): co_text_signature_mark_text.append(childtext2.text) elif "type" in nn.attrib and nn.attrib['type']=='header': co_text_header_text.append(childtext2.text) - elif "type" in nn.attrib and nn.attrib['type']=='catch-word': - co_text_catch_text.append(childtext2.text) - elif "type" in nn.attrib and nn.attrib['type']=='page-number': - co_text_page_number_text.append(childtext2.text) + ###elif "type" in nn.attrib and nn.attrib['type']=='catch-word': + ###co_text_catch_text.append(childtext2.text) + ###elif "type" in nn.attrib and nn.attrib['type']=='page-number': + ###co_text_page_number_text.append(childtext2.text) elif "type" in nn.attrib and nn.attrib['type']=='marginalia': co_text_marginalia_text.append(childtext2.text) else: @@ -774,7 +750,6 @@ def read_xml(xml_file): if "type" in nn.attrib and nn.attrib['type']=='drop-capital': - #if nn.attrib['type']=='paragraph': c_t_in_drop.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) @@ -792,27 +767,22 @@ def read_xml(xml_file): c_t_in_header.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) - elif "type" in nn.attrib and nn.attrib['type']=='catch-word': - c_t_in_catch.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) + ###elif "type" in nn.attrib and nn.attrib['type']=='catch-word': + ###c_t_in_catch.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) - elif "type" in nn.attrib and nn.attrib['type']=='page-number': + ###elif "type" in nn.attrib and nn.attrib['type']=='page-number': - c_t_in_page_number.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) - #print(c_t_in_paragraph) + ###c_t_in_page_number.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) elif "type" in nn.attrib and nn.attrib['type']=='marginalia': id_marginalia.append(nn.attrib['id']) c_t_in_marginalia.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) - #print(c_t_in_paragraph) else: - #print(nn.attrib['id']) - id_paragraph.append(nn.attrib['id']) c_t_in_paragraph.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) - #print(c_t_in_paragraph) break else: @@ -821,7 +791,6 @@ def read_xml(xml_file): if vv.tag==link+'Point': if "type" in nn.attrib and nn.attrib['type']=='drop-capital': - #if nn.attrib['type']=='paragraph': c_t_in_drop.append([ int(float(vv.attrib['x'])) , int(float(vv.attrib['y'])) ]) sumi+=1 @@ -835,7 +804,6 @@ def read_xml(xml_file): elif "type" in nn.attrib and nn.attrib['type']=='signature-mark': c_t_in_signature_mark.append([ int(float(vv.attrib['x'])) , int(float(vv.attrib['y'])) ]) - #print(c_t_in_paragraph) sumi+=1 elif "type" in nn.attrib and nn.attrib['type']=='header': id_header.append(nn.attrib['id']) @@ -843,33 +811,26 @@ def read_xml(xml_file): sumi+=1 - elif "type" in nn.attrib and nn.attrib['type']=='catch-word': - c_t_in_catch.append([ int(float(vv.attrib['x'])) , int(float(vv.attrib['y'])) ]) - sumi+=1 + ###elif "type" in nn.attrib and nn.attrib['type']=='catch-word': + ###c_t_in_catch.append([ int(float(vv.attrib['x'])) , int(float(vv.attrib['y'])) ]) + ###sumi+=1 + ###elif "type" in nn.attrib and nn.attrib['type']=='page-number': - elif "type" in nn.attrib and nn.attrib['type']=='page-number': - - c_t_in_page_number.append([ int(float(vv.attrib['x'])) , int(float(vv.attrib['y'])) ]) - #print(c_t_in_paragraph) - sumi+=1 + ###c_t_in_page_number.append([ int(float(vv.attrib['x'])) , int(float(vv.attrib['y'])) ]) + ###sumi+=1 elif "type" in nn.attrib and nn.attrib['type']=='marginalia': id_marginalia.append(nn.attrib['id']) c_t_in_marginalia.append([ int(float(vv.attrib['x'])) , int(float(vv.attrib['y'])) ]) - #print(c_t_in_paragraph) sumi+=1 else: id_paragraph.append(nn.attrib['id']) c_t_in_paragraph.append([ int(float(vv.attrib['x'])) , int(float(vv.attrib['y'])) ]) - #print(c_t_in_paragraph) sumi+=1 - #c_t_in.append([ int(float(vv.attrib['x'])) , int(float(vv.attrib['y'])) ]) - - #print(vv.tag,'in') elif vv.tag!=link+'Point' and sumi>=1: break @@ -895,7 +856,6 @@ def read_xml(xml_file): elif tag.endswith('}GraphicRegion') or tag.endswith('}graphicregion'): - #print('sth') for nn in root1.iter(tag): c_t_in=[] c_t_in_text_annotation=[] @@ -907,40 +867,31 @@ def read_xml(xml_file): coords=bool(vv.attrib) if coords: p_h=vv.attrib['points'].split(' ') - #c_t_in.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) if "type" in nn.attrib and nn.attrib['type']=='handwritten-annotation': - #if nn.attrib['type']=='paragraph': - c_t_in_text_annotation.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) - + elif "type" in nn.attrib and nn.attrib['type']=='decoration': - c_t_in_decoration.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) - #print(c_t_in_paragraph) + else: c_t_in.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) - break else: pass if vv.tag==link+'Point': - if "type" in nn.attrib and nn.attrib['type']=='handwritten-annotation': - #if nn.attrib['type']=='paragraph': - c_t_in_text_annotation.append([ int(float(vv.attrib['x'])) , int(float(vv.attrib['y'])) ]) sumi+=1 elif "type" in nn.attrib and nn.attrib['type']=='decoration': - c_t_in_decoration.append([ int(float(vv.attrib['x'])) , int(float(vv.attrib['y'])) ]) - #print(c_t_in_paragraph) sumi+=1 + else: c_t_in.append([ int(float(vv.attrib['x'])) , int(float(vv.attrib['y'])) ]) sumi+=1 @@ -955,7 +906,6 @@ def read_xml(xml_file): elif tag.endswith('}ImageRegion') or tag.endswith('}imageregion'): - #print('sth') for nn in root1.iter(tag): c_t_in=[] sumi=0 @@ -974,7 +924,6 @@ def read_xml(xml_file): if vv.tag==link+'Point': c_t_in.append([ int(float(vv.attrib['x'])) , int(float(vv.attrib['y'])) ]) sumi+=1 - #print(vv.tag,'in') elif vv.tag!=link+'Point' and sumi>=1: break co_img.append(np.array(c_t_in)) @@ -982,7 +931,6 @@ def read_xml(xml_file): elif tag.endswith('}SeparatorRegion') or tag.endswith('}separatorregion'): - #print('sth') for nn in root1.iter(tag): c_t_in=[] sumi=0 @@ -1001,7 +949,6 @@ def read_xml(xml_file): if vv.tag==link+'Point': c_t_in.append([ int(float(vv.attrib['x'])) , int(float(vv.attrib['y'])) ]) sumi+=1 - #print(vv.tag,'in') elif vv.tag!=link+'Point' and sumi>=1: break co_sep.append(np.array(c_t_in)) @@ -1009,7 +956,6 @@ def read_xml(xml_file): elif tag.endswith('}TableRegion') or tag.endswith('}tableregion'): - #print('sth') for nn in root1.iter(tag): c_t_in=[] sumi=0 @@ -1028,14 +974,13 @@ def read_xml(xml_file): if vv.tag==link+'Point': c_t_in.append([ int(float(vv.attrib['x'])) , int(float(vv.attrib['y'])) ]) sumi+=1 - #print(vv.tag,'in') + elif vv.tag!=link+'Point' and sumi>=1: break co_table.append(np.array(c_t_in)) co_table_text.append(' ') elif tag.endswith('}NoiseRegion') or tag.endswith('}noiseregion'): - #print('sth') for nn in root1.iter(tag): c_t_in=[] sumi=0 @@ -1054,40 +999,22 @@ def read_xml(xml_file): if vv.tag==link+'Point': c_t_in.append([ int(float(vv.attrib['x'])) , int(float(vv.attrib['y'])) ]) sumi+=1 - #print(vv.tag,'in') + elif vv.tag!=link+'Point' and sumi>=1: break co_noise.append(np.array(c_t_in)) co_noise_text.append(' ') - img = np.zeros( (y_len,x_len,3) ) - img_poly=cv2.fillPoly(img, pts =co_text_paragraph, color=(1,1,1)) img_poly=cv2.fillPoly(img, pts =co_text_heading, color=(2,2,2)) img_poly=cv2.fillPoly(img, pts =co_text_header, color=(2,2,2)) - #img_poly=cv2.fillPoly(img, pts =co_text_catch, color=(125,255,125)) - #img_poly=cv2.fillPoly(img, pts =co_text_signature_mark, color=(125,125,0)) - #img_poly=cv2.fillPoly(img, pts =co_graphic_decoration, color=(1,125,255)) - #img_poly=cv2.fillPoly(img, pts =co_text_page_number, color=(1,125,0)) img_poly=cv2.fillPoly(img, pts =co_text_marginalia, color=(3,3,3)) - #img_poly=cv2.fillPoly(img, pts =co_text_drop, color=(1,125,255)) - - #img_poly=cv2.fillPoly(img, pts =co_graphic_text_annotation, color=(125,0,125)) img_poly=cv2.fillPoly(img, pts =co_img, color=(4,4,4)) img_poly=cv2.fillPoly(img, pts =co_sep, color=(5,5,5)) - #img_poly=cv2.fillPoly(img, pts =co_table, color=(1,255,255)) - #img_poly=cv2.fillPoly(img, pts =co_graphic, color=(255,125,125)) - #img_poly=cv2.fillPoly(img, pts =co_noise, color=(255,0,255)) - - #print('yazdimmm',self.output_dir+'/'+self.gt_list[index].split('.')[0]+'.jpg') - ###try: - ####print('yazdimmm',self.output_dir+'/'+self.gt_list[index].split('.')[0]+'.jpg') - ###cv2.imwrite(self.output_dir+'/'+self.gt_list[index].split('-')[1].split('.')[0]+'.jpg',img_poly ) - ###except: - ###cv2.imwrite(self.output_dir+'/'+self.gt_list[index].split('.')[0]+'.jpg',img_poly ) - return file_name, id_paragraph, id_header,co_text_paragraph, co_text_header,\ + + return tree1, root1, file_name, id_paragraph, id_header,co_text_paragraph, co_text_header,\ tot_region_ref,x_len, y_len,index_tot_regions, img_poly @@ -1113,3 +1040,24 @@ def make_image_from_bb(width_l, height_l, bb_all): for i in range(bb_all.shape[0]): img_remade[bb_all[i,1]:bb_all[i,1]+bb_all[i,3],bb_all[i,0]:bb_all[i,0]+bb_all[i,2] ] = 1 return img_remade + +def update_list_and_return_first_with_length_bigger_than_one(index_element_to_be_updated, innner_index_pr_pos, pr_list, pos_list,list_inp): + list_inp.pop(index_element_to_be_updated) + if len(pr_list)>0: + list_inp.insert(index_element_to_be_updated, pr_list) + else: + index_element_to_be_updated = index_element_to_be_updated -1 + + list_inp.insert(index_element_to_be_updated+1, [innner_index_pr_pos]) + if len(pos_list)>0: + list_inp.insert(index_element_to_be_updated+2, pos_list) + + len_all_elements = [len(i) for i in list_inp] + list_len_bigger_1 = np.where(np.array(len_all_elements)>1) + list_len_bigger_1 = list_len_bigger_1[0] + + if len(list_len_bigger_1)>0: + early_list_bigger_than_one = list_len_bigger_1[0] + else: + early_list_bigger_than_one = -20 + return list_inp, early_list_bigger_than_one diff --git a/inference.py b/inference.py index 94e318d..73b4ed8 100644 --- a/inference.py +++ b/inference.py @@ -11,13 +11,11 @@ from tensorflow.keras import layers import tensorflow.keras.losses from tensorflow.keras.layers import * from models import * +from gt_gen_utils import * import click import json from tensorflow.python.keras import backend as tensorflow_backend - - - - +import xml.etree.ElementTree as ET with warnings.catch_warnings(): @@ -29,7 +27,7 @@ Tool to load model and predict for given image. """ class sbb_predict: - def __init__(self,image, model, task, config_params_model, patches, save, ground_truth): + def __init__(self,image, model, task, config_params_model, patches, save, ground_truth, xml_file): self.image=image self.patches=patches self.save=save @@ -37,6 +35,7 @@ class sbb_predict: self.ground_truth=ground_truth self.task=task self.config_params_model=config_params_model + self.xml_file = xml_file def resize_image(self,img_in,input_height,input_width): return cv2.resize( img_in, ( input_width,input_height) ,interpolation=cv2.INTER_NEAREST) @@ -166,7 +165,7 @@ class sbb_predict: ##if self.weights_dir!=None: ##self.model.load_weights(self.weights_dir) - if self.task != 'classification': + if (self.task != 'classification' and self.task != 'reading_order'): self.img_height=self.model.layers[len(self.model.layers)-1].output_shape[1] self.img_width=self.model.layers[len(self.model.layers)-1].output_shape[2] self.n_classes=self.model.layers[len(self.model.layers)-1].output_shape[3] @@ -233,6 +232,178 @@ class sbb_predict: index_class = np.argmax(label_p_pred[0]) print("Predicted Class: {}".format(classes_names[str(int(index_class))])) + elif self.task == 'reading_order': + img_height = self.config_params_model['input_height'] + img_width = self.config_params_model['input_width'] + + tree_xml, root_xml, file_name, id_paragraph, id_header, co_text_paragraph, co_text_header, tot_region_ref, x_len, y_len, index_tot_regions, img_poly = read_xml(self.xml_file) + _, cy_main, x_min_main, x_max_main, y_min_main, y_max_main, _ = find_new_features_of_contours(co_text_header) + + img_header_and_sep = np.zeros((y_len,x_len), dtype='uint8') + + for j in range(len(cy_main)): + img_header_and_sep[int(y_max_main[j]):int(y_max_main[j])+12,int(x_min_main[j]):int(x_max_main[j]) ] = 1 + + co_text_all = co_text_paragraph + co_text_header + id_all_text = id_paragraph + id_header + + ##texts_corr_order_index = [index_tot_regions[tot_region_ref.index(i)] for i in id_all_text ] + ##texts_corr_order_index_int = [int(x) for x in texts_corr_order_index] + texts_corr_order_index_int = list(np.array(range(len(co_text_all)))) + + min_area = 0 + max_area = 1 + + co_text_all, texts_corr_order_index_int = filter_contours_area_of_image(img_poly, co_text_all, texts_corr_order_index_int, max_area, min_area) + + labels_con = np.zeros((y_len,x_len,len(co_text_all)),dtype='uint8') + for i in range(len(co_text_all)): + img_label = np.zeros((y_len,x_len,3),dtype='uint8') + img_label=cv2.fillPoly(img_label, pts =[co_text_all[i]], color=(1,1,1)) + labels_con[:,:,i] = img_label[:,:,0] + + img3= np.copy(img_poly) + labels_con = resize_image(labels_con, img_height, img_width) + + img_header_and_sep = resize_image(img_header_and_sep, img_height, img_width) + + img3= resize_image (img3, img_height, img_width) + img3 = img3.astype(np.uint16) + + inference_bs = 1#4 + + input_1= np.zeros( (inference_bs, img_height, img_width,3)) + + + starting_list_of_regions = [] + starting_list_of_regions.append( list(range(labels_con.shape[2])) ) + + index_update = 0 + index_selected = starting_list_of_regions[0] + + scalibility_num = 0 + while index_update>=0: + ij_list = starting_list_of_regions[index_update] + i = ij_list[0] + ij_list.pop(0) + + + pr_list = [] + post_list = [] + + batch_counter = 0 + tot_counter = 1 + + tot_iteration = len(ij_list) + full_bs_ite= tot_iteration//inference_bs + last_bs = tot_iteration % inference_bs + + jbatch_indexer =[] + for j in ij_list: + img1= np.repeat(labels_con[:,:,i][:, :, np.newaxis], 3, axis=2) + img2 = np.repeat(labels_con[:,:,j][:, :, np.newaxis], 3, axis=2) + + + img2[:,:,0][img3[:,:,0]==5] = 2 + img2[:,:,0][img_header_and_sep[:,:]==1] = 3 + + + + img1[:,:,0][img3[:,:,0]==5] = 2 + img1[:,:,0][img_header_and_sep[:,:]==1] = 3 + + #input_1= np.zeros( (height1, width1,3)) + + + jbatch_indexer.append(j) + + input_1[batch_counter,:,:,0] = img1[:,:,0]/3. + input_1[batch_counter,:,:,2] = img2[:,:,0]/3. + input_1[batch_counter,:,:,1] = img3[:,:,0]/5. + #input_1[batch_counter,:,:,:]= np.zeros( (batch_counter, height1, width1,3)) + batch_counter = batch_counter+1 + + #input_1[:,:,0] = img1[:,:,0]/3. + #input_1[:,:,2] = img2[:,:,0]/3. + #input_1[:,:,1] = img3[:,:,0]/5. + + if batch_counter==inference_bs or ( (tot_counter//inference_bs)==full_bs_ite and tot_counter%inference_bs==last_bs): + y_pr = self.model.predict(input_1 , verbose=0) + scalibility_num = scalibility_num+1 + + if batch_counter==inference_bs: + iteration_batches = inference_bs + else: + iteration_batches = last_bs + for jb in range(iteration_batches): + if y_pr[jb][0]>=0.5: + post_list.append(jbatch_indexer[jb]) + else: + pr_list.append(jbatch_indexer[jb]) + + batch_counter = 0 + jbatch_indexer = [] + + tot_counter = tot_counter+1 + + starting_list_of_regions, index_update = update_list_and_return_first_with_length_bigger_than_one(index_update, i, pr_list, post_list,starting_list_of_regions) + + index_sort = [i[0] for i in starting_list_of_regions ] + + + alltags=[elem.tag for elem in root_xml.iter()] + + + + link=alltags[0].split('}')[0]+'}' + name_space = alltags[0].split('}')[0] + name_space = name_space.split('{')[1] + + page_element = root_xml.find(link+'Page') + + """ + ro_subelement = ET.SubElement(page_element, 'ReadingOrder') + #print(page_element, 'page_element') + + #new_element = ET.Element('ReadingOrder') + + new_element_element = ET.Element('OrderedGroup') + new_element_element.set('id', "ro357564684568544579089") + + for index, id_text in enumerate(id_all_text): + new_element_2 = ET.Element('RegionRefIndexed') + new_element_2.set('regionRef', id_all_text[index]) + new_element_2.set('index', str(index_sort[index])) + + new_element_element.append(new_element_2) + + ro_subelement.append(new_element_element) + """ + ##ro_subelement = ET.SubElement(page_element, 'ReadingOrder') + + ro_subelement = ET.Element('ReadingOrder') + + ro_subelement2 = ET.SubElement(ro_subelement, 'OrderedGroup') + ro_subelement2.set('id', "ro357564684568544579089") + + for index, id_text in enumerate(id_all_text): + new_element_2 = ET.SubElement(ro_subelement2, 'RegionRefIndexed') + new_element_2.set('regionRef', id_all_text[index]) + new_element_2.set('index', str(index_sort[index])) + + if link+'PrintSpace' in alltags: + page_element.insert(1, ro_subelement) + else: + page_element.insert(0, ro_subelement) + + #page_element[0].append(new_element) + #root_xml.append(new_element) + alltags=[elem.tag for elem in root_xml.iter()] + + ET.register_namespace("",name_space) + tree_xml.write('library2.xml',xml_declaration=True,method='xml',encoding="utf8",default_namespace=None) + #tree_xml.write('library2.xml') + else: if self.patches: #def textline_contours(img,input_width,input_height,n_classes,model): @@ -356,7 +527,7 @@ class sbb_predict: def run(self): res=self.predict() - if self.task == 'classification': + if (self.task == 'classification' or self.task == 'reading_order'): pass else: img_seg_overlayed = self.visualize_model_output(res, self.img_org, self.task) @@ -397,15 +568,20 @@ class sbb_predict: "-gt", help="ground truth directory if you want to see the iou of prediction.", ) -def main(image, model, patches, save, ground_truth): +@click.option( + "--xml_file", + "-xml", + help="xml file with layout coordinates that reading order detection will be implemented on. The result will be written in the same xml file.", +) +def main(image, model, patches, save, ground_truth, xml_file): with open(os.path.join(model,'config.json')) as f: config_params_model = json.load(f) task = config_params_model['task'] - if task != 'classification': + if (task != 'classification' and task != 'reading_order'): if not save: print("Error: You used one of segmentation or binarization task but not set -s, you need a filename to save visualized output with -s") sys.exit(1) - x=sbb_predict(image, model, task, config_params_model, patches, save, ground_truth) + x=sbb_predict(image, model, task, config_params_model, patches, save, ground_truth, xml_file) x.run() if __name__=="__main__": From f6abefb0a8191ebb420d3ddead45a46bf398076b Mon Sep 17 00:00:00 2001 From: vahidrezanezhad Date: Wed, 29 May 2024 11:18:35 +0200 Subject: [PATCH 22/62] reading order detection on xml with layout + result will be written in an output directory with the same file name --- gt_gen_utils.py | 74 +++++++++++++++++++++++++++++++++++++++++++------ inference.py | 45 ++++++++++++++++++++++-------- 2 files changed, 99 insertions(+), 20 deletions(-) diff --git a/gt_gen_utils.py b/gt_gen_utils.py index 0286ac7..8f72fb8 100644 --- a/gt_gen_utils.py +++ b/gt_gen_utils.py @@ -664,6 +664,58 @@ def read_xml(xml_file): for jj in root1.iter(link+'RegionRefIndexed'): index_tot_regions.append(jj.attrib['index']) tot_region_ref.append(jj.attrib['regionRef']) + + if (link+'PrintSpace' in alltags) or (link+'Border' in alltags): + co_printspace = [] + if link+'PrintSpace' in alltags: + region_tags_printspace = np.unique([x for x in alltags if x.endswith('PrintSpace')]) + elif link+'Border' in alltags: + region_tags_printspace = np.unique([x for x in alltags if x.endswith('Border')]) + + for tag in region_tags_printspace: + if link+'PrintSpace' in alltags: + tag_endings_printspace = ['}PrintSpace','}printspace'] + elif link+'Border' in alltags: + tag_endings_printspace = ['}Border','}border'] + + if tag.endswith(tag_endings_printspace[0]) or tag.endswith(tag_endings_printspace[1]): + for nn in root1.iter(tag): + c_t_in = [] + sumi = 0 + for vv in nn.iter(): + # check the format of coords + if vv.tag == link + 'Coords': + coords = bool(vv.attrib) + if coords: + p_h = vv.attrib['points'].split(' ') + c_t_in.append( + np.array([[int(x.split(',')[0]), int(x.split(',')[1])] for x in p_h])) + break + else: + pass + + if vv.tag == link + 'Point': + c_t_in.append([int(float(vv.attrib['x'])), int(float(vv.attrib['y']))]) + sumi += 1 + elif vv.tag != link + 'Point' and sumi >= 1: + break + co_printspace.append(np.array(c_t_in)) + img_printspace = np.zeros( (y_len,x_len,3) ) + img_printspace=cv2.fillPoly(img_printspace, pts =co_printspace, color=(1,1,1)) + img_printspace = img_printspace.astype(np.uint8) + + imgray = cv2.cvtColor(img_printspace, cv2.COLOR_BGR2GRAY) + _, thresh = cv2.threshold(imgray, 0, 255, 0) + contours, _ = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) + cnt_size = np.array([cv2.contourArea(contours[j]) for j in range(len(contours))]) + cnt = contours[np.argmax(cnt_size)] + x, y, w, h = cv2.boundingRect(cnt) + + bb_coord_printspace = [x, y, w, h] + + else: + bb_coord_printspace = None + region_tags=np.unique([x for x in alltags if x.endswith('Region')]) co_text_paragraph=[] @@ -754,7 +806,7 @@ def read_xml(xml_file): c_t_in_drop.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) elif "type" in nn.attrib and nn.attrib['type']=='heading': - id_heading.append(nn.attrib['id']) + ##id_heading.append(nn.attrib['id']) c_t_in_heading.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) @@ -763,7 +815,7 @@ def read_xml(xml_file): c_t_in_signature_mark.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) #print(c_t_in_paragraph) elif "type" in nn.attrib and nn.attrib['type']=='header': - id_header.append(nn.attrib['id']) + #id_header.append(nn.attrib['id']) c_t_in_header.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) @@ -776,11 +828,11 @@ def read_xml(xml_file): ###c_t_in_page_number.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) elif "type" in nn.attrib and nn.attrib['type']=='marginalia': - id_marginalia.append(nn.attrib['id']) + #id_marginalia.append(nn.attrib['id']) c_t_in_marginalia.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) else: - id_paragraph.append(nn.attrib['id']) + #id_paragraph.append(nn.attrib['id']) c_t_in_paragraph.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) @@ -796,7 +848,7 @@ def read_xml(xml_file): sumi+=1 elif "type" in nn.attrib and nn.attrib['type']=='heading': - id_heading.append(nn.attrib['id']) + #id_heading.append(nn.attrib['id']) c_t_in_heading.append([ int(float(vv.attrib['x'])) , int(float(vv.attrib['y'])) ]) sumi+=1 @@ -806,7 +858,7 @@ def read_xml(xml_file): c_t_in_signature_mark.append([ int(float(vv.attrib['x'])) , int(float(vv.attrib['y'])) ]) sumi+=1 elif "type" in nn.attrib and nn.attrib['type']=='header': - id_header.append(nn.attrib['id']) + #id_header.append(nn.attrib['id']) c_t_in_header.append([ int(float(vv.attrib['x'])) , int(float(vv.attrib['y'])) ]) sumi+=1 @@ -821,13 +873,13 @@ def read_xml(xml_file): ###sumi+=1 elif "type" in nn.attrib and nn.attrib['type']=='marginalia': - id_marginalia.append(nn.attrib['id']) + #id_marginalia.append(nn.attrib['id']) c_t_in_marginalia.append([ int(float(vv.attrib['x'])) , int(float(vv.attrib['y'])) ]) sumi+=1 else: - id_paragraph.append(nn.attrib['id']) + #id_paragraph.append(nn.attrib['id']) c_t_in_paragraph.append([ int(float(vv.attrib['x'])) , int(float(vv.attrib['y'])) ]) sumi+=1 @@ -838,11 +890,14 @@ def read_xml(xml_file): co_text_drop.append(np.array(c_t_in_drop)) if len(c_t_in_paragraph)>0: co_text_paragraph.append(np.array(c_t_in_paragraph)) + id_paragraph.append(nn.attrib['id']) if len(c_t_in_heading)>0: co_text_heading.append(np.array(c_t_in_heading)) + id_heading.append(nn.attrib['id']) if len(c_t_in_header)>0: co_text_header.append(np.array(c_t_in_header)) + id_header.append(nn.attrib['id']) if len(c_t_in_page_number)>0: co_text_page_number.append(np.array(c_t_in_page_number)) if len(c_t_in_catch)>0: @@ -853,6 +908,7 @@ def read_xml(xml_file): if len(c_t_in_marginalia)>0: co_text_marginalia.append(np.array(c_t_in_marginalia)) + id_marginalia.append(nn.attrib['id']) elif tag.endswith('}GraphicRegion') or tag.endswith('}graphicregion'): @@ -1014,7 +1070,7 @@ def read_xml(xml_file): img_poly=cv2.fillPoly(img, pts =co_img, color=(4,4,4)) img_poly=cv2.fillPoly(img, pts =co_sep, color=(5,5,5)) - return tree1, root1, file_name, id_paragraph, id_header,co_text_paragraph, co_text_header,\ + return tree1, root1, bb_coord_printspace, file_name, id_paragraph, id_header+id_heading, co_text_paragraph, co_text_header+co_text_heading,\ tot_region_ref,x_len, y_len,index_tot_regions, img_poly diff --git a/inference.py b/inference.py index 73b4ed8..28445e8 100644 --- a/inference.py +++ b/inference.py @@ -16,6 +16,7 @@ import click import json from tensorflow.python.keras import backend as tensorflow_backend import xml.etree.ElementTree as ET +import matplotlib.pyplot as plt with warnings.catch_warnings(): @@ -27,7 +28,7 @@ Tool to load model and predict for given image. """ class sbb_predict: - def __init__(self,image, model, task, config_params_model, patches, save, ground_truth, xml_file): + def __init__(self,image, model, task, config_params_model, patches, save, ground_truth, xml_file, out): self.image=image self.patches=patches self.save=save @@ -36,6 +37,7 @@ class sbb_predict: self.task=task self.config_params_model=config_params_model self.xml_file = xml_file + self.out = out def resize_image(self,img_in,input_height,input_width): return cv2.resize( img_in, ( input_width,input_height) ,interpolation=cv2.INTER_NEAREST) @@ -236,16 +238,18 @@ class sbb_predict: img_height = self.config_params_model['input_height'] img_width = self.config_params_model['input_width'] - tree_xml, root_xml, file_name, id_paragraph, id_header, co_text_paragraph, co_text_header, tot_region_ref, x_len, y_len, index_tot_regions, img_poly = read_xml(self.xml_file) + tree_xml, root_xml, bb_coord_printspace, file_name, id_paragraph, id_header, co_text_paragraph, co_text_header, tot_region_ref, x_len, y_len, index_tot_regions, img_poly = read_xml(self.xml_file) _, cy_main, x_min_main, x_max_main, y_min_main, y_max_main, _ = find_new_features_of_contours(co_text_header) img_header_and_sep = np.zeros((y_len,x_len), dtype='uint8') + for j in range(len(cy_main)): img_header_and_sep[int(y_max_main[j]):int(y_max_main[j])+12,int(x_min_main[j]):int(x_max_main[j]) ] = 1 co_text_all = co_text_paragraph + co_text_header id_all_text = id_paragraph + id_header + ##texts_corr_order_index = [index_tot_regions[tot_region_ref.index(i)] for i in id_all_text ] ##texts_corr_order_index_int = [int(x) for x in texts_corr_order_index] @@ -253,8 +257,9 @@ class sbb_predict: min_area = 0 max_area = 1 + - co_text_all, texts_corr_order_index_int = filter_contours_area_of_image(img_poly, co_text_all, texts_corr_order_index_int, max_area, min_area) + ##co_text_all, texts_corr_order_index_int = filter_contours_area_of_image(img_poly, co_text_all, texts_corr_order_index_int, max_area, min_area) labels_con = np.zeros((y_len,x_len,len(co_text_all)),dtype='uint8') for i in range(len(co_text_all)): @@ -262,6 +267,18 @@ class sbb_predict: img_label=cv2.fillPoly(img_label, pts =[co_text_all[i]], color=(1,1,1)) labels_con[:,:,i] = img_label[:,:,0] + if bb_coord_printspace: + #bb_coord_printspace[x,y,w,h,_,_] + x = bb_coord_printspace[0] + y = bb_coord_printspace[1] + w = bb_coord_printspace[2] + h = bb_coord_printspace[3] + labels_con = labels_con[y:y+h, x:x+w, :] + img_poly = img_poly[y:y+h, x:x+w, :] + img_header_and_sep = img_header_and_sep[y:y+h, x:x+w] + + + img3= np.copy(img_poly) labels_con = resize_image(labels_con, img_height, img_width) @@ -347,9 +364,11 @@ class sbb_predict: tot_counter = tot_counter+1 starting_list_of_regions, index_update = update_list_and_return_first_with_length_bigger_than_one(index_update, i, pr_list, post_list,starting_list_of_regions) - + + index_sort = [i[0] for i in starting_list_of_regions ] + id_all_text = np.array(id_all_text)[index_sort] alltags=[elem.tag for elem in root_xml.iter()] @@ -389,19 +408,17 @@ class sbb_predict: for index, id_text in enumerate(id_all_text): new_element_2 = ET.SubElement(ro_subelement2, 'RegionRefIndexed') new_element_2.set('regionRef', id_all_text[index]) - new_element_2.set('index', str(index_sort[index])) + new_element_2.set('index', str(index)) - if link+'PrintSpace' in alltags: + if (link+'PrintSpace' in alltags) or (link+'Border' in alltags): page_element.insert(1, ro_subelement) else: page_element.insert(0, ro_subelement) - #page_element[0].append(new_element) - #root_xml.append(new_element) alltags=[elem.tag for elem in root_xml.iter()] ET.register_namespace("",name_space) - tree_xml.write('library2.xml',xml_declaration=True,method='xml',encoding="utf8",default_namespace=None) + tree_xml.write(os.path.join(self.out, file_name+'.xml'),xml_declaration=True,method='xml',encoding="utf8",default_namespace=None) #tree_xml.write('library2.xml') else: @@ -545,6 +562,12 @@ class sbb_predict: help="image filename", type=click.Path(exists=True, dir_okay=False), ) +@click.option( + "--out", + "-o", + help="output directory where xml with detected reading order will be written.", + type=click.Path(exists=True, file_okay=False), +) @click.option( "--patches/--no-patches", "-p/-nop", @@ -573,7 +596,7 @@ class sbb_predict: "-xml", help="xml file with layout coordinates that reading order detection will be implemented on. The result will be written in the same xml file.", ) -def main(image, model, patches, save, ground_truth, xml_file): +def main(image, model, patches, save, ground_truth, xml_file, out): with open(os.path.join(model,'config.json')) as f: config_params_model = json.load(f) task = config_params_model['task'] @@ -581,7 +604,7 @@ def main(image, model, patches, save, ground_truth, xml_file): if not save: print("Error: You used one of segmentation or binarization task but not set -s, you need a filename to save visualized output with -s") sys.exit(1) - x=sbb_predict(image, model, task, config_params_model, patches, save, ground_truth, xml_file) + x=sbb_predict(image, model, task, config_params_model, patches, save, ground_truth, xml_file, out) x.run() if __name__=="__main__": From 785033536a28e6aa93df8c7798426037d28ccc80 Mon Sep 17 00:00:00 2001 From: vahidrezanezhad Date: Wed, 29 May 2024 13:07:06 +0200 Subject: [PATCH 23/62] min_area size of regions considered for reading order detection passed as an argument for inference --- gt_gen_utils.py | 13 +++++++++++-- inference.py | 29 +++++++++++++++++++++++------ 2 files changed, 34 insertions(+), 8 deletions(-) diff --git a/gt_gen_utils.py b/gt_gen_utils.py index 8f72fb8..d3dd7df 100644 --- a/gt_gen_utils.py +++ b/gt_gen_utils.py @@ -32,10 +32,16 @@ def filter_contours_area_of_image_tables(image, contours, hierarchy, max_area, m jv = 0 for c in contours: + if len(np.shape(c)) == 3: + c = c[0] + elif len(np.shape(c)) == 2: + pass + #c = c[0] if len(c) < 3: # A polygon cannot have less than 3 points continue - polygon = geometry.Polygon([point[0] for point in c]) + c_e = [point for point in c] + polygon = geometry.Polygon(c_e) # area = cv2.contourArea(c) area = polygon.area # Check that polygon has area greater than minimal area @@ -49,7 +55,10 @@ def filter_contours_area_of_image(image, contours, order_index, max_area, min_ar order_index_filtered = list() #jv = 0 for jv, c in enumerate(contours): - c = c[0] + if len(np.shape(c)) == 3: + c = c[0] + elif len(np.shape(c)) == 2: + pass if len(c) < 3: # A polygon cannot have less than 3 points continue c_e = [point for point in c] diff --git a/inference.py b/inference.py index 28445e8..c7a8b02 100644 --- a/inference.py +++ b/inference.py @@ -28,7 +28,7 @@ Tool to load model and predict for given image. """ class sbb_predict: - def __init__(self,image, model, task, config_params_model, patches, save, ground_truth, xml_file, out): + def __init__(self,image, model, task, config_params_model, patches, save, ground_truth, xml_file, out, min_area): self.image=image self.patches=patches self.save=save @@ -38,6 +38,10 @@ class sbb_predict: self.config_params_model=config_params_model self.xml_file = xml_file self.out = out + if min_area: + self.min_area = float(min_area) + else: + self.min_area = 0 def resize_image(self,img_in,input_height,input_width): return cv2.resize( img_in, ( input_width,input_height) ,interpolation=cv2.INTER_NEAREST) @@ -255,11 +259,18 @@ class sbb_predict: ##texts_corr_order_index_int = [int(x) for x in texts_corr_order_index] texts_corr_order_index_int = list(np.array(range(len(co_text_all)))) - min_area = 0 + #print(texts_corr_order_index_int) + max_area = 1 + #print(np.shape(co_text_all[0]), len( np.shape(co_text_all[0]) ),'co_text_all') + #co_text_all = filter_contours_area_of_image_tables(img_poly, co_text_all, _, max_area, min_area) + #print(co_text_all,'co_text_all') + co_text_all, texts_corr_order_index_int = filter_contours_area_of_image(img_poly, co_text_all, texts_corr_order_index_int, max_area, self.min_area) - - ##co_text_all, texts_corr_order_index_int = filter_contours_area_of_image(img_poly, co_text_all, texts_corr_order_index_int, max_area, min_area) + #print(texts_corr_order_index_int) + + #co_text_all = [co_text_all[index] for index in texts_corr_order_index_int] + id_all_text = [id_all_text[index] for index in texts_corr_order_index_int] labels_con = np.zeros((y_len,x_len,len(co_text_all)),dtype='uint8') for i in range(len(co_text_all)): @@ -596,7 +607,13 @@ class sbb_predict: "-xml", help="xml file with layout coordinates that reading order detection will be implemented on. The result will be written in the same xml file.", ) -def main(image, model, patches, save, ground_truth, xml_file, out): + +@click.option( + "--min_area", + "-min", + help="min area size of regions considered for reading order detection. The default value is zero and means that all text regions are considered for reading order.", +) +def main(image, model, patches, save, ground_truth, xml_file, out, min_area): with open(os.path.join(model,'config.json')) as f: config_params_model = json.load(f) task = config_params_model['task'] @@ -604,7 +621,7 @@ def main(image, model, patches, save, ground_truth, xml_file, out): if not save: print("Error: You used one of segmentation or binarization task but not set -s, you need a filename to save visualized output with -s") sys.exit(1) - x=sbb_predict(image, model, task, config_params_model, patches, save, ground_truth, xml_file, out) + x=sbb_predict(image, model, task, config_params_model, patches, save, ground_truth, xml_file, out, min_area) x.run() if __name__=="__main__": From 4640d9f2dc0a0db7462a9a2f5f779f40fe4d217d Mon Sep 17 00:00:00 2001 From: vahidrezanezhad Date: Thu, 30 May 2024 12:56:56 +0200 Subject: [PATCH 24/62] modifying xml parsing --- gt_gen_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gt_gen_utils.py b/gt_gen_utils.py index d3dd7df..debaf15 100644 --- a/gt_gen_utils.py +++ b/gt_gen_utils.py @@ -122,7 +122,7 @@ def get_images_of_ground_truth(gt_list, dir_in, output_dir, output_type, config_ ## to do: add footnote to text regions for index in tqdm(range(len(gt_list))): #try: - tree1 = ET.parse(dir_in+'/'+gt_list[index]) + tree1 = ET.parse(dir_in+'/'+gt_list[index], parser = ET.XMLParser(encoding = 'iso-8859-5')) root1=tree1.getroot() alltags=[elem.tag for elem in root1.iter()] link=alltags[0].split('}')[0]+'}' @@ -658,7 +658,7 @@ def find_new_features_of_contours(contours_main): return cx_main, cy_main, x_min_main, x_max_main, y_min_main, y_max_main, y_corr_x_min_from_argmin def read_xml(xml_file): file_name = Path(xml_file).stem - tree1 = ET.parse(xml_file) + tree1 = ET.parse(xml_file, parser = ET.XMLParser(encoding = 'iso-8859-5')) root1=tree1.getroot() alltags=[elem.tag for elem in root1.iter()] link=alltags[0].split('}')[0]+'}' From 821290c4644dcfdd2128f2eff1afc56689906a68 Mon Sep 17 00:00:00 2001 From: vahidrezanezhad Date: Thu, 30 May 2024 16:59:50 +0200 Subject: [PATCH 25/62] scaling and cropping of labels and org images --- custom_config_page2label.json | 5 +- generate_gt_for_training.py | 34 ++++++--- gt_gen_utils.py | 125 +++++++++++++++++++++++++++++++--- 3 files changed, 145 insertions(+), 19 deletions(-) diff --git a/custom_config_page2label.json b/custom_config_page2label.json index e4c02cb..9116ce3 100644 --- a/custom_config_page2label.json +++ b/custom_config_page2label.json @@ -1,9 +1,8 @@ { -"use_case": "layout", +"use_case": "textline", "textregions":{ "rest_as_paragraph": 1, "header":2 , "heading":2 , "marginalia":3 }, "imageregion":4, "separatorregion":5, "graphicregions" :{"rest_as_decoration":6}, -"artificial_class_on_boundry": ["paragraph"], -"artificial_class_label":7 +"columns_width":{"1":1000, "2":1300, "3":1600, "4":2000, "5":2300, "6":2500} } diff --git a/generate_gt_for_training.py b/generate_gt_for_training.py index cf2b2a6..752090c 100644 --- a/generate_gt_for_training.py +++ b/generate_gt_for_training.py @@ -14,10 +14,22 @@ def main(): help="directory of GT page-xml files", type=click.Path(exists=True, file_okay=False), ) +@click.option( + "--dir_images", + "-di", + help="directory of org images. If print space cropping or scaling is needed for labels it would be great to provide the original images to apply the same function on them. So if -ps is not set true or in config files no columns_width key is given this argumnet can be ignored. File stems in this directory should be the same as those in dir_xml.", + type=click.Path(exists=True, file_okay=False), +) +@click.option( + "--dir_out_images", + "-doi", + help="directory where the output org images after undergoing a process (like print space cropping or scaling) will be written.", + type=click.Path(exists=True, file_okay=False), +) @click.option( "--dir_out", "-do", - help="directory where ground truth images would be written", + help="directory where ground truth label images would be written", type=click.Path(exists=True, file_okay=False), ) @@ -33,8 +45,14 @@ def main(): "-to", help="this defines how output should be. A 2d image array or a 3d image array encoded with RGB color. Just pass 2d or 3d. The file will be saved one directory up. 2D image array is 3d but only information of one channel would be enough since all channels have the same values.", ) +@click.option( + "--printspace", + "-ps", + is_flag=True, + help="if this parameter set to true, generated labels and in the case of provided org images cropping will be imposed and cropped labels and images will be written in output directories.", +) -def pagexml2label(dir_xml,dir_out,type_output,config): +def pagexml2label(dir_xml,dir_out,type_output,config, printspace, dir_images, dir_out_images): if config: with open(config) as f: config_params = json.load(f) @@ -42,7 +60,7 @@ def pagexml2label(dir_xml,dir_out,type_output,config): print("passed") config_params = None gt_list = get_content_of_dir(dir_xml) - get_images_of_ground_truth(gt_list,dir_xml,dir_out,type_output, config, config_params) + get_images_of_ground_truth(gt_list,dir_xml,dir_out,type_output, config, config_params, printspace, dir_images, dir_out_images) @main.command() @click.option( @@ -181,7 +199,7 @@ def machine_based_reading_order(dir_xml, dir_out_modal_image, dir_out_classes, i for i in range(len(texts_corr_order_index_int)): for j in range(len(texts_corr_order_index_int)): if i!=j: - input_matrix = np.zeros((input_height,input_width,3)).astype(np.int8) + input_multi_visual_modal = np.zeros((input_height,input_width,3)).astype(np.int8) final_f_name = f_name+'_'+str(indexer+indexer_start) order_class_condition = texts_corr_order_index_int[i]-texts_corr_order_index_int[j] if order_class_condition<0: @@ -189,13 +207,13 @@ def machine_based_reading_order(dir_xml, dir_out_modal_image, dir_out_classes, i else: class_type = 0 - input_matrix[:,:,0] = resize_image(labels_con[:,:,i], input_height, input_width) - input_matrix[:,:,1] = resize_image(img_poly[:,:,0], input_height, input_width) - input_matrix[:,:,2] = resize_image(labels_con[:,:,j], input_height, input_width) + input_multi_visual_modal[:,:,0] = resize_image(labels_con[:,:,i], input_height, input_width) + input_multi_visual_modal[:,:,1] = resize_image(img_poly[:,:,0], input_height, input_width) + input_multi_visual_modal[:,:,2] = resize_image(labels_con[:,:,j], input_height, input_width) np.save(os.path.join(dir_out_classes,final_f_name+'.npy' ), class_type) - cv2.imwrite(os.path.join(dir_out_modal_image,final_f_name+'.png' ), input_matrix) + cv2.imwrite(os.path.join(dir_out_modal_image,final_f_name+'.png' ), input_multi_visual_modal) indexer = indexer+1 diff --git a/gt_gen_utils.py b/gt_gen_utils.py index debaf15..d3e95e8 100644 --- a/gt_gen_utils.py +++ b/gt_gen_utils.py @@ -115,11 +115,15 @@ def update_region_contours(co_text, img_boundary, erosion_rate, dilation_rate, y img_boundary[:,:][boundary[:,:]==1] =1 return co_text_eroded, img_boundary -def get_images_of_ground_truth(gt_list, dir_in, output_dir, output_type, config_file, config_params): +def get_images_of_ground_truth(gt_list, dir_in, output_dir, output_type, config_file, config_params, printspace, dir_images, dir_out_images): """ Reading the page xml files and write the ground truth images into given output directory. """ ## to do: add footnote to text regions + + if dir_images: + ls_org_imgs = os.listdir(dir_images) + ls_org_imgs_stem = [item.split('.')[0] for item in ls_org_imgs] for index in tqdm(range(len(gt_list))): #try: tree1 = ET.parse(dir_in+'/'+gt_list[index], parser = ET.XMLParser(encoding = 'iso-8859-5')) @@ -133,6 +137,72 @@ def get_images_of_ground_truth(gt_list, dir_in, output_dir, output_type, config_ y_len=int(jj.attrib['imageHeight']) x_len=int(jj.attrib['imageWidth']) + if 'columns_width' in list(config_params.keys()): + columns_width_dict = config_params['columns_width'] + metadata_element = root1.find(link+'Metadata') + comment_is_sub_element = False + for child in metadata_element: + tag2 = child.tag + if tag2.endswith('}Comments') or tag2.endswith('}comments'): + text_comments = child.text + num_col = int(text_comments.split('num_col')[1]) + comment_is_sub_element = True + if not comment_is_sub_element: + num_col = None + + if num_col: + x_new = columns_width_dict[str(num_col)] + y_new = int ( x_new * (y_len / float(x_len)) ) + + if printspace: + region_tags = np.unique([x for x in alltags if x.endswith('PrintSpace') or x.endswith('Border')]) + co_use_case = [] + + for tag in region_tags: + tag_endings = ['}PrintSpace','}Border'] + + if tag.endswith(tag_endings[0]) or tag.endswith(tag_endings[1]): + for nn in root1.iter(tag): + c_t_in = [] + sumi = 0 + for vv in nn.iter(): + # check the format of coords + if vv.tag == link + 'Coords': + coords = bool(vv.attrib) + if coords: + p_h = vv.attrib['points'].split(' ') + c_t_in.append( + np.array([[int(x.split(',')[0]), int(x.split(',')[1])] for x in p_h])) + break + else: + pass + + if vv.tag == link + 'Point': + c_t_in.append([int(float(vv.attrib['x'])), int(float(vv.attrib['y']))]) + sumi += 1 + elif vv.tag != link + 'Point' and sumi >= 1: + break + co_use_case.append(np.array(c_t_in)) + + img = np.zeros((y_len, x_len, 3)) + + img_poly = cv2.fillPoly(img, pts=co_use_case, color=(1, 1, 1)) + + img_poly = img_poly.astype(np.uint8) + + imgray = cv2.cvtColor(img_poly, cv2.COLOR_BGR2GRAY) + _, thresh = cv2.threshold(imgray, 0, 255, 0) + + contours, _ = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) + + cnt_size = np.array([cv2.contourArea(contours[j]) for j in range(len(contours))]) + + cnt = contours[np.argmax(cnt_size)] + + x, y, w, h = cv2.boundingRect(cnt) + bb_xywh = [x, y, w, h] + + if config_file and (config_params['use_case']=='textline' or config_params['use_case']=='word' or config_params['use_case']=='glyph' or config_params['use_case']=='printspace'): keys = list(config_params.keys()) if "artificial_class_label" in keys: @@ -186,7 +256,6 @@ def get_images_of_ground_truth(gt_list, dir_in, output_dir, output_type, config_ co_use_case.append(np.array(c_t_in)) - if "artificial_class_label" in keys: img_boundary = np.zeros((y_len, x_len)) erosion_rate = 1 @@ -205,12 +274,32 @@ def get_images_of_ground_truth(gt_list, dir_in, output_dir, output_type, config_ img_poly[:,:,0][img_boundary[:,:]==1] = artificial_class_rgb_color[0] img_poly[:,:,1][img_boundary[:,:]==1] = artificial_class_rgb_color[1] img_poly[:,:,2][img_boundary[:,:]==1] = artificial_class_rgb_color[2] + + + if printspace and config_params['use_case']!='printspace': + img_poly = img_poly[bb_xywh[1]:bb_xywh[1]+bb_xywh[3], bb_xywh[0]:bb_xywh[0]+bb_xywh[2], :] + + if 'columns_width' in list(config_params.keys()) and num_col and config_params['use_case']!='printspace': + img_poly = resize_image(img_poly, y_new, x_new) try: - cv2.imwrite(output_dir + '/' + gt_list[index].split('-')[1].split('.')[0] + '.png', - img_poly) + xml_file_stem = gt_list[index].split('-')[1].split('.')[0] + cv2.imwrite(os.path.join(output_dir, xml_file_stem + '.png'), img_poly) except: - cv2.imwrite(output_dir + '/' + gt_list[index].split('.')[0] + '.png', img_poly) + xml_file_stem = gt_list[index].split('.')[0] + cv2.imwrite(os.path.join(output_dir, xml_file_stem + '.png'), img_poly) + + if dir_images: + org_image_name = ls_org_imgs[ls_org_imgs_stem.index(xml_file_stem)] + img_org = cv2.imread(os.path.join(dir_images, org_image_name)) + + if printspace and config_params['use_case']!='printspace': + img_org = img_org[bb_xywh[1]:bb_xywh[1]+bb_xywh[3], bb_xywh[0]:bb_xywh[0]+bb_xywh[2], :] + + if 'columns_width' in list(config_params.keys()) and num_col and config_params['use_case']!='printspace': + img_org = resize_image(img_org, y_new, x_new) + + cv2.imwrite(os.path.join(dir_out_images, org_image_name), img_org) if config_file and config_params['use_case']=='layout': @@ -616,11 +705,31 @@ def get_images_of_ground_truth(gt_list, dir_in, output_dir, output_type, config_ + if printspace: + img_poly = img_poly[bb_xywh[1]:bb_xywh[1]+bb_xywh[3], bb_xywh[0]:bb_xywh[0]+bb_xywh[2], :] + + if 'columns_width' in list(config_params.keys()) and num_col: + img_poly = resize_image(img_poly, y_new, x_new) - try: - cv2.imwrite(output_dir+'/'+gt_list[index].split('-')[1].split('.')[0]+'.png',img_poly ) + try: + xml_file_stem = gt_list[index].split('-')[1].split('.')[0] + cv2.imwrite(os.path.join(output_dir, xml_file_stem + '.png'), img_poly) except: - cv2.imwrite(output_dir+'/'+gt_list[index].split('.')[0]+'.png',img_poly ) + xml_file_stem = gt_list[index].split('.')[0] + cv2.imwrite(os.path.join(output_dir, xml_file_stem + '.png'), img_poly) + + + if dir_images: + org_image_name = ls_org_imgs[ls_org_imgs_stem.index(xml_file_stem)] + img_org = cv2.imread(os.path.join(dir_images, org_image_name)) + + if printspace: + img_org = img_org[bb_xywh[1]:bb_xywh[1]+bb_xywh[3], bb_xywh[0]:bb_xywh[0]+bb_xywh[2], :] + + if 'columns_width' in list(config_params.keys()) and num_col: + img_org = resize_image(img_org, y_new, x_new) + + cv2.imwrite(os.path.join(dir_out_images, org_image_name), img_org) From b9cbc0edb79f5ccd73fafa6f8152ceef623f667c Mon Sep 17 00:00:00 2001 From: vahidrezanezhad Date: Thu, 6 Jun 2024 14:38:29 +0200 Subject: [PATCH 26/62] replacement in a list done correctly --- gt_gen_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gt_gen_utils.py b/gt_gen_utils.py index d3e95e8..38e77e8 100644 --- a/gt_gen_utils.py +++ b/gt_gen_utils.py @@ -636,7 +636,7 @@ def get_images_of_ground_truth(gt_list, dir_in, output_dir, output_type, config_ if 'textregions' in keys: if 'rest_as_paragraph' in types_text: - types_text[types_text=='rest_as_paragraph'] = 'paragraph' + types_text = ['paragraph'if ttind=='rest_as_paragraph' else ttind for ttind in types_text] for element_text in types_text: if element_text == 'paragraph': color_label = labels_rgb_color[ config_params['textregions']['rest_as_paragraph']] @@ -688,7 +688,7 @@ def get_images_of_ground_truth(gt_list, dir_in, output_dir, output_type, config_ if 'textregions' in keys: if 'rest_as_paragraph' in types_text: - types_text[types_text=='rest_as_paragraph'] = 'paragraph' + types_text = ['paragraph'if ttind=='rest_as_paragraph' else ttind for ttind in types_text] for element_text in types_text: if element_text == 'paragraph': color_label = config_params['textregions']['rest_as_paragraph'] From e25a92516970ce1f08713e8f0fd3a69c8294cbad Mon Sep 17 00:00:00 2001 From: vahidrezanezhad Date: Thu, 6 Jun 2024 14:46:06 +0200 Subject: [PATCH 27/62] Update README.md --- README.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/README.md b/README.md index 899c9a3..b9e70a8 100644 --- a/README.md +++ b/README.md @@ -73,3 +73,6 @@ The output folder should be an empty folder where the output model will be writt * weighted_loss: If ``true``, this means that you want to apply weighted categorical_crossentropy as loss fucntion. Be carefull if you set to ``true``the parameter "is_loss_soft_dice" should be ``false`` * data_is_provided: If you have already provided the input data you can set this to ``true``. Be sure that the train and eval data are in "dir_output". Since when once we provide training data we resize and augment them and then we write them in sub-directories train and eval in "dir_output". * dir_train: This is the directory of "images" and "labels" (dir_train should include two subdirectories with names of images and labels ) for raw images and labels. Namely they are not prepared (not resized and not augmented) yet for training the model. When we run this tool these raw data will be transformed to suitable size needed for the model and they will be written in "dir_output" in train and eval directories. Each of train and eval include "images" and "labels" sub-directories. + +#### Additional documentation +Please check the [wiki](https://github.com/qurator-spk/sbb_pixelwise_segmentation/wiki). From 1c8873ffa3bb5fdf53bc23babca62268a3f04d73 Mon Sep 17 00:00:00 2001 From: vahidrezanezhad Date: Thu, 6 Jun 2024 18:45:47 +0200 Subject: [PATCH 28/62] just defined textregion types can be extracted as label --- gt_gen_utils.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/gt_gen_utils.py b/gt_gen_utils.py index 38e77e8..86eb0a1 100644 --- a/gt_gen_utils.py +++ b/gt_gen_utils.py @@ -325,6 +325,7 @@ def get_images_of_ground_truth(gt_list, dir_in, output_dir, output_type, config_ region_tags=np.unique([x for x in alltags if x.endswith('Region')]) co_text = {'drop-capital':[], "footnote":[], "footnote-continued":[], "heading":[], "signature-mark":[], "header":[], "catch-word":[], "page-number":[], "marginalia":[], "paragraph":[]} + all_defined_textregion_types = list(co_text.keys()) co_graphic = {"handwritten-annotation":[], "decoration":[], "stamp":[], "signature":[]} co_sep=[] co_img=[] @@ -359,7 +360,8 @@ def get_images_of_ground_truth(gt_list, dir_in, output_dir, output_type, config_ else: if "type" in nn.attrib: - c_t_in[nn.attrib['type']].append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) + if nn.attrib['type'] in all_defined_textregion_types: + c_t_in[nn.attrib['type']].append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) break else: @@ -384,8 +386,9 @@ def get_images_of_ground_truth(gt_list, dir_in, output_dir, output_type, config_ else: if "type" in nn.attrib: - c_t_in[nn.attrib['type']].append( [ int(float(vv.attrib['x'])) , int(float(vv.attrib['y'])) ] ) - sumi+=1 + if nn.attrib['type'] in all_defined_textregion_types: + c_t_in[nn.attrib['type']].append( [ int(float(vv.attrib['x'])) , int(float(vv.attrib['y'])) ] ) + sumi+=1 elif vv.tag!=link+'Point' and sumi>=1: From dc356a5f426f71ab9b036b6acfd1fd7eb815e8ff Mon Sep 17 00:00:00 2001 From: vahidrezanezhad Date: Thu, 6 Jun 2024 18:55:22 +0200 Subject: [PATCH 29/62] just defined graphic region types can be extracted as label --- gt_gen_utils.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/gt_gen_utils.py b/gt_gen_utils.py index 86eb0a1..c2360fc 100644 --- a/gt_gen_utils.py +++ b/gt_gen_utils.py @@ -327,6 +327,7 @@ def get_images_of_ground_truth(gt_list, dir_in, output_dir, output_type, config_ co_text = {'drop-capital':[], "footnote":[], "footnote-continued":[], "heading":[], "signature-mark":[], "header":[], "catch-word":[], "page-number":[], "marginalia":[], "paragraph":[]} all_defined_textregion_types = list(co_text.keys()) co_graphic = {"handwritten-annotation":[], "decoration":[], "stamp":[], "signature":[]} + all_defined_graphic_types = list(co_graphic.keys()) co_sep=[] co_img=[] co_table=[] @@ -425,7 +426,8 @@ def get_images_of_ground_truth(gt_list, dir_in, output_dir, output_type, config_ else: if "type" in nn.attrib: - c_t_in_graphic[nn.attrib['type']].append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) + if nn.attrib['type'] in all_defined_graphic_types: + c_t_in_graphic[nn.attrib['type']].append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) break else: @@ -450,8 +452,9 @@ def get_images_of_ground_truth(gt_list, dir_in, output_dir, output_type, config_ else: if "type" in nn.attrib: - c_t_in_graphic[nn.attrib['type']].append( [ int(float(vv.attrib['x'])) , int(float(vv.attrib['y'])) ] ) - sumi+=1 + if nn.attrib['type'] in all_defined_graphic_types: + c_t_in_graphic[nn.attrib['type']].append( [ int(float(vv.attrib['x'])) , int(float(vv.attrib['y'])) ] ) + sumi+=1 elif vv.tag!=link+'Point' and sumi>=1: break From 815e5a1d352784f1b754c176b7b3d0e0455b0cdf Mon Sep 17 00:00:00 2001 From: vahidrezanezhad Date: Fri, 7 Jun 2024 16:24:31 +0200 Subject: [PATCH 30/62] updating train.py --- train.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/train.py b/train.py index f338c78..e16745f 100644 --- a/train.py +++ b/train.py @@ -59,6 +59,8 @@ def config_params(): pretraining = False # Set to true to load pretrained weights of ResNet50 encoder. scaling_bluring = False # If true, a combination of scaling and blurring will be applied to the image. scaling_binarization = False # If true, a combination of scaling and binarization will be applied to the image. + rotation = False # If true, a 90 degree rotation will be implemeneted. + rotation_not_90 = False # If true rotation based on provided angles with thetha will be implemeneted. scaling_brightness = False # If true, a combination of scaling and brightening will be applied to the image. scaling_flip = False # If true, a combination of scaling and flipping will be applied to the image. thetha = None # Rotate image by these angles for augmentation. From 41a0e15e796976da98fb3236ec9eee26d14d3cb1 Mon Sep 17 00:00:00 2001 From: vahidrezanezhad Date: Mon, 10 Jun 2024 22:15:30 +0200 Subject: [PATCH 31/62] updating train.py nontransformer backend --- models.py | 13 +++++++++---- train.py | 12 +++++++++--- 2 files changed, 18 insertions(+), 7 deletions(-) diff --git a/models.py b/models.py index d852ac3..b8b0d27 100644 --- a/models.py +++ b/models.py @@ -30,8 +30,8 @@ class Patches(layers.Layer): self.patch_size = patch_size def call(self, images): - print(tf.shape(images)[1],'images') - print(self.patch_size,'self.patch_size') + #print(tf.shape(images)[1],'images') + #print(self.patch_size,'self.patch_size') batch_size = tf.shape(images)[0] patches = tf.image.extract_patches( images=images, @@ -41,7 +41,7 @@ class Patches(layers.Layer): padding="VALID", ) patch_dims = patches.shape[-1] - print(patches.shape,patch_dims,'patch_dims') + #print(patches.shape,patch_dims,'patch_dims') patches = tf.reshape(patches, [batch_size, -1, patch_dims]) return patches def get_config(self): @@ -51,6 +51,7 @@ class Patches(layers.Layer): 'patch_size': self.patch_size, }) return config + class PatchEncoder(layers.Layer): def __init__(self, num_patches, projection_dim): @@ -408,7 +409,11 @@ def vit_resnet50_unet(n_classes, patch_size, num_patches, input_height=224, inpu if pretraining: model = Model(inputs, x).load_weights(resnet50_Weights_path) - num_patches = x.shape[1]*x.shape[2] + #num_patches = x.shape[1]*x.shape[2] + + #patch_size_y = input_height / x.shape[1] + #patch_size_x = input_width / x.shape[2] + #patch_size = patch_size_x * patch_size_y patches = Patches(patch_size)(x) # Encode patches. encoded_patches = PatchEncoder(num_patches, projection_dim)(patches) diff --git a/train.py b/train.py index e16745f..84c9d3b 100644 --- a/train.py +++ b/train.py @@ -97,8 +97,6 @@ def run(_config, n_classes, n_epochs, input_height, pretraining, learning_rate, task, f1_threshold_classification, classification_classes_name): if task == "segmentation" or task == "enhancement": - - num_patches = transformer_num_patches_xy[0]*transformer_num_patches_xy[1] if data_is_provided: dir_train_flowing = os.path.join(dir_output, 'train') dir_eval_flowing = os.path.join(dir_output, 'eval') @@ -213,7 +211,15 @@ def run(_config, n_classes, n_epochs, input_height, index_start = 0 if backbone_type=='nontransformer': model = resnet50_unet(n_classes, input_height, input_width, task, weight_decay, pretraining) - elif backbone_type=='nontransformer': + elif backbone_type=='transformer': + num_patches = transformer_num_patches_xy[0]*transformer_num_patches_xy[1] + + if not (num_patches == (input_width / 32) * (input_height / 32)): + print("Error: transformer num patches error. Parameter transformer_num_patches_xy should be set to (input_width/32) = {} and (input_height/32) = {}".format(int(input_width / 32), int(input_height / 32)) ) + sys.exit(1) + if not (transformer_patchsize == 1): + print("Error: transformer patchsize error. Parameter transformer_patchsizeshould set to 1" ) + sys.exit(1) model = vit_resnet50_unet(n_classes, transformer_patchsize, num_patches, input_height, input_width, task, weight_decay, pretraining) #if you want to see the model structure just uncomment model summary. From 2aa216e3885eb34f9927486c642ea51ce1220019 Mon Sep 17 00:00:00 2001 From: vahidrezanezhad Date: Tue, 11 Jun 2024 17:48:30 +0200 Subject: [PATCH 32/62] binarization as a separate task of segmentation --- train.py | 13 +++++++------ utils.py | 4 ++-- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/train.py b/train.py index 84c9d3b..9e06a66 100644 --- a/train.py +++ b/train.py @@ -96,7 +96,7 @@ def run(_config, n_classes, n_epochs, input_height, transformer_num_patches_xy, backbone_type, flip_index, dir_eval, dir_output, pretraining, learning_rate, task, f1_threshold_classification, classification_classes_name): - if task == "segmentation" or task == "enhancement": + if task == "segmentation" or task == "enhancement" or task == "binarization": if data_is_provided: dir_train_flowing = os.path.join(dir_output, 'train') dir_eval_flowing = os.path.join(dir_output, 'eval') @@ -194,16 +194,16 @@ def run(_config, n_classes, n_epochs, input_height, if continue_training: if backbone_type=='nontransformer': - if is_loss_soft_dice and task == "segmentation": + if is_loss_soft_dice and (task == "segmentation" or task == "binarization"): model = load_model(dir_of_start_model, compile=True, custom_objects={'soft_dice_loss': soft_dice_loss}) - if weighted_loss and task == "segmentation": + if weighted_loss and (task == "segmentation" or task == "binarization"): model = load_model(dir_of_start_model, compile=True, custom_objects={'loss': weighted_categorical_crossentropy(weights)}) if not is_loss_soft_dice and not weighted_loss: model = load_model(dir_of_start_model , compile=True) elif backbone_type=='transformer': - if is_loss_soft_dice and task == "segmentation": + if is_loss_soft_dice and (task == "segmentation" or task == "binarization"): model = load_model(dir_of_start_model, compile=True, custom_objects={"PatchEncoder": PatchEncoder, "Patches": Patches,'soft_dice_loss': soft_dice_loss}) - if weighted_loss and task == "segmentation": + if weighted_loss and (task == "segmentation" or task == "binarization"): model = load_model(dir_of_start_model, compile=True, custom_objects={'loss': weighted_categorical_crossentropy(weights)}) if not is_loss_soft_dice and not weighted_loss: model = load_model(dir_of_start_model , compile=True,custom_objects = {"PatchEncoder": PatchEncoder, "Patches": Patches}) @@ -224,8 +224,9 @@ def run(_config, n_classes, n_epochs, input_height, #if you want to see the model structure just uncomment model summary. #model.summary() + - if task == "segmentation": + if (task == "segmentation" or task == "binarization"): if not is_loss_soft_dice and not weighted_loss: model.compile(loss='categorical_crossentropy', optimizer=Adam(lr=learning_rate), metrics=['accuracy']) diff --git a/utils.py b/utils.py index a2e8a9c..605d8d1 100644 --- a/utils.py +++ b/utils.py @@ -309,7 +309,7 @@ def data_gen(img_folder, mask_folder, batch_size, input_height, input_width, n_c interpolation=cv2.INTER_NEAREST) # Read an image from folder and resize img[i - c] = train_img # add to array - img[0], img[1], and so on. - if task == "segmentation": + if task == "segmentation" or task=="binarization": train_mask = cv2.imread(mask_folder + '/' + filename + '.png') train_mask = get_one_hot(resize_image(train_mask, input_height, input_width), input_height, input_width, n_classes) @@ -569,7 +569,7 @@ def provide_patches(imgs_list_train, segs_list_train, dir_img, dir_seg, dir_flow indexer = 0 for im, seg_i in tqdm(zip(imgs_list_train, segs_list_train)): img_name = im.split('.')[0] - if task == "segmentation": + if task == "segmentation" or task == "binarization": dir_of_label_file = os.path.join(dir_seg, img_name + '.png') elif task=="enhancement": dir_of_label_file = os.path.join(dir_seg, im) From f1fd74c7eb485e4ea0cfb53f233404124c5665c6 Mon Sep 17 00:00:00 2001 From: vahidrezanezhad Date: Wed, 12 Jun 2024 13:26:27 +0200 Subject: [PATCH 33/62] transformer patch size is dynamic now. --- config_params.json | 28 ++++++++++++++------------- models.py | 47 +++++++++++++++++++++++++++++++++++++--------- train.py | 30 +++++++++++++++++++++-------- 3 files changed, 75 insertions(+), 30 deletions(-) diff --git a/config_params.json b/config_params.json index 8a56de5..6b8b6ed 100644 --- a/config_params.json +++ b/config_params.json @@ -1,42 +1,44 @@ { - "backbone_type" : "nontransformer", - "task": "classification", + "backbone_type" : "transformer", + "task": "binarization", "n_classes" : 2, - "n_epochs" : 20, - "input_height" : 448, - "input_width" : 448, + "n_epochs" : 1, + "input_height" : 224, + "input_width" : 672, "weight_decay" : 1e-6, - "n_batch" : 6, + "n_batch" : 1, "learning_rate": 1e-4, - "f1_threshold_classification": 0.8, "patches" : true, "pretraining" : true, "augmentation" : false, "flip_aug" : false, "blur_aug" : false, "scaling" : true, + "degrading": false, + "brightening": false, "binarization" : false, "scaling_bluring" : false, "scaling_binarization" : false, "scaling_flip" : false, "rotation": false, "rotation_not_90": false, - "transformer_num_patches_xy": [28, 28], - "transformer_patchsize": 1, + "transformer_num_patches_xy": [7, 7], + "transformer_patchsize_x": 3, + "transformer_patchsize_y": 1, + "transformer_projection_dim": 192, "blur_k" : ["blur","guass","median"], "scales" : [0.6, 0.7, 0.8, 0.9, 1.1, 1.2, 1.4], "brightness" : [1.3, 1.5, 1.7, 2], "degrade_scales" : [0.2, 0.4], "flip_index" : [0, 1, -1], "thetha" : [10, -10], - "classification_classes_name" : {"0":"apple", "1":"orange"}, "continue_training": false, "index_start" : 0, "dir_of_start_model" : " ", "weighted_loss": false, "is_loss_soft_dice": false, "data_is_provided": false, - "dir_train": "./train", - "dir_eval": "./eval", - "dir_output": "./output" + "dir_train": "/home/vahid/Documents/test/training_data_sample_binarization", + "dir_eval": "/home/vahid/Documents/test/eval", + "dir_output": "/home/vahid/Documents/test/out" } diff --git a/models.py b/models.py index b8b0d27..1abf304 100644 --- a/models.py +++ b/models.py @@ -6,25 +6,49 @@ from tensorflow.keras import layers from tensorflow.keras.regularizers import l2 mlp_head_units = [2048, 1024] -projection_dim = 64 +#projection_dim = 64 transformer_layers = 8 num_heads = 4 resnet50_Weights_path = './pretrained_model/resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5' IMAGE_ORDERING = 'channels_last' MERGE_AXIS = -1 -transformer_units = [ - projection_dim * 2, - projection_dim, -] # Size of the transformer layers def mlp(x, hidden_units, dropout_rate): for units in hidden_units: x = layers.Dense(units, activation=tf.nn.gelu)(x) x = layers.Dropout(dropout_rate)(x) return x - class Patches(layers.Layer): + def __init__(self, patch_size_x, patch_size_y):#__init__(self, **kwargs):#:__init__(self, patch_size):#__init__(self, **kwargs): + super(Patches, self).__init__() + self.patch_size_x = patch_size_x + self.patch_size_y = patch_size_y + + def call(self, images): + #print(tf.shape(images)[1],'images') + #print(self.patch_size,'self.patch_size') + batch_size = tf.shape(images)[0] + patches = tf.image.extract_patches( + images=images, + sizes=[1, self.patch_size_y, self.patch_size_x, 1], + strides=[1, self.patch_size_y, self.patch_size_x, 1], + rates=[1, 1, 1, 1], + padding="VALID", + ) + patch_dims = patches.shape[-1] + patches = tf.reshape(patches, [batch_size, -1, patch_dims]) + return patches + def get_config(self): + + config = super().get_config().copy() + config.update({ + 'patch_size_x': self.patch_size_x, + 'patch_size_y': self.patch_size_y, + }) + return config + +class Patches_old(layers.Layer): def __init__(self, patch_size):#__init__(self, **kwargs):#:__init__(self, patch_size):#__init__(self, **kwargs): super(Patches, self).__init__() self.patch_size = patch_size @@ -369,8 +393,13 @@ def resnet50_unet(n_classes, input_height=224, input_width=224, task="segmentati return model -def vit_resnet50_unet(n_classes, patch_size, num_patches, input_height=224, input_width=224, task="segmentation", weight_decay=1e-6, pretraining=False): +def vit_resnet50_unet(n_classes, patch_size_x, patch_size_y, num_patches, projection_dim = 64, input_height=224, input_width=224, task="segmentation", weight_decay=1e-6, pretraining=False): inputs = layers.Input(shape=(input_height, input_width, 3)) + + transformer_units = [ + projection_dim * 2, + projection_dim, + ] # Size of the transformer layers IMAGE_ORDERING = 'channels_last' bn_axis=3 @@ -414,7 +443,7 @@ def vit_resnet50_unet(n_classes, patch_size, num_patches, input_height=224, inpu #patch_size_y = input_height / x.shape[1] #patch_size_x = input_width / x.shape[2] #patch_size = patch_size_x * patch_size_y - patches = Patches(patch_size)(x) + patches = Patches(patch_size_x, patch_size_y)(x) # Encode patches. encoded_patches = PatchEncoder(num_patches, projection_dim)(patches) @@ -434,7 +463,7 @@ def vit_resnet50_unet(n_classes, patch_size, num_patches, input_height=224, inpu # Skip connection 2. encoded_patches = layers.Add()([x3, x2]) - encoded_patches = tf.reshape(encoded_patches, [-1, x.shape[1], x.shape[2], 64]) + encoded_patches = tf.reshape(encoded_patches, [-1, x.shape[1], x.shape[2] , int( projection_dim / (patch_size_x * patch_size_y) )]) v1024_2048 = Conv2D( 1024 , (1, 1), padding='same', data_format=IMAGE_ORDERING,kernel_regularizer=l2(weight_decay))(encoded_patches) v1024_2048 = (BatchNormalization(axis=bn_axis))(v1024_2048) diff --git a/train.py b/train.py index 9e06a66..bafcc9e 100644 --- a/train.py +++ b/train.py @@ -70,8 +70,10 @@ def config_params(): brightness = None # Brighten image for augmentation. flip_index = None # Flip image for augmentation. continue_training = False # Set to true if you would like to continue training an already trained a model. - transformer_patchsize = None # Patch size of vision transformer patches. + transformer_patchsize_x = None # Patch size of vision transformer patches. + transformer_patchsize_y = None transformer_num_patches_xy = None # Number of patches for vision transformer. + transformer_projection_dim = 64 # Transformer projection dimension index_start = 0 # Index of model to continue training from. E.g. if you trained for 3 epochs and last index is 2, to continue from model_1.h5, set "index_start" to 3 to start naming model with index 3. dir_of_start_model = '' # Directory containing pretrained encoder to continue training the model. is_loss_soft_dice = False # Use soft dice as loss function. When set to true, "weighted_loss" must be false. @@ -92,7 +94,7 @@ def run(_config, n_classes, n_epochs, input_height, brightening, binarization, blur_k, scales, degrade_scales, brightness, dir_train, data_is_provided, scaling_bluring, scaling_brightness, scaling_binarization, rotation, rotation_not_90, - thetha, scaling_flip, continue_training, transformer_patchsize, + thetha, scaling_flip, continue_training, transformer_projection_dim, transformer_patchsize_x, transformer_patchsize_y, transformer_num_patches_xy, backbone_type, flip_index, dir_eval, dir_output, pretraining, learning_rate, task, f1_threshold_classification, classification_classes_name): @@ -212,15 +214,27 @@ def run(_config, n_classes, n_epochs, input_height, if backbone_type=='nontransformer': model = resnet50_unet(n_classes, input_height, input_width, task, weight_decay, pretraining) elif backbone_type=='transformer': - num_patches = transformer_num_patches_xy[0]*transformer_num_patches_xy[1] + num_patches_x = transformer_num_patches_xy[0] + num_patches_y = transformer_num_patches_xy[1] + num_patches = num_patches_x * num_patches_y - if not (num_patches == (input_width / 32) * (input_height / 32)): - print("Error: transformer num patches error. Parameter transformer_num_patches_xy should be set to (input_width/32) = {} and (input_height/32) = {}".format(int(input_width / 32), int(input_height / 32)) ) + ##if not (num_patches == (input_width / 32) * (input_height / 32)): + ##print("Error: transformer num patches error. Parameter transformer_num_patches_xy should be set to (input_width/32) = {} and (input_height/32) = {}".format(int(input_width / 32), int(input_height / 32)) ) + ##sys.exit(1) + #if not (transformer_patchsize == 1): + #print("Error: transformer patchsize error. Parameter transformer_patchsizeshould set to 1" ) + #sys.exit(1) + if (input_height != (num_patches_y * transformer_patchsize_y * 32) ): + print("Error: transformer_patchsize_y or transformer_num_patches_xy height value error . input_height should be equal to ( transformer_num_patches_xy height value * transformer_patchsize_y * 32)") sys.exit(1) - if not (transformer_patchsize == 1): - print("Error: transformer patchsize error. Parameter transformer_patchsizeshould set to 1" ) + if (input_width != (num_patches_x * transformer_patchsize_x * 32) ): + print("Error: transformer_patchsize_x or transformer_num_patches_xy width value error . input_width should be equal to ( transformer_num_patches_xy width value * transformer_patchsize_x * 32)") sys.exit(1) - model = vit_resnet50_unet(n_classes, transformer_patchsize, num_patches, input_height, input_width, task, weight_decay, pretraining) + if (transformer_projection_dim % (transformer_patchsize_y * transformer_patchsize_x)) != 0: + print("Error: transformer_projection_dim error. The remainder when parameter transformer_projection_dim is divided by (transformer_patchsize_y*transformer_patchsize_x) should be zero") + sys.exit(1) + + model = vit_resnet50_unet(n_classes, transformer_patchsize_x, transformer_patchsize_y, num_patches, transformer_projection_dim, input_height, input_width, task, weight_decay, pretraining) #if you want to see the model structure just uncomment model summary. #model.summary() From 743f2e97d65b8009991d799be947db825b6f6d3e Mon Sep 17 00:00:00 2001 From: vahidrezanezhad Date: Wed, 12 Jun 2024 17:39:57 +0200 Subject: [PATCH 34/62] Transformer+CNN structure is added to vision transformer type --- config_params.json | 16 +++-- models.py | 142 +++++++++++++++++++++++++++++++++++++++++---- train.py | 57 +++++++++++------- 3 files changed, 176 insertions(+), 39 deletions(-) diff --git a/config_params.json b/config_params.json index 6b8b6ed..d72530e 100644 --- a/config_params.json +++ b/config_params.json @@ -2,9 +2,9 @@ "backbone_type" : "transformer", "task": "binarization", "n_classes" : 2, - "n_epochs" : 1, + "n_epochs" : 2, "input_height" : 224, - "input_width" : 672, + "input_width" : 224, "weight_decay" : 1e-6, "n_batch" : 1, "learning_rate": 1e-4, @@ -22,10 +22,14 @@ "scaling_flip" : false, "rotation": false, "rotation_not_90": false, - "transformer_num_patches_xy": [7, 7], - "transformer_patchsize_x": 3, - "transformer_patchsize_y": 1, - "transformer_projection_dim": 192, + "transformer_num_patches_xy": [56, 56], + "transformer_patchsize_x": 4, + "transformer_patchsize_y": 4, + "transformer_projection_dim": 64, + "transformer_mlp_head_units": [128, 64], + "transformer_layers": 1, + "transformer_num_heads": 1, + "transformer_cnn_first": false, "blur_k" : ["blur","guass","median"], "scales" : [0.6, 0.7, 0.8, 0.9, 1.1, 1.2, 1.4], "brightness" : [1.3, 1.5, 1.7, 2], diff --git a/models.py b/models.py index 1abf304..8841bd3 100644 --- a/models.py +++ b/models.py @@ -5,10 +5,10 @@ from tensorflow.keras.layers import * from tensorflow.keras import layers from tensorflow.keras.regularizers import l2 -mlp_head_units = [2048, 1024] -#projection_dim = 64 -transformer_layers = 8 -num_heads = 4 +##mlp_head_units = [512, 256]#[2048, 1024] +###projection_dim = 64 +##transformer_layers = 2#8 +##num_heads = 1#4 resnet50_Weights_path = './pretrained_model/resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5' IMAGE_ORDERING = 'channels_last' MERGE_AXIS = -1 @@ -36,7 +36,8 @@ class Patches(layers.Layer): rates=[1, 1, 1, 1], padding="VALID", ) - patch_dims = patches.shape[-1] + #patch_dims = patches.shape[-1] + patch_dims = tf.shape(patches)[-1] patches = tf.reshape(patches, [batch_size, -1, patch_dims]) return patches def get_config(self): @@ -393,13 +394,13 @@ def resnet50_unet(n_classes, input_height=224, input_width=224, task="segmentati return model -def vit_resnet50_unet(n_classes, patch_size_x, patch_size_y, num_patches, projection_dim = 64, input_height=224, input_width=224, task="segmentation", weight_decay=1e-6, pretraining=False): +def vit_resnet50_unet(n_classes, patch_size_x, patch_size_y, num_patches, mlp_head_units=[128, 64], transformer_layers=8, num_heads =4, projection_dim = 64, input_height=224, input_width=224, task="segmentation", weight_decay=1e-6, pretraining=False): inputs = layers.Input(shape=(input_height, input_width, 3)) - transformer_units = [ - projection_dim * 2, - projection_dim, - ] # Size of the transformer layers + #transformer_units = [ + #projection_dim * 2, + #projection_dim, + #] # Size of the transformer layers IMAGE_ORDERING = 'channels_last' bn_axis=3 @@ -459,7 +460,7 @@ def vit_resnet50_unet(n_classes, patch_size_x, patch_size_y, num_patches, projec # Layer normalization 2. x3 = layers.LayerNormalization(epsilon=1e-6)(x2) # MLP. - x3 = mlp(x3, hidden_units=transformer_units, dropout_rate=0.1) + x3 = mlp(x3, hidden_units=mlp_head_units, dropout_rate=0.1) # Skip connection 2. encoded_patches = layers.Add()([x3, x2]) @@ -515,6 +516,125 @@ def vit_resnet50_unet(n_classes, patch_size_x, patch_size_y, num_patches, projec return model +def vit_resnet50_unet_transformer_before_cnn(n_classes, patch_size_x, patch_size_y, num_patches, mlp_head_units=[128, 64], transformer_layers=8, num_heads =4, projection_dim = 64, input_height=224, input_width=224, task="segmentation", weight_decay=1e-6, pretraining=False): + inputs = layers.Input(shape=(input_height, input_width, 3)) + + ##transformer_units = [ + ##projection_dim * 2, + ##projection_dim, + ##] # Size of the transformer layers + IMAGE_ORDERING = 'channels_last' + bn_axis=3 + + patches = Patches(patch_size_x, patch_size_y)(inputs) + # Encode patches. + encoded_patches = PatchEncoder(num_patches, projection_dim)(patches) + + for _ in range(transformer_layers): + # Layer normalization 1. + x1 = layers.LayerNormalization(epsilon=1e-6)(encoded_patches) + # Create a multi-head attention layer. + attention_output = layers.MultiHeadAttention( + num_heads=num_heads, key_dim=projection_dim, dropout=0.1 + )(x1, x1) + # Skip connection 1. + x2 = layers.Add()([attention_output, encoded_patches]) + # Layer normalization 2. + x3 = layers.LayerNormalization(epsilon=1e-6)(x2) + # MLP. + x3 = mlp(x3, hidden_units=mlp_head_units, dropout_rate=0.1) + # Skip connection 2. + encoded_patches = layers.Add()([x3, x2]) + + encoded_patches = tf.reshape(encoded_patches, [-1, input_height, input_width , int( projection_dim / (patch_size_x * patch_size_y) )]) + + encoded_patches = Conv2D(3, (1, 1), padding='same', data_format=IMAGE_ORDERING, kernel_regularizer=l2(weight_decay), name='convinput')(encoded_patches) + + x = ZeroPadding2D((3, 3), data_format=IMAGE_ORDERING)(encoded_patches) + x = Conv2D(64, (7, 7), data_format=IMAGE_ORDERING, strides=(2, 2),kernel_regularizer=l2(weight_decay), name='conv1')(x) + f1 = x + + x = BatchNormalization(axis=bn_axis, name='bn_conv1')(x) + x = Activation('relu')(x) + x = MaxPooling2D((3, 3), data_format=IMAGE_ORDERING, strides=(2, 2))(x) + + x = conv_block(x, 3, [64, 64, 256], stage=2, block='a', strides=(1, 1)) + x = identity_block(x, 3, [64, 64, 256], stage=2, block='b') + x = identity_block(x, 3, [64, 64, 256], stage=2, block='c') + f2 = one_side_pad(x) + + x = conv_block(x, 3, [128, 128, 512], stage=3, block='a') + x = identity_block(x, 3, [128, 128, 512], stage=3, block='b') + x = identity_block(x, 3, [128, 128, 512], stage=3, block='c') + x = identity_block(x, 3, [128, 128, 512], stage=3, block='d') + f3 = x + + x = conv_block(x, 3, [256, 256, 1024], stage=4, block='a') + x = identity_block(x, 3, [256, 256, 1024], stage=4, block='b') + x = identity_block(x, 3, [256, 256, 1024], stage=4, block='c') + x = identity_block(x, 3, [256, 256, 1024], stage=4, block='d') + x = identity_block(x, 3, [256, 256, 1024], stage=4, block='e') + x = identity_block(x, 3, [256, 256, 1024], stage=4, block='f') + f4 = x + + x = conv_block(x, 3, [512, 512, 2048], stage=5, block='a') + x = identity_block(x, 3, [512, 512, 2048], stage=5, block='b') + x = identity_block(x, 3, [512, 512, 2048], stage=5, block='c') + f5 = x + + if pretraining: + model = Model(encoded_patches, x).load_weights(resnet50_Weights_path) + + v1024_2048 = Conv2D( 1024 , (1, 1), padding='same', data_format=IMAGE_ORDERING,kernel_regularizer=l2(weight_decay))(x) + v1024_2048 = (BatchNormalization(axis=bn_axis))(v1024_2048) + v1024_2048 = Activation('relu')(v1024_2048) + + o = (UpSampling2D( (2, 2), data_format=IMAGE_ORDERING))(v1024_2048) + o = (concatenate([o, f4],axis=MERGE_AXIS)) + o = (ZeroPadding2D((1, 1), data_format=IMAGE_ORDERING))(o) + o = (Conv2D(512, (3, 3), padding='valid', data_format=IMAGE_ORDERING,kernel_regularizer=l2(weight_decay)))(o) + o = (BatchNormalization(axis=bn_axis))(o) + o = Activation('relu')(o) + + o = (UpSampling2D((2, 2), data_format=IMAGE_ORDERING))(o) + o = (concatenate([o ,f3], axis=MERGE_AXIS)) + o = (ZeroPadding2D((1, 1), data_format=IMAGE_ORDERING))(o) + o = (Conv2D(256, (3, 3), padding='valid', data_format=IMAGE_ORDERING,kernel_regularizer=l2(weight_decay)))(o) + o = (BatchNormalization(axis=bn_axis))(o) + o = Activation('relu')(o) + + o = (UpSampling2D((2, 2), data_format=IMAGE_ORDERING))(o) + o = (concatenate([o, f2], axis=MERGE_AXIS)) + o = (ZeroPadding2D((1, 1), data_format=IMAGE_ORDERING))(o) + o = (Conv2D(128, (3, 3), padding='valid', data_format=IMAGE_ORDERING,kernel_regularizer=l2(weight_decay)))(o) + o = (BatchNormalization(axis=bn_axis))(o) + o = Activation('relu')(o) + + o = (UpSampling2D((2, 2), data_format=IMAGE_ORDERING))(o) + o = (concatenate([o, f1], axis=MERGE_AXIS)) + o = (ZeroPadding2D((1, 1), data_format=IMAGE_ORDERING))(o) + o = (Conv2D(64, (3, 3), padding='valid', data_format=IMAGE_ORDERING,kernel_regularizer=l2(weight_decay)))(o) + o = (BatchNormalization(axis=bn_axis))(o) + o = Activation('relu')(o) + + o = (UpSampling2D((2, 2), data_format=IMAGE_ORDERING))(o) + o = (concatenate([o, inputs],axis=MERGE_AXIS)) + o = (ZeroPadding2D((1, 1), data_format=IMAGE_ORDERING))(o) + o = (Conv2D(32, (3, 3), padding='valid', data_format=IMAGE_ORDERING,kernel_regularizer=l2(weight_decay)))(o) + o = (BatchNormalization(axis=bn_axis))(o) + o = Activation('relu')(o) + + o = Conv2D(n_classes, (1, 1), padding='same', data_format=IMAGE_ORDERING,kernel_regularizer=l2(weight_decay))(o) + if task == "segmentation": + o = (BatchNormalization(axis=bn_axis))(o) + o = (Activation('softmax'))(o) + else: + o = (Activation('sigmoid'))(o) + + model = Model(inputs=inputs, outputs=o) + + return model + def resnet50_classifier(n_classes,input_height=224,input_width=224,weight_decay=1e-6,pretraining=False): include_top=True assert input_height%32 == 0 diff --git a/train.py b/train.py index bafcc9e..71f31f3 100644 --- a/train.py +++ b/train.py @@ -70,10 +70,14 @@ def config_params(): brightness = None # Brighten image for augmentation. flip_index = None # Flip image for augmentation. continue_training = False # Set to true if you would like to continue training an already trained a model. - transformer_patchsize_x = None # Patch size of vision transformer patches. - transformer_patchsize_y = None - transformer_num_patches_xy = None # Number of patches for vision transformer. - transformer_projection_dim = 64 # Transformer projection dimension + transformer_patchsize_x = None # Patch size of vision transformer patches in x direction. + transformer_patchsize_y = None # Patch size of vision transformer patches in y direction. + transformer_num_patches_xy = None # Number of patches for vision transformer in x and y direction respectively. + transformer_projection_dim = 64 # Transformer projection dimension. Default value is 64. + transformer_mlp_head_units = [128, 64] # Transformer Multilayer Perceptron (MLP) head units. Default value is [128, 64] + transformer_layers = 8 # transformer layers. Default value is 8. + transformer_num_heads = 4 # Transformer number of heads. Default value is 4. + transformer_cnn_first = True # We have two types of vision transformers. In one type, a CNN is applied first, followed by a transformer. In the other type, this order is reversed. If transformer_cnn_first is true, it means the CNN will be applied before the transformer. Default value is true. index_start = 0 # Index of model to continue training from. E.g. if you trained for 3 epochs and last index is 2, to continue from model_1.h5, set "index_start" to 3 to start naming model with index 3. dir_of_start_model = '' # Directory containing pretrained encoder to continue training the model. is_loss_soft_dice = False # Use soft dice as loss function. When set to true, "weighted_loss" must be false. @@ -94,7 +98,9 @@ def run(_config, n_classes, n_epochs, input_height, brightening, binarization, blur_k, scales, degrade_scales, brightness, dir_train, data_is_provided, scaling_bluring, scaling_brightness, scaling_binarization, rotation, rotation_not_90, - thetha, scaling_flip, continue_training, transformer_projection_dim, transformer_patchsize_x, transformer_patchsize_y, + thetha, scaling_flip, continue_training, transformer_projection_dim, + transformer_mlp_head_units, transformer_layers, transformer_num_heads, transformer_cnn_first, + transformer_patchsize_x, transformer_patchsize_y, transformer_num_patches_xy, backbone_type, flip_index, dir_eval, dir_output, pretraining, learning_rate, task, f1_threshold_classification, classification_classes_name): @@ -218,26 +224,33 @@ def run(_config, n_classes, n_epochs, input_height, num_patches_y = transformer_num_patches_xy[1] num_patches = num_patches_x * num_patches_y - ##if not (num_patches == (input_width / 32) * (input_height / 32)): - ##print("Error: transformer num patches error. Parameter transformer_num_patches_xy should be set to (input_width/32) = {} and (input_height/32) = {}".format(int(input_width / 32), int(input_height / 32)) ) - ##sys.exit(1) - #if not (transformer_patchsize == 1): - #print("Error: transformer patchsize error. Parameter transformer_patchsizeshould set to 1" ) - #sys.exit(1) - if (input_height != (num_patches_y * transformer_patchsize_y * 32) ): - print("Error: transformer_patchsize_y or transformer_num_patches_xy height value error . input_height should be equal to ( transformer_num_patches_xy height value * transformer_patchsize_y * 32)") - sys.exit(1) - if (input_width != (num_patches_x * transformer_patchsize_x * 32) ): - print("Error: transformer_patchsize_x or transformer_num_patches_xy width value error . input_width should be equal to ( transformer_num_patches_xy width value * transformer_patchsize_x * 32)") - sys.exit(1) - if (transformer_projection_dim % (transformer_patchsize_y * transformer_patchsize_x)) != 0: - print("Error: transformer_projection_dim error. The remainder when parameter transformer_projection_dim is divided by (transformer_patchsize_y*transformer_patchsize_x) should be zero") - sys.exit(1) + if transformer_cnn_first: + if (input_height != (num_patches_y * transformer_patchsize_y * 32) ): + print("Error: transformer_patchsize_y or transformer_num_patches_xy height value error . input_height should be equal to ( transformer_num_patches_xy height value * transformer_patchsize_y * 32)") + sys.exit(1) + if (input_width != (num_patches_x * transformer_patchsize_x * 32) ): + print("Error: transformer_patchsize_x or transformer_num_patches_xy width value error . input_width should be equal to ( transformer_num_patches_xy width value * transformer_patchsize_x * 32)") + sys.exit(1) + if (transformer_projection_dim % (transformer_patchsize_y * transformer_patchsize_x)) != 0: + print("Error: transformer_projection_dim error. The remainder when parameter transformer_projection_dim is divided by (transformer_patchsize_y*transformer_patchsize_x) should be zero") + sys.exit(1) + - model = vit_resnet50_unet(n_classes, transformer_patchsize_x, transformer_patchsize_y, num_patches, transformer_projection_dim, input_height, input_width, task, weight_decay, pretraining) + model = vit_resnet50_unet(n_classes, transformer_patchsize_x, transformer_patchsize_y, num_patches, transformer_mlp_head_units, transformer_layers, transformer_num_heads, transformer_projection_dim, input_height, input_width, task, weight_decay, pretraining) + else: + if (input_height != (num_patches_y * transformer_patchsize_y) ): + print("Error: transformer_patchsize_y or transformer_num_patches_xy height value error . input_height should be equal to ( transformer_num_patches_xy height value * transformer_patchsize_y)") + sys.exit(1) + if (input_width != (num_patches_x * transformer_patchsize_x) ): + print("Error: transformer_patchsize_x or transformer_num_patches_xy width value error . input_width should be equal to ( transformer_num_patches_xy width value * transformer_patchsize_x)") + sys.exit(1) + if (transformer_projection_dim % (transformer_patchsize_y * transformer_patchsize_x)) != 0: + print("Error: transformer_projection_dim error. The remainder when parameter transformer_projection_dim is divided by (transformer_patchsize_y*transformer_patchsize_x) should be zero") + sys.exit(1) + model = vit_resnet50_unet_transformer_before_cnn(n_classes, transformer_patchsize_x, transformer_patchsize_y, num_patches, transformer_mlp_head_units, transformer_layers, transformer_num_heads, transformer_projection_dim, input_height, input_width, task, weight_decay, pretraining) #if you want to see the model structure just uncomment model summary. - #model.summary() + model.summary() if (task == "segmentation" or task == "binarization"): From 9358657a0d2e7c2a85345bb65ff9c549f8c0f190 Mon Sep 17 00:00:00 2001 From: vahidrezanezhad Date: Wed, 12 Jun 2024 17:40:40 +0200 Subject: [PATCH 35/62] update config --- config_params.json | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/config_params.json b/config_params.json index d72530e..a89cbb5 100644 --- a/config_params.json +++ b/config_params.json @@ -42,7 +42,7 @@ "weighted_loss": false, "is_loss_soft_dice": false, "data_is_provided": false, - "dir_train": "/home/vahid/Documents/test/training_data_sample_binarization", - "dir_eval": "/home/vahid/Documents/test/eval", - "dir_output": "/home/vahid/Documents/test/out" + "dir_train": "./train", + "dir_eval": "./eval", + "dir_output": "./output" } From 033cf6734b649aae71bc578b979ac3efc82371e7 Mon Sep 17 00:00:00 2001 From: vahidrezanezhad Date: Fri, 21 Jun 2024 13:06:26 +0200 Subject: [PATCH 36/62] update reading order machine based --- generate_gt_for_training.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/generate_gt_for_training.py b/generate_gt_for_training.py index 752090c..cfcc151 100644 --- a/generate_gt_for_training.py +++ b/generate_gt_for_training.py @@ -163,8 +163,7 @@ def machine_based_reading_order(dir_xml, dir_out_modal_image, dir_out_classes, i #print('########################') xml_file = os.path.join(dir_xml,ind_xml ) f_name = ind_xml.split('.')[0] - file_name, id_paragraph, id_header,co_text_paragraph,\ - co_text_header,tot_region_ref,x_len, y_len,index_tot_regions,img_poly = read_xml(xml_file) + _, _, _, file_name, id_paragraph, id_header,co_text_paragraph,co_text_header,tot_region_ref,x_len, y_len,index_tot_regions,img_poly = read_xml(xml_file) id_all_text = id_paragraph + id_header co_text_all = co_text_paragraph + co_text_header From c0faecec2c39440862d878fab3fa044621434ca6 Mon Sep 17 00:00:00 2001 From: vahidrezanezhad Date: Fri, 21 Jun 2024 23:42:25 +0200 Subject: [PATCH 37/62] update inference --- inference.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/inference.py b/inference.py index c7a8b02..3fec9c2 100644 --- a/inference.py +++ b/inference.py @@ -557,6 +557,10 @@ class sbb_predict: res=self.predict() if (self.task == 'classification' or self.task == 'reading_order'): pass + elif self.task == 'enhancement': + if self.save: + print(self.save) + cv2.imwrite(self.save,res) else: img_seg_overlayed = self.visualize_model_output(res, self.img_org, self.task) if self.save: From 647a3f8cc4ade0b0ea982a0fee7451bd7550e6c7 Mon Sep 17 00:00:00 2001 From: vahidrezanezhad Date: Tue, 9 Jul 2024 03:04:29 +0200 Subject: [PATCH 38/62] resolving typo --- gt_gen_utils.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/gt_gen_utils.py b/gt_gen_utils.py index c2360fc..c264f4c 100644 --- a/gt_gen_utils.py +++ b/gt_gen_utils.py @@ -304,8 +304,9 @@ def get_images_of_ground_truth(gt_list, dir_in, output_dir, output_type, config_ if config_file and config_params['use_case']=='layout': keys = list(config_params.keys()) - if "artificial_class_on_boundry" in keys: - elements_with_artificial_class = list(config_params['artificial_class_on_boundry']) + + if "artificial_class_on_boundary" in keys: + elements_with_artificial_class = list(config_params['artificial_class_on_boundary']) artificial_class_rgb_color = (255,255,0) artificial_class_label = config_params['artificial_class_label'] #values = config_params.values() @@ -567,8 +568,8 @@ def get_images_of_ground_truth(gt_list, dir_in, output_dir, output_type, config_ elif vv.tag!=link+'Point' and sumi>=1: break co_noise.append(np.array(c_t_in)) - - if "artificial_class_on_boundry" in keys: + + if "artificial_class_on_boundary" in keys: img_boundary = np.zeros( (y_len,x_len) ) if "paragraph" in elements_with_artificial_class: erosion_rate = 2 @@ -655,7 +656,7 @@ def get_images_of_ground_truth(gt_list, dir_in, output_dir, output_type, config_ img_poly=cv2.fillPoly(img, pts =co_text[element_text], color=color_label) - if "artificial_class_on_boundry" in keys: + if "artificial_class_on_boundary" in keys: img_poly[:,:,0][img_boundary[:,:]==1] = artificial_class_rgb_color[0] img_poly[:,:,1][img_boundary[:,:]==1] = artificial_class_rgb_color[1] img_poly[:,:,2][img_boundary[:,:]==1] = artificial_class_rgb_color[2] @@ -706,7 +707,7 @@ def get_images_of_ground_truth(gt_list, dir_in, output_dir, output_type, config_ color_label = config_params['textregions'][element_text] img_poly=cv2.fillPoly(img, pts =co_text[element_text], color=color_label) - if "artificial_class_on_boundry" in keys: + if "artificial_class_on_boundary" in keys: img_poly[:,:][img_boundary[:,:]==1] = artificial_class_label From 55f3cb9a84ffd323b0a3eb150d40535bd64ecbba Mon Sep 17 00:00:00 2001 From: vahidrezanezhad Date: Tue, 16 Jul 2024 18:29:27 +0200 Subject: [PATCH 39/62] printspace_as_class_in_layout is integrated. Printspace can be defined as a class for layout segmentation --- gt_gen_utils.py | 26 ++++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) diff --git a/gt_gen_utils.py b/gt_gen_utils.py index c264f4c..1df7b2a 100644 --- a/gt_gen_utils.py +++ b/gt_gen_utils.py @@ -154,7 +154,7 @@ def get_images_of_ground_truth(gt_list, dir_in, output_dir, output_type, config_ x_new = columns_width_dict[str(num_col)] y_new = int ( x_new * (y_len / float(x_len)) ) - if printspace: + if printspace or "printspace_as_class_in_layout" in list(config_params.keys()): region_tags = np.unique([x for x in alltags if x.endswith('PrintSpace') or x.endswith('Border')]) co_use_case = [] @@ -279,6 +279,7 @@ def get_images_of_ground_truth(gt_list, dir_in, output_dir, output_type, config_ if printspace and config_params['use_case']!='printspace': img_poly = img_poly[bb_xywh[1]:bb_xywh[1]+bb_xywh[3], bb_xywh[0]:bb_xywh[0]+bb_xywh[2], :] + if 'columns_width' in list(config_params.keys()) and num_col and config_params['use_case']!='printspace': img_poly = resize_image(img_poly, y_new, x_new) @@ -310,6 +311,10 @@ def get_images_of_ground_truth(gt_list, dir_in, output_dir, output_type, config_ artificial_class_rgb_color = (255,255,0) artificial_class_label = config_params['artificial_class_label'] #values = config_params.values() + + if "printspace_as_class_in_layout" in list(config_params.keys()): + printspace_class_rgb_color = (125,125,255) + printspace_class_label = config_params['printspace_as_class_in_layout'] if 'textregions' in keys: types_text_dict = config_params['textregions'] @@ -614,7 +619,7 @@ def get_images_of_ground_truth(gt_list, dir_in, output_dir, output_type, config_ - img = np.zeros( (y_len,x_len,3) ) + img = np.zeros( (y_len,x_len,3) ) if output_type == '3d': if 'graphicregions' in keys: @@ -661,6 +666,15 @@ def get_images_of_ground_truth(gt_list, dir_in, output_dir, output_type, config_ img_poly[:,:,1][img_boundary[:,:]==1] = artificial_class_rgb_color[1] img_poly[:,:,2][img_boundary[:,:]==1] = artificial_class_rgb_color[2] + + if "printspace_as_class_in_layout" in list(config_params.keys()): + printspace_mask = np.zeros((img_poly.shape[0], img_poly.shape[1])) + printspace_mask[bb_xywh[1]:bb_xywh[1]+bb_xywh[3], bb_xywh[0]:bb_xywh[0]+bb_xywh[2]] = 1 + + img_poly[:,:,0][printspace_mask[:,:] == 0] = printspace_class_rgb_color[0] + img_poly[:,:,1][printspace_mask[:,:] == 0] = printspace_class_rgb_color[1] + img_poly[:,:,2][printspace_mask[:,:] == 0] = printspace_class_rgb_color[2] + @@ -709,6 +723,14 @@ def get_images_of_ground_truth(gt_list, dir_in, output_dir, output_type, config_ if "artificial_class_on_boundary" in keys: img_poly[:,:][img_boundary[:,:]==1] = artificial_class_label + + if "printspace_as_class_in_layout" in list(config_params.keys()): + printspace_mask = np.zeros((img_poly.shape[0], img_poly.shape[1])) + printspace_mask[bb_xywh[1]:bb_xywh[1]+bb_xywh[3], bb_xywh[0]:bb_xywh[0]+bb_xywh[2]] = 1 + + img_poly[:,:,0][printspace_mask[:,:] == 0] = printspace_class_label + img_poly[:,:,1][printspace_mask[:,:] == 0] = printspace_class_label + img_poly[:,:,2][printspace_mask[:,:] == 0] = printspace_class_label From 9521768774f89aa1e3e47f50be324214fe83f348 Mon Sep 17 00:00:00 2001 From: vahidrezanezhad Date: Wed, 17 Jul 2024 17:14:20 +0200 Subject: [PATCH 40/62] adding degrading and brightness augmentation to no patches case training --- utils.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/utils.py b/utils.py index 605d8d1..7a2274c 100644 --- a/utils.py +++ b/utils.py @@ -597,6 +597,14 @@ def provide_patches(imgs_list_train, segs_list_train, dir_img, dir_seg, dir_flow cv2.imwrite(dir_flow_train_labels + '/img_' + str(indexer) + '.png', resize_image(cv2.imread(dir_of_label_file), input_height, input_width)) indexer += 1 + if brightening: + for factor in brightness: + cv2.imwrite(dir_flow_train_imgs + '/img_' + str(indexer) + '.png', + (resize_image(do_brightening(dir_img + '/' +im, factor), input_height, input_width))) + + cv2.imwrite(dir_flow_train_labels + '/img_' + str(indexer) + '.png', + resize_image(cv2.imread(dir_of_label_file), input_height, input_width)) + indexer += 1 if binarization: cv2.imwrite(dir_flow_train_imgs + '/img_' + str(indexer) + '.png', @@ -606,6 +614,15 @@ def provide_patches(imgs_list_train, segs_list_train, dir_img, dir_seg, dir_flow resize_image(cv2.imread(dir_of_label_file), input_height, input_width)) indexer += 1 + if degrading: + for degrade_scale_ind in degrade_scales: + cv2.imwrite(dir_flow_train_imgs + '/img_' + str(indexer) + '.png', + (resize_image(do_degrading(cv2.imread(dir_img + '/' + im), degrade_scale_ind), input_height, input_width))) + + cv2.imwrite(dir_flow_train_labels + '/img_' + str(indexer) + '.png', + resize_image(cv2.imread(dir_of_label_file), input_height, input_width)) + indexer += 1 + if patches: indexer = get_patches(dir_flow_train_imgs, dir_flow_train_labels, From f2692cf8dd937654bdb77c7fdb9fc53eb23f5846 Mon Sep 17 00:00:00 2001 From: b-vr103 Date: Wed, 17 Jul 2024 18:20:24 +0200 Subject: [PATCH 41/62] brightness augmentation modified --- utils.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/utils.py b/utils.py index 7a2274c..891ee15 100644 --- a/utils.py +++ b/utils.py @@ -599,12 +599,15 @@ def provide_patches(imgs_list_train, segs_list_train, dir_img, dir_seg, dir_flow indexer += 1 if brightening: for factor in brightness: - cv2.imwrite(dir_flow_train_imgs + '/img_' + str(indexer) + '.png', - (resize_image(do_brightening(dir_img + '/' +im, factor), input_height, input_width))) + try: + cv2.imwrite(dir_flow_train_imgs + '/img_' + str(indexer) + '.png', + (resize_image(do_brightening(dir_img + '/' +im, factor), input_height, input_width))) - cv2.imwrite(dir_flow_train_labels + '/img_' + str(indexer) + '.png', - resize_image(cv2.imread(dir_of_label_file), input_height, input_width)) - indexer += 1 + cv2.imwrite(dir_flow_train_labels + '/img_' + str(indexer) + '.png', + resize_image(cv2.imread(dir_of_label_file), input_height, input_width)) + indexer += 1 + except: + pass if binarization: cv2.imwrite(dir_flow_train_imgs + '/img_' + str(indexer) + '.png', From c340fbb721d3756e057260bd7f3ccec8bd0bca64 Mon Sep 17 00:00:00 2001 From: b-vr103 Date: Tue, 23 Jul 2024 11:29:05 +0200 Subject: [PATCH 42/62] increasing margin in the case of pixelwise inference --- inference.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/inference.py b/inference.py index 3fec9c2..49bebf8 100644 --- a/inference.py +++ b/inference.py @@ -219,7 +219,7 @@ class sbb_predict: added_image = cv2.addWeighted(img,0.5,output,0.1,0) - return added_image + return added_image, output def predict(self): self.start_new_session_and_model() @@ -444,7 +444,7 @@ class sbb_predict: if img.shape[1] < self.img_width: img = cv2.resize(img, (self.img_height, img.shape[0]), interpolation=cv2.INTER_NEAREST) - margin = int(0 * self.img_width) + margin = int(0.1 * self.img_width) width_mid = self.img_width - 2 * margin height_mid = self.img_height - 2 * margin img = img / float(255.0) @@ -562,9 +562,10 @@ class sbb_predict: print(self.save) cv2.imwrite(self.save,res) else: - img_seg_overlayed = self.visualize_model_output(res, self.img_org, self.task) + img_seg_overlayed, only_prediction = self.visualize_model_output(res, self.img_org, self.task) if self.save: cv2.imwrite(self.save,img_seg_overlayed) + cv2.imwrite('./layout.png', only_prediction) if self.ground_truth: gt_img=cv2.imread(self.ground_truth) From 30894ddc7528a42f4dee44204443a0955da0a6ff Mon Sep 17 00:00:00 2001 From: vahidrezanezhad Date: Wed, 24 Jul 2024 16:52:05 +0200 Subject: [PATCH 43/62] erosion and dilation parameters are changed & separators are written in label images after artificial label --- gt_gen_utils.py | 40 +++++++++++++++++++++------------------- 1 file changed, 21 insertions(+), 19 deletions(-) diff --git a/gt_gen_utils.py b/gt_gen_utils.py index 1df7b2a..253c44a 100644 --- a/gt_gen_utils.py +++ b/gt_gen_utils.py @@ -577,8 +577,8 @@ def get_images_of_ground_truth(gt_list, dir_in, output_dir, output_type, config_ if "artificial_class_on_boundary" in keys: img_boundary = np.zeros( (y_len,x_len) ) if "paragraph" in elements_with_artificial_class: - erosion_rate = 2 - dilation_rate = 4 + erosion_rate = 0#2 + dilation_rate = 3#4 co_text['paragraph'], img_boundary = update_region_contours(co_text['paragraph'], img_boundary, erosion_rate, dilation_rate, y_len, x_len ) if "drop-capital" in elements_with_artificial_class: erosion_rate = 0 @@ -586,35 +586,35 @@ def get_images_of_ground_truth(gt_list, dir_in, output_dir, output_type, config_ co_text["drop-capital"], img_boundary = update_region_contours(co_text["drop-capital"], img_boundary, erosion_rate, dilation_rate, y_len, x_len ) if "catch-word" in elements_with_artificial_class: erosion_rate = 0 - dilation_rate = 4 + dilation_rate = 2#4 co_text["catch-word"], img_boundary = update_region_contours(co_text["catch-word"], img_boundary, erosion_rate, dilation_rate, y_len, x_len ) if "page-number" in elements_with_artificial_class: erosion_rate = 0 - dilation_rate = 4 + dilation_rate = 2#4 co_text["page-number"], img_boundary = update_region_contours(co_text["page-number"], img_boundary, erosion_rate, dilation_rate, y_len, x_len ) if "header" in elements_with_artificial_class: - erosion_rate = 1 - dilation_rate = 4 + erosion_rate = 0#1 + dilation_rate = 3#4 co_text["header"], img_boundary = update_region_contours(co_text["header"], img_boundary, erosion_rate, dilation_rate, y_len, x_len ) if "heading" in elements_with_artificial_class: - erosion_rate = 1 - dilation_rate = 4 + erosion_rate = 0#1 + dilation_rate = 3#4 co_text["heading"], img_boundary = update_region_contours(co_text["heading"], img_boundary, erosion_rate, dilation_rate, y_len, x_len ) if "signature-mark" in elements_with_artificial_class: erosion_rate = 1 dilation_rate = 4 co_text["signature-mark"], img_boundary = update_region_contours(co_text["signature-mark"], img_boundary, erosion_rate, dilation_rate, y_len, x_len ) if "marginalia" in elements_with_artificial_class: - erosion_rate = 2 - dilation_rate = 4 + erosion_rate = 0#2 + dilation_rate = 3#4 co_text["marginalia"], img_boundary = update_region_contours(co_text["marginalia"], img_boundary, erosion_rate, dilation_rate, y_len, x_len ) if "footnote" in elements_with_artificial_class: - erosion_rate = 2 - dilation_rate = 4 + erosion_rate = 0#2 + dilation_rate = 2#4 co_text["footnote"], img_boundary = update_region_contours(co_text["footnote"], img_boundary, erosion_rate, dilation_rate, y_len, x_len ) if "footnote-continued" in elements_with_artificial_class: - erosion_rate = 2 - dilation_rate = 4 + erosion_rate = 0#2 + dilation_rate = 2#4 co_text["footnote-continued"], img_boundary = update_region_contours(co_text["footnote-continued"], img_boundary, erosion_rate, dilation_rate, y_len, x_len ) @@ -639,8 +639,6 @@ def get_images_of_ground_truth(gt_list, dir_in, output_dir, output_type, config_ if 'imageregion' in keys: img_poly=cv2.fillPoly(img, pts =co_img, color=labels_rgb_color[ config_params['imageregion']]) - if 'separatorregion' in keys: - img_poly=cv2.fillPoly(img, pts =co_sep, color=labels_rgb_color[ config_params['separatorregion']]) if 'tableregion' in keys: img_poly=cv2.fillPoly(img, pts =co_table, color=labels_rgb_color[ config_params['tableregion']]) if 'noiseregion' in keys: @@ -666,6 +664,9 @@ def get_images_of_ground_truth(gt_list, dir_in, output_dir, output_type, config_ img_poly[:,:,1][img_boundary[:,:]==1] = artificial_class_rgb_color[1] img_poly[:,:,2][img_boundary[:,:]==1] = artificial_class_rgb_color[2] + if 'separatorregion' in keys: + img_poly=cv2.fillPoly(img, pts =co_sep, color=labels_rgb_color[ config_params['separatorregion']]) + if "printspace_as_class_in_layout" in list(config_params.keys()): printspace_mask = np.zeros((img_poly.shape[0], img_poly.shape[1])) @@ -697,9 +698,6 @@ def get_images_of_ground_truth(gt_list, dir_in, output_dir, output_type, config_ if 'imageregion' in keys: color_label = config_params['imageregion'] img_poly=cv2.fillPoly(img, pts =co_img, color=(color_label,color_label,color_label)) - if 'separatorregion' in keys: - color_label = config_params['separatorregion'] - img_poly=cv2.fillPoly(img, pts =co_sep, color=(color_label,color_label,color_label)) if 'tableregion' in keys: color_label = config_params['tableregion'] img_poly=cv2.fillPoly(img, pts =co_table, color=(color_label,color_label,color_label)) @@ -724,6 +722,10 @@ def get_images_of_ground_truth(gt_list, dir_in, output_dir, output_type, config_ if "artificial_class_on_boundary" in keys: img_poly[:,:][img_boundary[:,:]==1] = artificial_class_label + if 'separatorregion' in keys: + color_label = config_params['separatorregion'] + img_poly=cv2.fillPoly(img, pts =co_sep, color=(color_label,color_label,color_label)) + if "printspace_as_class_in_layout" in list(config_params.keys()): printspace_mask = np.zeros((img_poly.shape[0], img_poly.shape[1])) printspace_mask[bb_xywh[1]:bb_xywh[1]+bb_xywh[3], bb_xywh[0]:bb_xywh[0]+bb_xywh[2]] = 1 From 59e5892f25684d4ec8ef7f489481f8c5f4e0f77e Mon Sep 17 00:00:00 2001 From: vahidrezanezhad Date: Thu, 1 Aug 2024 14:30:51 +0200 Subject: [PATCH 44/62] erosion rate changed --- gt_gen_utils.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/gt_gen_utils.py b/gt_gen_utils.py index 253c44a..13010bf 100644 --- a/gt_gen_utils.py +++ b/gt_gen_utils.py @@ -577,36 +577,36 @@ def get_images_of_ground_truth(gt_list, dir_in, output_dir, output_type, config_ if "artificial_class_on_boundary" in keys: img_boundary = np.zeros( (y_len,x_len) ) if "paragraph" in elements_with_artificial_class: - erosion_rate = 0#2 - dilation_rate = 3#4 + erosion_rate = 2 + dilation_rate = 4 co_text['paragraph'], img_boundary = update_region_contours(co_text['paragraph'], img_boundary, erosion_rate, dilation_rate, y_len, x_len ) if "drop-capital" in elements_with_artificial_class: - erosion_rate = 0 - dilation_rate = 4 + erosion_rate = 1 + dilation_rate = 3 co_text["drop-capital"], img_boundary = update_region_contours(co_text["drop-capital"], img_boundary, erosion_rate, dilation_rate, y_len, x_len ) if "catch-word" in elements_with_artificial_class: erosion_rate = 0 - dilation_rate = 2#4 + dilation_rate = 3#4 co_text["catch-word"], img_boundary = update_region_contours(co_text["catch-word"], img_boundary, erosion_rate, dilation_rate, y_len, x_len ) if "page-number" in elements_with_artificial_class: erosion_rate = 0 - dilation_rate = 2#4 + dilation_rate = 3#4 co_text["page-number"], img_boundary = update_region_contours(co_text["page-number"], img_boundary, erosion_rate, dilation_rate, y_len, x_len ) if "header" in elements_with_artificial_class: - erosion_rate = 0#1 - dilation_rate = 3#4 + erosion_rate = 1 + dilation_rate = 4 co_text["header"], img_boundary = update_region_contours(co_text["header"], img_boundary, erosion_rate, dilation_rate, y_len, x_len ) if "heading" in elements_with_artificial_class: - erosion_rate = 0#1 - dilation_rate = 3#4 + erosion_rate = 1 + dilation_rate = 4 co_text["heading"], img_boundary = update_region_contours(co_text["heading"], img_boundary, erosion_rate, dilation_rate, y_len, x_len ) if "signature-mark" in elements_with_artificial_class: erosion_rate = 1 dilation_rate = 4 co_text["signature-mark"], img_boundary = update_region_contours(co_text["signature-mark"], img_boundary, erosion_rate, dilation_rate, y_len, x_len ) if "marginalia" in elements_with_artificial_class: - erosion_rate = 0#2 - dilation_rate = 3#4 + erosion_rate = 2 + dilation_rate = 4 co_text["marginalia"], img_boundary = update_region_contours(co_text["marginalia"], img_boundary, erosion_rate, dilation_rate, y_len, x_len ) if "footnote" in elements_with_artificial_class: erosion_rate = 0#2 From b6bdf942fdb7d4902edda52e58ca788efee48ae2 Mon Sep 17 00:00:00 2001 From: Clemens Neudecker <952378+cneud@users.noreply.github.com> Date: Thu, 8 Aug 2024 16:35:06 +0200 Subject: [PATCH 45/62] add documentation from wiki as markdown file to the codebase --- train.md | 576 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 576 insertions(+) create mode 100644 train.md diff --git a/train.md b/train.md new file mode 100644 index 0000000..553522b --- /dev/null +++ b/train.md @@ -0,0 +1,576 @@ +# Documentation for Training Models + +This repository assists users in preparing training datasets, training models, and performing inference with trained models. We cover various use cases including pixel-wise segmentation, image classification, image enhancement, and machine-based reading order. For each use case, we provide guidance on how to generate the corresponding training dataset. +All these use cases are now utilized in the Eynollah workflow. +As mentioned, the following three tasks can be accomplished using this repository: + +* Generate training dataset +* Train a model +* Inference with the trained model + +## Generate training dataset +The script generate_gt_for_training.py is used for generating training datasets. As the results of the following command demonstrate, the dataset generator provides three different commands: + +`python generate_gt_for_training.py --help` + + +These three commands are: + +* image-enhancement +* machine-based-reading-order +* pagexml2label + + +### image-enhancement + +Generating a training dataset for image enhancement is quite straightforward. All that is needed is a set of high-resolution images. The training dataset can then be generated using the following command: + +`python generate_gt_for_training.py image-enhancement -dis "dir of high resolution images" -dois "dir where degraded images will be written" -dols "dir where the corresponding high resolution image will be written as label" -scs "degrading scales json file"` + +The scales JSON file is a dictionary with a key named 'scales' and values representing scales smaller than 1. Images are downscaled based on these scales and then upscaled again to their original size. This process causes the images to lose resolution at different scales. The degraded images are used as input images, and the original high-resolution images serve as labels. The enhancement model can be trained with this generated dataset. The scales JSON file looks like this: + +```yaml +{ + "scales": [0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9] +} +``` + +### machine-based-reading-order + +For machine-based reading order, we aim to determine the reading priority between two sets of text regions. The model's input is a three-channel image: the first and last channels contain information about each of the two text regions, while the middle channel encodes prominent layout elements necessary for reading order, such as separators and headers. To generate the training dataset, our script requires a page XML file that specifies the image layout with the correct reading order. + +For output images, it is necessary to specify the width and height. Additionally, a minimum text region size can be set to filter out regions smaller than this minimum size. This minimum size is defined as the ratio of the text region area to the image area, with a default value of zero. To run the dataset generator, use the following command: + + +`python generate_gt_for_training.py machine-based-reading-order -dx "dir of GT xml files" -domi "dir where output images will be written" -docl "dir where the labels will be written" -ih "height" -iw "width" -min "min area ratio"` + +### pagexml2label + +pagexml2label is designed to generate labels from GT page XML files for various pixel-wise segmentation use cases, including 'layout,' 'textline,' 'printspace,' 'glyph,' and 'word' segmentation. +To train a pixel-wise segmentation model, we require images along with their corresponding labels. Our training script expects a PNG image where each pixel corresponds to a label, represented by an integer. The background is always labeled as zero, while other elements are assigned different integers. For instance, if we have ground truth data with four elements including the background, the classes would be labeled as 0, 1, 2, and 3 respectively. + +In binary segmentation scenarios such as textline or page extraction, the background is encoded as 0, and the desired element is automatically encoded as 1 in the PNG label. + +To specify the desired use case and the elements to be extracted in the PNG labels, a custom JSON file can be passed. For example, in the case of 'textline' detection, the JSON file would resemble this: + +```yaml +{ +"use_case": "textline" +} +``` + +In the case of layout segmentation a possible custom config json file can be like this: + +```yaml +{ +"use_case": "layout", +"textregions":{"rest_as_paragraph":1 , "drop-capital": 1, "header":2, "heading":2, "marginalia":3}, +"imageregion":4, +"separatorregion":5, +"graphicregions" :{"rest_as_decoration":6 ,"stamp":7} +} +``` + +A possible custom config json file for layout segmentation where the "printspace" is wished to be a class: + +```yaml +{ +"use_case": "layout", +"textregions":{"rest_as_paragraph":1 , "drop-capital": 1, "header":2, "heading":2, "marginalia":3}, +"imageregion":4, +"separatorregion":5, +"graphicregions" :{"rest_as_decoration":6 ,"stamp":7} +"printspace_as_class_in_layout" : 8 +} +``` +For the layout use case, it is beneficial to first understand the structure of the page XML file and its elements. In a given image, the annotations of elements are recorded in a page XML file, including their contours and classes. For an image document, the known regions are 'textregion', 'separatorregion', 'imageregion', 'graphicregion', 'noiseregion', and 'tableregion'. + +Text regions and graphic regions also have their own specific types. The known types for us for text regions are 'paragraph', 'header', 'heading', 'marginalia', 'drop-capital', 'footnote', 'footnote-continued', 'signature-mark', 'page-number', and 'catch-word'. The known types for graphic regions are 'handwritten-annotation', 'decoration', 'stamp', and 'signature'. +Since we don't know all types of text and graphic regions, unknown cases can arise. To handle these, we have defined two additional types: "rest_as_paragraph" and "rest_as_decoration" to ensure that no unknown types are missed. This way, users can extract all known types from the labels and be confident that no unknown types are overlooked. + +In the custom JSON file shown above, "header" and "heading" are extracted as the same class, while "marginalia" is shown as a different class. All other text region types, including "drop-capital," are grouped into the same class. For the graphic region, "stamp" has its own class, while all other types are classified together. "Image region" and "separator region" are also present in the label. However, other regions like "noise region" and "table region" will not be included in the label PNG file, even if they have information in the page XML files, as we chose not to include them. + +`python generate_gt_for_training.py pagexml2label -dx "dir of GT xml files" -do "dir where output label png files will be written" -cfg "custom config json file" -to "output type which has 2d and 3d. 2d is used for training and 3d is just to visualise the labels" "` + +We have also defined an artificial class that can be added to the boundary of text region types or text lines. This key is called "artificial_class_on_boundary." If users want to apply this to certain text regions in the layout use case, the example JSON config file should look like this: + +```yaml +{ + "use_case": "layout", + "textregions": { + "paragraph": 1, + "drop-capital": 1, + "header": 2, + "heading": 2, + "marginalia": 3 + }, + "imageregion": 4, + "separatorregion": 5, + "graphicregions": { + "rest_as_decoration": 6 + }, + "artificial_class_on_boundary": ["paragraph", "header", "heading", "marginalia"], + "artificial_class_label": 7 +} +``` + +This implies that the artificial class label, denoted by 7, will be present on PNG files and will only be added to the elements labeled as "paragraph," "header," "heading," and "marginalia." + +For "textline," "word," and "glyph," the artificial class on the boundaries will be activated only if the "artificial_class_label" key is specified in the config file. Its value should be set as 2 since these elements represent binary cases. For example, if the background and textline are denoted as 0 and 1 respectively, then the artificial class should be assigned the value 2. The example JSON config file should look like this for "textline" use case: + +```yaml +{ + "use_case": "textline", + "artificial_class_label": 2 +} +``` + +If the coordinates of "PrintSpace" or "Border" are present in the page XML ground truth files, and the user wishes to crop only the print space area, this can be achieved by activating the "-ps" argument. However, it should be noted that in this scenario, since cropping will be applied to the label files, the directory of the original images must be provided to ensure that they are cropped in sync with the labels. This ensures that the correct images and labels required for training are obtained. The command should resemble the following: + +`python generate_gt_for_training.py pagexml2label -dx "dir of GT xml files" -do "dir where output label png files will be written" -cfg "custom config json file" -to "output type which has 2d and 3d. 2d is used for training and 3d is just to visualise the labels" -ps -di "dir where the org images are located" -doi "dir where the cropped output images will be written" ` + +## Train a model +### classification + +For the classification use case, we haven't provided a ground truth generator, as it's unnecessary. For classification, all we require is a training directory with subdirectories, each containing images of its respective classes. We need separate directories for training and evaluation, and the class names (subdirectories) must be consistent across both directories. Additionally, the class names should be specified in the config JSON file, as shown in the following example. If, for instance, we aim to classify "apple" and "orange," with a total of 2 classes, the "classification_classes_name" key in the config file should appear as follows: + +```yaml +{ + "backbone_type" : "nontransformer", + "task": "classification", + "n_classes" : 2, + "n_epochs" : 10, + "input_height" : 448, + "input_width" : 448, + "weight_decay" : 1e-6, + "n_batch" : 4, + "learning_rate": 1e-4, + "f1_threshold_classification": 0.8, + "pretraining" : true, + "classification_classes_name" : {"0":"apple", "1":"orange"}, + "dir_train": "./train", + "dir_eval": "./eval", + "dir_output": "./output" +} +``` + +The "dir_train" should be like this: + +``` +. +└── train # train directory + ├── apple # directory of images for apple class + └── orange # directory of images for orange class +``` + +And the "dir_eval" the same structure as train directory: + +``` +. +└── eval # evaluation directory + ├── apple # directory of images for apple class + └── orange # directory of images for orange class + +``` + +The classification model can be trained using the following command line: + +`python train.py with config_classification.json` + + +As evident in the example JSON file above, for classification, we utilize a "f1_threshold_classification" parameter. This parameter is employed to gather all models with an evaluation f1 score surpassing this threshold. Subsequently, an ensemble of these model weights is executed, and a model is saved in the output directory as "model_ens_avg". Additionally, the weight of the best model based on the evaluation f1 score is saved as "model_best". + +### reading order +An example config json file for machine based reading order should be like this: + +```yaml +{ + "backbone_type" : "nontransformer", + "task": "reading_order", + "n_classes" : 1, + "n_epochs" : 5, + "input_height" : 672, + "input_width" : 448, + "weight_decay" : 1e-6, + "n_batch" : 4, + "learning_rate": 1e-4, + "pretraining" : true, + "dir_train": "./train", + "dir_eval": "./eval", + "dir_output": "./output" +} +``` + +The "dir_train" should be like this: + +``` +. +└── train # train directory + ├── images # directory of images + └── labels # directory of labels +``` + +And the "dir_eval" the same structure as train directory: + +``` +. +└── eval # evaluation directory + ├── images # directory of images + └── labels # directory of labels +``` + +The classification model can be trained like the classification case command line. + +### Segmentation (Textline, Binarization, Page extraction and layout) and enhancement + +#### Parameter configuration for segmentation or enhancement usecases + +The following parameter configuration can be applied to all segmentation use cases and enhancements. The augmentation, its sub-parameters, and continued training are defined only for segmentation use cases and enhancements, not for classification and machine-based reading order, as you can see in their example config files. + +* backbone_type: For segmentation tasks (such as text line, binarization, and layout detection) and enhancement, we offer two backbone options: a "nontransformer" and a "transformer" backbone. For the "transformer" backbone, we first apply a CNN followed by a transformer. In contrast, the "nontransformer" backbone utilizes only a CNN ResNet-50. +* task : The task parameter can have values such as "segmentation", "enhancement", "classification", and "reading_order". +* patches: If you want to break input images into smaller patches (input size of the model) you need to set this parameter to ``true``. In the case that the model should see the image once, like page extraction, patches should be set to ``false``. +* n_batch: Number of batches at each iteration. +* n_classes: Number of classes. In the case of binary classification this should be 2. In the case of reading_order it should set to 1. And for the case of layout detection just the unique number of classes should be given. +* n_epochs: Number of epochs. +* input_height: This indicates the height of model's input. +* input_width: This indicates the width of model's input. +* weight_decay: Weight decay of l2 regularization of model layers. +* pretraining: Set to ``true`` to load pretrained weights of ResNet50 encoder. The downloaded weights should be saved in a folder named "pretrained_model" in the same directory of "train.py" script. +* augmentation: If you want to apply any kind of augmentation this parameter should first set to ``true``. +* flip_aug: If ``true``, different types of filp will be applied on image. Type of flips is given with "flip_index" parameter. +* blur_aug: If ``true``, different types of blurring will be applied on image. Type of blurrings is given with "blur_k" parameter. +* scaling: If ``true``, scaling will be applied on image. Scale of scaling is given with "scales" parameter. +* degrading: If ``true``, degrading will be applied to the image. The amount of degrading is defined with "degrade_scales" parameter. +* brightening: If ``true``, brightening will be applied to the image. The amount of brightening is defined with "brightness" parameter. +* rotation_not_90: If ``true``, rotation (not 90 degree) will be applied on image. Rotation angles are given with "thetha" parameter. +* rotation: If ``true``, 90 degree rotation will be applied on image. +* binarization: If ``true``,Otsu thresholding will be applied to augment the input data with binarized images. +* scaling_bluring: If ``true``, combination of scaling and blurring will be applied on image. +* scaling_binarization: If ``true``, combination of scaling and binarization will be applied on image. +* scaling_flip: If ``true``, combination of scaling and flip will be applied on image. +* flip_index: Type of flips. +* blur_k: Type of blurrings. +* scales: Scales of scaling. +* brightness: The amount of brightenings. +* thetha: Rotation angles. +* degrade_scales: The amount of degradings. +* continue_training: If ``true``, it means that you have already trained a model and you would like to continue the training. So it is needed to provide the dir of trained model with "dir_of_start_model" and index for naming the models. For example if you have already trained for 3 epochs then your last index is 2 and if you want to continue from model_1.h5, you can set ``index_start`` to 3 to start naming model with index 3. +* weighted_loss: If ``true``, this means that you want to apply weighted categorical_crossentropy as loss fucntion. Be carefull if you set to ``true``the parameter "is_loss_soft_dice" should be ``false`` +* data_is_provided: If you have already provided the input data you can set this to ``true``. Be sure that the train and eval data are in "dir_output". Since when once we provide training data we resize and augment them and then we write them in sub-directories train and eval in "dir_output". +* dir_train: This is the directory of "images" and "labels" (dir_train should include two subdirectories with names of images and labels ) for raw images and labels. Namely they are not prepared (not resized and not augmented) yet for training the model. When we run this tool these raw data will be transformed to suitable size needed for the model and they will be written in "dir_output" in train and eval directories. Each of train and eval include "images" and "labels" sub-directories. +* index_start: Starting index for saved models in the case that "continue_training" is ``true``. +* dir_of_start_model: Directory containing pretrained model to continue training the model in the case that "continue_training" is ``true``. +* transformer_num_patches_xy: Number of patches for vision transformer in x and y direction respectively. +* transformer_patchsize_x: Patch size of vision transformer patches in x direction. +* transformer_patchsize_y: Patch size of vision transformer patches in y direction. +* transformer_projection_dim: Transformer projection dimension. Default value is 64. +* transformer_mlp_head_units: Transformer Multilayer Perceptron (MLP) head units. Default value is [128, 64]. +* transformer_layers: transformer layers. Default value is 8. +* transformer_num_heads: Transformer number of heads. Default value is 4. +* transformer_cnn_first: We have two types of vision transformers. In one type, a CNN is applied first, followed by a transformer. In the other type, this order is reversed. If transformer_cnn_first is true, it means the CNN will be applied before the transformer. Default value is true. + +In the case of segmentation and enhancement the train and evaluation directory should be as following. + +The "dir_train" should be like this: + +``` +. +└── train # train directory + ├── images # directory of images + └── labels # directory of labels +``` + +And the "dir_eval" the same structure as train directory: + +``` +. +└── eval # evaluation directory + ├── images # directory of images + └── labels # directory of labels +``` + +After configuring the JSON file for segmentation or enhancement, training can be initiated by running the following command, similar to the process for classification and reading order: + +`python train.py with config_classification.json` + +#### Binarization + +An example config json file for binarization can be like this: + +```yaml +{ + "backbone_type" : "transformer", + "task": "binarization", + "n_classes" : 2, + "n_epochs" : 4, + "input_height" : 224, + "input_width" : 672, + "weight_decay" : 1e-6, + "n_batch" : 1, + "learning_rate": 1e-4, + "patches" : true, + "pretraining" : true, + "augmentation" : true, + "flip_aug" : false, + "blur_aug" : false, + "scaling" : true, + "degrading": false, + "brightening": false, + "binarization" : false, + "scaling_bluring" : false, + "scaling_binarization" : false, + "scaling_flip" : false, + "rotation": false, + "rotation_not_90": false, + "transformer_num_patches_xy": [7, 7], + "transformer_patchsize_x": 3, + "transformer_patchsize_y": 1, + "transformer_projection_dim": 192, + "transformer_mlp_head_units": [128, 64], + "transformer_layers": 8, + "transformer_num_heads": 4, + "transformer_cnn_first": true, + "blur_k" : ["blur","guass","median"], + "scales" : [0.6, 0.7, 0.8, 0.9, 1.1, 1.2, 1.4], + "brightness" : [1.3, 1.5, 1.7, 2], + "degrade_scales" : [0.2, 0.4], + "flip_index" : [0, 1, -1], + "thetha" : [10, -10], + "continue_training": false, + "index_start" : 0, + "dir_of_start_model" : " ", + "weighted_loss": false, + "is_loss_soft_dice": false, + "data_is_provided": false, + "dir_train": "./train", + "dir_eval": "./eval", + "dir_output": "./output" +} +``` + +#### Textline + +```yaml +{ + "backbone_type" : "nontransformer", + "task": "segmentation", + "n_classes" : 2, + "n_epochs" : 4, + "input_height" : 448, + "input_width" : 224, + "weight_decay" : 1e-6, + "n_batch" : 1, + "learning_rate": 1e-4, + "patches" : true, + "pretraining" : true, + "augmentation" : true, + "flip_aug" : false, + "blur_aug" : false, + "scaling" : true, + "degrading": false, + "brightening": false, + "binarization" : false, + "scaling_bluring" : false, + "scaling_binarization" : false, + "scaling_flip" : false, + "rotation": false, + "rotation_not_90": false, + "blur_k" : ["blur","guass","median"], + "scales" : [0.6, 0.7, 0.8, 0.9, 1.1, 1.2, 1.4], + "brightness" : [1.3, 1.5, 1.7, 2], + "degrade_scales" : [0.2, 0.4], + "flip_index" : [0, 1, -1], + "thetha" : [10, -10], + "continue_training": false, + "index_start" : 0, + "dir_of_start_model" : " ", + "weighted_loss": false, + "is_loss_soft_dice": false, + "data_is_provided": false, + "dir_train": "./train", + "dir_eval": "./eval", + "dir_output": "./output" +} +``` + +#### Enhancement + +```yaml +{ + "backbone_type" : "nontransformer", + "task": "enhancement", + "n_classes" : 3, + "n_epochs" : 4, + "input_height" : 448, + "input_width" : 224, + "weight_decay" : 1e-6, + "n_batch" : 4, + "learning_rate": 1e-4, + "patches" : true, + "pretraining" : true, + "augmentation" : true, + "flip_aug" : false, + "blur_aug" : false, + "scaling" : true, + "degrading": false, + "brightening": false, + "binarization" : false, + "scaling_bluring" : false, + "scaling_binarization" : false, + "scaling_flip" : false, + "rotation": false, + "rotation_not_90": false, + "blur_k" : ["blur","guass","median"], + "scales" : [0.6, 0.7, 0.8, 0.9, 1.1, 1.2, 1.4], + "brightness" : [1.3, 1.5, 1.7, 2], + "degrade_scales" : [0.2, 0.4], + "flip_index" : [0, 1, -1], + "thetha" : [10, -10], + "continue_training": false, + "index_start" : 0, + "dir_of_start_model" : " ", + "weighted_loss": false, + "is_loss_soft_dice": false, + "data_is_provided": false, + "dir_train": "./train", + "dir_eval": "./eval", + "dir_output": "./output" +} +``` + +It's important to mention that the value of n_classes for enhancement should be 3, as the model's output is a 3-channel image. + +#### Page extraction + +```yaml +{ + "backbone_type" : "nontransformer", + "task": "segmentation", + "n_classes" : 2, + "n_epochs" : 4, + "input_height" : 448, + "input_width" : 224, + "weight_decay" : 1e-6, + "n_batch" : 1, + "learning_rate": 1e-4, + "patches" : false, + "pretraining" : true, + "augmentation" : false, + "flip_aug" : false, + "blur_aug" : false, + "scaling" : true, + "degrading": false, + "brightening": false, + "binarization" : false, + "scaling_bluring" : false, + "scaling_binarization" : false, + "scaling_flip" : false, + "rotation": false, + "rotation_not_90": false, + "blur_k" : ["blur","guass","median"], + "scales" : [0.6, 0.7, 0.8, 0.9, 1.1, 1.2, 1.4], + "brightness" : [1.3, 1.5, 1.7, 2], + "degrade_scales" : [0.2, 0.4], + "flip_index" : [0, 1, -1], + "thetha" : [10, -10], + "continue_training": false, + "index_start" : 0, + "dir_of_start_model" : " ", + "weighted_loss": false, + "is_loss_soft_dice": false, + "data_is_provided": false, + "dir_train": "./train", + "dir_eval": "./eval", + "dir_output": "./output" +} +``` + +For page segmentation (or print space or border segmentation), the model needs to view the input image in its entirety, hence the patches parameter should be set to false. + +#### layout segmentation + +An example config json file for layout segmentation with 5 classes (including background) can be like this: + +```yaml +{ + "backbone_type" : "transformer", + "task": "segmentation", + "n_classes" : 5, + "n_epochs" : 4, + "input_height" : 448, + "input_width" : 224, + "weight_decay" : 1e-6, + "n_batch" : 1, + "learning_rate": 1e-4, + "patches" : true, + "pretraining" : true, + "augmentation" : true, + "flip_aug" : false, + "blur_aug" : false, + "scaling" : true, + "degrading": false, + "brightening": false, + "binarization" : false, + "scaling_bluring" : false, + "scaling_binarization" : false, + "scaling_flip" : false, + "rotation": false, + "rotation_not_90": false, + "transformer_num_patches_xy": [7, 14], + "transformer_patchsize_x": 1, + "transformer_patchsize_y": 1, + "transformer_projection_dim": 64, + "transformer_mlp_head_units": [128, 64], + "transformer_layers": 8, + "transformer_num_heads": 4, + "transformer_cnn_first": true, + "blur_k" : ["blur","guass","median"], + "scales" : [0.6, 0.7, 0.8, 0.9, 1.1, 1.2, 1.4], + "brightness" : [1.3, 1.5, 1.7, 2], + "degrade_scales" : [0.2, 0.4], + "flip_index" : [0, 1, -1], + "thetha" : [10, -10], + "continue_training": false, + "index_start" : 0, + "dir_of_start_model" : " ", + "weighted_loss": false, + "is_loss_soft_dice": false, + "data_is_provided": false, + "dir_train": "./train", + "dir_eval": "./eval", + "dir_output": "./output" +} +``` +## Inference with the trained model +### classification + +For conducting inference with a trained model, you simply need to execute the following command line, specifying the directory of the model and the image on which to perform inference: + + +`python inference.py -m "model dir" -i "image" ` + +This will straightforwardly return the class of the image. + +### machine based reading order + + +To infer the reading order using an reading order model, we need a page XML file containing layout information but without the reading order. We simply need to provide the model directory, the XML file, and the output directory. The new XML file with the added reading order will be written to the output directory with the same name. We need to run: + +`python inference.py -m "model dir" -xml "page xml file" -o "output dir to write new xml with reading order" ` + + +### Segmentation (Textline, Binarization, Page extraction and layout) and enhancement + +For conducting inference with a trained model for segmentation and enhancement you need to run the following command line: + + +`python inference.py -m "model dir" -i "image" -p -s "output image" ` + + +Note that in the case of page extraction the -p flag is not needed. + +For segmentation or binarization tasks, if a ground truth (GT) label is available, the IOU evaluation metric can be calculated for the output. To do this, you need to provide the GT label using the argument -gt. + + + From f4bad09083f83d59936d2362ce435222b2272029 Mon Sep 17 00:00:00 2001 From: vahidrezanezhad Date: Fri, 9 Aug 2024 12:46:18 +0200 Subject: [PATCH 46/62] save only layout output. different from overlayed layout on image --- inference.py | 32 +++++++++++++++++++------------- 1 file changed, 19 insertions(+), 13 deletions(-) diff --git a/inference.py b/inference.py index 49bebf8..6054b01 100644 --- a/inference.py +++ b/inference.py @@ -32,6 +32,7 @@ class sbb_predict: self.image=image self.patches=patches self.save=save + self.save_layout=save_layout self.model_dir=model self.ground_truth=ground_truth self.task=task @@ -181,6 +182,7 @@ class sbb_predict: prediction = prediction * -1 prediction = prediction + 1 added_image = prediction * 255 + layout_only = None else: unique_classes = np.unique(prediction[:,:,0]) rgb_colors = {'0' : [255, 255, 255], @@ -200,26 +202,26 @@ class sbb_predict: '14' : [255, 125, 125], '15' : [255, 0, 255]} - output = np.zeros(prediction.shape) + layout_only = np.zeros(prediction.shape) for unq_class in unique_classes: rgb_class_unique = rgb_colors[str(int(unq_class))] - output[:,:,0][prediction[:,:,0]==unq_class] = rgb_class_unique[0] - output[:,:,1][prediction[:,:,0]==unq_class] = rgb_class_unique[1] - output[:,:,2][prediction[:,:,0]==unq_class] = rgb_class_unique[2] + layout_only[:,:,0][prediction[:,:,0]==unq_class] = rgb_class_unique[0] + layout_only[:,:,1][prediction[:,:,0]==unq_class] = rgb_class_unique[1] + layout_only[:,:,2][prediction[:,:,0]==unq_class] = rgb_class_unique[2] - img = self.resize_image(img, output.shape[0], output.shape[1]) + img = self.resize_image(img, layout_only.shape[0], layout_only.shape[1]) - output = output.astype(np.int32) + layout_only = layout_only.astype(np.int32) img = img.astype(np.int32) - added_image = cv2.addWeighted(img,0.5,output,0.1,0) + added_image = cv2.addWeighted(img,0.5,layout_only,0.1,0) - return added_image, output + return added_image, layout_only def predict(self): self.start_new_session_and_model() @@ -559,13 +561,12 @@ class sbb_predict: pass elif self.task == 'enhancement': if self.save: - print(self.save) cv2.imwrite(self.save,res) else: - img_seg_overlayed, only_prediction = self.visualize_model_output(res, self.img_org, self.task) + img_seg_overlayed, only_layout = self.visualize_model_output(res, self.img_org, self.task) if self.save: cv2.imwrite(self.save,img_seg_overlayed) - cv2.imwrite('./layout.png', only_prediction) + cv2.imwrite(self.save_layout, only_layout) if self.ground_truth: gt_img=cv2.imread(self.ground_truth) @@ -595,6 +596,11 @@ class sbb_predict: "-s", help="save prediction as a png file in current folder.", ) +@click.option( + "--save_layout", + "-sl", + help="save layout prediction only as a png file in current folder.", +) @click.option( "--model", "-m", @@ -618,7 +624,7 @@ class sbb_predict: "-min", help="min area size of regions considered for reading order detection. The default value is zero and means that all text regions are considered for reading order.", ) -def main(image, model, patches, save, ground_truth, xml_file, out, min_area): +def main(image, model, patches, save, save_layout, ground_truth, xml_file, out, min_area): with open(os.path.join(model,'config.json')) as f: config_params_model = json.load(f) task = config_params_model['task'] @@ -626,7 +632,7 @@ def main(image, model, patches, save, ground_truth, xml_file, out, min_area): if not save: print("Error: You used one of segmentation or binarization task but not set -s, you need a filename to save visualized output with -s") sys.exit(1) - x=sbb_predict(image, model, task, config_params_model, patches, save, ground_truth, xml_file, out, min_area) + x=sbb_predict(image, model, task, config_params_model, patches, save, save_layout, ground_truth, xml_file, out, min_area) x.run() if __name__=="__main__": From 85dd59f23eeb17a6ab2bb9aa5164530cfeeeab56 Mon Sep 17 00:00:00 2001 From: vahidrezanezhad Date: Fri, 9 Aug 2024 13:20:09 +0200 Subject: [PATCH 47/62] update --- inference.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/inference.py b/inference.py index 6054b01..8d0a572 100644 --- a/inference.py +++ b/inference.py @@ -28,7 +28,7 @@ Tool to load model and predict for given image. """ class sbb_predict: - def __init__(self,image, model, task, config_params_model, patches, save, ground_truth, xml_file, out, min_area): + def __init__(self,image, model, task, config_params_model, patches, save, save_layout, ground_truth, xml_file, out, min_area): self.image=image self.patches=patches self.save=save From 7be326d6897c81e516a181113ce0a01557140ad7 Mon Sep 17 00:00:00 2001 From: vahidrezanezhad Date: Wed, 21 Aug 2024 00:48:30 +0200 Subject: [PATCH 48/62] augmentation function for red textlines, rgb background and scaling for no patch case --- utils.py | 70 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 70 insertions(+) diff --git a/utils.py b/utils.py index 891ee15..2278849 100644 --- a/utils.py +++ b/utils.py @@ -12,6 +12,76 @@ from tensorflow.keras.utils import to_categorical from PIL import Image, ImageEnhance +def return_shuffled_channels(img, channels_order): + """ + channels order in ordinary case is like this [0, 1, 2]. In the case of shuffling the order should be provided. + """ + img_sh = np.copy(img) + + img_sh[:,:,0]= img[:,:,channels_order[0]] + img_sh[:,:,1]= img[:,:,channels_order[1]] + img_sh[:,:,2]= img[:,:,channels_order[2]] + return img_sh + +def return_binary_image_with_red_textlines(img_bin): + img_red = np.copy(img_bin) + + img_red[:,:,0][img_bin[:,:,0] == 0] = 255 + return img_red + +def return_binary_image_with_given_rgb_background(img_bin, img_rgb_background): + img_rgb_background = resize_image(img_rgb_background ,img_bin.shape[0], img_bin.shape[1]) + + img_final = np.copy(img_bin) + + img_final[:,:,0][img_bin[:,:,0] != 0] = img_rgb_background[:,:,0][img_bin[:,:,0] != 0] + img_final[:,:,1][img_bin[:,:,1] != 0] = img_rgb_background[:,:,1][img_bin[:,:,1] != 0] + img_final[:,:,2][img_bin[:,:,2] != 0] = img_rgb_background[:,:,2][img_bin[:,:,2] != 0] + + return img_final + +def return_binary_image_with_given_rgb_background_red_textlines(img_bin, img_rgb_background, img_color): + img_rgb_background = resize_image(img_rgb_background ,img_bin.shape[0], img_bin.shape[1]) + + img_final = np.copy(img_color) + + img_final[:,:,0][img_bin[:,:,0] != 0] = img_rgb_background[:,:,0][img_bin[:,:,0] != 0] + img_final[:,:,1][img_bin[:,:,1] != 0] = img_rgb_background[:,:,1][img_bin[:,:,1] != 0] + img_final[:,:,2][img_bin[:,:,2] != 0] = img_rgb_background[:,:,2][img_bin[:,:,2] != 0] + + return img_final + +def scale_image_for_no_patch(img, label, scale): + h_n = int(img.shape[0]*scale) + w_n = int(img.shape[1]*scale) + + channel0_avg = int( np.mean(img[:,:,0]) ) + channel1_avg = int( np.mean(img[:,:,1]) ) + channel2_avg = int( np.mean(img[:,:,2]) ) + + h_diff = img.shape[0] - h_n + w_diff = img.shape[1] - w_n + + h_start = int(h_diff / 2.) + w_start = int(w_diff / 2.) + + img_res = resize_image(img, h_n, w_n) + label_res = resize_image(label, h_n, w_n) + + img_scaled_padded = np.copy(img) + + label_scaled_padded = np.zeros(label.shape) + + img_scaled_padded[:,:,0] = channel0_avg + img_scaled_padded[:,:,1] = channel1_avg + img_scaled_padded[:,:,2] = channel2_avg + + img_scaled_padded[h_start:h_start+h_n, w_start:w_start+w_n,:] = img_res[:,:,:] + label_scaled_padded[h_start:h_start+h_n, w_start:w_start+w_n,:] = label_res[:,:,:] + + return img_scaled_padded, label_scaled_padded + + def return_number_of_total_training_data(path_classes): sub_classes = os.listdir(path_classes) n_tot = 0 From 95bbdf804058c255944554e1ad2ba608a5929fd2 Mon Sep 17 00:00:00 2001 From: vahidrezanezhad Date: Wed, 21 Aug 2024 16:17:59 +0200 Subject: [PATCH 49/62] updating augmentations --- train.py | 8 +++++--- utils.py | 41 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 46 insertions(+), 3 deletions(-) diff --git a/train.py b/train.py index 71f31f3..fa08a98 100644 --- a/train.py +++ b/train.py @@ -53,6 +53,7 @@ def config_params(): degrading = False # If true, degrading will be applied to the image. The amount of degrading is defined with "degrade_scales" in config_params.json. brightening = False # If true, brightening will be applied to the image. The amount of brightening is defined with "brightness" in config_params.json. binarization = False # If true, Otsu thresholding will be applied to augment the input with binarized images. + rgb_background = False dir_train = None # Directory of training dataset with subdirectories having the names "images" and "labels". dir_eval = None # Directory of validation dataset with subdirectories having the names "images" and "labels". dir_output = None # Directory where the output model will be saved. @@ -95,7 +96,7 @@ def run(_config, n_classes, n_epochs, input_height, index_start, dir_of_start_model, is_loss_soft_dice, n_batch, patches, augmentation, flip_aug, blur_aug, padding_white, padding_black, scaling, degrading, - brightening, binarization, blur_k, scales, degrade_scales, + brightening, binarization, rgb_background, blur_k, scales, degrade_scales, brightness, dir_train, data_is_provided, scaling_bluring, scaling_brightness, scaling_binarization, rotation, rotation_not_90, thetha, scaling_flip, continue_training, transformer_projection_dim, @@ -108,6 +109,7 @@ def run(_config, n_classes, n_epochs, input_height, if data_is_provided: dir_train_flowing = os.path.join(dir_output, 'train') dir_eval_flowing = os.path.join(dir_output, 'eval') + dir_flow_train_imgs = os.path.join(dir_train_flowing, 'images') dir_flow_train_labels = os.path.join(dir_train_flowing, 'labels') @@ -161,7 +163,7 @@ def run(_config, n_classes, n_epochs, input_height, # writing patches into a sub-folder in order to be flowed from directory. provide_patches(imgs_list, segs_list, dir_img, dir_seg, dir_flow_train_imgs, dir_flow_train_labels, input_height, input_width, blur_k, - blur_aug, padding_white, padding_black, flip_aug, binarization, + blur_aug, padding_white, padding_black, flip_aug, binarization, rgb_background, scaling, degrading, brightening, scales, degrade_scales, brightness, flip_index, scaling_bluring, scaling_brightness, scaling_binarization, rotation, rotation_not_90, thetha, scaling_flip, task, augmentation=augmentation, @@ -169,7 +171,7 @@ def run(_config, n_classes, n_epochs, input_height, provide_patches(imgs_list_test, segs_list_test, dir_img_val, dir_seg_val, dir_flow_eval_imgs, dir_flow_eval_labels, input_height, input_width, - blur_k, blur_aug, padding_white, padding_black, flip_aug, binarization, + blur_k, blur_aug, padding_white, padding_black, flip_aug, binarization, rgb_background, scaling, degrading, brightening, scales, degrade_scales, brightness, flip_index, scaling_bluring, scaling_brightness, scaling_binarization, rotation, rotation_not_90, thetha, scaling_flip, task, augmentation=False, patches=patches) diff --git a/utils.py b/utils.py index 2278849..cf7a65c 100644 --- a/utils.py +++ b/utils.py @@ -695,6 +695,47 @@ def provide_patches(imgs_list_train, segs_list_train, dir_img, dir_seg, dir_flow cv2.imwrite(dir_flow_train_labels + '/img_' + str(indexer) + '.png', resize_image(cv2.imread(dir_of_label_file), input_height, input_width)) indexer += 1 + + if rotation_not_90: + for thetha_i in thetha: + img_max_rotated, label_max_rotated = rotation_not_90_func(cv2.imread(dir_img + '/'+im), + cv2.imread(dir_of_label_file), thetha_i) + + cv2.imwrite(dir_flow_train_imgs + '/img_' + str(indexer) + '.png', resize_image(img_max_rotated, input_height, input_width)) + + cv2.imwrite(dir_flow_train_labels + '/img_' + str(indexer) + '.png', resize_image(label_max_rotated, input_height, input_width)) + indexer += 1 + + if channels_shuffling: + for shuffle_index in shuffle_indexes: + cv2.imwrite(dir_flow_train_imgs + '/img_' + str(indexer) + '.png', + (resize_image(return_shuffled_channels(cv2.imread(dir_img + '/' + im), shuffle_index), input_height, input_width))) + + cv2.imwrite(dir_flow_train_labels + '/img_' + str(indexer) + '.png', + resize_image(cv2.imread(dir_of_label_file), input_height, input_width)) + indexer += 1 + + if scaling: + for sc_ind in scales: + img_scaled, label_scaled = scale_image_for_no_patch(cv2.imread(dir_img + '/'+im), + cv2.imread(dir_of_label_file), sc_ind) + + cv2.imwrite(dir_flow_train_imgs + '/img_' + str(indexer) + '.png', resize_image(img_scaled, input_height, input_width)) + cv2.imwrite(dir_flow_train_labels + '/img_' + str(indexer) + '.png', resize_image(label_scaled, input_height, input_width)) + indexer += 1 + + if rgb_color_background: + img_bin_corr = cv2.imread(dir_img_bin + '/' + img_name+'.png') + for i_n in range(number_of_backgrounds_per_image): + background_image_chosen_name = random.choice(list_all_possible_background_images) + img_rgb_background_chosen = cv2.imread(dir_rgb_backgrounds + '/' + background_image_chosen_name) + img_with_overlayed_background = return_binary_image_with_given_rgb_background(img_bin_corr, img_rgb_background) + + cv2.imwrite(dir_flow_train_imgs + '/img_' + str(indexer) + '.png', resize_image(img_with_overlayed_background, input_height, input_width)) + cv2.imwrite(dir_flow_train_labels + '/img_' + str(indexer) + '.png', + resize_image(cv2.imread(dir_of_label_file), input_height, input_width)) + + if patches: From f31219b1c9027a55db5fee911b5465958092bfcb Mon Sep 17 00:00:00 2001 From: vahidrezanezhad Date: Wed, 21 Aug 2024 19:33:23 +0200 Subject: [PATCH 50/62] scaling, channels shuffling, rgb background and red content added to no patch augmentation --- config_params.json | 30 +++++++++++++++++++----------- train.py | 32 ++++++++++++++++++++++---------- utils.py | 32 +++++++++++++++++++++++++++----- 3 files changed, 68 insertions(+), 26 deletions(-) diff --git a/config_params.json b/config_params.json index a89cbb5..e5f652d 100644 --- a/config_params.json +++ b/config_params.json @@ -1,19 +1,22 @@ { "backbone_type" : "transformer", - "task": "binarization", + "task": "segmentation", "n_classes" : 2, - "n_epochs" : 2, - "input_height" : 224, - "input_width" : 224, + "n_epochs" : 0, + "input_height" : 448, + "input_width" : 448, "weight_decay" : 1e-6, "n_batch" : 1, "learning_rate": 1e-4, - "patches" : true, + "patches" : false, "pretraining" : true, - "augmentation" : false, + "augmentation" : true, "flip_aug" : false, "blur_aug" : false, "scaling" : true, + "adding_rgb_background": true, + "add_red_textlines": true, + "channels_shuffling": true, "degrading": false, "brightening": false, "binarization" : false, @@ -31,18 +34,23 @@ "transformer_num_heads": 1, "transformer_cnn_first": false, "blur_k" : ["blur","guass","median"], - "scales" : [0.6, 0.7, 0.8, 0.9, 1.1, 1.2, 1.4], + "scales" : [0.6, 0.7, 0.8, 0.9], "brightness" : [1.3, 1.5, 1.7, 2], "degrade_scales" : [0.2, 0.4], "flip_index" : [0, 1, -1], - "thetha" : [10, -10], + "shuffle_indexes" : [ [0,2,1], [1,2,0], [1,0,2] , [2,1,0]], + "thetha" : [5, -5], + "number_of_backgrounds_per_image": 2, "continue_training": false, "index_start" : 0, "dir_of_start_model" : " ", "weighted_loss": false, "is_loss_soft_dice": false, "data_is_provided": false, - "dir_train": "./train", - "dir_eval": "./eval", - "dir_output": "./output" + "dir_train": "/home/vahid/Documents/test/sbb_pixelwise_segmentation/test_label/pageextractor_test/train_new", + "dir_eval": "/home/vahid/Documents/test/sbb_pixelwise_segmentation/test_label/pageextractor_test/eval_new", + "dir_output": "/home/vahid/Documents/test/sbb_pixelwise_segmentation/test_label/pageextractor_test/output_new", + "dir_rgb_backgrounds": "/home/vahid/Documents/1_2_test_eynollah/set_rgb_background", + "dir_img_bin": "/home/vahid/Documents/test/sbb_pixelwise_segmentation/test_label/pageextractor_test/train_new/images_bin" + } diff --git a/train.py b/train.py index fa08a98..5dfad07 100644 --- a/train.py +++ b/train.py @@ -53,7 +53,9 @@ def config_params(): degrading = False # If true, degrading will be applied to the image. The amount of degrading is defined with "degrade_scales" in config_params.json. brightening = False # If true, brightening will be applied to the image. The amount of brightening is defined with "brightness" in config_params.json. binarization = False # If true, Otsu thresholding will be applied to augment the input with binarized images. - rgb_background = False + adding_rgb_background = False + add_red_textlines = False + channels_shuffling = False dir_train = None # Directory of training dataset with subdirectories having the names "images" and "labels". dir_eval = None # Directory of validation dataset with subdirectories having the names "images" and "labels". dir_output = None # Directory where the output model will be saved. @@ -65,6 +67,7 @@ def config_params(): scaling_brightness = False # If true, a combination of scaling and brightening will be applied to the image. scaling_flip = False # If true, a combination of scaling and flipping will be applied to the image. thetha = None # Rotate image by these angles for augmentation. + shuffle_indexes = None blur_k = None # Blur image for augmentation. scales = None # Scale patches for augmentation. degrade_scales = None # Degrade image for augmentation. @@ -88,6 +91,10 @@ def config_params(): f1_threshold_classification = None # This threshold is used to consider models with an evaluation f1 scores bigger than it. The selected model weights undergo a weights ensembling. And avreage ensembled model will be written to output. classification_classes_name = None # Dictionary of classification classes names. backbone_type = None # As backbone we have 2 types of backbones. A vision transformer alongside a CNN and we call it "transformer" and only CNN called "nontransformer" + + dir_img_bin = None + number_of_backgrounds_per_image = 1 + dir_rgb_backgrounds = None @ex.automain @@ -95,15 +102,20 @@ def run(_config, n_classes, n_epochs, input_height, input_width, weight_decay, weighted_loss, index_start, dir_of_start_model, is_loss_soft_dice, n_batch, patches, augmentation, flip_aug, - blur_aug, padding_white, padding_black, scaling, degrading, - brightening, binarization, rgb_background, blur_k, scales, degrade_scales, + blur_aug, padding_white, padding_black, scaling, degrading,channels_shuffling, + brightening, binarization, adding_rgb_background, add_red_textlines, blur_k, scales, degrade_scales,shuffle_indexes, brightness, dir_train, data_is_provided, scaling_bluring, scaling_brightness, scaling_binarization, rotation, rotation_not_90, thetha, scaling_flip, continue_training, transformer_projection_dim, transformer_mlp_head_units, transformer_layers, transformer_num_heads, transformer_cnn_first, transformer_patchsize_x, transformer_patchsize_y, transformer_num_patches_xy, backbone_type, flip_index, dir_eval, dir_output, - pretraining, learning_rate, task, f1_threshold_classification, classification_classes_name): + pretraining, learning_rate, task, f1_threshold_classification, classification_classes_name, dir_img_bin, number_of_backgrounds_per_image,dir_rgb_backgrounds): + + if dir_rgb_backgrounds: + list_all_possible_background_images = os.listdir(dir_rgb_backgrounds) + else: + list_all_possible_background_images = None if task == "segmentation" or task == "enhancement" or task == "binarization": if data_is_provided: @@ -163,18 +175,18 @@ def run(_config, n_classes, n_epochs, input_height, # writing patches into a sub-folder in order to be flowed from directory. provide_patches(imgs_list, segs_list, dir_img, dir_seg, dir_flow_train_imgs, dir_flow_train_labels, input_height, input_width, blur_k, - blur_aug, padding_white, padding_black, flip_aug, binarization, rgb_background, + blur_aug, padding_white, padding_black, flip_aug, binarization, adding_rgb_background,add_red_textlines, channels_shuffling, scaling, degrading, brightening, scales, degrade_scales, brightness, - flip_index, scaling_bluring, scaling_brightness, scaling_binarization, + flip_index,shuffle_indexes, scaling_bluring, scaling_brightness, scaling_binarization, rotation, rotation_not_90, thetha, scaling_flip, task, augmentation=augmentation, - patches=patches) + patches=patches, dir_img_bin=dir_img_bin,number_of_backgrounds_per_image=number_of_backgrounds_per_image,list_all_possible_background_images=list_all_possible_background_images, dir_rgb_backgrounds=dir_rgb_backgrounds) provide_patches(imgs_list_test, segs_list_test, dir_img_val, dir_seg_val, dir_flow_eval_imgs, dir_flow_eval_labels, input_height, input_width, - blur_k, blur_aug, padding_white, padding_black, flip_aug, binarization, rgb_background, + blur_k, blur_aug, padding_white, padding_black, flip_aug, binarization, adding_rgb_background, add_red_textlines, channels_shuffling, scaling, degrading, brightening, scales, degrade_scales, brightness, - flip_index, scaling_bluring, scaling_brightness, scaling_binarization, - rotation, rotation_not_90, thetha, scaling_flip, task, augmentation=False, patches=patches) + flip_index, shuffle_indexes, scaling_bluring, scaling_brightness, scaling_binarization, + rotation, rotation_not_90, thetha, scaling_flip, task, augmentation=False, patches=patches,dir_img_bin=dir_img_bin,number_of_backgrounds_per_image=number_of_backgrounds_per_image,list_all_possible_background_images=list_all_possible_background_images, dir_rgb_backgrounds=dir_rgb_backgrounds) if weighted_loss: weights = np.zeros(n_classes) diff --git a/utils.py b/utils.py index cf7a65c..20fda29 100644 --- a/utils.py +++ b/utils.py @@ -51,6 +51,16 @@ def return_binary_image_with_given_rgb_background_red_textlines(img_bin, img_rgb return img_final +def return_image_with_red_elements(img, img_bin): + img_final = np.copy(img) + + img_final[:,:,0][img_bin[:,:,0]==0] = 0 + img_final[:,:,1][img_bin[:,:,0]==0] = 0 + img_final[:,:,2][img_bin[:,:,0]==0] = 255 + return img_final + + + def scale_image_for_no_patch(img, label, scale): h_n = int(img.shape[0]*scale) w_n = int(img.shape[1]*scale) @@ -631,10 +641,10 @@ def get_patches_num_scale_new(dir_img_f, dir_seg_f, img, label, height, width, i def provide_patches(imgs_list_train, segs_list_train, dir_img, dir_seg, dir_flow_train_imgs, dir_flow_train_labels, input_height, input_width, blur_k, blur_aug, - padding_white, padding_black, flip_aug, binarization, scaling, degrading, - brightening, scales, degrade_scales, brightness, flip_index, + padding_white, padding_black, flip_aug, binarization, adding_rgb_background, add_red_textlines, channels_shuffling, scaling, degrading, + brightening, scales, degrade_scales, brightness, flip_index, shuffle_indexes, scaling_bluring, scaling_brightness, scaling_binarization, rotation, - rotation_not_90, thetha, scaling_flip, task, augmentation=False, patches=False): + rotation_not_90, thetha, scaling_flip, task, augmentation=False, patches=False, dir_img_bin=None,number_of_backgrounds_per_image=None,list_all_possible_background_images=None, dir_rgb_backgrounds=None): indexer = 0 for im, seg_i in tqdm(zip(imgs_list_train, segs_list_train)): @@ -724,17 +734,29 @@ def provide_patches(imgs_list_train, segs_list_train, dir_img, dir_seg, dir_flow cv2.imwrite(dir_flow_train_labels + '/img_' + str(indexer) + '.png', resize_image(label_scaled, input_height, input_width)) indexer += 1 - if rgb_color_background: + if adding_rgb_background: img_bin_corr = cv2.imread(dir_img_bin + '/' + img_name+'.png') for i_n in range(number_of_backgrounds_per_image): background_image_chosen_name = random.choice(list_all_possible_background_images) img_rgb_background_chosen = cv2.imread(dir_rgb_backgrounds + '/' + background_image_chosen_name) - img_with_overlayed_background = return_binary_image_with_given_rgb_background(img_bin_corr, img_rgb_background) + img_with_overlayed_background = return_binary_image_with_given_rgb_background(img_bin_corr, img_rgb_background_chosen) cv2.imwrite(dir_flow_train_imgs + '/img_' + str(indexer) + '.png', resize_image(img_with_overlayed_background, input_height, input_width)) cv2.imwrite(dir_flow_train_labels + '/img_' + str(indexer) + '.png', resize_image(cv2.imread(dir_of_label_file), input_height, input_width)) + indexer += 1 + + if add_red_textlines: + img_bin_corr = cv2.imread(dir_img_bin + '/' + img_name+'.png') + img_red_context = return_image_with_red_elements(cv2.imread(dir_img + '/'+im), img_bin_corr) + + cv2.imwrite(dir_flow_train_imgs + '/img_' + str(indexer) + '.png', resize_image(img_red_context, input_height, input_width)) + cv2.imwrite(dir_flow_train_labels + '/img_' + str(indexer) + '.png', + resize_image(cv2.imread(dir_of_label_file), input_height, input_width)) + + indexer += 1 + From 99048467761e6021a693098f676e9ee39a67ff96 Mon Sep 17 00:00:00 2001 From: vahidrezanezhad Date: Thu, 22 Aug 2024 21:58:09 +0200 Subject: [PATCH 51/62] using prepared binarized images in the case of augmentation --- utils.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/utils.py b/utils.py index 20fda29..84af85e 100644 --- a/utils.py +++ b/utils.py @@ -690,8 +690,15 @@ def provide_patches(imgs_list_train, segs_list_train, dir_img, dir_seg, dir_flow pass if binarization: - cv2.imwrite(dir_flow_train_imgs + '/img_' + str(indexer) + '.png', - resize_image(otsu_copy(cv2.imread(dir_img + '/' + im)), input_height, input_width)) + + if dir_img_bin: + img_bin_corr = cv2.imread(dir_img_bin + '/' + img_name+'.png') + + cv2.imwrite(dir_flow_train_imgs + '/img_' + str(indexer) + '.png', + resize_image(img_bin_corr, input_height, input_width)) + else: + cv2.imwrite(dir_flow_train_imgs + '/img_' + str(indexer) + '.png', + resize_image(otsu_copy(cv2.imread(dir_img + '/' + im)), input_height, input_width)) cv2.imwrite(dir_flow_train_labels + '/img_' + str(indexer) + '.png', resize_image(cv2.imread(dir_of_label_file), input_height, input_width)) From 4f0e3efa2b6badb9d6bd187f37577ebc52846687 Mon Sep 17 00:00:00 2001 From: vahidrezanezhad Date: Wed, 28 Aug 2024 00:04:19 +0200 Subject: [PATCH 52/62] early dilation for textline artificial class --- gt_gen_utils.py | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/gt_gen_utils.py b/gt_gen_utils.py index 13010bf..dd4091f 100644 --- a/gt_gen_utils.py +++ b/gt_gen_utils.py @@ -88,12 +88,15 @@ def return_contours_of_interested_region(region_pre_p, pixel, min_area=0.0002): contours_imgs = filter_contours_area_of_image_tables(thresh, contours_imgs, hierarchy, max_area=1, min_area=min_area) return contours_imgs -def update_region_contours(co_text, img_boundary, erosion_rate, dilation_rate, y_len, x_len): +def update_region_contours(co_text, img_boundary, erosion_rate, dilation_rate, y_len, x_len, dilation_early=None): co_text_eroded = [] for con in co_text: img_boundary_in = np.zeros( (y_len,x_len) ) img_boundary_in = cv2.fillPoly(img_boundary_in, pts=[con], color=(1, 1, 1)) + if dilation_early: + img_boundary_in = cv2.dilate(img_boundary_in[:,:], KERNEL, iterations=dilation_early) + #img_boundary_in = cv2.erode(img_boundary_in[:,:], KERNEL, iterations=7)#asiatica if erosion_rate > 0: img_boundary_in = cv2.erode(img_boundary_in[:,:], KERNEL, iterations=erosion_rate) @@ -258,22 +261,25 @@ def get_images_of_ground_truth(gt_list, dir_in, output_dir, output_type, config_ if "artificial_class_label" in keys: img_boundary = np.zeros((y_len, x_len)) - erosion_rate = 1 + erosion_rate = 0#1 dilation_rate = 3 - co_use_case, img_boundary = update_region_contours(co_use_case, img_boundary, erosion_rate, dilation_rate, y_len, x_len ) + dilation_early = 2 + co_use_case, img_boundary = update_region_contours(co_use_case, img_boundary, erosion_rate, dilation_rate, y_len, x_len, dilation_early=dilation_early ) img = np.zeros((y_len, x_len, 3)) if output_type == '2d': img_poly = cv2.fillPoly(img, pts=co_use_case, color=(1, 1, 1)) if "artificial_class_label" in keys: - img_poly[:,:][img_boundary[:,:]==1] = artificial_class_label + img_mask = np.copy(img_poly) + img_poly[:,:][(img_boundary[:,:]==1) & (img_mask[:,:,0]!=1)] = artificial_class_label elif output_type == '3d': img_poly = cv2.fillPoly(img, pts=co_use_case, color=textline_rgb_color) if "artificial_class_label" in keys: - img_poly[:,:,0][img_boundary[:,:]==1] = artificial_class_rgb_color[0] - img_poly[:,:,1][img_boundary[:,:]==1] = artificial_class_rgb_color[1] - img_poly[:,:,2][img_boundary[:,:]==1] = artificial_class_rgb_color[2] + img_mask = np.copy(img_poly) + img_poly[:,:,0][(img_boundary[:,:]==1) & (img_mask[:,:,0]!=255)] = artificial_class_rgb_color[0] + img_poly[:,:,1][(img_boundary[:,:]==1) & (img_mask[:,:,0]!=255)] = artificial_class_rgb_color[1] + img_poly[:,:,2][(img_boundary[:,:]==1) & (img_mask[:,:,0]!=255)] = artificial_class_rgb_color[2] if printspace and config_params['use_case']!='printspace': From c502e67c14b073812f2dd660fa9db6b1bd81e5c1 Mon Sep 17 00:00:00 2001 From: vahidrezanezhad Date: Wed, 28 Aug 2024 02:09:27 +0200 Subject: [PATCH 53/62] adding foreground rgb to augmentation --- config_params.json | 10 ++++++---- train.py | 19 +++++++++++++------ utils.py | 40 ++++++++++++++++++++++++++++++++++++++-- 3 files changed, 57 insertions(+), 12 deletions(-) diff --git a/config_params.json b/config_params.json index e5f652d..1db8026 100644 --- a/config_params.json +++ b/config_params.json @@ -13,13 +13,14 @@ "augmentation" : true, "flip_aug" : false, "blur_aug" : false, - "scaling" : true, + "scaling" : false, "adding_rgb_background": true, - "add_red_textlines": true, - "channels_shuffling": true, + "adding_rgb_foreground": true, + "add_red_textlines": false, + "channels_shuffling": false, "degrading": false, "brightening": false, - "binarization" : false, + "binarization" : true, "scaling_bluring" : false, "scaling_binarization" : false, "scaling_flip" : false, @@ -51,6 +52,7 @@ "dir_eval": "/home/vahid/Documents/test/sbb_pixelwise_segmentation/test_label/pageextractor_test/eval_new", "dir_output": "/home/vahid/Documents/test/sbb_pixelwise_segmentation/test_label/pageextractor_test/output_new", "dir_rgb_backgrounds": "/home/vahid/Documents/1_2_test_eynollah/set_rgb_background", + "dir_rgb_foregrounds": "/home/vahid/Documents/1_2_test_eynollah/out_set_rgb_foreground", "dir_img_bin": "/home/vahid/Documents/test/sbb_pixelwise_segmentation/test_label/pageextractor_test/train_new/images_bin" } diff --git a/train.py b/train.py index 5dfad07..848ff6a 100644 --- a/train.py +++ b/train.py @@ -54,6 +54,7 @@ def config_params(): brightening = False # If true, brightening will be applied to the image. The amount of brightening is defined with "brightness" in config_params.json. binarization = False # If true, Otsu thresholding will be applied to augment the input with binarized images. adding_rgb_background = False + adding_rgb_foreground = False add_red_textlines = False channels_shuffling = False dir_train = None # Directory of training dataset with subdirectories having the names "images" and "labels". @@ -95,6 +96,7 @@ def config_params(): dir_img_bin = None number_of_backgrounds_per_image = 1 dir_rgb_backgrounds = None + dir_rgb_foregrounds = None @ex.automain @@ -103,20 +105,25 @@ def run(_config, n_classes, n_epochs, input_height, index_start, dir_of_start_model, is_loss_soft_dice, n_batch, patches, augmentation, flip_aug, blur_aug, padding_white, padding_black, scaling, degrading,channels_shuffling, - brightening, binarization, adding_rgb_background, add_red_textlines, blur_k, scales, degrade_scales,shuffle_indexes, + brightening, binarization, adding_rgb_background, adding_rgb_foreground, add_red_textlines, blur_k, scales, degrade_scales,shuffle_indexes, brightness, dir_train, data_is_provided, scaling_bluring, scaling_brightness, scaling_binarization, rotation, rotation_not_90, thetha, scaling_flip, continue_training, transformer_projection_dim, transformer_mlp_head_units, transformer_layers, transformer_num_heads, transformer_cnn_first, transformer_patchsize_x, transformer_patchsize_y, transformer_num_patches_xy, backbone_type, flip_index, dir_eval, dir_output, - pretraining, learning_rate, task, f1_threshold_classification, classification_classes_name, dir_img_bin, number_of_backgrounds_per_image,dir_rgb_backgrounds): + pretraining, learning_rate, task, f1_threshold_classification, classification_classes_name, dir_img_bin, number_of_backgrounds_per_image,dir_rgb_backgrounds, dir_rgb_foregrounds): if dir_rgb_backgrounds: list_all_possible_background_images = os.listdir(dir_rgb_backgrounds) else: list_all_possible_background_images = None + if dir_rgb_foregrounds: + list_all_possible_foreground_rgbs = os.listdir(dir_rgb_foregrounds) + else: + list_all_possible_foreground_rgbs = None + if task == "segmentation" or task == "enhancement" or task == "binarization": if data_is_provided: dir_train_flowing = os.path.join(dir_output, 'train') @@ -175,18 +182,18 @@ def run(_config, n_classes, n_epochs, input_height, # writing patches into a sub-folder in order to be flowed from directory. provide_patches(imgs_list, segs_list, dir_img, dir_seg, dir_flow_train_imgs, dir_flow_train_labels, input_height, input_width, blur_k, - blur_aug, padding_white, padding_black, flip_aug, binarization, adding_rgb_background,add_red_textlines, channels_shuffling, + blur_aug, padding_white, padding_black, flip_aug, binarization, adding_rgb_background,adding_rgb_foreground, add_red_textlines, channels_shuffling, scaling, degrading, brightening, scales, degrade_scales, brightness, flip_index,shuffle_indexes, scaling_bluring, scaling_brightness, scaling_binarization, rotation, rotation_not_90, thetha, scaling_flip, task, augmentation=augmentation, - patches=patches, dir_img_bin=dir_img_bin,number_of_backgrounds_per_image=number_of_backgrounds_per_image,list_all_possible_background_images=list_all_possible_background_images, dir_rgb_backgrounds=dir_rgb_backgrounds) + patches=patches, dir_img_bin=dir_img_bin,number_of_backgrounds_per_image=number_of_backgrounds_per_image,list_all_possible_background_images=list_all_possible_background_images, dir_rgb_backgrounds=dir_rgb_backgrounds, dir_rgb_foregrounds=dir_rgb_foregrounds,list_all_possible_foreground_rgbs=list_all_possible_foreground_rgbs) provide_patches(imgs_list_test, segs_list_test, dir_img_val, dir_seg_val, dir_flow_eval_imgs, dir_flow_eval_labels, input_height, input_width, - blur_k, blur_aug, padding_white, padding_black, flip_aug, binarization, adding_rgb_background, add_red_textlines, channels_shuffling, + blur_k, blur_aug, padding_white, padding_black, flip_aug, binarization, adding_rgb_background, adding_rgb_foreground, add_red_textlines, channels_shuffling, scaling, degrading, brightening, scales, degrade_scales, brightness, flip_index, shuffle_indexes, scaling_bluring, scaling_brightness, scaling_binarization, - rotation, rotation_not_90, thetha, scaling_flip, task, augmentation=False, patches=patches,dir_img_bin=dir_img_bin,number_of_backgrounds_per_image=number_of_backgrounds_per_image,list_all_possible_background_images=list_all_possible_background_images, dir_rgb_backgrounds=dir_rgb_backgrounds) + rotation, rotation_not_90, thetha, scaling_flip, task, augmentation=False, patches=patches,dir_img_bin=dir_img_bin,number_of_backgrounds_per_image=number_of_backgrounds_per_image,list_all_possible_background_images=list_all_possible_background_images, dir_rgb_backgrounds=dir_rgb_backgrounds,dir_rgb_foregrounds=dir_rgb_foregrounds,list_all_possible_foreground_rgbs=list_all_possible_foreground_rgbs ) if weighted_loss: weights = np.zeros(n_classes) diff --git a/utils.py b/utils.py index 84af85e..d38e798 100644 --- a/utils.py +++ b/utils.py @@ -40,6 +40,25 @@ def return_binary_image_with_given_rgb_background(img_bin, img_rgb_background): return img_final +def return_binary_image_with_given_rgb_background_and_given_foreground_rgb(img_bin, img_rgb_background, rgb_foreground): + img_rgb_background = resize_image(img_rgb_background ,img_bin.shape[0], img_bin.shape[1]) + + img_final = np.copy(img_bin) + img_foreground = np.zeros(img_bin.shape) + + + img_foreground[:,:,0][img_bin[:,:,0] == 0] = rgb_foreground[0] + img_foreground[:,:,1][img_bin[:,:,0] == 0] = rgb_foreground[1] + img_foreground[:,:,2][img_bin[:,:,0] == 0] = rgb_foreground[2] + + + img_final[:,:,0][img_bin[:,:,0] != 0] = img_rgb_background[:,:,0][img_bin[:,:,0] != 0] + img_final[:,:,1][img_bin[:,:,1] != 0] = img_rgb_background[:,:,1][img_bin[:,:,1] != 0] + img_final[:,:,2][img_bin[:,:,2] != 0] = img_rgb_background[:,:,2][img_bin[:,:,2] != 0] + + img_final = img_final + img_foreground + return img_final + def return_binary_image_with_given_rgb_background_red_textlines(img_bin, img_rgb_background, img_color): img_rgb_background = resize_image(img_rgb_background ,img_bin.shape[0], img_bin.shape[1]) @@ -641,10 +660,10 @@ def get_patches_num_scale_new(dir_img_f, dir_seg_f, img, label, height, width, i def provide_patches(imgs_list_train, segs_list_train, dir_img, dir_seg, dir_flow_train_imgs, dir_flow_train_labels, input_height, input_width, blur_k, blur_aug, - padding_white, padding_black, flip_aug, binarization, adding_rgb_background, add_red_textlines, channels_shuffling, scaling, degrading, + padding_white, padding_black, flip_aug, binarization, adding_rgb_background, adding_rgb_foreground, add_red_textlines, channels_shuffling, scaling, degrading, brightening, scales, degrade_scales, brightness, flip_index, shuffle_indexes, scaling_bluring, scaling_brightness, scaling_binarization, rotation, - rotation_not_90, thetha, scaling_flip, task, augmentation=False, patches=False, dir_img_bin=None,number_of_backgrounds_per_image=None,list_all_possible_background_images=None, dir_rgb_backgrounds=None): + rotation_not_90, thetha, scaling_flip, task, augmentation=False, patches=False, dir_img_bin=None,number_of_backgrounds_per_image=None,list_all_possible_background_images=None, dir_rgb_backgrounds=None, dir_rgb_foregrounds=None, list_all_possible_foreground_rgbs=None): indexer = 0 for im, seg_i in tqdm(zip(imgs_list_train, segs_list_train)): @@ -754,6 +773,23 @@ def provide_patches(imgs_list_train, segs_list_train, dir_img, dir_seg, dir_flow indexer += 1 + if adding_rgb_foreground: + img_bin_corr = cv2.imread(dir_img_bin + '/' + img_name+'.png') + for i_n in range(number_of_backgrounds_per_image): + background_image_chosen_name = random.choice(list_all_possible_background_images) + foreground_rgb_chosen_name = random.choice(list_all_possible_foreground_rgbs) + + img_rgb_background_chosen = cv2.imread(dir_rgb_backgrounds + '/' + background_image_chosen_name) + foreground_rgb_chosen = np.load(dir_rgb_foregrounds + '/' + foreground_rgb_chosen_name) + + img_with_overlayed_background = return_binary_image_with_given_rgb_background_and_given_foreground_rgb(img_bin_corr, img_rgb_background_chosen, foreground_rgb_chosen) + + cv2.imwrite(dir_flow_train_imgs + '/img_' + str(indexer) + '.png', resize_image(img_with_overlayed_background, input_height, input_width)) + cv2.imwrite(dir_flow_train_labels + '/img_' + str(indexer) + '.png', + resize_image(cv2.imread(dir_of_label_file), input_height, input_width)) + + indexer += 1 + if add_red_textlines: img_bin_corr = cv2.imread(dir_img_bin + '/' + img_name+'.png') img_red_context = return_image_with_red_elements(cv2.imread(dir_img + '/'+im), img_bin_corr) From 5f456cf5082c8b115130bf37a841d965d3f8d90e Mon Sep 17 00:00:00 2001 From: vahidrezanezhad Date: Wed, 28 Aug 2024 17:34:06 +0200 Subject: [PATCH 54/62] fixing artificial class bug --- gt_gen_utils.py | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/gt_gen_utils.py b/gt_gen_utils.py index dd4091f..5784e14 100644 --- a/gt_gen_utils.py +++ b/gt_gen_utils.py @@ -8,6 +8,7 @@ from tqdm import tqdm import cv2 from shapely import geometry from pathlib import Path +import matplotlib.pyplot as plt KERNEL = np.ones((5, 5), np.uint8) @@ -83,9 +84,13 @@ def return_contours_of_interested_region(region_pre_p, pixel, min_area=0.0002): ret, thresh = cv2.threshold(imgray, 0, 255, 0) contours_imgs, hierarchy = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) + + #print(len(contours_imgs), hierarchy) contours_imgs = return_parent_contours(contours_imgs, hierarchy) - contours_imgs = filter_contours_area_of_image_tables(thresh, contours_imgs, hierarchy, max_area=1, min_area=min_area) + + #print(len(contours_imgs), "iki") + #contours_imgs = filter_contours_area_of_image_tables(thresh, contours_imgs, hierarchy, max_area=1, min_area=min_area) return contours_imgs def update_region_contours(co_text, img_boundary, erosion_rate, dilation_rate, y_len, x_len, dilation_early=None): @@ -103,12 +108,15 @@ def update_region_contours(co_text, img_boundary, erosion_rate, dilation_rate, y pixel = 1 min_size = 0 + + img_boundary_in = img_boundary_in.astype("uint8") + con_eroded = return_contours_of_interested_region(img_boundary_in,pixel, min_size ) - try: - co_text_eroded.append(con_eroded[0]) - except: - co_text_eroded.append(con) + #try: + co_text_eroded.append(con_eroded[0]) + #except: + #co_text_eroded.append(con) img_boundary_in_dilated = cv2.dilate(img_boundary_in[:,:], KERNEL, iterations=dilation_rate) @@ -262,8 +270,8 @@ def get_images_of_ground_truth(gt_list, dir_in, output_dir, output_type, config_ if "artificial_class_label" in keys: img_boundary = np.zeros((y_len, x_len)) erosion_rate = 0#1 - dilation_rate = 3 - dilation_early = 2 + dilation_rate = 2 + dilation_early = 1 co_use_case, img_boundary = update_region_contours(co_use_case, img_boundary, erosion_rate, dilation_rate, y_len, x_len, dilation_early=dilation_early ) From cca4d178238f5115093137b7e16d566f274911cc Mon Sep 17 00:00:00 2001 From: vahidrezanezhad Date: Fri, 30 Aug 2024 15:30:18 +0200 Subject: [PATCH 55/62] new augmentations for patchwise training --- utils.py | 64 ++++++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 60 insertions(+), 4 deletions(-) diff --git a/utils.py b/utils.py index d38e798..3d42b64 100644 --- a/utils.py +++ b/utils.py @@ -823,6 +823,53 @@ def provide_patches(imgs_list_train, segs_list_train, dir_img, dir_seg, dir_flow img_max_rotated, label_max_rotated, input_height, input_width, indexer=indexer) + + if channels_shuffling: + for shuffle_index in shuffle_indexes: + indexer = get_patches(dir_flow_train_imgs, dir_flow_train_labels, + return_shuffled_channels(cv2.imread(dir_img + '/' + im), shuffle_index), + cv2.imread(dir_of_label_file), + input_height, input_width, indexer=indexer) + + if adding_rgb_background: + img_bin_corr = cv2.imread(dir_img_bin + '/' + img_name+'.png') + for i_n in range(number_of_backgrounds_per_image): + background_image_chosen_name = random.choice(list_all_possible_background_images) + img_rgb_background_chosen = cv2.imread(dir_rgb_backgrounds + '/' + background_image_chosen_name) + img_with_overlayed_background = return_binary_image_with_given_rgb_background(img_bin_corr, img_rgb_background_chosen) + + indexer = get_patches(dir_flow_train_imgs, dir_flow_train_labels, + img_with_overlayed_background, + cv2.imread(dir_of_label_file), + input_height, input_width, indexer=indexer) + + + if adding_rgb_foreground: + img_bin_corr = cv2.imread(dir_img_bin + '/' + img_name+'.png') + for i_n in range(number_of_backgrounds_per_image): + background_image_chosen_name = random.choice(list_all_possible_background_images) + foreground_rgb_chosen_name = random.choice(list_all_possible_foreground_rgbs) + + img_rgb_background_chosen = cv2.imread(dir_rgb_backgrounds + '/' + background_image_chosen_name) + foreground_rgb_chosen = np.load(dir_rgb_foregrounds + '/' + foreground_rgb_chosen_name) + + img_with_overlayed_background = return_binary_image_with_given_rgb_background_and_given_foreground_rgb(img_bin_corr, img_rgb_background_chosen, foreground_rgb_chosen) + + indexer = get_patches(dir_flow_train_imgs, dir_flow_train_labels, + img_with_overlayed_background, + cv2.imread(dir_of_label_file), + input_height, input_width, indexer=indexer) + + + if add_red_textlines: + img_bin_corr = cv2.imread(dir_img_bin + '/' + img_name+'.png') + img_red_context = return_image_with_red_elements(cv2.imread(dir_img + '/'+im), img_bin_corr) + + indexer = get_patches(dir_flow_train_imgs, dir_flow_train_labels, + img_red_context, + cv2.imread(dir_of_label_file), + input_height, input_width, indexer=indexer) + if flip_aug: for f_i in flip_index: indexer = get_patches(dir_flow_train_imgs, dir_flow_train_labels, @@ -871,10 +918,19 @@ def provide_patches(imgs_list_train, segs_list_train, dir_img, dir_seg, dir_flow input_height, input_width, indexer=indexer) if binarization: - indexer = get_patches(dir_flow_train_imgs, dir_flow_train_labels, - otsu_copy(cv2.imread(dir_img + '/' + im)), - cv2.imread(dir_of_label_file), - input_height, input_width, indexer=indexer) + if dir_img_bin: + img_bin_corr = cv2.imread(dir_img_bin + '/' + img_name+'.png') + + indexer = get_patches(dir_flow_train_imgs, dir_flow_train_labels, + img_bin_corr, + cv2.imread(dir_of_label_file), + input_height, input_width, indexer=indexer) + + else: + indexer = get_patches(dir_flow_train_imgs, dir_flow_train_labels, + otsu_copy(cv2.imread(dir_img + '/' + im)), + cv2.imread(dir_of_label_file), + input_height, input_width, indexer=indexer) if scaling_brightness: for sc_ind in scales: From df4a47ae6ffdff4a9d739a05f0137a76e88ed811 Mon Sep 17 00:00:00 2001 From: johnlockejrr Date: Sat, 19 Oct 2024 13:21:29 -0700 Subject: [PATCH 56/62] Update inference.py to check if save_layout was passed as argument otherwise can give an cv2 error --- inference.py | 1 + 1 file changed, 1 insertion(+) diff --git a/inference.py b/inference.py index 8d0a572..89d32de 100644 --- a/inference.py +++ b/inference.py @@ -566,6 +566,7 @@ class sbb_predict: img_seg_overlayed, only_layout = self.visualize_model_output(res, self.img_org, self.task) if self.save: cv2.imwrite(self.save,img_seg_overlayed) + if self.save_layout: cv2.imwrite(self.save_layout, only_layout) if self.ground_truth: From 451188c3b9becf576d1d4370c0b5dd9605e5b37b Mon Sep 17 00:00:00 2001 From: johnlockejrr Date: Sat, 19 Oct 2024 13:25:50 -0700 Subject: [PATCH 57/62] Changed deprecated `lr` to `learning_rate` and `model.fit_generator` to `model.fit` --- train.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/train.py b/train.py index 848ff6a..4cc3cbb 100644 --- a/train.py +++ b/train.py @@ -277,16 +277,16 @@ def run(_config, n_classes, n_epochs, input_height, if (task == "segmentation" or task == "binarization"): if not is_loss_soft_dice and not weighted_loss: model.compile(loss='categorical_crossentropy', - optimizer=Adam(lr=learning_rate), metrics=['accuracy']) + optimizer=Adam(learning_rate=learning_rate), metrics=['accuracy']) if is_loss_soft_dice: model.compile(loss=soft_dice_loss, - optimizer=Adam(lr=learning_rate), metrics=['accuracy']) + optimizer=Adam(learning_rate=learning_rate), metrics=['accuracy']) if weighted_loss: model.compile(loss=weighted_categorical_crossentropy(weights), - optimizer=Adam(lr=learning_rate), metrics=['accuracy']) + optimizer=Adam(learning_rate=learning_rate), metrics=['accuracy']) elif task == "enhancement": model.compile(loss='mean_squared_error', - optimizer=Adam(lr=learning_rate), metrics=['accuracy']) + optimizer=Adam(learning_rate=learning_rate), metrics=['accuracy']) # generating train and evaluation data @@ -299,7 +299,7 @@ def run(_config, n_classes, n_epochs, input_height, ##score_best=[] ##score_best.append(0) for i in tqdm(range(index_start, n_epochs + index_start)): - model.fit_generator( + model.fit( train_gen, steps_per_epoch=int(len(os.listdir(dir_flow_train_imgs)) / n_batch) - 1, validation_data=val_gen, @@ -384,7 +384,7 @@ def run(_config, n_classes, n_epochs, input_height, #f1score_tot = [0] indexer_start = 0 - opt = SGD(lr=0.01, momentum=0.9) + opt = SGD(learning_rate=0.01, momentum=0.9) opt_adam = tf.keras.optimizers.Adam(learning_rate=0.0001) model.compile(loss="binary_crossentropy", optimizer = opt_adam,metrics=['accuracy']) From be57f137d77946c5ed81cb001e23964d6bf27d9e Mon Sep 17 00:00:00 2001 From: johnlockejrr <16368414+johnlockejrr@users.noreply.github.com> Date: Sun, 11 May 2025 05:31:34 -0700 Subject: [PATCH 58/62] Update utils.py --- utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils.py b/utils.py index 3d42b64..cba20c2 100644 --- a/utils.py +++ b/utils.py @@ -667,7 +667,7 @@ def provide_patches(imgs_list_train, segs_list_train, dir_img, dir_seg, dir_flow indexer = 0 for im, seg_i in tqdm(zip(imgs_list_train, segs_list_train)): - img_name = im.split('.')[0] + img_name = os.path.splitext(im)[0] if task == "segmentation" or task == "binarization": dir_of_label_file = os.path.join(dir_seg, img_name + '.png') elif task=="enhancement": From 102b04c84d1c4d233ba816eb34b1a966b5a3283b Mon Sep 17 00:00:00 2001 From: johnlockejrr <16368414+johnlockejrr@users.noreply.github.com> Date: Sun, 11 May 2025 06:09:17 -0700 Subject: [PATCH 59/62] Update utils.py Changed unsafe basename extraction: `file_name = i.split('.')[0]` to `file_name = os.path.splitext(i)[0]` and `filename = n[i].split('.')[0]` to `filename = os.path.splitext(n[i])[0]` because `"Vat.sam.2_206.jpg` -> `Vat` instead of `"Vat.sam.2_206` --- utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/utils.py b/utils.py index cba20c2..bbe21d1 100644 --- a/utils.py +++ b/utils.py @@ -374,7 +374,7 @@ def generate_arrays_from_folder_reading_order(classes_file_dir, modal_dir, batch batchcount = 0 while True: for i in all_labels_files: - file_name = i.split('.')[0] + file_name = os.path.splitext(i)[0] img = cv2.imread(os.path.join(modal_dir,file_name+'.png')) label_class = int( np.load(os.path.join(classes_file_dir,i)) ) @@ -401,7 +401,7 @@ def data_gen(img_folder, mask_folder, batch_size, input_height, input_width, n_c for i in range(c, c + batch_size): # initially from 0 to 16, c = 0. try: - filename = n[i].split('.')[0] + filename = os.path.splitext(n[i])[0] train_img = cv2.imread(img_folder + '/' + n[i]) / 255. train_img = cv2.resize(train_img, (input_width, input_height), From 1bf801985b002a2b07f1e0e655e5e0884d412452 Mon Sep 17 00:00:00 2001 From: johnlockejrr <16368414+johnlockejrr@users.noreply.github.com> Date: Wed, 14 May 2025 03:34:51 -0700 Subject: [PATCH 60/62] Update gt_gen_utils.py Keep safely the full basename without extension --- gt_gen_utils.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/gt_gen_utils.py b/gt_gen_utils.py index 5784e14..8837462 100644 --- a/gt_gen_utils.py +++ b/gt_gen_utils.py @@ -22,7 +22,7 @@ def get_content_of_dir(dir_in): """ gt_all=os.listdir(dir_in) - gt_list=[file for file in gt_all if file.split('.')[ len(file.split('.'))-1 ]=='xml' ] + gt_list = [file for file in gt_all if os.path.splitext(file)[1] == '.xml'] return gt_list def return_parent_contours(contours, hierarchy): @@ -134,7 +134,7 @@ def get_images_of_ground_truth(gt_list, dir_in, output_dir, output_type, config_ if dir_images: ls_org_imgs = os.listdir(dir_images) - ls_org_imgs_stem = [item.split('.')[0] for item in ls_org_imgs] + ls_org_imgs_stem = [os.path.splitext(item)[0] for item in ls_org_imgs] for index in tqdm(range(len(gt_list))): #try: tree1 = ET.parse(dir_in+'/'+gt_list[index], parser = ET.XMLParser(encoding = 'iso-8859-5')) @@ -298,10 +298,10 @@ def get_images_of_ground_truth(gt_list, dir_in, output_dir, output_type, config_ img_poly = resize_image(img_poly, y_new, x_new) try: - xml_file_stem = gt_list[index].split('-')[1].split('.')[0] + xml_file_stem = os.path.splitext(gt_list[index])[0] cv2.imwrite(os.path.join(output_dir, xml_file_stem + '.png'), img_poly) except: - xml_file_stem = gt_list[index].split('.')[0] + xml_file_stem = os.path.splitext(gt_list[index])[0] cv2.imwrite(os.path.join(output_dir, xml_file_stem + '.png'), img_poly) if dir_images: @@ -757,10 +757,10 @@ def get_images_of_ground_truth(gt_list, dir_in, output_dir, output_type, config_ img_poly = resize_image(img_poly, y_new, x_new) try: - xml_file_stem = gt_list[index].split('-')[1].split('.')[0] + xml_file_stem = os.path.splitext(gt_list[index])[0] cv2.imwrite(os.path.join(output_dir, xml_file_stem + '.png'), img_poly) except: - xml_file_stem = gt_list[index].split('.')[0] + xml_file_stem = os.path.splitext(gt_list[index])[0] cv2.imwrite(os.path.join(output_dir, xml_file_stem + '.png'), img_poly) From 766108089983e787870fc6d7dc4f9bbde7bd5b75 Mon Sep 17 00:00:00 2001 From: johnlockejrr <16368414+johnlockejrr@users.noreply.github.com> Date: Sat, 17 May 2025 16:17:38 +0300 Subject: [PATCH 61/62] LR Warmup and Optimization Implementation # Learning Rate Warmup and Optimization Implementation ## Overview Added learning rate warmup functionality to improve training stability, especially when using pretrained weights. The implementation uses TensorFlow's native learning rate scheduling for better performance. ## Changes Made ### 1. Configuration Updates (`runs/train_no_patches_448x448.json`) Added new configuration parameters for warmup: ```json { "warmup_enabled": true, "warmup_epochs": 5, "warmup_start_lr": 1e-6 } ``` ### 2. Training Script Updates (`train.py`) #### A. Optimizer and Learning Rate Schedule - Replaced fixed learning rate with dynamic scheduling - Implemented warmup using `tf.keras.optimizers.schedules.PolynomialDecay` - Maintained compatibility with existing ReduceLROnPlateau and EarlyStopping ```python if warmup_enabled: lr_schedule = tf.keras.optimizers.schedules.PolynomialDecay( initial_learning_rate=warmup_start_lr, decay_steps=warmup_epochs * steps_per_epoch, end_learning_rate=learning_rate, power=1.0 # Linear decay ) optimizer = Adam(learning_rate=lr_schedule) else: optimizer = Adam(learning_rate=learning_rate) ``` #### B. Learning Rate Behavior - Initial learning rate: 1e-6 (configurable via `warmup_start_lr`) - Target learning rate: 5e-5 (configurable via `learning_rate`) - Linear increase over 5 epochs (configurable via `warmup_epochs`) - After warmup, learning rate remains at target value until ReduceLROnPlateau triggers ## Benefits 1. Improved training stability during initial epochs 2. Better handling of pretrained weights 3. Efficient implementation using TensorFlow's native scheduling 4. Configurable through JSON configuration file 5. Maintains compatibility with existing callbacks (ReduceLROnPlateau, EarlyStopping) ## Usage To enable warmup: 1. Set `warmup_enabled: true` in the configuration file 2. Adjust `warmup_epochs` and `warmup_start_lr` as needed 3. The warmup will automatically integrate with existing learning rate reduction and early stopping To disable warmup: - Set `warmup_enabled: false` or remove the warmup parameters from the configuration file --- train.py | 99 ++++++++++++++++++++++++++++++++--- train_no_patches_448x448.json | 52 ++++++++++++++++++ 2 files changed, 143 insertions(+), 8 deletions(-) create mode 100644 train_no_patches_448x448.json diff --git a/train.py b/train.py index 4cc3cbb..668d6aa 100644 --- a/train.py +++ b/train.py @@ -5,6 +5,7 @@ import tensorflow as tf from tensorflow.compat.v1.keras.backend import set_session import warnings from tensorflow.keras.optimizers import * +from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping, Callback from sacred import Experiment from models import * from utils import * @@ -15,6 +16,36 @@ import json from sklearn.metrics import f1_score +def get_warmup_schedule(start_lr, target_lr, warmup_epochs, steps_per_epoch): + initial_learning_rate = start_lr + target_learning_rate = target_lr + warmup_steps = warmup_epochs * steps_per_epoch + + lr_schedule = tf.keras.optimizers.schedules.LinearSchedule( + initial_learning_rate=initial_learning_rate, + final_learning_rate=target_learning_rate, + total_steps=warmup_steps + ) + + return lr_schedule + + +class WarmupScheduler(Callback): + def __init__(self, start_lr, target_lr, warmup_epochs): + super(WarmupScheduler, self).__init__() + self.start_lr = start_lr + self.target_lr = target_lr + self.warmup_epochs = warmup_epochs + self.current_epoch = 0 + + def on_epoch_begin(self, epoch, logs=None): + if self.current_epoch < self.warmup_epochs: + # Linear warmup + lr = self.start_lr + (self.target_lr - self.start_lr) * (self.current_epoch / self.warmup_epochs) + tf.keras.backend.set_value(self.model.optimizer.lr, lr) + self.current_epoch += 1 + + def configuration(): config = tf.compat.v1.ConfigProto() config.gpu_options.allow_growth = True @@ -97,6 +128,18 @@ def config_params(): number_of_backgrounds_per_image = 1 dir_rgb_backgrounds = None dir_rgb_foregrounds = None + reduce_lr_enabled = False # Whether to use ReduceLROnPlateau callback + reduce_lr_monitor = 'val_loss' # Metric to monitor for reducing learning rate + reduce_lr_factor = 0.5 # Factor to reduce learning rate by + reduce_lr_patience = 3 # Number of epochs to wait before reducing learning rate + reduce_lr_min_lr = 1e-6 # Minimum learning rate + early_stopping_enabled = False # Whether to use EarlyStopping callback + early_stopping_monitor = 'val_loss' # Metric to monitor for early stopping + early_stopping_patience = 10 # Number of epochs to wait before stopping + early_stopping_restore_best_weights = True # Whether to restore best weights when stopping + warmup_enabled = False # Whether to use learning rate warmup + warmup_epochs = 5 # Number of epochs for warmup + warmup_start_lr = 1e-6 # Starting learning rate for warmup @ex.automain @@ -112,7 +155,10 @@ def run(_config, n_classes, n_epochs, input_height, transformer_mlp_head_units, transformer_layers, transformer_num_heads, transformer_cnn_first, transformer_patchsize_x, transformer_patchsize_y, transformer_num_patches_xy, backbone_type, flip_index, dir_eval, dir_output, - pretraining, learning_rate, task, f1_threshold_classification, classification_classes_name, dir_img_bin, number_of_backgrounds_per_image,dir_rgb_backgrounds, dir_rgb_foregrounds): + pretraining, learning_rate, task, f1_threshold_classification, classification_classes_name, dir_img_bin, number_of_backgrounds_per_image,dir_rgb_backgrounds, dir_rgb_foregrounds, + reduce_lr_enabled, reduce_lr_monitor, reduce_lr_factor, reduce_lr_patience, reduce_lr_min_lr, + early_stopping_enabled, early_stopping_monitor, early_stopping_patience, early_stopping_restore_best_weights, + warmup_enabled, warmup_epochs, warmup_start_lr): if dir_rgb_backgrounds: list_all_possible_background_images = os.listdir(dir_rgb_backgrounds) @@ -274,20 +320,55 @@ def run(_config, n_classes, n_epochs, input_height, model.summary() + # Create callbacks list + callbacks = [] + if reduce_lr_enabled: + reduce_lr = ReduceLROnPlateau( + monitor=reduce_lr_monitor, + factor=reduce_lr_factor, + patience=reduce_lr_patience, + min_lr=reduce_lr_min_lr, + verbose=1 + ) + callbacks.append(reduce_lr) + + if early_stopping_enabled: + early_stopping = EarlyStopping( + monitor=early_stopping_monitor, + patience=early_stopping_patience, + restore_best_weights=early_stopping_restore_best_weights, + verbose=1 + ) + callbacks.append(early_stopping) + + # Calculate steps per epoch + steps_per_epoch = int(len(os.listdir(dir_flow_train_imgs)) / n_batch) - 1 + + # Create optimizer with or without warmup + if warmup_enabled: + lr_schedule = tf.keras.optimizers.schedules.PolynomialDecay( + initial_learning_rate=warmup_start_lr, + decay_steps=warmup_epochs * steps_per_epoch, + end_learning_rate=learning_rate, + power=1.0 # Linear decay + ) + optimizer = Adam(learning_rate=lr_schedule) + else: + optimizer = Adam(learning_rate=learning_rate) + if (task == "segmentation" or task == "binarization"): if not is_loss_soft_dice and not weighted_loss: model.compile(loss='categorical_crossentropy', - optimizer=Adam(learning_rate=learning_rate), metrics=['accuracy']) + optimizer=optimizer, metrics=['accuracy']) if is_loss_soft_dice: model.compile(loss=soft_dice_loss, - optimizer=Adam(learning_rate=learning_rate), metrics=['accuracy']) + optimizer=optimizer, metrics=['accuracy']) if weighted_loss: model.compile(loss=weighted_categorical_crossentropy(weights), - optimizer=Adam(learning_rate=learning_rate), metrics=['accuracy']) + optimizer=optimizer, metrics=['accuracy']) elif task == "enhancement": model.compile(loss='mean_squared_error', - optimizer=Adam(learning_rate=learning_rate), metrics=['accuracy']) - + optimizer=optimizer, metrics=['accuracy']) # generating train and evaluation data train_gen = data_gen(dir_flow_train_imgs, dir_flow_train_labels, batch_size=n_batch, @@ -298,13 +379,15 @@ def run(_config, n_classes, n_epochs, input_height, ##img_validation_patches = os.listdir(dir_flow_eval_imgs) ##score_best=[] ##score_best.append(0) + for i in tqdm(range(index_start, n_epochs + index_start)): model.fit( train_gen, - steps_per_epoch=int(len(os.listdir(dir_flow_train_imgs)) / n_batch) - 1, + steps_per_epoch=steps_per_epoch, validation_data=val_gen, validation_steps=1, - epochs=1) + epochs=1, + callbacks=callbacks) model.save(os.path.join(dir_output,'model_'+str(i))) with open(os.path.join(os.path.join(dir_output,'model_'+str(i)),"config.json"), "w") as fp: diff --git a/train_no_patches_448x448.json b/train_no_patches_448x448.json new file mode 100644 index 0000000..c3d0e10 --- /dev/null +++ b/train_no_patches_448x448.json @@ -0,0 +1,52 @@ +{ + "backbone_type" : "nontransformer", + "task": "segmentation", + "n_classes" : 3, + "n_epochs" : 50, + "input_height" : 448, + "input_width" : 448, + "weight_decay" : 1e-4, + "n_batch" : 4, + "learning_rate": 5e-5, + "patches" : false, + "pretraining" : true, + "augmentation" : true, + "flip_aug" : false, + "blur_aug" : false, + "scaling" : true, + "degrading": true, + "brightening": false, + "binarization" : false, + "scaling_bluring" : false, + "scaling_binarization" : false, + "scaling_flip" : false, + "rotation": false, + "rotation_not_90": false, + "blur_k" : ["blur","guass","median"], + "scales" : [0.6, 0.7, 0.8, 0.9], + "brightness" : [1.3, 1.5, 1.7, 2], + "degrade_scales" : [0.2, 0.4], + "flip_index" : [0, 1, -1], + "thetha" : [10, -10], + "continue_training": false, + "index_start" : 0, + "dir_of_start_model" : " ", + "weighted_loss": false, + "is_loss_soft_dice": true, + "data_is_provided": true, + "dir_train": "/home/incognito/sbb_pixelwise_segmentation/dataset/sam_41_mss/dir_train/train", + "dir_eval": "/home/incognito/sbb_pixelwise_segmentation/dataset/sam_41_mss/dir_train/eval", + "dir_output": "runs/sam_41_mss_npt_448x448", + "reduce_lr_enabled": true, + "reduce_lr_monitor": "val_loss", + "reduce_lr_factor": 0.5, + "reduce_lr_patience": 4, + "reduce_lr_min_lr": 1e-6, + "early_stopping_enabled": true, + "early_stopping_monitor": "val_loss", + "early_stopping_patience": 6, + "early_stopping_restore_best_weights": true, + "warmup_enabled": true, + "warmup_epochs": 5, + "warmup_start_lr": 1e-6 +} From f298643fcfa26862ca3c681be7dd6aef2fa90c02 Mon Sep 17 00:00:00 2001 From: johnlockejrr <16368414+johnlockejrr@users.noreply.github.com> Date: Sat, 17 May 2025 23:24:40 +0300 Subject: [PATCH 62/62] Fix `ReduceONPlateau` wrong logic # Training Script Improvements ## Learning Rate Management Fixes ### 1. ReduceLROnPlateau Implementation - Fixed the learning rate reduction mechanism by replacing the manual epoch loop with a single `model.fit()` call - This ensures proper tracking of validation metrics across epochs - Configured with: ```python reduce_lr = ReduceLROnPlateau( monitor='val_loss', factor=0.2, # More aggressive reduction patience=3, # Quick response to plateaus min_lr=1e-6, # Minimum learning rate min_delta=1e-5, # Minimum change to be considered improvement verbose=1 ) ``` ### 2. Warmup Implementation - Added learning rate warmup using TensorFlow's native scheduling - Gradually increases learning rate from 1e-6 to target (2e-5) over 5 epochs - Helps stabilize initial training phase - Implemented using `PolynomialDecay` schedule: ```python lr_schedule = tf.keras.optimizers.schedules.PolynomialDecay( initial_learning_rate=warmup_start_lr, decay_steps=warmup_epochs * steps_per_epoch, end_learning_rate=learning_rate, power=1.0 # Linear decay ) ``` ### 3. Early Stopping - Added early stopping to prevent overfitting - Configured with: ```python early_stopping = EarlyStopping( monitor='val_loss', patience=6, restore_best_weights=True, verbose=1 ) ``` ## Model Saving Improvements ### 1. Epoch-based Model Saving - Implemented custom `ModelCheckpointWithConfig` to save both model and config - Saves after each epoch with corresponding config.json - Maintains compatibility with original script's saving behavior ### 2. Best Model Saving - Saves the best model at training end - If early stopping triggers: saves the best model from training - If no early stopping: saves the final model ## Configuration All parameters are configurable through the JSON config file: ```json { "reduce_lr_enabled": true, "reduce_lr_monitor": "val_loss", "reduce_lr_factor": 0.2, "reduce_lr_patience": 3, "reduce_lr_min_lr": 1e-6, "reduce_lr_min_delta": 1e-5, "early_stopping_enabled": true, "early_stopping_monitor": "val_loss", "early_stopping_patience": 6, "early_stopping_restore_best_weights": true, "warmup_enabled": true, "warmup_epochs": 5, "warmup_start_lr": 1e-6 } ``` ## Benefits 1. More stable training with proper learning rate management 2. Better handling of training plateaus 3. Automatic saving of best model 4. Maintained compatibility with existing config saving 5. Improved training monitoring and control --- train.py | 78 ++++++++++++++++++----------------- train_no_patches_448x448.json | 7 ++-- 2 files changed, 44 insertions(+), 41 deletions(-) diff --git a/train.py b/train.py index 668d6aa..177f408 100644 --- a/train.py +++ b/train.py @@ -5,7 +5,7 @@ import tensorflow as tf from tensorflow.compat.v1.keras.backend import set_session import warnings from tensorflow.keras.optimizers import * -from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping, Callback +from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping, Callback, ModelCheckpoint from sacred import Experiment from models import * from utils import * @@ -30,22 +30,6 @@ def get_warmup_schedule(start_lr, target_lr, warmup_epochs, steps_per_epoch): return lr_schedule -class WarmupScheduler(Callback): - def __init__(self, start_lr, target_lr, warmup_epochs): - super(WarmupScheduler, self).__init__() - self.start_lr = start_lr - self.target_lr = target_lr - self.warmup_epochs = warmup_epochs - self.current_epoch = 0 - - def on_epoch_begin(self, epoch, logs=None): - if self.current_epoch < self.warmup_epochs: - # Linear warmup - lr = self.start_lr + (self.target_lr - self.start_lr) * (self.current_epoch / self.warmup_epochs) - tf.keras.backend.set_value(self.model.optimizer.lr, lr) - self.current_epoch += 1 - - def configuration(): config = tf.compat.v1.ConfigProto() config.gpu_options.allow_growth = True @@ -133,6 +117,7 @@ def config_params(): reduce_lr_factor = 0.5 # Factor to reduce learning rate by reduce_lr_patience = 3 # Number of epochs to wait before reducing learning rate reduce_lr_min_lr = 1e-6 # Minimum learning rate + reduce_lr_min_delta = 1e-5 # Minimum change in monitored value to be considered as improvement early_stopping_enabled = False # Whether to use EarlyStopping callback early_stopping_monitor = 'val_loss' # Metric to monitor for early stopping early_stopping_patience = 10 # Number of epochs to wait before stopping @@ -156,7 +141,7 @@ def run(_config, n_classes, n_epochs, input_height, transformer_patchsize_x, transformer_patchsize_y, transformer_num_patches_xy, backbone_type, flip_index, dir_eval, dir_output, pretraining, learning_rate, task, f1_threshold_classification, classification_classes_name, dir_img_bin, number_of_backgrounds_per_image,dir_rgb_backgrounds, dir_rgb_foregrounds, - reduce_lr_enabled, reduce_lr_monitor, reduce_lr_factor, reduce_lr_patience, reduce_lr_min_lr, + reduce_lr_enabled, reduce_lr_monitor, reduce_lr_factor, reduce_lr_patience, reduce_lr_min_lr, reduce_lr_min_delta, early_stopping_enabled, early_stopping_monitor, early_stopping_patience, early_stopping_restore_best_weights, warmup_enabled, warmup_epochs, warmup_start_lr): @@ -328,6 +313,7 @@ def run(_config, n_classes, n_epochs, input_height, factor=reduce_lr_factor, patience=reduce_lr_patience, min_lr=reduce_lr_min_lr, + min_delta=reduce_lr_min_delta, verbose=1 ) callbacks.append(reduce_lr) @@ -341,6 +327,27 @@ def run(_config, n_classes, n_epochs, input_height, ) callbacks.append(early_stopping) + # Add checkpoint to save models every epoch + class ModelCheckpointWithConfig(ModelCheckpoint): + def __init__(self, *args, **kwargs): + self._config = _config + super().__init__(*args, **kwargs) + + def on_epoch_end(self, epoch, logs=None): + super().on_epoch_end(epoch, logs) + model_dir = os.path.join(dir_output, f"model_{epoch+1}") + with open(os.path.join(model_dir, "config.json"), "w") as fp: + json.dump(self._config, fp) + + checkpoint_epoch = ModelCheckpointWithConfig( + os.path.join(dir_output, "model_{epoch}"), + save_freq='epoch', + save_weights_only=False, + save_best_only=False, + verbose=1 + ) + callbacks.append(checkpoint_epoch) + # Calculate steps per epoch steps_per_epoch = int(len(os.listdir(dir_flow_train_imgs)) / n_batch) - 1 @@ -376,27 +383,22 @@ def run(_config, n_classes, n_epochs, input_height, val_gen = data_gen(dir_flow_eval_imgs, dir_flow_eval_labels, batch_size=n_batch, input_height=input_height, input_width=input_width, n_classes=n_classes, task=task) - ##img_validation_patches = os.listdir(dir_flow_eval_imgs) - ##score_best=[] - ##score_best.append(0) - - for i in tqdm(range(index_start, n_epochs + index_start)): - model.fit( - train_gen, - steps_per_epoch=steps_per_epoch, - validation_data=val_gen, - validation_steps=1, - epochs=1, - callbacks=callbacks) - model.save(os.path.join(dir_output,'model_'+str(i))) + # Single fit call with all epochs + history = model.fit( + train_gen, + steps_per_epoch=steps_per_epoch, + validation_data=val_gen, + validation_steps=1, + epochs=n_epochs, + callbacks=callbacks + ) - with open(os.path.join(os.path.join(dir_output,'model_'+str(i)),"config.json"), "w") as fp: - json.dump(_config, fp) # encode dict into JSON - - #os.system('rm -rf '+dir_train_flowing) - #os.system('rm -rf '+dir_eval_flowing) - - #model.save(dir_output+'/'+'model'+'.h5') + # Save the best model (either from early stopping or final model) + model.save(os.path.join(dir_output, 'model_best')) + + with open(os.path.join(dir_output, 'model_best', "config.json"), "w") as fp: + json.dump(_config, fp) # encode dict into JSON + elif task=='classification': configuration() model = resnet50_classifier(n_classes, input_height, input_width, weight_decay, pretraining) diff --git a/train_no_patches_448x448.json b/train_no_patches_448x448.json index c3d0e10..aaab12b 100644 --- a/train_no_patches_448x448.json +++ b/train_no_patches_448x448.json @@ -7,7 +7,7 @@ "input_width" : 448, "weight_decay" : 1e-4, "n_batch" : 4, - "learning_rate": 5e-5, + "learning_rate": 2e-5, "patches" : false, "pretraining" : true, "augmentation" : true, @@ -39,8 +39,9 @@ "dir_output": "runs/sam_41_mss_npt_448x448", "reduce_lr_enabled": true, "reduce_lr_monitor": "val_loss", - "reduce_lr_factor": 0.5, - "reduce_lr_patience": 4, + "reduce_lr_factor": 0.2, + "reduce_lr_patience": 3, + "reduce_lr_min_delta": 1e-5, "reduce_lr_min_lr": 1e-6, "early_stopping_enabled": true, "early_stopping_monitor": "val_loss",