diff --git a/src/eynollah/sbb_binarize.py b/src/eynollah/sbb_binarize.py index 851ac7d..37ac7c3 100644 --- a/src/eynollah/sbb_binarize.py +++ b/src/eynollah/sbb_binarize.py @@ -19,7 +19,7 @@ from eynollah.model_zoo import EynollahModelZoo tf_disable_interactive_logs() import tensorflow as tf from tensorflow.python.keras import backend as tensorflow_backend - +from pathlib import Path from .utils import is_image_filename def resize_image(img_in, input_height, input_width): @@ -347,7 +347,7 @@ class SbbBinarizer: self.logger.info("Found %d image files to binarize in %s", len(ls_imgs), dir_in) for i, image_path in enumerate(ls_imgs): self.logger.info('Binarizing [%3d/%d] %s', i + 1, len(ls_imgs), image_path) - image_stem = image_path.split('.')[0] + image_stem = Path(image_path).stem image = cv2.imread(os.path.join(dir_in,image_path) ) img_last = 0 model_file, model = self.models diff --git a/src/eynollah/training/cli.py b/src/eynollah/training/cli.py index 65a7a8a..3718275 100644 --- a/src/eynollah/training/cli.py +++ b/src/eynollah/training/cli.py @@ -9,6 +9,7 @@ from .generate_gt_for_training import main as generate_gt_cli from .inference import main as inference_cli from .train import ex from .extract_line_gt import linegt_cli +from .weights_ensembling import main as ensemble_cli @click.command(context_settings=dict( ignore_unknown_options=True, @@ -26,3 +27,4 @@ main.add_command(generate_gt_cli, 'generate-gt') main.add_command(inference_cli, 'inference') main.add_command(train_cli, 'train') main.add_command(linegt_cli, 'export_textline_images_and_text') +main.add_command(ensemble_cli, 'ensembling') diff --git a/src/eynollah/training/generate_gt_for_training.py b/src/eynollah/training/generate_gt_for_training.py index 693cab8..30abd04 100644 --- a/src/eynollah/training/generate_gt_for_training.py +++ b/src/eynollah/training/generate_gt_for_training.py @@ -474,7 +474,7 @@ def visualize_layout_segmentation(xml_file, dir_xml, dir_out, dir_imgs): img_file_name_with_format = find_format_of_given_filename_in_dir(dir_imgs, f_name) img = cv2.imread(os.path.join(dir_imgs, img_file_name_with_format)) - co_text, co_graphic, co_sep, co_img, co_table, co_noise, y_len, x_len = get_layout_contours_for_visualization(xml_file) + co_text, co_graphic, co_sep, co_img, co_table, co_map, co_noise, y_len, x_len = get_layout_contours_for_visualization(xml_file) added_image = visualize_image_from_contours_layout(co_text['paragraph'], co_text['header']+co_text['heading'], co_text['drop-capital'], co_sep, co_img, co_text['marginalia'], co_table, img) diff --git a/src/eynollah/training/gt_gen_utils.py b/src/eynollah/training/gt_gen_utils.py index 28ab422..50bb6fa 100644 --- a/src/eynollah/training/gt_gen_utils.py +++ b/src/eynollah/training/gt_gen_utils.py @@ -15,7 +15,7 @@ with warnings.catch_warnings(): warnings.simplefilter("ignore") -def visualize_image_from_contours_layout(co_par, co_header, co_drop, co_sep, co_image, co_marginal, co_table, img): +def visualize_image_from_contours_layout(co_par, co_header, co_drop, co_sep, co_image, co_marginal, co_table, co_map, img): alpha = 0.5 blank_image = np.ones( (img.shape[:]), dtype=np.uint8) * 255 @@ -28,6 +28,7 @@ def visualize_image_from_contours_layout(co_par, co_header, co_drop, co_sep, co_ col_sep = (255, 0, 0) col_marginal = (106, 90, 205) col_table = (0, 90, 205) + col_map = (90, 90, 205) if len(co_image)>0: cv2.drawContours(blank_image, co_image, -1, col_image, thickness=cv2.FILLED) # Fill the contour @@ -52,6 +53,9 @@ def visualize_image_from_contours_layout(co_par, co_header, co_drop, co_sep, co_ if len(co_table)>0: cv2.drawContours(blank_image, co_table, -1, col_table, thickness=cv2.FILLED) # Fill the contour + + if len(co_map)>0: + cv2.drawContours(blank_image, co_map, -1, col_map, thickness=cv2.FILLED) # Fill the contour img_final =cv2.cvtColor(blank_image, cv2.COLOR_BGR2RGB) @@ -231,7 +235,12 @@ def update_region_contours(co_text, img_boundary, erosion_rate, dilation_rate, y con_eroded = return_contours_of_interested_region(img_boundary_in,pixel, min_size ) try: - co_text_eroded.append(con_eroded[0]) + if len(con_eroded)>1: + cnt_size = np.array([cv2.contourArea(con_eroded[j]) for j in range(len(con_eroded))]) + cnt = contours[np.argmax(cnt_size)] + co_text_eroded.append(cnt) + else: + co_text_eroded.append(con_eroded[0]) except: co_text_eroded.append(con) @@ -377,6 +386,7 @@ def get_layout_contours_for_visualization(xml_file): co_sep=[] co_img=[] co_table=[] + co_map=[] co_noise=[] types_text = [] @@ -593,6 +603,31 @@ def get_layout_contours_for_visualization(xml_file): elif vv.tag!=link+'Point' and sumi>=1: break co_table.append(np.array(c_t_in)) + + if tag.endswith('}MapRegion') or tag.endswith('}mapregion'): + #print('sth') + for nn in root1.iter(tag): + c_t_in=[] + sumi=0 + for vv in nn.iter(): + # check the format of coords + if vv.tag==link+'Coords': + coords=bool(vv.attrib) + if coords: + p_h=vv.attrib['points'].split(' ') + c_t_in.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) + break + else: + pass + + + if vv.tag==link+'Point': + c_t_in.append([ int(float(vv.attrib['x'])) , int(float(vv.attrib['y'])) ]) + sumi+=1 + #print(vv.tag,'in') + elif vv.tag!=link+'Point' and sumi>=1: + break + co_map.append(np.array(c_t_in)) if tag.endswith('}NoiseRegion') or tag.endswith('}noiseregion'): @@ -619,7 +654,7 @@ def get_layout_contours_for_visualization(xml_file): elif vv.tag!=link+'Point' and sumi>=1: break co_noise.append(np.array(c_t_in)) - return co_text, co_graphic, co_sep, co_img, co_table, co_noise, y_len, x_len + return co_text, co_graphic, co_sep, co_img, co_table, co_map, co_noise, y_len, x_len def get_images_of_ground_truth(gt_list, dir_in, output_dir, output_type, config_file, config_params, printspace, dir_images, dir_out_images): """ @@ -698,12 +733,15 @@ def get_images_of_ground_truth(gt_list, dir_in, output_dir, output_type, config_ _, thresh = cv2.threshold(imgray, 0, 255, 0) contours, _ = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) - + cnt_size = np.array([cv2.contourArea(contours[j]) for j in range(len(contours))]) - - cnt = contours[np.argmax(cnt_size)] - - x, y, w, h = cv2.boundingRect(cnt) + + try: + cnt = contours[np.argmax(cnt_size)] + x, y, w, h = cv2.boundingRect(cnt) + except: + x, y , w, h = 0, 0, x_len, y_len + bb_xywh = [x, y, w, h] @@ -835,7 +873,7 @@ def get_images_of_ground_truth(gt_list, dir_in, output_dir, output_type, config_ types_graphic_label = list(types_graphic_dict.values()) - labels_rgb_color = [ (0,0,0), (255,0,0), (255,125,0), (255,0,125), (125,255,125), (125,125,0), (0,125,255), (0,125,0), (125,125,125), (255,0,255), (125,0,125), (0,255,0),(0,0,255), (0,255,255), (255,125,125), (0,125,125), (0,255,125), (255,125,255), (125,255,0)] + labels_rgb_color = [ (0,0,0), (255,0,0), (255,125,0), (255,0,125), (125,255,125), (125,125,0), (0,125,255), (0,125,0), (125,125,125), (255,0,255), (125,0,125), (0,255,0),(0,0,255), (0,255,255), (255,125,125), (0,125,125), (0,255,125), (255,125,255), (125,255,0), (125,255,255)] region_tags=np.unique([x for x in alltags if x.endswith('Region')]) @@ -846,6 +884,7 @@ def get_images_of_ground_truth(gt_list, dir_in, output_dir, output_type, config_ co_sep=[] co_img=[] co_table=[] + co_map=[] co_noise=[] for tag in region_tags: @@ -1056,6 +1095,32 @@ def get_images_of_ground_truth(gt_list, dir_in, output_dir, output_type, config_ elif vv.tag!=link+'Point' and sumi>=1: break co_table.append(np.array(c_t_in)) + + if 'mapregion' in keys: + if tag.endswith('}MapRegion') or tag.endswith('}mapregion'): + #print('sth') + for nn in root1.iter(tag): + c_t_in=[] + sumi=0 + for vv in nn.iter(): + # check the format of coords + if vv.tag==link+'Coords': + coords=bool(vv.attrib) + if coords: + p_h=vv.attrib['points'].split(' ') + c_t_in.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) + break + else: + pass + + + if vv.tag==link+'Point': + c_t_in.append([ int(float(vv.attrib['x'])) , int(float(vv.attrib['y'])) ]) + sumi+=1 + #print(vv.tag,'in') + elif vv.tag!=link+'Point' and sumi>=1: + break + co_map.append(np.array(c_t_in)) if 'noiseregion' in keys: if tag.endswith('}NoiseRegion') or tag.endswith('}noiseregion'): @@ -1129,6 +1194,10 @@ def get_images_of_ground_truth(gt_list, dir_in, output_dir, output_type, config_ erosion_rate = 0#2 dilation_rate = 3#4 co_table, img_boundary = update_region_contours(co_table, img_boundary, erosion_rate, dilation_rate, y_len, x_len ) + if "mapregion" in elements_with_artificial_class: + erosion_rate = 0#2 + dilation_rate = 3#4 + co_map, img_boundary = update_region_contours(co_map, img_boundary, erosion_rate, dilation_rate, y_len, x_len ) @@ -1154,6 +1223,8 @@ def get_images_of_ground_truth(gt_list, dir_in, output_dir, output_type, config_ img_poly=cv2.fillPoly(img, pts =co_img, color=labels_rgb_color[ config_params['imageregion']]) if 'tableregion' in keys: img_poly=cv2.fillPoly(img, pts =co_table, color=labels_rgb_color[ config_params['tableregion']]) + if 'mapregion' in keys: + img_poly=cv2.fillPoly(img, pts =co_map, color=labels_rgb_color[ config_params['mapregion']]) if 'noiseregion' in keys: img_poly=cv2.fillPoly(img, pts =co_noise, color=labels_rgb_color[ config_params['noiseregion']]) @@ -1214,6 +1285,9 @@ def get_images_of_ground_truth(gt_list, dir_in, output_dir, output_type, config_ if 'tableregion' in keys: color_label = config_params['tableregion'] img_poly=cv2.fillPoly(img, pts =co_table, color=(color_label,color_label,color_label)) + if 'mapregion' in keys: + color_label = config_params['mapregion'] + img_poly=cv2.fillPoly(img, pts =co_map, color=(color_label,color_label,color_label)) if 'noiseregion' in keys: color_label = config_params['noiseregion'] img_poly=cv2.fillPoly(img, pts =co_noise, color=(color_label,color_label,color_label)) diff --git a/src/eynollah/training/inference.py b/src/eynollah/training/inference.py index 10fca6c..f74e9e1 100644 --- a/src/eynollah/training/inference.py +++ b/src/eynollah/training/inference.py @@ -26,6 +26,9 @@ from .models import ( Patches ) +from.utils import (scale_padd_image_for_ocr) +from eynollah.utils.utils_ocr import (decode_batch_predictions) + with warnings.catch_warnings(): warnings.simplefilter("ignore") @@ -35,8 +38,7 @@ Tool to load model and predict for given image. """ class sbb_predict: - - def __init__(self,image, dir_in, model, task, config_params_model, patches, save, save_layout, ground_truth, xml_file, out, min_area): + def __init__(self,image, dir_in, model, task, config_params_model, patches, save, save_layout, ground_truth, xml_file, cpu, out, min_area): self.image=image self.dir_in=dir_in self.patches=patches @@ -48,6 +50,7 @@ class sbb_predict: self.config_params_model=config_params_model self.xml_file = xml_file self.out = out + self.cpu = cpu if min_area: self.min_area = float(min_area) else: @@ -159,21 +162,19 @@ class sbb_predict: return mIoU def start_new_session_and_model(self): - - config = tf.compat.v1.ConfigProto() - config.gpu_options.allow_growth = True + if self.task == "cnn-rnn-ocr": + if self.cpu: + os.environ['CUDA_VISIBLE_DEVICES']='-1' + self.model = load_model(self.model_dir) + self.model = tf.keras.models.Model( + self.model.get_layer(name = "image").input, + self.model.get_layer(name = "dense2").output) + else: + config = tf.compat.v1.ConfigProto() + config.gpu_options.allow_growth = True - session = tf.compat.v1.Session(config=config) # tf.InteractiveSession() - tensorflow_backend.set_session(session) - #tensorflow.keras.layers.custom_layer = PatchEncoder - #tensorflow.keras.layers.custom_layer = Patches - self.model = load_model(self.model_dir , compile=False,custom_objects = {"PatchEncoder": PatchEncoder, "Patches": Patches}) - #config = tf.ConfigProto() - #config.gpu_options.allow_growth=True - - #self.session = tf.InteractiveSession() - #keras.losses.custom_loss = self.weighted_categorical_crossentropy - #self.model = load_model(self.model_dir , compile=False) + session = tf.compat.v1.Session(config=config) # tf.InteractiveSession() + tensorflow_backend.set_session(session) ##if self.weights_dir!=None: @@ -250,6 +251,30 @@ class sbb_predict: index_class = np.argmax(label_p_pred[0]) print("Predicted Class: {}".format(classes_names[str(int(index_class))])) + elif self.task == "cnn-rnn-ocr": + img=cv2.imread(image_dir) + img = scale_padd_image_for_ocr(img, self.config_params_model['input_height'], self.config_params_model['input_width']) + + img = img / 255. + + with open(os.path.join(self.model_dir, "characters_org.txt"), 'r') as char_txt_f: + characters = json.load(char_txt_f) + + AUTOTUNE = tf.data.AUTOTUNE + + # Mapping characters to integers. + char_to_num = StringLookup(vocabulary=list(characters), mask_token=None) + + # Mapping integers back to original characters. + num_to_char = StringLookup( + vocabulary=char_to_num.get_vocabulary(), mask_token=None, invert=True + ) + preds = self.model.predict(img.reshape(1, img.shape[0], img.shape[1], img.shape[2]), verbose=0) + pred_texts = decode_batch_predictions(preds, num_to_char) + pred_texts = pred_texts[0].replace("[UNK]", "") + return pred_texts + + elif self.task == 'reading_order': img_height = self.config_params_model['input_height'] img_width = self.config_params_model['input_width'] @@ -580,6 +605,8 @@ class sbb_predict: elif self.task == 'enhancement': if self.save: cv2.imwrite(self.save,res) + elif self.task == "cnn-rnn-ocr": + print(f"Detected text: {res}") else: img_seg_overlayed, only_layout = self.visualize_model_output(res, self.img_org, self.task) if self.save: @@ -587,9 +614,9 @@ class sbb_predict: if self.save_layout: cv2.imwrite(self.save_layout, only_layout) - if self.ground_truth: - gt_img=cv2.imread(self.ground_truth) - self.IoU(gt_img[:,:,0],res[:,:,0]) + if self.ground_truth: + gt_img=cv2.imread(self.ground_truth) + self.IoU(gt_img[:,:,0],res[:,:,0]) else: ls_images = os.listdir(self.dir_in) @@ -603,6 +630,8 @@ class sbb_predict: elif self.task == 'enhancement': self.save = os.path.join(self.out, f_name+'.png') cv2.imwrite(self.save,res) + elif self.task == "cnn-rnn-ocr": + print(f"Detected text for file name {f_name} is: {res}") else: img_seg_overlayed, only_layout = self.visualize_model_output(res, self.img_org, self.task) self.save = os.path.join(self.out, f_name+'_overlayed.png') @@ -610,9 +639,9 @@ class sbb_predict: self.save_layout = os.path.join(self.out, f_name+'_layout.png') cv2.imwrite(self.save_layout, only_layout) - if self.ground_truth: - gt_img=cv2.imread(self.ground_truth) - self.IoU(gt_img[:,:,0],res[:,:,0]) + if self.ground_truth: + gt_img=cv2.imread(self.ground_truth) + self.IoU(gt_img[:,:,0],res[:,:,0]) @@ -668,24 +697,29 @@ class sbb_predict: "-xml", help="xml file with layout coordinates that reading order detection will be implemented on. The result will be written in the same xml file.", ) - +@click.option( + "--cpu", + "-cpu", + help="For OCR, the default device is the GPU. If this parameter is set to true, inference will be performed on the CPU", + is_flag=True, +) @click.option( "--min_area", "-min", help="min area size of regions considered for reading order detection. The default value is zero and means that all text regions are considered for reading order.", ) -def main(image, dir_in, model, patches, save, save_layout, ground_truth, xml_file, out, min_area): +def main(image, dir_in, model, patches, save, save_layout, ground_truth, xml_file, cpu, out, min_area): assert image or dir_in, "Either a single image -i or a dir_in -di is required" with open(os.path.join(model,'config.json')) as f: config_params_model = json.load(f) task = config_params_model['task'] - if task != 'classification' and task != 'reading_order': + if task != 'classification' and task != 'reading_order' and task != "cnn-rnn-ocr": if image and not save: print("Error: You used one of segmentation or binarization task with image input but not set -s, you need a filename to save visualized output with -s") sys.exit(1) if dir_in and not out: print("Error: You used one of segmentation or binarization task with dir_in but not set -out") sys.exit(1) - x=sbb_predict(image, dir_in, model, task, config_params_model, patches, save, save_layout, ground_truth, xml_file, out, min_area) + x=sbb_predict(image, dir_in, model, task, config_params_model, patches, save, save_layout, ground_truth, xml_file, cpu, out, min_area) x.run() diff --git a/src/eynollah/training/utils.py b/src/eynollah/training/utils.py index 0a92935..005810f 100644 --- a/src/eynollah/training/utils.py +++ b/src/eynollah/training/utils.py @@ -1,7 +1,7 @@ import os import math import random - +from pathlib import Path import cv2 import numpy as np import seaborn as sns @@ -32,6 +32,9 @@ def scale_padd_image_for_ocr(img, height, width): else: width_new = width + if width_new <= 0: + width_new = width + img_res= resize_image (img, height, width_new) img_fin = np.ones((height, width, 3))*255 @@ -1335,7 +1338,8 @@ def data_gen_ocr( # TODO: Why while True + yield, why not return a list? while True: for i in ls_files_images: - f_name = i.split('.')[0] + print(i, 'i') + f_name = Path(i).stem#.split('.')[0] txt_inp = open(os.path.join(dir_train, "labels/"+f_name+'.txt'),'r').read().split('\n')[0] diff --git a/src/eynollah/training/weights_ensembling.py b/src/eynollah/training/weights_ensembling.py new file mode 100644 index 0000000..6dce7fd --- /dev/null +++ b/src/eynollah/training/weights_ensembling.py @@ -0,0 +1,136 @@ +import sys +from glob import glob +from os import environ, devnull +from os.path import join +from warnings import catch_warnings, simplefilter +import os + +import numpy as np +from PIL import Image +import cv2 +environ['TF_CPP_MIN_LOG_LEVEL'] = '3' +stderr = sys.stderr +sys.stderr = open(devnull, 'w') +import tensorflow as tf +from tensorflow.keras.models import load_model +from tensorflow.python.keras import backend as tensorflow_backend +sys.stderr = stderr +from tensorflow.keras import layers +import tensorflow.keras.losses +from tensorflow.keras.layers import * +import click +import logging + + +class Patches(layers.Layer): + def __init__(self, patch_size_x, patch_size_y): + super(Patches, self).__init__() + self.patch_size_x = patch_size_x + self.patch_size_y = patch_size_y + + def call(self, images): + #print(tf.shape(images)[1],'images') + #print(self.patch_size,'self.patch_size') + batch_size = tf.shape(images)[0] + patches = tf.image.extract_patches( + images=images, + sizes=[1, self.patch_size_y, self.patch_size_x, 1], + strides=[1, self.patch_size_y, self.patch_size_x, 1], + rates=[1, 1, 1, 1], + padding="VALID", + ) + #patch_dims = patches.shape[-1] + patch_dims = tf.shape(patches)[-1] + patches = tf.reshape(patches, [batch_size, -1, patch_dims]) + return patches + def get_config(self): + + config = super().get_config().copy() + config.update({ + 'patch_size_x': self.patch_size_x, + 'patch_size_y': self.patch_size_y, + }) + return config + + + +class PatchEncoder(layers.Layer): + def __init__(self, **kwargs): + super(PatchEncoder, self).__init__() + self.num_patches = num_patches + self.projection = layers.Dense(units=projection_dim) + self.position_embedding = layers.Embedding( + input_dim=num_patches, output_dim=projection_dim + ) + + def call(self, patch): + positions = tf.range(start=0, limit=self.num_patches, delta=1) + encoded = self.projection(patch) + self.position_embedding(positions) + return encoded + def get_config(self): + + config = super().get_config().copy() + config.update({ + 'num_patches': self.num_patches, + 'projection': self.projection, + 'position_embedding': self.position_embedding, + }) + return config + + +def start_new_session(): + ###config = tf.compat.v1.ConfigProto() + ###config.gpu_options.allow_growth = True + + ###self.session = tf.compat.v1.Session(config=config) # tf.InteractiveSession() + ###tensorflow_backend.set_session(self.session) + + config = tf.compat.v1.ConfigProto() + config.gpu_options.allow_growth = True + + session = tf.compat.v1.Session(config=config) # tf.InteractiveSession() + tensorflow_backend.set_session(session) + return session + +def run_ensembling(dir_models, out): + ls_models = os.listdir(dir_models) + + + weights=[] + + for model_name in ls_models: + model = load_model(os.path.join(dir_models,model_name) , compile=False, custom_objects={'PatchEncoder':PatchEncoder, 'Patches': Patches}) + weights.append(model.get_weights()) + + new_weights = list() + + for weights_list_tuple in zip(*weights): + new_weights.append( + [np.array(weights_).mean(axis=0)\ + for weights_ in zip(*weights_list_tuple)]) + + + + new_weights = [np.array(x) for x in new_weights] + + model.set_weights(new_weights) + model.save(out) + os.system('cp '+os.path.join(os.path.join(dir_models,model_name) , "config.json ")+out) + +@click.command() +@click.option( + "--dir_models", + "-dm", + help="directory of models", + type=click.Path(exists=True, file_okay=False), +) +@click.option( + "--out", + "-o", + help="output directory where ensembled model will be written.", + type=click.Path(exists=False, file_okay=False), +) + +def main(dir_models, out): + run_ensembling(dir_models, out) +