machine based reading order training dataset generator is added

2025-10-09 07:40:00 +02:00 · 2024-05-24 14:42:58 +02:00 · 2024-05-24 14:42:58 +02:00 · bf1468391a
commit bf1468391a
parent f5746011f6
4 changed files with 1433 additions and 820 deletions
--- a/generate_gt_for_training.py
+++ b/generate_gt_for_training.py
@ -0,0 +1,194 @@
+import click
+import json
+from gt_gen_utils import *
+from tqdm import tqdm
+
+@click.group()
+def main():
+    pass
+
+@main.command()
+@click.option(
+    "--dir_xml",
+    "-dx",
+    help="directory of GT page-xml files",
+    type=click.Path(exists=True, file_okay=False),
+)
+@click.option(
+    "--dir_out",
+    "-do",
+    help="directory where ground truth images would be written",
+    type=click.Path(exists=True, file_okay=False),
+)
+
+@click.option(
+    "--config",
+    "-cfg",
+    help="config file of prefered layout or use case.",
+    type=click.Path(exists=True, dir_okay=False),
+)
+
+@click.option(
+    "--type_output",
+    "-to",
+    help="this defines how output should be. A 2d image array or a 3d image array encoded with RGB color. Just pass 2d or 3d. The file will be saved one directory up. 2D image array is 3d but only information of one channel would be enough since all channels have the same values.",
+)
+
+def pagexml2label(dir_xml,dir_out,type_output,config):
+    if config:
+        with open(config) as f:
+            config_params = json.load(f)
+    else:
+        print("passed")
+        config_params = None
+    gt_list = get_content_of_dir(dir_xml)
+    get_images_of_ground_truth(gt_list,dir_xml,dir_out,type_output, config, config_params)
+    
+@main.command()
+@click.option(
+    "--dir_imgs",
+    "-dis",
+    help="directory of images with high resolution.",
+    type=click.Path(exists=True, file_okay=False),
+)
+@click.option(
+    "--dir_out_images",
+    "-dois",
+    help="directory where degraded images will be written.",
+    type=click.Path(exists=True, file_okay=False),
+)
+
+@click.option(
+    "--dir_out_labels",
+    "-dols",
+    help="directory where original images will be written as labels.",
+    type=click.Path(exists=True, file_okay=False),
+)
+def image_enhancement(dir_imgs, dir_out_images, dir_out_labels):
+    #dir_imgs = './training_data_sample_enhancement/images'
+    #dir_out_images = './training_data_sample_enhancement/images_gt'
+    #dir_out_labels = './training_data_sample_enhancement/labels_gt'
+
+    ls_imgs = os.listdir(dir_imgs)
+    ls_scales = [ 0.5, 0.55, 0.6, 0.65, 0.7, 0.75,  0.8, 0.85,  0.9]
+
+    for img in tqdm(ls_imgs):
+        img_name = img.split('.')[0]
+        img_type = img.split('.')[1]
+        image = cv2.imread(os.path.join(dir_imgs, img))
+        for i, scale in enumerate(ls_scales):
+            height_sc = int(image.shape[0]*scale)
+            width_sc = int(image.shape[1]*scale)
+            
+            image_down_scaled = resize_image(image, height_sc, width_sc)
+            image_back_to_org_scale = resize_image(image_down_scaled, image.shape[0], image.shape[1])
+            
+            cv2.imwrite(os.path.join(dir_out_images, img_name+'_'+str(i)+'.'+img_type), image_back_to_org_scale)
+            cv2.imwrite(os.path.join(dir_out_labels, img_name+'_'+str(i)+'.'+img_type), image)
+    
+    
+@main.command()
+@click.option(
+    "--dir_xml",
+    "-dx",
+    help="directory of GT page-xml files",
+    type=click.Path(exists=True, file_okay=False),
+)
+
+@click.option(
+    "--dir_out_modal_image",
+    "-domi",
+    help="directory where ground truth images would be written",
+    type=click.Path(exists=True, file_okay=False),
+)
+
+@click.option(
+    "--dir_out_classes",
+    "-docl",
+    help="directory where ground truth classes would be written",
+    type=click.Path(exists=True, file_okay=False),
+)
+
+@click.option(
+    "--input_height",
+    "-ih",
+    help="input_height",
+)
+@click.option(
+    "--input_width",
+    "-iw",
+    help="input_width",
+)
+
+def machine_based_reading_order(dir_xml, dir_out_modal_image, dir_out_classes, input_height, input_width):
+    xml_files_ind = os.listdir(dir_xml)
+    input_height = int(input_height)
+    input_width = int(input_width)
+
+    indexer_start= 0#55166
+    max_area = 1
+    min_area = 0.0001
+
+    for ind_xml in tqdm(xml_files_ind):
+        indexer = 0
+        #print(ind_xml)
+        #print('########################')
+        xml_file = os.path.join(dir_xml,ind_xml )
+        f_name = ind_xml.split('.')[0]
+        file_name, id_paragraph, id_header,co_text_paragraph,\
+        co_text_header,tot_region_ref,x_len, y_len,index_tot_regions,img_poly = read_xml(xml_file)
+        
+        id_all_text = id_paragraph + id_header
+        co_text_all = co_text_paragraph + co_text_header
+        
+        
+        _, cy_main, x_min_main, x_max_main, y_min_main, y_max_main, _ = find_new_features_of_contours(co_text_header)
+        
+        img_header_and_sep = np.zeros((y_len,x_len), dtype='uint8')
+
+        for j in range(len(cy_main)):
+            img_header_and_sep[int(y_max_main[j]):int(y_max_main[j])+12,int(x_min_main[j]):int(x_max_main[j]) ] = 1
+
+
+        texts_corr_order_index  = [index_tot_regions[tot_region_ref.index(i)] for i in id_all_text ]
+        texts_corr_order_index_int = [int(x) for x in texts_corr_order_index]
+        
+
+        co_text_all, texts_corr_order_index_int = filter_contours_area_of_image(img_poly, co_text_all, texts_corr_order_index_int, max_area, min_area)
+        
+        arg_array = np.array(range(len(texts_corr_order_index_int)))
+        
+        labels_con = np.zeros((y_len,x_len,len(arg_array)),dtype='uint8')
+        for i in range(len(co_text_all)):
+            img_label = np.zeros((y_len,x_len,3),dtype='uint8')
+            img_label=cv2.fillPoly(img_label, pts =[co_text_all[i]], color=(1,1,1))
+            
+            img_label[:,:,0][img_poly[:,:,0]==5] = 2
+            img_label[:,:,0][img_header_and_sep[:,:]==1] = 3
+            
+            labels_con[:,:,i] = img_label[:,:,0]
+        
+        for i in range(len(texts_corr_order_index_int)):
+            for j in range(len(texts_corr_order_index_int)):
+                if i!=j:
+                    input_matrix = np.zeros((input_height,input_width,3)).astype(np.int8)
+                    final_f_name = f_name+'_'+str(indexer+indexer_start)
+                    order_class_condition = texts_corr_order_index_int[i]-texts_corr_order_index_int[j]
+                    if order_class_condition<0:
+                        class_type = 1
+                    else:
+                        class_type = 0
+
+                    input_matrix[:,:,0] = resize_image(labels_con[:,:,i], input_height, input_width)
+                    input_matrix[:,:,1] = resize_image(img_poly[:,:,0], input_height, input_width)
+                    input_matrix[:,:,2] = resize_image(labels_con[:,:,j], input_height, input_width)
+
+                    np.save(os.path.join(dir_out_classes,final_f_name+'.npy' ), class_type)
+                    
+                    cv2.imwrite(os.path.join(dir_out_modal_image,final_f_name+'.png' ), input_matrix)
+                    indexer = indexer+1
+
+    
+    
+if __name__ == "__main__":
+    main()