flow from directory

2026-01-05 18:07:01 +01:00 · 2022-03-29 06:55:19 -04:00 · 2022-03-29 06:55:19 -04:00 · c606391c31
commit c606391c31
parent cf5ef8f5ae
4 changed files with 767 additions and 437 deletions
--- a/qurator/eynollah/cli.py
+++ b/qurator/eynollah/cli.py
@ -10,7 +10,6 @@ from qurator.eynollah.eynollah import Eynollah
    "-i",
    help="image filename",
    type=click.Path(exists=True, dir_okay=False),
-    required=True,
 )
@click.option(
    "--out",
@ -19,6 +18,12 @@ from qurator.eynollah.eynollah import Eynollah
    type=click.Path(exists=True, file_okay=False),
    required=True,
 )
+@click.option(
+    "--dir_in",
+    "-di",
+    help="directory of images",
+    type=click.Path(exists=True, file_okay=False),
+)
@click.option(
    "--model",
    "-m",
@ -112,6 +117,7 @@ from qurator.eynollah.eynollah import Eynollah
 def main(
    image,
    out,
+    dir_in,
    model,
    save_images,
    save_layout,
@ -140,6 +146,7 @@ def main(
    eynollah = Eynollah(
        image_filename=image,
        dir_out=out,
+        dir_in=dir_in,
        dir_models=model,
        dir_of_cropped_images=save_images,
        dir_of_layout=save_layout,
@ -155,8 +162,9 @@ def main(
        headers_off=headers_off,
        light_version=light_version,
    )
-    pcgts = eynollah.run()
-    eynollah.writer.write_pagexml(pcgts)
+    eynollah.run()
+    #pcgts = eynollah.run()
+    ##eynollah.writer.write_pagexml(pcgts)

 if __name__ == "__main__":
    main()
--- a/qurator/eynollah/eynollah.py
+++ b/qurator/eynollah/eynollah.py
--- a/qurator/eynollah/utils/init.py
+++ b/qurator/eynollah/utils/init.py
@ -797,6 +797,76 @@ def putt_bb_of_drop_capitals_of_model_in_patches_in_layout(layout_in_patch):
    return layout_in_patch

 def check_any_text_region_in_model_one_is_main_or_header(regions_model_1,regions_model_full,contours_only_text_parent,all_box_coord,all_found_texline_polygons,slopes,contours_only_text_parent_d_ordered):
+    
+    cx_main,cy_main ,x_min_main , x_max_main, y_min_main ,y_max_main,y_corr_x_min_from_argmin=find_new_features_of_contours(contours_only_text_parent)
+
+    length_con=x_max_main-x_min_main
+    height_con=y_max_main-y_min_main
+
+
+
+    all_found_texline_polygons_main=[]
+    all_found_texline_polygons_head=[]
+
+    all_box_coord_main=[]
+    all_box_coord_head=[]
+
+    slopes_main=[]
+    slopes_head=[]
+
+    contours_only_text_parent_main=[]
+    contours_only_text_parent_head=[]
+
+    contours_only_text_parent_main_d=[]
+    contours_only_text_parent_head_d=[]
+
+    for ii in range(len(contours_only_text_parent)):
+        con=contours_only_text_parent[ii]
+        img=np.zeros((regions_model_1.shape[0],regions_model_1.shape[1],3))
+        img = cv2.fillPoly(img, pts=[con], color=(255, 255, 255))
+
+
+
+        all_pixels=((img[:,:,0]==255)*1).sum()
+
+        pixels_header=( ( (img[:,:,0]==255) & (regions_model_full[:,:,0]==2) )*1 ).sum()
+        pixels_main=all_pixels-pixels_header
+
+
+        if (pixels_header>=pixels_main) and ( (length_con[ii]/float(height_con[ii]) )>=1.3 ):
+            regions_model_1[:,:][(regions_model_1[:,:]==1) & (img[:,:,0]==255) ]=2
+            contours_only_text_parent_head.append(con)
+            if contours_only_text_parent_d_ordered is not None:
+                contours_only_text_parent_head_d.append(contours_only_text_parent_d_ordered[ii])
+            all_box_coord_head.append(all_box_coord[ii])
+            slopes_head.append(slopes[ii])
+            all_found_texline_polygons_head.append(all_found_texline_polygons[ii])
+        else:
+            regions_model_1[:,:][(regions_model_1[:,:]==1) & (img[:,:,0]==255) ]=1
+            contours_only_text_parent_main.append(con)
+            if contours_only_text_parent_d_ordered is not None:
+                contours_only_text_parent_main_d.append(contours_only_text_parent_d_ordered[ii])
+            all_box_coord_main.append(all_box_coord[ii])
+            slopes_main.append(slopes[ii])
+            all_found_texline_polygons_main.append(all_found_texline_polygons[ii])
+
+        #print(all_pixels,pixels_main,pixels_header)
+
+    return regions_model_1,contours_only_text_parent_main,contours_only_text_parent_head,all_box_coord_main,all_box_coord_head,all_found_texline_polygons_main,all_found_texline_polygons_head,slopes_main,slopes_head,contours_only_text_parent_main_d,contours_only_text_parent_head_d
+
+
+def check_any_text_region_in_model_one_is_main_or_header_light(regions_model_1,regions_model_full,contours_only_text_parent,all_box_coord,all_found_texline_polygons,slopes,contours_only_text_parent_d_ordered):
+    
+    ### to make it faster
+    h_o = regions_model_1.shape[0]
+    w_o = regions_model_1.shape[1]
+    
+    regions_model_1 = cv2.resize(regions_model_1, (int(regions_model_1.shape[1]/3.), int(regions_model_1.shape[0]/3.)), interpolation=cv2.INTER_NEAREST)
+    regions_model_full = cv2.resize(regions_model_full, (int(regions_model_full.shape[1]/3.), int(regions_model_full.shape[0]/3.)), interpolation=cv2.INTER_NEAREST)
+    contours_only_text_parent = [ (i/3.).astype(np.int32) for i in  contours_only_text_parent]
+
+    ###
+    
    cx_main,cy_main ,x_min_main , x_max_main, y_min_main ,y_max_main,y_corr_x_min_from_argmin=find_new_features_of_contours(contours_only_text_parent)

    length_con=x_max_main-x_min_main
@ -853,8 +923,14 @@ def check_any_text_region_in_model_one_is_main_or_header(regions_model_1,regions



-        #plt.imshow(img[:,:,0])
-        #plt.show()
+    ### to make it faster
+    
+    regions_model_1 = cv2.resize(regions_model_1, (w_o, h_o), interpolation=cv2.INTER_NEAREST)
+    #regions_model_full = cv2.resize(img, (int(regions_model_full.shape[1]/3.), int(regions_model_full.shape[0]/3.)), interpolation=cv2.INTER_NEAREST)
+    contours_only_text_parent_head = [ (i*3.).astype(np.int32) for i in  contours_only_text_parent_head]
+    contours_only_text_parent_main = [ (i*3.).astype(np.int32) for i in  contours_only_text_parent_main]
+    ###
+    
    return regions_model_1,contours_only_text_parent_main,contours_only_text_parent_head,all_box_coord_main,all_box_coord_head,all_found_texline_polygons_main,all_found_texline_polygons_head,slopes_main,slopes_head,contours_only_text_parent_main_d,contours_only_text_parent_head_d

 def small_textlines_to_parent_adherence2(textlines_con, textline_iamge, num_col):
--- a/qurator/eynollah/utils/contour.py
+++ b/qurator/eynollah/utils/contour.py
@ -3,7 +3,8 @@ import numpy as np
 from shapely import geometry

 from .rotate import rotate_image, rotation_image_new
-
+from multiprocessing import Process, Queue, cpu_count
+from multiprocessing import Pool
 def contours_in_same_horizon(cy_main_hor):
    X1 = np.zeros((len(cy_main_hor), len(cy_main_hor)))
    X2 = np.zeros((len(cy_main_hor), len(cy_main_hor)))
@ -147,6 +148,96 @@ def return_contours_of_interested_region(region_pre_p, pixel, min_area=0.0002):

    return contours_imgs

+def do_work_of_contours_in_image(queue_of_all_params, contours_per_process, indexes_r_con_per_pro, img, slope_first):
+    cnts_org_per_each_subprocess = []
+    index_by_text_region_contours = []
+    for mv in range(len(contours_per_process)):
+        index_by_text_region_contours.append(indexes_r_con_per_pro[mv])
+        
+        img_copy = np.zeros(img.shape)
+        img_copy = cv2.fillPoly(img_copy, pts=[contours_per_process[mv]], color=(1, 1, 1))
+
+        img_copy = rotation_image_new(img_copy, -slope_first)
+
+        img_copy = img_copy.astype(np.uint8)
+        imgray = cv2.cvtColor(img_copy, cv2.COLOR_BGR2GRAY)
+        ret, thresh = cv2.threshold(imgray, 0, 255, 0)
+
+        cont_int, _ = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
+
+        cont_int[0][:, 0, 0] = cont_int[0][:, 0, 0] + np.abs(img_copy.shape[1] - img.shape[1])
+        cont_int[0][:, 0, 1] = cont_int[0][:, 0, 1] + np.abs(img_copy.shape[0] - img.shape[0])
+
+
+        cnts_org_per_each_subprocess.append(cont_int[0])
+
+    queue_of_all_params.put([ cnts_org_per_each_subprocess, index_by_text_region_contours])
+
+
+def get_textregion_contours_in_org_image_multi(cnts, img, slope_first):
+    
+    num_cores = cpu_count()
+    queue_of_all_params = Queue()
+
+    processes = []
+    nh = np.linspace(0, len(cnts), num_cores + 1)
+    indexes_by_text_con = np.array(range(len(cnts)))
+    for i in range(num_cores):
+        contours_per_process = cnts[int(nh[i]) : int(nh[i + 1])]
+        indexes_text_con_per_process = indexes_by_text_con[int(nh[i]) : int(nh[i + 1])]
+
+        processes.append(Process(target=do_work_of_contours_in_image, args=(queue_of_all_params, contours_per_process, indexes_text_con_per_process, img,slope_first )))
+    for i in range(num_cores):
+        processes[i].start()
+    cnts_org = []
+    all_index_text_con = []
+    for i in range(num_cores):
+        list_all_par = queue_of_all_params.get(True)
+        contours_for_sub_process = list_all_par[0]
+        indexes_for_sub_process = list_all_par[1]
+        for j in range(len(contours_for_sub_process)):
+            cnts_org.append(contours_for_sub_process[j])
+            all_index_text_con.append(indexes_for_sub_process[j])
+    for i in range(num_cores):
+        processes[i].join()
+
+    print(all_index_text_con)
+    return cnts_org
+def loop_contour_image(index_l, cnts,img, slope_first):
+    img_copy = np.zeros(img.shape)
+    img_copy = cv2.fillPoly(img_copy, pts=[cnts[index_l]], color=(1, 1, 1))
+
+    # plt.imshow(img_copy)
+    # plt.show()
+
+    # print(img.shape,'img')
+    img_copy = rotation_image_new(img_copy, -slope_first)
+    ##print(img_copy.shape,'img_copy')
+    # plt.imshow(img_copy)
+    # plt.show()
+
+    img_copy = img_copy.astype(np.uint8)
+    imgray = cv2.cvtColor(img_copy, cv2.COLOR_BGR2GRAY)
+    ret, thresh = cv2.threshold(imgray, 0, 255, 0)
+
+    cont_int, _ = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
+
+    cont_int[0][:, 0, 0] = cont_int[0][:, 0, 0] + np.abs(img_copy.shape[1] - img.shape[1])
+    cont_int[0][:, 0, 1] = cont_int[0][:, 0, 1] + np.abs(img_copy.shape[0] - img.shape[0])
+    # print(np.shape(cont_int[0]))
+    return cont_int[0]
+
+def get_textregion_contours_in_org_image_multi2(cnts, img, slope_first):
+
+    cnts_org = []
+    # print(cnts,'cnts')
+    with Pool(cpu_count()) as p:
+        cnts_org = p.starmap(loop_contour_image, [(index_l,cnts, img,slope_first) for index_l in range(len(cnts))])
+        
+    print(len(cnts_org),'lendiha')
+
+    return cnts_org
+
 def get_textregion_contours_in_org_image(cnts, img, slope_first):

    cnts_org = []
@ -175,11 +266,43 @@ def get_textregion_contours_in_org_image(cnts, img, slope_first):
        # print(np.shape(cont_int[0]))
        cnts_org.append(cont_int[0])

-    # print(cnts_org,'cnts_org')
+    return cnts_org
+
+def get_textregion_contours_in_org_image_light(cnts, img, slope_first):
+    
+    h_o = img.shape[0]
+    w_o = img.shape[1]
+    
+    img = cv2.resize(img, (int(img.shape[1]/3.), int(img.shape[0]/3.)), interpolation=cv2.INTER_NEAREST)
+    ##cnts = list( (np.array(cnts)/2).astype(np.int16) )
+    #cnts = cnts/2
+    cnts = [(i/ 3).astype(np.int32) for i in cnts]
+    cnts_org = []
+    #print(cnts,'cnts')
+    for i in range(len(cnts)):
+        img_copy = np.zeros(img.shape)
+        img_copy = cv2.fillPoly(img_copy, pts=[cnts[i]], color=(1, 1, 1))
+
+        # plt.imshow(img_copy)
+        # plt.show()
+
+        # print(img.shape,'img')
+        img_copy = rotation_image_new(img_copy, -slope_first)
+        ##print(img_copy.shape,'img_copy')
+        # plt.imshow(img_copy)
+        # plt.show()
+
+        img_copy = img_copy.astype(np.uint8)
+        imgray = cv2.cvtColor(img_copy, cv2.COLOR_BGR2GRAY)
+        ret, thresh = cv2.threshold(imgray, 0, 255, 0)
+
+        cont_int, _ = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
+
+        cont_int[0][:, 0, 0] = cont_int[0][:, 0, 0] + np.abs(img_copy.shape[1] - img.shape[1])
+        cont_int[0][:, 0, 1] = cont_int[0][:, 0, 1] + np.abs(img_copy.shape[0] - img.shape[0])
+        # print(np.shape(cont_int[0]))
+        cnts_org.append(cont_int[0]*3)

-    # sys.exit()
-    # self.y_shift = np.abs(img_copy.shape[0] - img.shape[0])
-    # self.x_shift = np.abs(img_copy.shape[1] - img.shape[1])
    return cnts_org

 def return_contours_of_interested_textline(region_pre_p, pixel):