flow from directory

2025-12-20 18:14:14 +01:00 · 2022-03-29 06:55:19 -04:00 · 2022-03-29 06:55:19 -04:00 · c606391c31
commit c606391c31
parent cf5ef8f5ae
4 changed files with 767 additions and 437 deletions
--- a/qurator/eynollah/cli.py
+++ b/qurator/eynollah/cli.py
@ -10,7 +10,6 @@ from qurator.eynollah.eynollah import Eynollah
    "-i",
    help="image filename",
    type=click.Path(exists=True, dir_okay=False),
    required=True,
 )
@click.option(
    "--out",
@ -19,6 +18,12 @@ from qurator.eynollah.eynollah import Eynollah
    type=click.Path(exists=True, file_okay=False),
    required=True,
 )
@click.option(
    "--dir_in",
    "-di",
    help="directory of images",
    type=click.Path(exists=True, file_okay=False),
 )
@click.option(
    "--model",
    "-m",
@ -112,6 +117,7 @@ from qurator.eynollah.eynollah import Eynollah
 def main(
    image,
    out,
    dir_in,
    model,
    save_images,
    save_layout,
@ -140,6 +146,7 @@ def main(
    eynollah = Eynollah(
        image_filename=image,
        dir_out=out,
        dir_in=dir_in,
        dir_models=model,
        dir_of_cropped_images=save_images,
        dir_of_layout=save_layout,
@ -155,8 +162,9 @@ def main(
        headers_off=headers_off,
        light_version=light_version,
    )
-    pcgts = eynollah.run()
+    eynollah.run()
-    eynollah.writer.write_pagexml(pcgts)
+    #pcgts = eynollah.run()
    ##eynollah.writer.write_pagexml(pcgts)
 if __name__ == "__main__":
    main()
--- a/qurator/eynollah/eynollah.py
+++ b/qurator/eynollah/eynollah.py
--- a/qurator/eynollah/utils/init.py
+++ b/qurator/eynollah/utils/init.py
@ -797,6 +797,76 @@ def putt_bb_of_drop_capitals_of_model_in_patches_in_layout(layout_in_patch):
    return layout_in_patch
 def check_any_text_region_in_model_one_is_main_or_header(regions_model_1,regions_model_full,contours_only_text_parent,all_box_coord,all_found_texline_polygons,slopes,contours_only_text_parent_d_ordered):
    cx_main,cy_main ,x_min_main , x_max_main, y_min_main ,y_max_main,y_corr_x_min_from_argmin=find_new_features_of_contours(contours_only_text_parent)
    length_con=x_max_main-x_min_main
    height_con=y_max_main-y_min_main
    all_found_texline_polygons_main=[]
    all_found_texline_polygons_head=[]
    all_box_coord_main=[]
    all_box_coord_head=[]
    slopes_main=[]
    slopes_head=[]
    contours_only_text_parent_main=[]
    contours_only_text_parent_head=[]
    contours_only_text_parent_main_d=[]
    contours_only_text_parent_head_d=[]
    for ii in range(len(contours_only_text_parent)):
        con=contours_only_text_parent[ii]
        img=np.zeros((regions_model_1.shape[0],regions_model_1.shape[1],3))
        img = cv2.fillPoly(img, pts=[con], color=(255, 255, 255))
        all_pixels=((img[:,:,0]==255)*1).sum()
        pixels_header=( ( (img[:,:,0]==255) & (regions_model_full[:,:,0]==2) )*1 ).sum()
        pixels_main=all_pixels-pixels_header
        if (pixels_header>=pixels_main) and ( (length_con[ii]/float(height_con[ii]) )>=1.3 ):
            regions_model_1[:,:][(regions_model_1[:,:]==1) & (img[:,:,0]==255) ]=2
            contours_only_text_parent_head.append(con)
            if contours_only_text_parent_d_ordered is not None:
                contours_only_text_parent_head_d.append(contours_only_text_parent_d_ordered[ii])
            all_box_coord_head.append(all_box_coord[ii])
            slopes_head.append(slopes[ii])
            all_found_texline_polygons_head.append(all_found_texline_polygons[ii])
        else:
            regions_model_1[:,:][(regions_model_1[:,:]==1) & (img[:,:,0]==255) ]=1
            contours_only_text_parent_main.append(con)
            if contours_only_text_parent_d_ordered is not None:
                contours_only_text_parent_main_d.append(contours_only_text_parent_d_ordered[ii])
            all_box_coord_main.append(all_box_coord[ii])
            slopes_main.append(slopes[ii])
            all_found_texline_polygons_main.append(all_found_texline_polygons[ii])
        #print(all_pixels,pixels_main,pixels_header)
    return regions_model_1,contours_only_text_parent_main,contours_only_text_parent_head,all_box_coord_main,all_box_coord_head,all_found_texline_polygons_main,all_found_texline_polygons_head,slopes_main,slopes_head,contours_only_text_parent_main_d,contours_only_text_parent_head_d
 def check_any_text_region_in_model_one_is_main_or_header_light(regions_model_1,regions_model_full,contours_only_text_parent,all_box_coord,all_found_texline_polygons,slopes,contours_only_text_parent_d_ordered):
    ### to make it faster
    h_o = regions_model_1.shape[0]
    w_o = regions_model_1.shape[1]
    regions_model_1 = cv2.resize(regions_model_1, (int(regions_model_1.shape[1]/3.), int(regions_model_1.shape[0]/3.)), interpolation=cv2.INTER_NEAREST)
    regions_model_full = cv2.resize(regions_model_full, (int(regions_model_full.shape[1]/3.), int(regions_model_full.shape[0]/3.)), interpolation=cv2.INTER_NEAREST)
    contours_only_text_parent = [ (i/3.).astype(np.int32) for i in  contours_only_text_parent]
    ###
    cx_main,cy_main ,x_min_main , x_max_main, y_min_main ,y_max_main,y_corr_x_min_from_argmin=find_new_features_of_contours(contours_only_text_parent)
    length_con=x_max_main-x_min_main
@ -853,8 +923,14 @@ def check_any_text_region_in_model_one_is_main_or_header(regions_model_1,regions
-        #plt.imshow(img[:,:,0])
+    ### to make it faster
-        #plt.show()
+    
    regions_model_1 = cv2.resize(regions_model_1, (w_o, h_o), interpolation=cv2.INTER_NEAREST)
    #regions_model_full = cv2.resize(img, (int(regions_model_full.shape[1]/3.), int(regions_model_full.shape[0]/3.)), interpolation=cv2.INTER_NEAREST)
    contours_only_text_parent_head = [ (i*3.).astype(np.int32) for i in  contours_only_text_parent_head]
    contours_only_text_parent_main = [ (i*3.).astype(np.int32) for i in  contours_only_text_parent_main]
    ###
    return regions_model_1,contours_only_text_parent_main,contours_only_text_parent_head,all_box_coord_main,all_box_coord_head,all_found_texline_polygons_main,all_found_texline_polygons_head,slopes_main,slopes_head,contours_only_text_parent_main_d,contours_only_text_parent_head_d
 def small_textlines_to_parent_adherence2(textlines_con, textline_iamge, num_col):
--- a/qurator/eynollah/utils/contour.py
+++ b/qurator/eynollah/utils/contour.py
@ -3,7 +3,8 @@ import numpy as np
 from shapely import geometry
 from .rotate import rotate_image, rotation_image_new
-
+from multiprocessing import Process, Queue, cpu_count
 from multiprocessing import Pool
 def contours_in_same_horizon(cy_main_hor):
    X1 = np.zeros((len(cy_main_hor), len(cy_main_hor)))
    X2 = np.zeros((len(cy_main_hor), len(cy_main_hor)))
@ -147,6 +148,96 @@ def return_contours_of_interested_region(region_pre_p, pixel, min_area=0.0002):
    return contours_imgs
 def do_work_of_contours_in_image(queue_of_all_params, contours_per_process, indexes_r_con_per_pro, img, slope_first):
    cnts_org_per_each_subprocess = []
    index_by_text_region_contours = []
    for mv in range(len(contours_per_process)):
        index_by_text_region_contours.append(indexes_r_con_per_pro[mv])
        img_copy = np.zeros(img.shape)
        img_copy = cv2.fillPoly(img_copy, pts=[contours_per_process[mv]], color=(1, 1, 1))
        img_copy = rotation_image_new(img_copy, -slope_first)
        img_copy = img_copy.astype(np.uint8)
        imgray = cv2.cvtColor(img_copy, cv2.COLOR_BGR2GRAY)
        ret, thresh = cv2.threshold(imgray, 0, 255, 0)
        cont_int, _ = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
        cont_int[0][:, 0, 0] = cont_int[0][:, 0, 0] + np.abs(img_copy.shape[1] - img.shape[1])
        cont_int[0][:, 0, 1] = cont_int[0][:, 0, 1] + np.abs(img_copy.shape[0] - img.shape[0])
        cnts_org_per_each_subprocess.append(cont_int[0])
    queue_of_all_params.put([ cnts_org_per_each_subprocess, index_by_text_region_contours])
 def get_textregion_contours_in_org_image_multi(cnts, img, slope_first):
    num_cores = cpu_count()
    queue_of_all_params = Queue()
    processes = []
    nh = np.linspace(0, len(cnts), num_cores + 1)
    indexes_by_text_con = np.array(range(len(cnts)))
    for i in range(num_cores):
        contours_per_process = cnts[int(nh[i]) : int(nh[i + 1])]
        indexes_text_con_per_process = indexes_by_text_con[int(nh[i]) : int(nh[i + 1])]
        processes.append(Process(target=do_work_of_contours_in_image, args=(queue_of_all_params, contours_per_process, indexes_text_con_per_process, img,slope_first )))
    for i in range(num_cores):
        processes[i].start()
    cnts_org = []
    all_index_text_con = []
    for i in range(num_cores):
        list_all_par = queue_of_all_params.get(True)
        contours_for_sub_process = list_all_par[0]
        indexes_for_sub_process = list_all_par[1]
        for j in range(len(contours_for_sub_process)):
            cnts_org.append(contours_for_sub_process[j])
            all_index_text_con.append(indexes_for_sub_process[j])
    for i in range(num_cores):
        processes[i].join()
    print(all_index_text_con)
    return cnts_org
 def loop_contour_image(index_l, cnts,img, slope_first):
    img_copy = np.zeros(img.shape)
    img_copy = cv2.fillPoly(img_copy, pts=[cnts[index_l]], color=(1, 1, 1))
    # plt.imshow(img_copy)
    # plt.show()
    # print(img.shape,'img')
    img_copy = rotation_image_new(img_copy, -slope_first)
    ##print(img_copy.shape,'img_copy')
    # plt.imshow(img_copy)
    # plt.show()
    img_copy = img_copy.astype(np.uint8)
    imgray = cv2.cvtColor(img_copy, cv2.COLOR_BGR2GRAY)
    ret, thresh = cv2.threshold(imgray, 0, 255, 0)
    cont_int, _ = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
    cont_int[0][:, 0, 0] = cont_int[0][:, 0, 0] + np.abs(img_copy.shape[1] - img.shape[1])
    cont_int[0][:, 0, 1] = cont_int[0][:, 0, 1] + np.abs(img_copy.shape[0] - img.shape[0])
    # print(np.shape(cont_int[0]))
    return cont_int[0]
 def get_textregion_contours_in_org_image_multi2(cnts, img, slope_first):
    cnts_org = []
    # print(cnts,'cnts')
    with Pool(cpu_count()) as p:
        cnts_org = p.starmap(loop_contour_image, [(index_l,cnts, img,slope_first) for index_l in range(len(cnts))])
    print(len(cnts_org),'lendiha')
    return cnts_org
 def get_textregion_contours_in_org_image(cnts, img, slope_first):
    cnts_org = []
@ -175,11 +266,43 @@ def get_textregion_contours_in_org_image(cnts, img, slope_first):
        # print(np.shape(cont_int[0]))
        cnts_org.append(cont_int[0])
-    # print(cnts_org,'cnts_org')
+    return cnts_org
 def get_textregion_contours_in_org_image_light(cnts, img, slope_first):
    h_o = img.shape[0]
    w_o = img.shape[1]
    img = cv2.resize(img, (int(img.shape[1]/3.), int(img.shape[0]/3.)), interpolation=cv2.INTER_NEAREST)
    ##cnts = list( (np.array(cnts)/2).astype(np.int16) )
    #cnts = cnts/2
    cnts = [(i/ 3).astype(np.int32) for i in cnts]
    cnts_org = []
    #print(cnts,'cnts')
    for i in range(len(cnts)):
        img_copy = np.zeros(img.shape)
        img_copy = cv2.fillPoly(img_copy, pts=[cnts[i]], color=(1, 1, 1))
        # plt.imshow(img_copy)
        # plt.show()
        # print(img.shape,'img')
        img_copy = rotation_image_new(img_copy, -slope_first)
        ##print(img_copy.shape,'img_copy')
        # plt.imshow(img_copy)
        # plt.show()
        img_copy = img_copy.astype(np.uint8)
        imgray = cv2.cvtColor(img_copy, cv2.COLOR_BGR2GRAY)
        ret, thresh = cv2.threshold(imgray, 0, 255, 0)
        cont_int, _ = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
        cont_int[0][:, 0, 0] = cont_int[0][:, 0, 0] + np.abs(img_copy.shape[1] - img.shape[1])
        cont_int[0][:, 0, 1] = cont_int[0][:, 0, 1] + np.abs(img_copy.shape[0] - img.shape[0])
        # print(np.shape(cont_int[0]))
        cnts_org.append(cont_int[0]*3)
    # sys.exit()
    # self.y_shift = np.abs(img_copy.shape[0] - img.shape[0])
    # self.x_shift = np.abs(img_copy.shape[1] - img.shape[1])
    return cnts_org
 def return_contours_of_interested_textline(region_pre_p, pixel):