diff --git a/qurator/eynollah/eynollah.py b/qurator/eynollah/eynollah.py index a408b42..1a5705d 100644 --- a/qurator/eynollah/eynollah.py +++ b/qurator/eynollah/eynollah.py @@ -72,7 +72,8 @@ from .utils import ( small_textlines_to_parent_adherence2, order_of_regions, find_number_of_columns_in_document, - return_boxes_of_images_by_order_of_reading_new) + return_boxes_of_images_by_order_of_reading_new, + return_boxes_of_images_by_order_of_reading_new_right2left) from .utils.pil_cv2 import check_dpi, pil2cv from .utils.xml import order_and_id_of_texts from .plot import EynollahPlotter @@ -2069,6 +2070,7 @@ class Eynollah: arg_text_con = [] for ii in range(len(cx_text_only)): for jj in range(len(boxes)): + print(cx_text_only[ii],cy_text_only[ii],'markaz') if cx_text_only[ii] >= boxes[jj][0] and cx_text_only[ii] < boxes[jj][1] and cy_text_only[ii] >= boxes[jj][2] and cy_text_only[ii] < boxes[jj][3]: # this is valid if the center of region identify in which box it is located arg_text_con.append(jj) break @@ -2104,6 +2106,9 @@ class Eynollah: ref_point += len(id_of_texts) order_of_texts_tot = [] + print(len(contours_only_text_parent),'contours_only_text_parent') + print(len(order_by_con_main),'order_by_con_main') + for tj1 in range(len(contours_only_text_parent)): order_of_texts_tot.append(int(order_by_con_main[tj1])) @@ -2618,7 +2623,7 @@ class Eynollah: regions_without_separators_d = cv2.erode(regions_without_separators_d[:, :], KERNEL, iterations=6) t1 = time.time() if np.abs(slope_deskew) < SLOPE_THRESHOLD: - boxes, peaks_neg_tot_tables = return_boxes_of_images_by_order_of_reading_new(splitter_y_new, regions_without_separators, matrix_of_lines_ch, num_col_classifier, erosion_hurts, self.tables) + boxes, peaks_neg_tot_tables = return_boxes_of_images_by_order_of_reading_new_right2left(splitter_y_new, regions_without_separators, matrix_of_lines_ch, num_col_classifier, erosion_hurts, self.tables) boxes_d = None self.logger.debug("len(boxes): %s", len(boxes)) @@ -2628,7 +2633,7 @@ class Eynollah: img_revised_tab2 = self.add_tables_heuristic_to_layout(text_regions_p_tables, boxes, 0, splitter_y_new, peaks_neg_tot_tables, text_regions_p_tables , num_col_classifier , 0.000005, pixel_line) img_revised_tab2, contoures_tables = self.check_iou_of_bounding_box_and_contour_for_tables(img_revised_tab2,table_prediction, 10, num_col_classifier) else: - boxes_d, peaks_neg_tot_tables_d = return_boxes_of_images_by_order_of_reading_new(splitter_y_new_d, regions_without_separators_d, matrix_of_lines_ch_d, num_col_classifier, erosion_hurts, self.tables) + boxes_d, peaks_neg_tot_tables_d = return_boxes_of_images_by_order_of_reading_new_right2left(splitter_y_new_d, regions_without_separators_d, matrix_of_lines_ch_d, num_col_classifier, erosion_hurts, self.tables) boxes = None self.logger.debug("len(boxes): %s", len(boxes_d)) @@ -2713,7 +2718,7 @@ class Eynollah: pass if np.abs(slope_deskew) < SLOPE_THRESHOLD: - boxes, peaks_neg_tot_tables = return_boxes_of_images_by_order_of_reading_new(splitter_y_new, regions_without_separators, matrix_of_lines_ch, num_col_classifier, erosion_hurts, self.tables) + boxes, peaks_neg_tot_tables = return_boxes_of_images_by_order_of_reading_new_right2left(splitter_y_new, regions_without_separators, matrix_of_lines_ch, num_col_classifier, erosion_hurts, self.tables) text_regions_p_tables = np.copy(text_regions_p) text_regions_p_tables[:,:][(table_prediction[:,:]==1)] = 10 pixel_line = 3 @@ -2722,7 +2727,7 @@ class Eynollah: img_revised_tab2,contoures_tables = self.check_iou_of_bounding_box_and_contour_for_tables(img_revised_tab2, table_prediction, 10, num_col_classifier) else: - boxes_d, peaks_neg_tot_tables_d = return_boxes_of_images_by_order_of_reading_new(splitter_y_new_d, regions_without_separators_d, matrix_of_lines_ch_d, num_col_classifier, erosion_hurts, self.tables) + boxes_d, peaks_neg_tot_tables_d = return_boxes_of_images_by_order_of_reading_new_right2left(splitter_y_new_d, regions_without_separators_d, matrix_of_lines_ch_d, num_col_classifier, erosion_hurts, self.tables) text_regions_p_tables = np.copy(text_regions_p_1_n) text_regions_p_tables = np.round(text_regions_p_tables) text_regions_p_tables[:,:][(text_regions_p_tables[:,:]!=3) & (table_prediction_n[:,:]==1)] = 10 @@ -3065,9 +3070,10 @@ class Eynollah: if np.abs(slope_deskew) < SLOPE_THRESHOLD: - boxes, peaks_neg_tot_tables = return_boxes_of_images_by_order_of_reading_new(splitter_y_new, regions_without_separators, matrix_of_lines_ch, num_col_classifier, erosion_hurts, self.tables) + boxes, peaks_neg_tot_tables = return_boxes_of_images_by_order_of_reading_new_right2left(splitter_y_new, regions_without_separators, matrix_of_lines_ch, num_col_classifier, erosion_hurts, self.tables) else: - boxes_d, peaks_neg_tot_tables_d = return_boxes_of_images_by_order_of_reading_new(splitter_y_new_d, regions_without_separators_d, matrix_of_lines_ch_d, num_col_classifier, erosion_hurts, self.tables) + boxes_d, peaks_neg_tot_tables_d = return_boxes_of_images_by_order_of_reading_new_right2left(splitter_y_new_d, regions_without_separators_d, matrix_of_lines_ch_d, num_col_classifier, erosion_hurts, self.tables) + if self.plotter: self.plotter.write_images_into_directory(polygons_of_images, image_page) diff --git a/qurator/eynollah/utils/__init__.py b/qurator/eynollah/utils/__init__.py index e9f872c..c59d508 100644 --- a/qurator/eynollah/utils/__init__.py +++ b/qurator/eynollah/utils/__init__.py @@ -1774,7 +1774,6 @@ def return_boxes_of_images_by_order_of_reading_new(splitter_y_new, regions_witho reading_order_type,x_starting,x_ending,y_type_2,y_diff_type_2,y_lines_without_mother,x_start_without_mother,x_end_without_mother,there_is_sep_with_child,y_lines_with_child_without_mother,x_start_with_child_without_mother,x_end_with_child_without_mother,new_main_sep_y=return_x_start_end_mothers_childs_and_type_of_reading_order(x_min_hor_some,x_max_hor_some,cy_hor_some,peaks_neg_tot,cy_hor_diff) - if (reading_order_type==1) or (reading_order_type==0 and (len(y_lines_without_mother)>=2 or there_is_sep_with_child==1)): @@ -2281,7 +2280,6 @@ def return_boxes_of_images_by_order_of_reading_new(splitter_y_new, regions_witho ind_args=np.array(range(len(y_type_2))) #ind_args=np.array(ind_args) - #print(ind_args,'ind_args') for column in range(len(peaks_neg_tot)-1): #print(column,'column') ind_args_in_col=ind_args[x_starting==column] @@ -2338,3 +2336,253 @@ def return_boxes_of_images_by_order_of_reading_new(splitter_y_new, regions_witho #else: #boxes.append([ 0, regions_without_separators[:,:].shape[1] ,splitter_y_new[i],splitter_y_new[i+1]]) return boxes, peaks_neg_tot_tables + + + + + + + + + + + + + + + + + + + + +def return_boxes_of_images_by_order_of_reading_new_right2left(splitter_y_new, regions_without_separators, matrix_of_lines_ch, num_col_classifier, erosion_hurts, tables): + boxes=[] + peaks_neg_tot_tables = [] + + for i in range(len(splitter_y_new)-1): + #print(splitter_y_new[i],splitter_y_new[i+1]) + matrix_new=matrix_of_lines_ch[:,:][ (matrix_of_lines_ch[:,6]> splitter_y_new[i] ) & (matrix_of_lines_ch[:,7]< splitter_y_new[i+1] ) ] + #print(len( matrix_new[:,9][matrix_new[:,9]==1] )) + + #print(matrix_new[:,8][matrix_new[:,9]==1],'gaddaaa') + + # check to see is there any vertical separator to find holes. + if 1>0:#len( matrix_new[:,9][matrix_new[:,9]==1] )>0 and np.max(matrix_new[:,8][matrix_new[:,9]==1])>=0.1*(np.abs(splitter_y_new[i+1]-splitter_y_new[i] )): + + try: + if erosion_hurts: + num_col, peaks_neg_fin=find_num_col(regions_without_separators[int(splitter_y_new[i]):int(splitter_y_new[i+1]),:], num_col_classifier, tables, multiplier=6.) + else: + num_col, peaks_neg_fin=find_num_col(regions_without_separators[int(splitter_y_new[i]):int(splitter_y_new[i+1]),:],num_col_classifier, tables, multiplier=7.) + except: + peaks_neg_fin=[] + num_col = 0 + + + try: + peaks_neg_fin_org=np.copy(peaks_neg_fin) + if (len(peaks_neg_fin)+1)=len(peaks_neg_fin2): + peaks_neg_fin=list(np.copy(peaks_neg_fin1)) + else: + peaks_neg_fin=list(np.copy(peaks_neg_fin2)) + + + + peaks_neg_fin=list(np.array(peaks_neg_fin)+peaks_neg_fin_early[i_n]) + + if i_n!=(len(peaks_neg_fin_early)-2): + peaks_neg_fin_rev.append(peaks_neg_fin_early[i_n+1]) + #print(peaks_neg_fin,'peaks_neg_fin') + peaks_neg_fin_rev=peaks_neg_fin_rev+peaks_neg_fin + + + + + + if len(peaks_neg_fin_rev)>=len(peaks_neg_fin_org): + peaks_neg_fin=list(np.sort(peaks_neg_fin_rev)) + num_col=len(peaks_neg_fin) + else: + peaks_neg_fin=list(np.copy(peaks_neg_fin_org)) + num_col=len(peaks_neg_fin) + + #print(peaks_neg_fin,'peaks_neg_fin') + except: + pass + #num_col, peaks_neg_fin=find_num_col(regions_without_separators[int(splitter_y_new[i]):int(splitter_y_new[i+1]),:],multiplier=7.0) + x_min_hor_some=matrix_new[:,2][ (matrix_new[:,9]==0) ] + x_max_hor_some=matrix_new[:,3][ (matrix_new[:,9]==0) ] + cy_hor_some=matrix_new[:,5][ (matrix_new[:,9]==0) ] + cy_hor_diff=matrix_new[:,7][ (matrix_new[:,9]==0) ] + arg_org_hor_some=matrix_new[:,0][ (matrix_new[:,9]==0) ] + + + + + + peaks_neg_tot=return_points_with_boundies(peaks_neg_fin,0, regions_without_separators[:,:].shape[1]) + + peaks_neg_tot_tables.append(peaks_neg_tot) + + reading_order_type,x_starting,x_ending,y_type_2,y_diff_type_2,y_lines_without_mother,x_start_without_mother,x_end_without_mother,there_is_sep_with_child,y_lines_with_child_without_mother,x_start_with_child_without_mother,x_end_with_child_without_mother,new_main_sep_y=return_x_start_end_mothers_childs_and_type_of_reading_order(x_min_hor_some,x_max_hor_some,cy_hor_some,peaks_neg_tot,cy_hor_diff) + + + y_lines_by_order=[] + x_start_by_order=[] + x_end_by_order=[] + if len(x_starting)>0: + all_columns = np.array(range(len(peaks_neg_tot)-1)) + columns_covered_by_lines_covered_more_than_2col=[] + + for dj in range(len(x_starting)): + if set( list(np.array(range(x_starting[dj],x_ending[dj])) ) ) == set(all_columns): + pass + else: + columns_covered_by_lines_covered_more_than_2col=columns_covered_by_lines_covered_more_than_2col+list(np.array(range(x_starting[dj],x_ending[dj])) ) + columns_covered_by_lines_covered_more_than_2col=list(set(columns_covered_by_lines_covered_more_than_2col)) + + + + columns_not_covered=list( set(all_columns)-set(columns_covered_by_lines_covered_more_than_2col) ) + + + y_type_2=list(y_type_2) + x_starting=list(x_starting) + x_ending=list(x_ending) + + for lj in columns_not_covered: + y_type_2.append(int(splitter_y_new[i])) + x_starting.append(lj) + x_ending.append(lj+1) + ##y_lines_by_order.append(int(splitter_y_new[i])) + ##x_start_by_order.append(0) + + #y_type_2.append(int(splitter_y_new[i])) + #x_starting.append(x_starting[0]) + #x_ending.append(x_ending[0]) + + if len(new_main_sep_y)>0: + y_type_2.append(int(splitter_y_new[i])) + x_starting.append(0) + x_ending.append(len(peaks_neg_tot)-1) + else: + y_type_2.append(int(splitter_y_new[i])) + x_starting.append(x_starting[0]) + x_ending.append(x_ending[0]) + + + y_type_2=np.array(y_type_2) + x_starting=np.array(x_starting) + x_ending=np.array(x_ending) + else: + all_columns=np.array(range(len(peaks_neg_tot)-1)) + columns_not_covered=list( set(all_columns) ) + + + y_type_2=list(y_type_2) + x_starting=list(x_starting) + x_ending=list(x_ending) + + for lj in columns_not_covered: + y_type_2.append(int(splitter_y_new[i])) + x_starting.append(lj) + x_ending.append(lj+1) + ##y_lines_by_order.append(int(splitter_y_new[i])) + ##x_start_by_order.append(0) + + + + y_type_2=np.array(y_type_2) + x_starting=np.array(x_starting) + x_ending=np.array(x_ending) + + ind_args=np.array(range(len(y_type_2))) + #ind_args=np.array(ind_args) + #print(ind_args,'ind_args') + for column in range(len(peaks_neg_tot)-1,0,-1): + #print(column,'column') + ind_args_in_col=ind_args[x_ending==column] + ind_args_in_col=np.array(ind_args_in_col) + #print(len(y_type_2)) + y_column=y_type_2[ind_args_in_col] + x_start_column=x_starting[ind_args_in_col] + x_end_column=x_ending[ind_args_in_col] + + ind_args_col_sorted=np.argsort(y_column) + y_col_sort=y_column[ind_args_col_sorted] + x_start_column_sort=x_start_column[ind_args_col_sorted] + x_end_column_sort=x_end_column[ind_args_col_sorted] + #print('babali4') + for ii in range(len(y_col_sort)): + #print('babali5') + y_lines_by_order.append(y_col_sort[ii]) + x_start_by_order.append(x_start_column_sort[ii]) + x_end_by_order.append(x_end_column_sort[ii]-1) + + for il in range(len(y_lines_by_order)): + + + y_copy=list( np.copy(y_lines_by_order) ) + x_start_copy=list( np.copy(x_start_by_order) ) + x_end_copy=list ( np.copy(x_end_by_order) ) + + #print(y_copy,'y_copy') + y_itself=y_copy.pop(il) + x_start_itself=x_start_copy.pop(il) + x_end_itself=x_end_copy.pop(il) + + #print(y_copy,'y_copy2') + + for column in range(x_end_itself+1-1,x_start_itself-1,-1): + #print(column,'cols') + y_in_cols=[] + for yic in range(len(y_copy)): + #print('burda') + if y_copy[yic]>y_itself and column>=x_start_copy[yic] and column<=x_end_copy[yic]: + y_in_cols.append(y_copy[yic]) + #print('burda2') + #print(y_in_cols,'y_in_cols') + if len(y_in_cols)>0: + y_down=np.min(y_in_cols) + else: + y_down=[int(splitter_y_new[i+1])][0] + #print(y_itself,'y_itself') + boxes.append([peaks_neg_tot[column],peaks_neg_tot[column+1],y_itself,y_down]) + + + + #else: + #boxes.append([ 0, regions_without_separators[:,:].shape[1] ,splitter_y_new[i],splitter_y_new[i+1]]) + return boxes, peaks_neg_tot_tables +