diff --git a/qurator/eynollah/cli.py b/qurator/eynollah/cli.py index 4bbd3f2..a2a2ad0 100644 --- a/qurator/eynollah/cli.py +++ b/qurator/eynollah/cli.py @@ -97,6 +97,12 @@ from qurator.eynollah.eynollah import Eynollah is_flag=True, help="if this parameter set to true, this tool will try to detect tables.", ) +@click.option( + "--right2left/--left2right", + "-r2l/-l2r", + is_flag=True, + help="if this parameter set to true, this tool will extract right-to-left reading order.", +) @click.option( "--input_binary/--input-RGB", "-ib/-irgb", @@ -149,6 +155,7 @@ def main( textline_light, full_layout, tables, + right2left, input_binary, allow_scaling, headers_off, @@ -184,6 +191,7 @@ def main( textline_light=textline_light, full_layout=full_layout, tables=tables, + right2left=right2left, input_binary=input_binary, allow_scaling=allow_scaling, headers_off=headers_off, diff --git a/qurator/eynollah/eynollah.py b/qurator/eynollah/eynollah.py index 1a5705d..ad3f312 100644 --- a/qurator/eynollah/eynollah.py +++ b/qurator/eynollah/eynollah.py @@ -72,8 +72,7 @@ from .utils import ( small_textlines_to_parent_adherence2, order_of_regions, find_number_of_columns_in_document, - return_boxes_of_images_by_order_of_reading_new, - return_boxes_of_images_by_order_of_reading_new_right2left) + return_boxes_of_images_by_order_of_reading_new) from .utils.pil_cv2 import check_dpi, pil2cv from .utils.xml import order_and_id_of_texts from .plot import EynollahPlotter @@ -159,6 +158,7 @@ class Eynollah: textline_light=False, full_layout=False, tables=False, + right2left=False, input_binary=False, allow_scaling=False, headers_off=False, @@ -190,6 +190,7 @@ class Eynollah: self.textline_light = textline_light self.full_layout = full_layout self.tables = tables + self.right2left = right2left self.input_binary = input_binary self.allow_scaling = allow_scaling self.headers_off = headers_off @@ -2623,7 +2624,7 @@ class Eynollah: regions_without_separators_d = cv2.erode(regions_without_separators_d[:, :], KERNEL, iterations=6) t1 = time.time() if np.abs(slope_deskew) < SLOPE_THRESHOLD: - boxes, peaks_neg_tot_tables = return_boxes_of_images_by_order_of_reading_new_right2left(splitter_y_new, regions_without_separators, matrix_of_lines_ch, num_col_classifier, erosion_hurts, self.tables) + boxes, peaks_neg_tot_tables = return_boxes_of_images_by_order_of_reading_new(splitter_y_new, regions_without_separators, matrix_of_lines_ch, num_col_classifier, erosion_hurts, self.tables, self.right2left) boxes_d = None self.logger.debug("len(boxes): %s", len(boxes)) @@ -2633,7 +2634,7 @@ class Eynollah: img_revised_tab2 = self.add_tables_heuristic_to_layout(text_regions_p_tables, boxes, 0, splitter_y_new, peaks_neg_tot_tables, text_regions_p_tables , num_col_classifier , 0.000005, pixel_line) img_revised_tab2, contoures_tables = self.check_iou_of_bounding_box_and_contour_for_tables(img_revised_tab2,table_prediction, 10, num_col_classifier) else: - boxes_d, peaks_neg_tot_tables_d = return_boxes_of_images_by_order_of_reading_new_right2left(splitter_y_new_d, regions_without_separators_d, matrix_of_lines_ch_d, num_col_classifier, erosion_hurts, self.tables) + boxes_d, peaks_neg_tot_tables_d = return_boxes_of_images_by_order_of_reading_new(splitter_y_new_d, regions_without_separators_d, matrix_of_lines_ch_d, num_col_classifier, erosion_hurts, self.tables, self.right2left) boxes = None self.logger.debug("len(boxes): %s", len(boxes_d)) @@ -2718,7 +2719,7 @@ class Eynollah: pass if np.abs(slope_deskew) < SLOPE_THRESHOLD: - boxes, peaks_neg_tot_tables = return_boxes_of_images_by_order_of_reading_new_right2left(splitter_y_new, regions_without_separators, matrix_of_lines_ch, num_col_classifier, erosion_hurts, self.tables) + boxes, peaks_neg_tot_tables = return_boxes_of_images_by_order_of_reading_new(splitter_y_new, regions_without_separators, matrix_of_lines_ch, num_col_classifier, erosion_hurts, self.tables, self.right2left) text_regions_p_tables = np.copy(text_regions_p) text_regions_p_tables[:,:][(table_prediction[:,:]==1)] = 10 pixel_line = 3 @@ -2727,7 +2728,7 @@ class Eynollah: img_revised_tab2,contoures_tables = self.check_iou_of_bounding_box_and_contour_for_tables(img_revised_tab2, table_prediction, 10, num_col_classifier) else: - boxes_d, peaks_neg_tot_tables_d = return_boxes_of_images_by_order_of_reading_new_right2left(splitter_y_new_d, regions_without_separators_d, matrix_of_lines_ch_d, num_col_classifier, erosion_hurts, self.tables) + boxes_d, peaks_neg_tot_tables_d = return_boxes_of_images_by_order_of_reading_new(splitter_y_new_d, regions_without_separators_d, matrix_of_lines_ch_d, num_col_classifier, erosion_hurts, self.tables, self.right2left) text_regions_p_tables = np.copy(text_regions_p_1_n) text_regions_p_tables = np.round(text_regions_p_tables) text_regions_p_tables[:,:][(text_regions_p_tables[:,:]!=3) & (table_prediction_n[:,:]==1)] = 10 @@ -3070,11 +3071,17 @@ class Eynollah: if np.abs(slope_deskew) < SLOPE_THRESHOLD: - boxes, peaks_neg_tot_tables = return_boxes_of_images_by_order_of_reading_new_right2left(splitter_y_new, regions_without_separators, matrix_of_lines_ch, num_col_classifier, erosion_hurts, self.tables) + boxes, peaks_neg_tot_tables = return_boxes_of_images_by_order_of_reading_new(splitter_y_new, regions_without_separators, matrix_of_lines_ch, num_col_classifier, erosion_hurts, self.tables, self.right2left) else: - boxes_d, peaks_neg_tot_tables_d = return_boxes_of_images_by_order_of_reading_new_right2left(splitter_y_new_d, regions_without_separators_d, matrix_of_lines_ch_d, num_col_classifier, erosion_hurts, self.tables) + boxes_d, peaks_neg_tot_tables_d = return_boxes_of_images_by_order_of_reading_new(splitter_y_new_d, regions_without_separators_d, matrix_of_lines_ch_d, num_col_classifier, erosion_hurts, self.tables, self.right2left) - + #print(boxes_d,'boxes_d') + #img_once = np.zeros((textline_mask_tot_d.shape[0],textline_mask_tot_d.shape[1])) + #for box_i in boxes_d: + #img_once[int(box_i[2]):int(box_i[3]),int(box_i[0]):int(box_i[1]) ] =1 + #plt.imshow(img_once) + #plt.show() + #print(np.unique(img_once),'img_once') if self.plotter: self.plotter.write_images_into_directory(polygons_of_images, image_page) t_order = time.time() diff --git a/qurator/eynollah/utils/__init__.py b/qurator/eynollah/utils/__init__.py index c59d508..b85abdf 100644 --- a/qurator/eynollah/utils/__init__.py +++ b/qurator/eynollah/utils/__init__.py @@ -1672,7 +1672,9 @@ def find_number_of_columns_in_document(region_pre_p, num_col_classifier, tables, return num_col_fin, peaks_neg_fin_fin,matrix_of_lines_ch,splitter_y_new,separators_closeup_n -def return_boxes_of_images_by_order_of_reading_new(splitter_y_new, regions_without_separators, matrix_of_lines_ch, num_col_classifier, erosion_hurts, tables): +def return_boxes_of_images_by_order_of_reading_new(splitter_y_new, regions_without_separators, matrix_of_lines_ch, num_col_classifier, erosion_hurts, tables, right2left_readingorder): + if right2left_readingorder: + regions_without_separators = cv2.flip(regions_without_separators,1) boxes=[] peaks_neg_tot_tables = [] @@ -1763,6 +1765,13 @@ def return_boxes_of_images_by_order_of_reading_new(splitter_y_new, regions_witho cy_hor_diff=matrix_new[:,7][ (matrix_new[:,9]==0) ] arg_org_hor_some=matrix_new[:,0][ (matrix_new[:,9]==0) ] + if right2left_readingorder: + x_max_hor_some_new = regions_without_separators.shape[1] - x_min_hor_some + x_min_hor_some_new = regions_without_separators.shape[1] - x_max_hor_some + + x_min_hor_some =list(np.copy(x_min_hor_some_new)) + x_max_hor_some =list(np.copy(x_max_hor_some_new)) + @@ -2026,6 +2035,7 @@ def return_boxes_of_images_by_order_of_reading_new(splitter_y_new, regions_witho columns_not_covered_child_no_mother=np.sort(columns_not_covered_child_no_mother) + ind_args=np.array(range(len(y_type_2))) @@ -2335,254 +2345,21 @@ def return_boxes_of_images_by_order_of_reading_new(splitter_y_new, regions_witho #else: #boxes.append([ 0, regions_without_separators[:,:].shape[1] ,splitter_y_new[i],splitter_y_new[i+1]]) - return boxes, peaks_neg_tot_tables - - - - - - - - - - - - - - - - - - - - -def return_boxes_of_images_by_order_of_reading_new_right2left(splitter_y_new, regions_without_separators, matrix_of_lines_ch, num_col_classifier, erosion_hurts, tables): - boxes=[] - peaks_neg_tot_tables = [] - - for i in range(len(splitter_y_new)-1): - #print(splitter_y_new[i],splitter_y_new[i+1]) - matrix_new=matrix_of_lines_ch[:,:][ (matrix_of_lines_ch[:,6]> splitter_y_new[i] ) & (matrix_of_lines_ch[:,7]< splitter_y_new[i+1] ) ] - #print(len( matrix_new[:,9][matrix_new[:,9]==1] )) - - #print(matrix_new[:,8][matrix_new[:,9]==1],'gaddaaa') - - # check to see is there any vertical separator to find holes. - if 1>0:#len( matrix_new[:,9][matrix_new[:,9]==1] )>0 and np.max(matrix_new[:,8][matrix_new[:,9]==1])>=0.1*(np.abs(splitter_y_new[i+1]-splitter_y_new[i] )): - - try: - if erosion_hurts: - num_col, peaks_neg_fin=find_num_col(regions_without_separators[int(splitter_y_new[i]):int(splitter_y_new[i+1]),:], num_col_classifier, tables, multiplier=6.) - else: - num_col, peaks_neg_fin=find_num_col(regions_without_separators[int(splitter_y_new[i]):int(splitter_y_new[i+1]),:],num_col_classifier, tables, multiplier=7.) - except: - peaks_neg_fin=[] - num_col = 0 - - try: - peaks_neg_fin_org=np.copy(peaks_neg_fin) - if (len(peaks_neg_fin)+1)=len(peaks_neg_fin2): - peaks_neg_fin=list(np.copy(peaks_neg_fin1)) - else: - peaks_neg_fin=list(np.copy(peaks_neg_fin2)) - - - - peaks_neg_fin=list(np.array(peaks_neg_fin)+peaks_neg_fin_early[i_n]) - - if i_n!=(len(peaks_neg_fin_early)-2): - peaks_neg_fin_rev.append(peaks_neg_fin_early[i_n+1]) - #print(peaks_neg_fin,'peaks_neg_fin') - peaks_neg_fin_rev=peaks_neg_fin_rev+peaks_neg_fin - - - - - - if len(peaks_neg_fin_rev)>=len(peaks_neg_fin_org): - peaks_neg_fin=list(np.sort(peaks_neg_fin_rev)) - num_col=len(peaks_neg_fin) - else: - peaks_neg_fin=list(np.copy(peaks_neg_fin_org)) - num_col=len(peaks_neg_fin) + if right2left_readingorder: + peaks_neg_tot_tables_new = [] + if len(peaks_neg_tot_tables)>=1: + for peaks_tab_ind in peaks_neg_tot_tables: + peaks_neg_tot_tables_ind = regions_without_separators.shape[1] - np.array(peaks_tab_ind) + peaks_neg_tot_tables_ind = list(peaks_neg_tot_tables_ind[::-1]) + peaks_neg_tot_tables_new.append(peaks_neg_tot_tables_ind) - #print(peaks_neg_fin,'peaks_neg_fin') - except: - pass - #num_col, peaks_neg_fin=find_num_col(regions_without_separators[int(splitter_y_new[i]):int(splitter_y_new[i+1]),:],multiplier=7.0) - x_min_hor_some=matrix_new[:,2][ (matrix_new[:,9]==0) ] - x_max_hor_some=matrix_new[:,3][ (matrix_new[:,9]==0) ] - cy_hor_some=matrix_new[:,5][ (matrix_new[:,9]==0) ] - cy_hor_diff=matrix_new[:,7][ (matrix_new[:,9]==0) ] - arg_org_hor_some=matrix_new[:,0][ (matrix_new[:,9]==0) ] - - - - - - peaks_neg_tot=return_points_with_boundies(peaks_neg_fin,0, regions_without_separators[:,:].shape[1]) - - peaks_neg_tot_tables.append(peaks_neg_tot) - - reading_order_type,x_starting,x_ending,y_type_2,y_diff_type_2,y_lines_without_mother,x_start_without_mother,x_end_without_mother,there_is_sep_with_child,y_lines_with_child_without_mother,x_start_with_child_without_mother,x_end_with_child_without_mother,new_main_sep_y=return_x_start_end_mothers_childs_and_type_of_reading_order(x_min_hor_some,x_max_hor_some,cy_hor_some,peaks_neg_tot,cy_hor_diff) - - - y_lines_by_order=[] - x_start_by_order=[] - x_end_by_order=[] - if len(x_starting)>0: - all_columns = np.array(range(len(peaks_neg_tot)-1)) - columns_covered_by_lines_covered_more_than_2col=[] - - for dj in range(len(x_starting)): - if set( list(np.array(range(x_starting[dj],x_ending[dj])) ) ) == set(all_columns): - pass - else: - columns_covered_by_lines_covered_more_than_2col=columns_covered_by_lines_covered_more_than_2col+list(np.array(range(x_starting[dj],x_ending[dj])) ) - columns_covered_by_lines_covered_more_than_2col=list(set(columns_covered_by_lines_covered_more_than_2col)) - - - - columns_not_covered=list( set(all_columns)-set(columns_covered_by_lines_covered_more_than_2col) ) - - - y_type_2=list(y_type_2) - x_starting=list(x_starting) - x_ending=list(x_ending) - - for lj in columns_not_covered: - y_type_2.append(int(splitter_y_new[i])) - x_starting.append(lj) - x_ending.append(lj+1) - ##y_lines_by_order.append(int(splitter_y_new[i])) - ##x_start_by_order.append(0) - - #y_type_2.append(int(splitter_y_new[i])) - #x_starting.append(x_starting[0]) - #x_ending.append(x_ending[0]) - - if len(new_main_sep_y)>0: - y_type_2.append(int(splitter_y_new[i])) - x_starting.append(0) - x_ending.append(len(peaks_neg_tot)-1) - else: - y_type_2.append(int(splitter_y_new[i])) - x_starting.append(x_starting[0]) - x_ending.append(x_ending[0]) - - - y_type_2=np.array(y_type_2) - x_starting=np.array(x_starting) - x_ending=np.array(x_ending) - else: - all_columns=np.array(range(len(peaks_neg_tot)-1)) - columns_not_covered=list( set(all_columns) ) - - - y_type_2=list(y_type_2) - x_starting=list(x_starting) - x_ending=list(x_ending) - - for lj in columns_not_covered: - y_type_2.append(int(splitter_y_new[i])) - x_starting.append(lj) - x_ending.append(lj+1) - ##y_lines_by_order.append(int(splitter_y_new[i])) - ##x_start_by_order.append(0) - - - - y_type_2=np.array(y_type_2) - x_starting=np.array(x_starting) - x_ending=np.array(x_ending) - - ind_args=np.array(range(len(y_type_2))) - #ind_args=np.array(ind_args) - #print(ind_args,'ind_args') - for column in range(len(peaks_neg_tot)-1,0,-1): - #print(column,'column') - ind_args_in_col=ind_args[x_ending==column] - ind_args_in_col=np.array(ind_args_in_col) - #print(len(y_type_2)) - y_column=y_type_2[ind_args_in_col] - x_start_column=x_starting[ind_args_in_col] - x_end_column=x_ending[ind_args_in_col] - - ind_args_col_sorted=np.argsort(y_column) - y_col_sort=y_column[ind_args_col_sorted] - x_start_column_sort=x_start_column[ind_args_col_sorted] - x_end_column_sort=x_end_column[ind_args_col_sorted] - #print('babali4') - for ii in range(len(y_col_sort)): - #print('babali5') - y_lines_by_order.append(y_col_sort[ii]) - x_start_by_order.append(x_start_column_sort[ii]) - x_end_by_order.append(x_end_column_sort[ii]-1) - - for il in range(len(y_lines_by_order)): - - - y_copy=list( np.copy(y_lines_by_order) ) - x_start_copy=list( np.copy(x_start_by_order) ) - x_end_copy=list ( np.copy(x_end_by_order) ) - - #print(y_copy,'y_copy') - y_itself=y_copy.pop(il) - x_start_itself=x_start_copy.pop(il) - x_end_itself=x_end_copy.pop(il) - - #print(y_copy,'y_copy2') - - for column in range(x_end_itself+1-1,x_start_itself-1,-1): - #print(column,'cols') - y_in_cols=[] - for yic in range(len(y_copy)): - #print('burda') - if y_copy[yic]>y_itself and column>=x_start_copy[yic] and column<=x_end_copy[yic]: - y_in_cols.append(y_copy[yic]) - #print('burda2') - #print(y_in_cols,'y_in_cols') - if len(y_in_cols)>0: - y_down=np.min(y_in_cols) - else: - y_down=[int(splitter_y_new[i+1])][0] - #print(y_itself,'y_itself') - boxes.append([peaks_neg_tot[column],peaks_neg_tot[column+1],y_itself,y_down]) - - - - #else: - #boxes.append([ 0, regions_without_separators[:,:].shape[1] ,splitter_y_new[i],splitter_y_new[i+1]]) - return boxes, peaks_neg_tot_tables - + + for i in range(len(boxes)): + x_start_new = regions_without_separators.shape[1] - boxes[i][1] + x_end_new = regions_without_separators.shape[1] - boxes[i][0] + boxes[i][0] = x_start_new + boxes[i][1] = x_end_new + return boxes, peaks_neg_tot_tables_new + else: + return boxes, peaks_neg_tot_tables