diff --git a/src/eynollah/utils/__init__.py b/src/eynollah/utils/__init__.py index 2017cea..f30d55e 100644 --- a/src/eynollah/utils/__init__.py +++ b/src/eynollah/utils/__init__.py @@ -1641,241 +1641,204 @@ def return_boxes_of_images_by_order_of_reading_new( #if (len(matrix_new[:,9][matrix_new[:,9]==1]) > 0 and # np.max(matrix_new[:,8][matrix_new[:,9]==1]) >= # 0.1 * (np.abs(bot-top))): - if True: - try: - num_col, peaks_neg_fin = find_num_col( - regions_without_separators[top:bot], - # we do not expect to get all columns in small parts (headings etc.): - num_col_classifier if bot - top >= big_part else 1, - tables, multiplier=6. if erosion_hurts else 7.) - except: - peaks_neg_fin=[] - num_col = 0 - try: - if ((len(peaks_neg_fin) + 1 < num_col_classifier or - num_col_classifier == 6) and - # we do not expect to get all columns in small parts (headings etc.): - bot - top >= big_part): - # found too few columns here - #print('burda') - peaks_neg_fin_org = np.copy(peaks_neg_fin) - #print("peaks_neg_fin_org", peaks_neg_fin_org) - if len(peaks_neg_fin)==0: - num_col, peaks_neg_fin = find_num_col( - regions_without_separators[top:bot], - num_col_classifier, tables, multiplier=3.) - #print(peaks_neg_fin,'peaks_neg_fin') - peaks_neg_fin_early = [0] + peaks_neg_fin + [width_tot-1] + try: + num_col, peaks_neg_fin = find_num_col( + regions_without_separators[top:bot], + # we do not expect to get all columns in small parts (headings etc.): + num_col_classifier if bot - top >= big_part else 1, + tables, multiplier=6. if erosion_hurts else 7.) + except: + peaks_neg_fin=[] + num_col = 0 + try: + if ((len(peaks_neg_fin) + 1 < num_col_classifier or + num_col_classifier == 6) and + # we do not expect to get all columns in small parts (headings etc.): + bot - top >= big_part): + # found too few columns here + #print('burda') + peaks_neg_fin_org = np.copy(peaks_neg_fin) + #print("peaks_neg_fin_org", peaks_neg_fin_org) + if len(peaks_neg_fin)==0: + num_col, peaks_neg_fin = find_num_col( + regions_without_separators[top:bot], + num_col_classifier, tables, multiplier=3.) + #print(peaks_neg_fin,'peaks_neg_fin') + peaks_neg_fin_early = [0] + peaks_neg_fin + [width_tot-1] - #print(peaks_neg_fin_early,'burda2') - peaks_neg_fin_rev=[] - for left, right in pairwise(peaks_neg_fin_early): - # print("%d:%d" % (left, right), 'i_n') - # dbg_plt([left, right, top, bot], - # "image cut for y split %d:%d / x gap %d:%d" % ( - # top, bot, left, right)) - # plt.plot(regions_without_separators[top:bot, left:right].sum(axis=0)) - # plt.title("vertical projection (sum over y)") - # plt.show() - try: - _, peaks_neg_fin1 = find_num_col( - regions_without_separators[top:bot, left:right], - num_col_classifier, tables, multiplier=7.) - except: - peaks_neg_fin1 = [] - try: - _, peaks_neg_fin2 = find_num_col( - regions_without_separators[top:bot, left:right], - num_col_classifier, tables, multiplier=5.) - except: - peaks_neg_fin2 = [] - if len(peaks_neg_fin1) >= len(peaks_neg_fin2): - peaks_neg_fin = peaks_neg_fin1 - else: - peaks_neg_fin = peaks_neg_fin2 - # add offset to local result - peaks_neg_fin = list(np.array(peaks_neg_fin) + left) - #print(peaks_neg_fin,'peaks_neg_fin') - - peaks_neg_fin_rev.extend(peaks_neg_fin) - if right < peaks_neg_fin_early[-1]: - # all but the last column: interject the preexisting boundary - peaks_neg_fin_rev.append(right) - #print(peaks_neg_fin_rev,'peaks_neg_fin_rev') - - if len(peaks_neg_fin_rev) >= len(peaks_neg_fin_org): - peaks_neg_fin = peaks_neg_fin_rev + #print(peaks_neg_fin_early,'burda2') + peaks_neg_fin_rev=[] + for left, right in pairwise(peaks_neg_fin_early): + # print("%d:%d" % (left, right), 'i_n') + # dbg_plt([left, right, top, bot], + # "image cut for y split %d:%d / x gap %d:%d" % ( + # top, bot, left, right)) + # plt.plot(regions_without_separators[top:bot, left:right].sum(axis=0)) + # plt.title("vertical projection (sum over y)") + # plt.show() + try: + _, peaks_neg_fin1 = find_num_col( + regions_without_separators[top:bot, left:right], + num_col_classifier, tables, multiplier=7.) + except: + peaks_neg_fin1 = [] + try: + _, peaks_neg_fin2 = find_num_col( + regions_without_separators[top:bot, left:right], + num_col_classifier, tables, multiplier=5.) + except: + peaks_neg_fin2 = [] + if len(peaks_neg_fin1) >= len(peaks_neg_fin2): + peaks_neg_fin = peaks_neg_fin1 else: - peaks_neg_fin = peaks_neg_fin_org - num_col = len(peaks_neg_fin) + peaks_neg_fin = peaks_neg_fin2 + # add offset to local result + peaks_neg_fin = list(np.array(peaks_neg_fin) + left) #print(peaks_neg_fin,'peaks_neg_fin') - except: - logger.exception("cannot find peaks consistent with columns") - #num_col, peaks_neg_fin = find_num_col( - # regions_without_separators[top:bot,:], - # multiplier=7.0) - x_min_hor_some=matrix_new[:,2][ (matrix_new[:,9]==0) ] - x_max_hor_some=matrix_new[:,3][ (matrix_new[:,9]==0) ] - cy_hor_some=matrix_new[:,5][ (matrix_new[:,9]==0) ] - y_max_hor_some=matrix_new[:,7][ (matrix_new[:,9]==0) ] - if right2left_readingorder: - x_max_hor_some_new = width_tot - x_min_hor_some - x_min_hor_some_new = width_tot - x_max_hor_some - x_min_hor_some =list(np.copy(x_min_hor_some_new)) - x_max_hor_some =list(np.copy(x_max_hor_some_new)) + peaks_neg_fin_rev.extend(peaks_neg_fin) + if right < peaks_neg_fin_early[-1]: + # all but the last column: interject the preexisting boundary + peaks_neg_fin_rev.append(right) + #print(peaks_neg_fin_rev,'peaks_neg_fin_rev') - peaks_neg_tot = np.array([0] + peaks_neg_fin + [width_tot]) - #print(peaks_neg_tot,'peaks_neg_tot') - peaks_neg_tot_tables.append(peaks_neg_tot) + if len(peaks_neg_fin_rev) >= len(peaks_neg_fin_org): + peaks_neg_fin = peaks_neg_fin_rev + else: + peaks_neg_fin = peaks_neg_fin_org + num_col = len(peaks_neg_fin) + #print(peaks_neg_fin,'peaks_neg_fin') + except: + logger.exception("cannot find peaks consistent with columns") + #num_col, peaks_neg_fin = find_num_col( + # regions_without_separators[top:bot,:], + # multiplier=7.0) + x_min_hor_some=matrix_new[:,2][ (matrix_new[:,9]==0) ] + x_max_hor_some=matrix_new[:,3][ (matrix_new[:,9]==0) ] + cy_hor_some=matrix_new[:,5][ (matrix_new[:,9]==0) ] + y_max_hor_some=matrix_new[:,7][ (matrix_new[:,9]==0) ] - all_columns = set(range(len(peaks_neg_tot) - 1)) - #print("all_columns", all_columns) + if right2left_readingorder: + x_max_hor_some_new = width_tot - x_min_hor_some + x_min_hor_some_new = width_tot - x_max_hor_some + x_min_hor_some =list(np.copy(x_min_hor_some_new)) + x_max_hor_some =list(np.copy(x_max_hor_some_new)) - reading_order_type, x_starting, x_ending, y_mid, y_max, \ - y_mid_without_mother, x_start_without_mother, x_end_without_mother, \ - there_is_sep_with_child, \ - y_mid_with_child_without_mother, x_start_with_child_without_mother, x_end_with_child_without_mother, \ - new_main_sep_y = return_x_start_end_mothers_childs_and_type_of_reading_order( - x_min_hor_some, x_max_hor_some, cy_hor_some, peaks_neg_tot, y_max_hor_some) + peaks_neg_tot = np.array([0] + peaks_neg_fin + [width_tot]) + #print(peaks_neg_tot,'peaks_neg_tot') + peaks_neg_tot_tables.append(peaks_neg_tot) - # show multi-column separators - # dbg_plt([0, None, top, bot], "multi-column separators in current split", - # list(zip(peaks_neg_tot[x_starting], peaks_neg_tot[x_ending], - # y_mid - top, y_max - top)), True) + all_columns = set(range(len(peaks_neg_tot) - 1)) + #print("all_columns", all_columns) - if (reading_order_type == 1 or - len(y_mid_without_mother) >= 2 or - there_is_sep_with_child == 1): - # there are top-level multi-colspan horizontal separators which overlap each other - # or multiple top-level multi-colspan horizontal separators - # or multi-colspan horizontal separators shorter than their respective top-level: - # todo: explain how this is dealt with - try: - y_grenze = top + 300 - up = (y_mid > top) & (y_mid <= y_grenze) + reading_order_type, x_starting, x_ending, y_mid, y_max, \ + y_mid_without_mother, x_start_without_mother, x_end_without_mother, \ + there_is_sep_with_child, \ + y_mid_with_child_without_mother, x_start_with_child_without_mother, x_end_with_child_without_mother, \ + new_main_sep_y = return_x_start_end_mothers_childs_and_type_of_reading_order( + x_min_hor_some, x_max_hor_some, cy_hor_some, peaks_neg_tot, y_max_hor_some) - args_early_ys=np.arange(len(y_mid)) - #print(args_early_ys,'args_early_ys') - #print(y_mid,'y_mid') + # show multi-column separators + # dbg_plt([0, None, top, bot], "multi-column separators in current split", + # list(zip(peaks_neg_tot[x_starting], peaks_neg_tot[x_ending], + # y_mid - top, y_max - top)), True) - x_starting_up = x_starting[up] - x_ending_up = x_ending[up] - y_mid_up = y_mid[up] - y_max_up = y_max[up] - args_up = args_early_ys[up] - #print(args_up,'args_up') - #print(y_mid_up,'y_mid_up') - #check if there is a big separator in this y_mains0 - if len(y_mid_up) > 0: - # is there a separator with full-width span? - main_separator = (x_starting_up == 0) & (x_ending_up == len(peaks_neg_tot) - 1) - y_mid_main_separator_up = y_mid_up[main_separator] - y_max_main_separator_up = y_max_up[main_separator] - args_main_to_deleted = args_up[main_separator] - #print(y_mid_main_separator_up,y_max_main_separator_up,args_main_to_deleted,'fffffjammmm') - if len(y_max_main_separator_up): - args_to_be_kept = np.array(list( set(args_early_ys) - set(args_main_to_deleted) )) - #print(args_to_be_kept,'args_to_be_kept') - boxes.append([0, peaks_neg_tot[-1], - top, y_max_main_separator_up.max()]) - # dbg_plt(boxes[-1], "near top main separator box") - top = y_max_main_separator_up.max() + if (reading_order_type == 1 or + len(y_mid_without_mother) >= 2 or + there_is_sep_with_child == 1): + # there are top-level multi-colspan horizontal separators which overlap each other + # or multiple top-level multi-colspan horizontal separators + # or multi-colspan horizontal separators shorter than their respective top-level: + # todo: explain how this is dealt with + try: + y_grenze = top + 300 + up = (y_mid > top) & (y_mid <= y_grenze) - #print(top,'top') - y_mid = y_mid[args_to_be_kept] - x_starting = x_starting[args_to_be_kept] - x_ending = x_ending[args_to_be_kept] - y_max = y_max[args_to_be_kept] + args_early_ys=np.arange(len(y_mid)) + #print(args_early_ys,'args_early_ys') + #print(y_mid,'y_mid') - #print('galdiha') - y_grenze = top + 200 - up = (y_mid > top) & (y_mid <= y_grenze) - args_early_ys2 = np.arange(len(y_mid)) - x_starting_up = x_starting[up] - x_ending_up = x_ending[up] - y_mid_up = y_mid[up] - y_max_up = y_max[up] - args_up2 = args_early_ys2[up] - #print(y_mid_up,x_starting_up,x_ending_up,'didid') - else: - args_early_ys2 = args_early_ys - args_up2 = args_up + x_starting_up = x_starting[up] + x_ending_up = x_ending[up] + y_mid_up = y_mid[up] + y_max_up = y_max[up] + args_up = args_early_ys[up] + #print(args_up,'args_up') + #print(y_mid_up,'y_mid_up') + #check if there is a big separator in this y_mains0 + if len(y_mid_up) > 0: + # is there a separator with full-width span? + main_separator = (x_starting_up == 0) & (x_ending_up == len(peaks_neg_tot) - 1) + y_mid_main_separator_up = y_mid_up[main_separator] + y_max_main_separator_up = y_max_up[main_separator] + args_main_to_deleted = args_up[main_separator] + #print(y_mid_main_separator_up,y_max_main_separator_up,args_main_to_deleted,'fffffjammmm') + if len(y_max_main_separator_up): + args_to_be_kept = np.array(list( set(args_early_ys) - set(args_main_to_deleted) )) + #print(args_to_be_kept,'args_to_be_kept') + boxes.append([0, peaks_neg_tot[-1], + top, y_max_main_separator_up.max()]) + # dbg_plt(boxes[-1], "near top main separator box") + top = y_max_main_separator_up.max() - nodes_in = set() - for ij in range(len(x_starting_up)): - nodes_in.update(range(x_starting_up[ij], - x_ending_up[ij])) - #print(nodes_in,'nodes_in') - #print(np.array(range(len(peaks_neg_tot)-1)),'np.array(range(len(peaks_neg_tot)-1))') + #print(top,'top') + y_mid = y_mid[args_to_be_kept] + x_starting = x_starting[args_to_be_kept] + x_ending = x_ending[args_to_be_kept] + y_max = y_max[args_to_be_kept] - if nodes_in == set(range(len(peaks_neg_tot)-1)): - pass - elif nodes_in == set(range(1, len(peaks_neg_tot)-1)): - pass - else: - #print('burdaydikh') - args_to_be_kept2 = np.array(list( set(args_early_ys2) - set(args_up2) )) - - if len(args_to_be_kept2): - #print(args_to_be_kept2, "args_to_be_kept2") - y_mid = y_mid[args_to_be_kept2] - x_starting = x_starting[args_to_be_kept2] - x_ending = x_ending[args_to_be_kept2] - y_max = y_max[args_to_be_kept2] - - #int(top) - # order multi-column separators - y_mid_by_order=[] - x_start_by_order=[] - x_end_by_order=[] - if (reading_order_type == 1 or - len(x_end_with_child_without_mother) == 0): - if reading_order_type == 1: - # there are top-level multi-colspan horizontal separators which overlap each other - #print("adding all columns at top because of multiple overlapping mothers") - y_mid_by_order.append(top) - x_start_by_order.append(0) - x_end_by_order.append(len(peaks_neg_tot)-2) - else: - # there are no top-level multi-colspan horizontal separators which themselves - # contain shorter multi-colspan separators - #print(x_start_without_mother,x_end_without_mother,peaks_neg_tot,'dodo') - columns_covered_by_mothers = set() - for dj in range(len(x_start_without_mother)): - columns_covered_by_mothers.update( - range(x_start_without_mother[dj], - x_end_without_mother[dj])) - columns_not_covered = list(all_columns - columns_covered_by_mothers) - #print(columns_covered_by_mothers, "columns_covered_by_mothers") - #print(columns_not_covered, "columns_not_covered") - y_mid = np.append(y_mid, np.ones(len(columns_not_covered) + - len(x_start_without_mother), - dtype=int) * top) - ##y_mid_by_order = np.append(y_mid_by_order, [top] * len(columns_not_covered)) - ##x_start_by_order = np.append(x_start_by_order, [0] * len(columns_not_covered)) - x_starting = np.append(x_starting, np.array(columns_not_covered, int)) - x_starting = np.append(x_starting, x_start_without_mother) - x_ending = np.append(x_ending, np.array(columns_not_covered, int) + 1) - x_ending = np.append(x_ending, x_end_without_mother) - - ind_args=np.arange(len(y_mid)) - #print(ind_args,'ind_args') - for column in range(len(peaks_neg_tot)-1): - #print(column,'column') - ind_args_in_col=ind_args[x_starting==column] - #print('babali2') - #print(ind_args_in_col,'ind_args_in_col') - #print(len(y_mid)) - y_mid_column=y_mid[ind_args_in_col] - x_start_column=x_starting[ind_args_in_col] - x_end_column=x_ending[ind_args_in_col] - #print('babali3') - ind_args_col_sorted=np.argsort(y_mid_column) - y_mid_by_order.extend(y_mid_column[ind_args_col_sorted]) - x_start_by_order.extend(x_start_column[ind_args_col_sorted]) - x_end_by_order.extend(x_end_column[ind_args_col_sorted] - 1) + #print('galdiha') + y_grenze = top + 200 + up = (y_mid > top) & (y_mid <= y_grenze) + args_early_ys2 = np.arange(len(y_mid)) + x_starting_up = x_starting[up] + x_ending_up = x_ending[up] + y_mid_up = y_mid[up] + y_max_up = y_max[up] + args_up2 = args_early_ys2[up] + #print(y_mid_up,x_starting_up,x_ending_up,'didid') else: + args_early_ys2 = args_early_ys + args_up2 = args_up + + nodes_in = set() + for ij in range(len(x_starting_up)): + nodes_in.update(range(x_starting_up[ij], + x_ending_up[ij])) + #print(nodes_in,'nodes_in') + #print(np.array(range(len(peaks_neg_tot)-1)),'np.array(range(len(peaks_neg_tot)-1))') + + if nodes_in == set(range(len(peaks_neg_tot)-1)): + pass + elif nodes_in == set(range(1, len(peaks_neg_tot)-1)): + pass + else: + #print('burdaydikh') + args_to_be_kept2 = np.array(list( set(args_early_ys2) - set(args_up2) )) + + if len(args_to_be_kept2): + #print(args_to_be_kept2, "args_to_be_kept2") + y_mid = y_mid[args_to_be_kept2] + x_starting = x_starting[args_to_be_kept2] + x_ending = x_ending[args_to_be_kept2] + y_max = y_max[args_to_be_kept2] + + #int(top) + # order multi-column separators + y_mid_by_order=[] + x_start_by_order=[] + x_end_by_order=[] + if (reading_order_type == 1 or + len(x_end_with_child_without_mother) == 0): + if reading_order_type == 1: + # there are top-level multi-colspan horizontal separators which overlap each other + #print("adding all columns at top because of multiple overlapping mothers") + y_mid_by_order.append(top) + x_start_by_order.append(0) + x_end_by_order.append(len(peaks_neg_tot)-2) + else: + # there are no top-level multi-colspan horizontal separators which themselves + # contain shorter multi-colspan separators #print(x_start_without_mother,x_end_without_mother,peaks_neg_tot,'dodo') columns_covered_by_mothers = set() for dj in range(len(x_start_without_mother)): @@ -1895,212 +1858,170 @@ def return_boxes_of_images_by_order_of_reading_new( x_ending = np.append(x_ending, np.array(columns_not_covered, int) + 1) x_ending = np.append(x_ending, x_end_without_mother) - columns_covered_by_mothers_with_child = set() - for dj in range(len(x_end_with_child_without_mother)): - columns_covered_by_mothers_with_child.update( - range(x_start_with_child_without_mother[dj], - x_end_with_child_without_mother[dj])) - #print(columns_covered_by_mothers_with_child, "columns_covered_by_mothers_with_child") - columns_not_covered_by_mothers_with_child = list( - all_columns - columns_covered_by_mothers_with_child) - #indexes_to_be_spanned=[] - for i_s in range(len(x_end_with_child_without_mother)): - columns_not_covered_by_mothers_with_child.append(x_start_with_child_without_mother[i_s]) - columns_not_covered_by_mothers_with_child = np.sort(columns_not_covered_by_mothers_with_child) - #print(columns_not_covered_by_mothers_with_child, "columns_not_covered_by_mothers_with_child") - ind_args = np.arange(len(y_mid)) - for i_s_nc in columns_not_covered_by_mothers_with_child: - if i_s_nc in x_start_with_child_without_mother: - # use only seps with mother's span ("biggest") - #print("i_s_nc", i_s_nc) - x_end_biggest_column = \ - x_end_with_child_without_mother[ - x_start_with_child_without_mother == i_s_nc][0] - args_all_biggest_seps = \ - ind_args[(x_starting == i_s_nc) & - (x_ending == x_end_biggest_column)] - y_mid_column_nc = y_mid[args_all_biggest_seps] - #print("%d:%d" % (i_s_nc, x_end_biggest_column), "columns covered by mother with child") - #x_start_column_nc = x_starting[args_all_biggest_seps] - #x_end_column_nc = x_ending[args_all_biggest_seps] - y_mid_column_nc = np.sort(y_mid_column_nc) - #print(y_mid_column_nc, "y_mid_column_nc (sorted)") - for nc_top, nc_bot in pairwise(np.append(y_mid_column_nc, bot)): - #print("i_c", i_c) - #print("%d:%d" % (nc_top, nc_bot), "y_mid_column_nc") - ind_all_seps_between_nm_wc = \ - ind_args[(y_mid > nc_top) & - (y_mid < nc_bot) & - (x_starting >= i_s_nc) & - (x_ending <= x_end_biggest_column)] - y_mid_all_between_nm_wc = y_mid[ind_all_seps_between_nm_wc] - x_starting_all_between_nm_wc = x_starting[ind_all_seps_between_nm_wc] - x_ending_all_between_nm_wc = x_ending[ind_all_seps_between_nm_wc] - - columns_covered_by_mothers = set() - for dj in range(len(ind_all_seps_between_nm_wc)): - columns_covered_by_mothers.update( - range(x_starting_all_between_nm_wc[dj], - x_ending_all_between_nm_wc[dj])) - #print(columns_covered_by_mothers, "columns_covered_by_mothers") - child_columns = set(range(i_s_nc, x_end_biggest_column)) - columns_not_covered = list(child_columns - columns_covered_by_mothers) - #print(child_columns, "child_columns") - #print(columns_not_covered, "columns_not_covered") - - if len(ind_all_seps_between_nm_wc): - biggest = np.argmax(x_ending_all_between_nm_wc - - x_starting_all_between_nm_wc) - #print(ind_all_seps_between_nm_wc, "ind_all_seps_between_nm_wc") - #print(biggest, "%d:%d" % (x_starting_all_between_nm_wc[biggest], - x_ending_all_between_nm_wc[biggest]), "biggest") - if columns_covered_by_mothers == set( - range(x_starting_all_between_nm_wc[biggest], - x_ending_all_between_nm_wc[biggest])): - # single biggest accounts for all covered columns alone, - # this separator should be extended to cover all - seps_too_close_to_top_separator = \ - ((y_mid_all_between_nm_wc > nc_top) & - (y_mid_all_between_nm_wc <= nc_top + 500)) - if (np.count_nonzero(seps_too_close_to_top_separator) and - np.count_nonzero(seps_too_close_to_top_separator) < - len(ind_all_seps_between_nm_wc)): - #print(seps_too_close_to_top_separator, "seps_too_close_to_top_separator") - y_mid_all_between_nm_wc = \ - y_mid_all_between_nm_wc[~seps_too_close_to_top_separator] - x_starting_all_between_nm_wc = \ - x_starting_all_between_nm_wc[~seps_too_close_to_top_separator] - x_ending_all_between_nm_wc = \ - x_ending_all_between_nm_wc[~seps_too_close_to_top_separator] - - y_mid_all_between_nm_wc = np.append( - y_mid_all_between_nm_wc, nc_top) - x_starting_all_between_nm_wc = np.append( - x_starting_all_between_nm_wc, i_s_nc) - x_ending_all_between_nm_wc = np.append( - x_ending_all_between_nm_wc, x_end_biggest_column) - else: - y_mid_all_between_nm_wc = np.append( - y_mid_all_between_nm_wc, nc_top) - x_starting_all_between_nm_wc = np.append( - x_starting_all_between_nm_wc, x_starting_all_between_nm_wc[biggest]) - x_ending_all_between_nm_wc = np.append( - x_ending_all_between_nm_wc, x_ending_all_between_nm_wc[biggest]) - - if len(columns_not_covered): - y_mid_all_between_nm_wc = np.append( - y_mid_all_between_nm_wc, [nc_top] * len(columns_not_covered)) - x_starting_all_between_nm_wc = np.append( - x_starting_all_between_nm_wc, np.array(columns_not_covered, int)) - x_ending_all_between_nm_wc = np.append( - x_ending_all_between_nm_wc, np.array(columns_not_covered, int) + 1) - - ind_args_between=np.arange(len(x_ending_all_between_nm_wc)) - for column in range(int(i_s_nc), int(x_end_biggest_column)): - ind_args_in_col=ind_args_between[x_starting_all_between_nm_wc==column] - #print('babali2') - #print(ind_args_in_col,'ind_args_in_col') - #print(len(y_mid)) - y_mid_column=y_mid_all_between_nm_wc[ind_args_in_col] - x_start_column=x_starting_all_between_nm_wc[ind_args_in_col] - x_end_column=x_ending_all_between_nm_wc[ind_args_in_col] - #print('babali3') - ind_args_col_sorted=np.argsort(y_mid_column) - y_mid_by_order.extend(y_mid_column[ind_args_col_sorted]) - x_start_by_order.extend(x_start_column[ind_args_col_sorted]) - x_end_by_order.extend(x_end_column[ind_args_col_sorted] - 1) - else: - #print(i_s_nc,'column not covered by mothers with child') - ind_args_in_col=ind_args[x_starting==i_s_nc] - #print('babali2') - #print(ind_args_in_col,'ind_args_in_col') - #print(len(y_mid)) - y_mid_column=y_mid[ind_args_in_col] - x_start_column=x_starting[ind_args_in_col] - x_end_column=x_ending[ind_args_in_col] - #print('babali3') - ind_args_col_sorted = np.argsort(y_mid_column) - y_mid_by_order.extend(y_mid_column[ind_args_col_sorted]) - x_start_by_order.extend(x_start_column[ind_args_col_sorted]) - x_end_by_order.extend(x_end_column[ind_args_col_sorted] - 1) - - # create single-column boxes from multi-column separators - y_mid_by_order = np.array(y_mid_by_order) - x_start_by_order = np.array(x_start_by_order) - x_end_by_order = np.array(x_end_by_order) - for il in range(len(y_mid_by_order)): - #print(il, "il") - y_mid_itself = y_mid_by_order[il] - x_start_itself = x_start_by_order[il] - x_end_itself = x_end_by_order[il] - for column in range(int(x_start_itself), int(x_end_itself)+1): - #print(column,'cols') - #print('burda') - #print('burda2') - y_mid_next = y_mid_by_order[(y_mid_itself < y_mid_by_order) & - (column >= x_start_by_order) & - (column <= x_end_by_order)] - y_mid_next = y_mid_next.min(initial=bot) - #print(y_mid_next,'y_mid_next') - #print(y_mid_itself,'y_mid_itself') - boxes.append([peaks_neg_tot[column], - peaks_neg_tot[column+1], - y_mid_itself, - y_mid_next]) - # dbg_plt(boxes[-1], "A column %d box" % (column + 1)) - except: - logger.exception("cannot assign boxes") - boxes.append([0, peaks_neg_tot[len(peaks_neg_tot)-1], - top, bot]) - # dbg_plt(boxes[-1], "fallback box") - else: - # order multi-column separators - y_mid_by_order=[] - x_start_by_order=[] - x_end_by_order=[] - if len(x_starting)>0: - columns_covered_by_seps_covered_more_than_2col = set() - for dj in range(len(x_starting)): - if set(range(x_starting[dj], x_ending[dj])) != all_columns: - columns_covered_by_seps_covered_more_than_2col.update( - range(x_starting[dj], x_ending[dj])) - columns_not_covered = list(all_columns - columns_covered_by_seps_covered_more_than_2col) - - y_mid = np.append(y_mid, np.ones(len(columns_not_covered) + 1, - dtype=int) * top) - ##y_mid_by_order = np.append(y_mid_by_order, [top] * len(columns_not_covered)) - ##x_start_by_order = np.append(x_start_by_order, [0] * len(columns_not_covered)) - x_starting = np.append(x_starting, np.array(columns_not_covered, x_starting.dtype)) - x_ending = np.append(x_ending, np.array(columns_not_covered, x_ending.dtype) + 1) - if len(new_main_sep_y) > 0: - x_starting = np.append(x_starting, 0) - x_ending = np.append(x_ending, len(peaks_neg_tot) - 1) - else: - x_starting = np.append(x_starting, x_starting[0]) - x_ending = np.append(x_ending, x_ending[0]) + ind_args=np.arange(len(y_mid)) + #print(ind_args,'ind_args') + for column in range(len(peaks_neg_tot)-1): + #print(column,'column') + ind_args_in_col=ind_args[x_starting==column] + #print('babali2') + #print(ind_args_in_col,'ind_args_in_col') + #print(len(y_mid)) + y_mid_column=y_mid[ind_args_in_col] + x_start_column=x_starting[ind_args_in_col] + x_end_column=x_ending[ind_args_in_col] + #print('babali3') + ind_args_col_sorted=np.argsort(y_mid_column) + y_mid_by_order.extend(y_mid_column[ind_args_col_sorted]) + x_start_by_order.extend(x_start_column[ind_args_col_sorted]) + x_end_by_order.extend(x_end_column[ind_args_col_sorted] - 1) else: - columns_not_covered = list(all_columns) - y_mid = np.append(y_mid, np.ones(len(columns_not_covered), + #print(x_start_without_mother,x_end_without_mother,peaks_neg_tot,'dodo') + columns_covered_by_mothers = set() + for dj in range(len(x_start_without_mother)): + columns_covered_by_mothers.update( + range(x_start_without_mother[dj], + x_end_without_mother[dj])) + columns_not_covered = list(all_columns - columns_covered_by_mothers) + #print(columns_covered_by_mothers, "columns_covered_by_mothers") + #print(columns_not_covered, "columns_not_covered") + y_mid = np.append(y_mid, np.ones(len(columns_not_covered) + + len(x_start_without_mother), dtype=int) * top) ##y_mid_by_order = np.append(y_mid_by_order, [top] * len(columns_not_covered)) ##x_start_by_order = np.append(x_start_by_order, [0] * len(columns_not_covered)) - x_starting = np.append(x_starting, np.array(columns_not_covered, x_starting.dtype)) - x_ending = np.append(x_ending, np.array(columns_not_covered, x_ending.dtype) + 1) + x_starting = np.append(x_starting, np.array(columns_not_covered, int)) + x_starting = np.append(x_starting, x_start_without_mother) + x_ending = np.append(x_ending, np.array(columns_not_covered, int) + 1) + x_ending = np.append(x_ending, x_end_without_mother) - ind_args = np.arange(len(y_mid)) + columns_covered_by_mothers_with_child = set() + for dj in range(len(x_end_with_child_without_mother)): + columns_covered_by_mothers_with_child.update( + range(x_start_with_child_without_mother[dj], + x_end_with_child_without_mother[dj])) + #print(columns_covered_by_mothers_with_child, "columns_covered_by_mothers_with_child") + columns_not_covered_by_mothers_with_child = list( + all_columns - columns_covered_by_mothers_with_child) + #indexes_to_be_spanned=[] + for i_s in range(len(x_end_with_child_without_mother)): + columns_not_covered_by_mothers_with_child.append(x_start_with_child_without_mother[i_s]) + columns_not_covered_by_mothers_with_child = np.sort(columns_not_covered_by_mothers_with_child) + #print(columns_not_covered_by_mothers_with_child, "columns_not_covered_by_mothers_with_child") + ind_args = np.arange(len(y_mid)) + for i_s_nc in columns_not_covered_by_mothers_with_child: + if i_s_nc in x_start_with_child_without_mother: + # use only seps with mother's span ("biggest") + #print("i_s_nc", i_s_nc) + x_end_biggest_column = \ + x_end_with_child_without_mother[ + x_start_with_child_without_mother == i_s_nc][0] + args_all_biggest_seps = \ + ind_args[(x_starting == i_s_nc) & + (x_ending == x_end_biggest_column)] + y_mid_column_nc = y_mid[args_all_biggest_seps] + #print("%d:%d" % (i_s_nc, x_end_biggest_column), "columns covered by mother with child") + #x_start_column_nc = x_starting[args_all_biggest_seps] + #x_end_column_nc = x_ending[args_all_biggest_seps] + y_mid_column_nc = np.sort(y_mid_column_nc) + #print(y_mid_column_nc, "y_mid_column_nc (sorted)") + for nc_top, nc_bot in pairwise(np.append(y_mid_column_nc, bot)): + #print("i_c", i_c) + #print("%d:%d" % (nc_top, nc_bot), "y_mid_column_nc") + ind_all_seps_between_nm_wc = \ + ind_args[(y_mid > nc_top) & + (y_mid < nc_bot) & + (x_starting >= i_s_nc) & + (x_ending <= x_end_biggest_column)] + y_mid_all_between_nm_wc = y_mid[ind_all_seps_between_nm_wc] + x_starting_all_between_nm_wc = x_starting[ind_all_seps_between_nm_wc] + x_ending_all_between_nm_wc = x_ending[ind_all_seps_between_nm_wc] - for column in range(len(peaks_neg_tot)-1): - #print(column,'column') - ind_args_in_col=ind_args[x_starting==column] - #print(len(y_mid)) - y_mid_column=y_mid[ind_args_in_col] - x_start_column=x_starting[ind_args_in_col] - x_end_column=x_ending[ind_args_in_col] + columns_covered_by_mothers = set() + for dj in range(len(ind_all_seps_between_nm_wc)): + columns_covered_by_mothers.update( + range(x_starting_all_between_nm_wc[dj], + x_ending_all_between_nm_wc[dj])) + #print(columns_covered_by_mothers, "columns_covered_by_mothers") + child_columns = set(range(i_s_nc, x_end_biggest_column)) + columns_not_covered = list(child_columns - columns_covered_by_mothers) + #print(child_columns, "child_columns") + #print(columns_not_covered, "columns_not_covered") - ind_args_col_sorted = np.argsort(y_mid_column) - y_mid_by_order.extend(y_mid_column[ind_args_col_sorted]) - x_start_by_order.extend(x_start_column[ind_args_col_sorted]) - x_end_by_order.extend(x_end_column[ind_args_col_sorted] - 1) + if len(ind_all_seps_between_nm_wc): + biggest = np.argmax(x_ending_all_between_nm_wc - + x_starting_all_between_nm_wc) + #print(ind_all_seps_between_nm_wc, "ind_all_seps_between_nm_wc") + #print(biggest, "%d:%d" % (x_starting_all_between_nm_wc[biggest], + x_ending_all_between_nm_wc[biggest]), "biggest") + if columns_covered_by_mothers == set( + range(x_starting_all_between_nm_wc[biggest], + x_ending_all_between_nm_wc[biggest])): + # single biggest accounts for all covered columns alone, + # this separator should be extended to cover all + seps_too_close_to_top_separator = \ + ((y_mid_all_between_nm_wc > nc_top) & + (y_mid_all_between_nm_wc <= nc_top + 500)) + if (np.count_nonzero(seps_too_close_to_top_separator) and + np.count_nonzero(seps_too_close_to_top_separator) < + len(ind_all_seps_between_nm_wc)): + #print(seps_too_close_to_top_separator, "seps_too_close_to_top_separator") + y_mid_all_between_nm_wc = \ + y_mid_all_between_nm_wc[~seps_too_close_to_top_separator] + x_starting_all_between_nm_wc = \ + x_starting_all_between_nm_wc[~seps_too_close_to_top_separator] + x_ending_all_between_nm_wc = \ + x_ending_all_between_nm_wc[~seps_too_close_to_top_separator] + + y_mid_all_between_nm_wc = np.append( + y_mid_all_between_nm_wc, nc_top) + x_starting_all_between_nm_wc = np.append( + x_starting_all_between_nm_wc, i_s_nc) + x_ending_all_between_nm_wc = np.append( + x_ending_all_between_nm_wc, x_end_biggest_column) + else: + y_mid_all_between_nm_wc = np.append( + y_mid_all_between_nm_wc, nc_top) + x_starting_all_between_nm_wc = np.append( + x_starting_all_between_nm_wc, x_starting_all_between_nm_wc[biggest]) + x_ending_all_between_nm_wc = np.append( + x_ending_all_between_nm_wc, x_ending_all_between_nm_wc[biggest]) + + if len(columns_not_covered): + y_mid_all_between_nm_wc = np.append( + y_mid_all_between_nm_wc, [nc_top] * len(columns_not_covered)) + x_starting_all_between_nm_wc = np.append( + x_starting_all_between_nm_wc, np.array(columns_not_covered, int)) + x_ending_all_between_nm_wc = np.append( + x_ending_all_between_nm_wc, np.array(columns_not_covered, int) + 1) + + ind_args_between=np.arange(len(x_ending_all_between_nm_wc)) + for column in range(int(i_s_nc), int(x_end_biggest_column)): + ind_args_in_col=ind_args_between[x_starting_all_between_nm_wc==column] + #print('babali2') + #print(ind_args_in_col,'ind_args_in_col') + #print(len(y_mid)) + y_mid_column=y_mid_all_between_nm_wc[ind_args_in_col] + x_start_column=x_starting_all_between_nm_wc[ind_args_in_col] + x_end_column=x_ending_all_between_nm_wc[ind_args_in_col] + #print('babali3') + ind_args_col_sorted=np.argsort(y_mid_column) + y_mid_by_order.extend(y_mid_column[ind_args_col_sorted]) + x_start_by_order.extend(x_start_column[ind_args_col_sorted]) + x_end_by_order.extend(x_end_column[ind_args_col_sorted] - 1) + else: + #print(i_s_nc,'column not covered by mothers with child') + ind_args_in_col=ind_args[x_starting==i_s_nc] + #print('babali2') + #print(ind_args_in_col,'ind_args_in_col') + #print(len(y_mid)) + y_mid_column=y_mid[ind_args_in_col] + x_start_column=x_starting[ind_args_in_col] + x_end_column=x_ending[ind_args_in_col] + #print('babali3') + ind_args_col_sorted = np.argsort(y_mid_column) + y_mid_by_order.extend(y_mid_column[ind_args_col_sorted]) + x_start_by_order.extend(x_start_column[ind_args_col_sorted]) + x_end_by_order.extend(x_end_column[ind_args_col_sorted] - 1) # create single-column boxes from multi-column separators y_mid_by_order = np.array(y_mid_by_order) @@ -2109,23 +2030,101 @@ def return_boxes_of_images_by_order_of_reading_new( for il in range(len(y_mid_by_order)): #print(il, "il") y_mid_itself = y_mid_by_order[il] - #print(y_mid_itself,'y_mid_itself') x_start_itself = x_start_by_order[il] x_end_itself = x_end_by_order[il] - for column in range(x_start_itself, x_end_itself+1): + for column in range(int(x_start_itself), int(x_end_itself)+1): #print(column,'cols') + #print('burda') #print('burda2') y_mid_next = y_mid_by_order[(y_mid_itself < y_mid_by_order) & (column >= x_start_by_order) & (column <= x_end_by_order)] - #print(y_mid_next,'y_mid_next') y_mid_next = y_mid_next.min(initial=bot) #print(y_mid_next,'y_mid_next') + #print(y_mid_itself,'y_mid_itself') boxes.append([peaks_neg_tot[column], peaks_neg_tot[column+1], y_mid_itself, y_mid_next]) - # dbg_plt(boxes[-1], "B column %d box" % (column + 1)) + # dbg_plt(boxes[-1], "A column %d box" % (column + 1)) + except: + logger.exception("cannot assign boxes") + boxes.append([0, peaks_neg_tot[len(peaks_neg_tot)-1], + top, bot]) + # dbg_plt(boxes[-1], "fallback box") + else: + # order multi-column separators + y_mid_by_order=[] + x_start_by_order=[] + x_end_by_order=[] + if len(x_starting)>0: + columns_covered_by_seps_covered_more_than_2col = set() + for dj in range(len(x_starting)): + if set(range(x_starting[dj], x_ending[dj])) != all_columns: + columns_covered_by_seps_covered_more_than_2col.update( + range(x_starting[dj], x_ending[dj])) + columns_not_covered = list(all_columns - columns_covered_by_seps_covered_more_than_2col) + + y_mid = np.append(y_mid, np.ones(len(columns_not_covered) + 1, + dtype=int) * top) + ##y_mid_by_order = np.append(y_mid_by_order, [top] * len(columns_not_covered)) + ##x_start_by_order = np.append(x_start_by_order, [0] * len(columns_not_covered)) + x_starting = np.append(x_starting, np.array(columns_not_covered, x_starting.dtype)) + x_ending = np.append(x_ending, np.array(columns_not_covered, x_ending.dtype) + 1) + if len(new_main_sep_y) > 0: + x_starting = np.append(x_starting, 0) + x_ending = np.append(x_ending, len(peaks_neg_tot) - 1) + else: + x_starting = np.append(x_starting, x_starting[0]) + x_ending = np.append(x_ending, x_ending[0]) + else: + columns_not_covered = list(all_columns) + y_mid = np.append(y_mid, np.ones(len(columns_not_covered), + dtype=int) * top) + ##y_mid_by_order = np.append(y_mid_by_order, [top] * len(columns_not_covered)) + ##x_start_by_order = np.append(x_start_by_order, [0] * len(columns_not_covered)) + x_starting = np.append(x_starting, np.array(columns_not_covered, x_starting.dtype)) + x_ending = np.append(x_ending, np.array(columns_not_covered, x_ending.dtype) + 1) + + ind_args = np.arange(len(y_mid)) + + for column in range(len(peaks_neg_tot)-1): + #print(column,'column') + ind_args_in_col=ind_args[x_starting==column] + #print(len(y_mid)) + y_mid_column=y_mid[ind_args_in_col] + x_start_column=x_starting[ind_args_in_col] + x_end_column=x_ending[ind_args_in_col] + + ind_args_col_sorted = np.argsort(y_mid_column) + y_mid_by_order.extend(y_mid_column[ind_args_col_sorted]) + x_start_by_order.extend(x_start_column[ind_args_col_sorted]) + x_end_by_order.extend(x_end_column[ind_args_col_sorted] - 1) + + # create single-column boxes from multi-column separators + y_mid_by_order = np.array(y_mid_by_order) + x_start_by_order = np.array(x_start_by_order) + x_end_by_order = np.array(x_end_by_order) + for il in range(len(y_mid_by_order)): + #print(il, "il") + y_mid_itself = y_mid_by_order[il] + #print(y_mid_itself,'y_mid_itself') + x_start_itself = x_start_by_order[il] + x_end_itself = x_end_by_order[il] + for column in range(x_start_itself, x_end_itself+1): + #print(column,'cols') + #print('burda2') + y_mid_next = y_mid_by_order[(y_mid_itself < y_mid_by_order) & + (column >= x_start_by_order) & + (column <= x_end_by_order)] + #print(y_mid_next,'y_mid_next') + y_mid_next = y_mid_next.min(initial=bot) + #print(y_mid_next,'y_mid_next') + boxes.append([peaks_neg_tot[column], + peaks_neg_tot[column+1], + y_mid_itself, + y_mid_next]) + # dbg_plt(boxes[-1], "B column %d box" % (column + 1)) if right2left_readingorder: peaks_neg_tot_tables_new = []