From 86c809b58f85f5907876cadf5c1220e1c074fafc Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Tue, 24 Nov 2020 15:11:25 +0100 Subject: [PATCH] more extraction of util functions --- sbb_newspapers_org_image/eynollah.py | 558 +++------------------------ sbb_newspapers_org_image/utils.py | 448 +++++++++++++++++++++ 2 files changed, 508 insertions(+), 498 deletions(-) diff --git a/sbb_newspapers_org_image/eynollah.py b/sbb_newspapers_org_image/eynollah.py index 5db06cb..e9b4bc0 100644 --- a/sbb_newspapers_org_image/eynollah.py +++ b/sbb_newspapers_org_image/eynollah.py @@ -41,6 +41,9 @@ from .utils import ( filter_contours_area_of_image_interiors, rotatedRectWithMaxArea, rotate_image, + rotate_max_area, + rotation_not_90_func, + rotation_not_90_func_full_layout, rotate_max_area_new, rotation_image_new, crop_image_inside_box, @@ -60,9 +63,14 @@ from .utils import ( seperate_lines, seperate_lines_new_inside_teils2, filter_small_drop_capitals_from_no_patch_layout, - find_num_col_deskew, return_hor_spliter_by_index_for_without_verticals, find_new_features_of_contoures, + find_num_col, + find_num_col_deskew, + find_num_col_only_image, + find_num_col_by_vertical_lines, + find_contours_mean_y_diff, + contours_in_same_horizon, ) @@ -1303,7 +1311,7 @@ class eynollah: try: textline_con, hierachy = return_contours_of_image(img_int_p) textline_con_fil = filter_contours_area_of_image(img_int_p, textline_con, hierachy, max_area=1, min_area=0.0008) - y_diff_mean = self.find_contours_mean_y_diff(textline_con_fil) + y_diff_mean = find_contours_mean_y_diff(textline_con_fil) sigma_des = int(y_diff_mean * (4.0 / 40.0)) @@ -1460,7 +1468,7 @@ class eynollah: textline_con, hierachy = return_contours_of_image(img_int_p) textline_con_fil = filter_contours_area_of_image(img_int_p, textline_con, hierachy, max_area=1, min_area=0.00008) - y_diff_mean = self.find_contours_mean_y_diff(textline_con_fil) + y_diff_mean = find_contours_mean_y_diff(textline_con_fil) sigma_des = int(y_diff_mean * (4.0 / 40.0)) @@ -2260,11 +2268,6 @@ class eynollah: return slope - def find_contours_mean_y_diff(self, contours_main): - M_main = [cv2.moments(contours_main[j]) for j in range(len(contours_main))] - cy_main = [(M_main[j]["m01"] / (M_main[j]["m00"] + 1e-32)) for j in range(len(M_main))] - return np.mean(np.diff(np.sort(np.array(cy_main)))) - def return_deskew_slop(self, img_patch_org, sigma_des, main_page=False): @@ -2826,7 +2829,7 @@ class eynollah: try: textline_con, hierachy = return_contours_of_image(crop_img) textline_con_fil = filter_contours_area_of_image(crop_img, textline_con, hierachy, max_area=1, min_area=0.0008) - y_diff_mean = self.find_contours_mean_y_diff(textline_con_fil) + y_diff_mean = find_contours_mean_y_diff(textline_con_fil) sigma_des = int(y_diff_mean * (4.0 / 40.0)) @@ -4100,400 +4103,6 @@ class eynollah: image_back_zero[:, :][image_back_zero[:, :] == -255] = 255 return image_back_zero - def find_num_col_only_image(self, regions_without_seperators, multiplier=3.8): - regions_without_seperators_0 = regions_without_seperators[:, :].sum(axis=0) - - ##plt.plot(regions_without_seperators_0) - ##plt.show() - - sigma_ = 15 - - meda_n_updown = regions_without_seperators_0[len(regions_without_seperators_0) :: -1] - - first_nonzero = next((i for i, x in enumerate(regions_without_seperators_0) if x), 0) - last_nonzero = next((i for i, x in enumerate(meda_n_updown) if x), 0) - - last_nonzero = len(regions_without_seperators_0) - last_nonzero - - y = regions_without_seperators_0 # [first_nonzero:last_nonzero] - - y_help = np.zeros(len(y) + 20) - - y_help[10 : len(y) + 10] = y - - x = np.array(range(len(y))) - - zneg_rev = -y_help + np.max(y_help) - - zneg = np.zeros(len(zneg_rev) + 20) - - zneg[10 : len(zneg_rev) + 10] = zneg_rev - - z = gaussian_filter1d(y, sigma_) - zneg = gaussian_filter1d(zneg, sigma_) - - peaks_neg, _ = find_peaks(zneg, height=0) - peaks, _ = find_peaks(z, height=0) - - peaks_neg = peaks_neg - 10 - 10 - - peaks_neg_org = np.copy(peaks_neg) - - peaks_neg = peaks_neg[(peaks_neg > first_nonzero) & (peaks_neg < last_nonzero)] - - peaks = peaks[(peaks > 0.09 * regions_without_seperators.shape[1]) & (peaks < 0.91 * regions_without_seperators.shape[1])] - - peaks_neg = peaks_neg[(peaks_neg > 500) & (peaks_neg < (regions_without_seperators.shape[1] - 500))] - # print(peaks) - interest_pos = z[peaks] - - interest_pos = interest_pos[interest_pos > 10] - - interest_neg = z[peaks_neg] - min_peaks_pos = np.mean(interest_pos) # np.min(interest_pos) - min_peaks_neg = 0 # np.min(interest_neg) - - # $print(min_peaks_pos) - dis_talaei = (min_peaks_pos - min_peaks_neg) / multiplier - # print(interest_pos) - grenze = min_peaks_pos - dis_talaei # np.mean(y[peaks_neg[0]:peaks_neg[len(peaks_neg)-1]])-np.std(y[peaks_neg[0]:peaks_neg[len(peaks_neg)-1]])/2.0 - - interest_neg_fin = interest_neg[(interest_neg < grenze)] - peaks_neg_fin = peaks_neg[(interest_neg < grenze)] - - num_col = (len(interest_neg_fin)) + 1 - - p_l = 0 - p_u = len(y) - 1 - p_m = int(len(y) / 2.0) - p_g_l = int(len(y) / 3.0) - p_g_u = len(y) - int(len(y) / 3.0) - - if num_col == 3: - if (peaks_neg_fin[0] > p_g_u and peaks_neg_fin[1] > p_g_u) or (peaks_neg_fin[0] < p_g_l and peaks_neg_fin[1] < p_g_l) or (peaks_neg_fin[0] < p_m and peaks_neg_fin[1] < p_m) or (peaks_neg_fin[0] > p_m and peaks_neg_fin[1] > p_m): - num_col = 1 - else: - pass - - if num_col == 2: - if (peaks_neg_fin[0] > p_g_u) or (peaks_neg_fin[0] < p_g_l): - num_col = 1 - else: - pass - - diff_peaks = np.abs(np.diff(peaks_neg_fin)) - - cut_off = 400 - peaks_neg_true = [] - forest = [] - - for i in range(len(peaks_neg_fin)): - if i == 0: - forest.append(peaks_neg_fin[i]) - if i < (len(peaks_neg_fin) - 1): - if diff_peaks[i] <= cut_off: - forest.append(peaks_neg_fin[i + 1]) - if diff_peaks[i] > cut_off: - # print(forest[np.argmin(z[forest]) ] ) - if not isNaN(forest[np.argmin(z[forest])]): - peaks_neg_true.append(forest[np.argmin(z[forest])]) - forest = [] - forest.append(peaks_neg_fin[i + 1]) - if i == (len(peaks_neg_fin) - 1): - # print(print(forest[np.argmin(z[forest]) ] )) - if not isNaN(forest[np.argmin(z[forest])]): - peaks_neg_true.append(forest[np.argmin(z[forest])]) - - num_col = (len(peaks_neg_true)) + 1 - p_l = 0 - p_u = len(y) - 1 - p_m = int(len(y) / 2.0) - p_quarter = int(len(y) / 4.0) - p_g_l = int(len(y) / 3.0) - p_g_u = len(y) - int(len(y) / 3.0) - - p_u_quarter = len(y) - p_quarter - - if num_col == 3: - if (peaks_neg_true[0] > p_g_u and peaks_neg_true[1] > p_g_u) or (peaks_neg_true[0] < p_g_l and peaks_neg_true[1] < p_g_l) or (peaks_neg_true[0] < p_m and peaks_neg_true[1] < p_m) or (peaks_neg_true[0] > p_m and peaks_neg_true[1] > p_m): - num_col = 1 - peaks_neg_true = [] - elif (peaks_neg_true[0] < p_g_u and peaks_neg_true[0] > p_g_l) and (peaks_neg_true[1] > p_u_quarter): - peaks_neg_true = [peaks_neg_true[0]] - elif (peaks_neg_true[1] < p_g_u and peaks_neg_true[1] > p_g_l) and (peaks_neg_true[0] < p_quarter): - peaks_neg_true = [peaks_neg_true[1]] - else: - pass - - if num_col == 2: - if (peaks_neg_true[0] > p_g_u) or (peaks_neg_true[0] < p_g_l): - num_col = 1 - peaks_neg_true = [] - - if num_col == 4: - if len(np.array(peaks_neg_true)[np.array(peaks_neg_true) < p_g_l]) == 2 or len(np.array(peaks_neg_true)[np.array(peaks_neg_true) > (len(y) - p_g_l)]) == 2: - num_col = 1 - peaks_neg_true = [] - else: - pass - - # no deeper hill around found hills - - peaks_fin_true = [] - for i in range(len(peaks_neg_true)): - hill_main = peaks_neg_true[i] - # deep_depth=z[peaks_neg] - hills_around = peaks_neg_org[((peaks_neg_org > hill_main) & (peaks_neg_org <= hill_main + 400)) | ((peaks_neg_org < hill_main) & (peaks_neg_org >= hill_main - 400))] - deep_depth_around = z[hills_around] - - # print(hill_main,z[hill_main],hills_around,deep_depth_around,'manoooo') - try: - if np.min(deep_depth_around) < z[hill_main]: - pass - else: - peaks_fin_true.append(hill_main) - except: - pass - - diff_peaks_annormal = diff_peaks[diff_peaks < 360] - - if len(diff_peaks_annormal) > 0: - arg_help = np.array(range(len(diff_peaks))) - arg_help_ann = arg_help[diff_peaks < 360] - - peaks_neg_fin_new = [] - - for ii in range(len(peaks_neg_fin)): - if ii in arg_help_ann: - arg_min = np.argmin([interest_neg_fin[ii], interest_neg_fin[ii + 1]]) - if arg_min == 0: - peaks_neg_fin_new.append(peaks_neg_fin[ii]) - else: - peaks_neg_fin_new.append(peaks_neg_fin[ii + 1]) - - elif (ii - 1) in arg_help_ann: - pass - else: - peaks_neg_fin_new.append(peaks_neg_fin[ii]) - else: - peaks_neg_fin_new = peaks_neg_fin - - # sometime pages with one columns gives also some negative peaks. delete those peaks - param = z[peaks_neg_true] / float(min_peaks_pos) * 100 - - if len(param[param <= 41]) == 0: - peaks_neg_true = [] - - return len(peaks_fin_true), peaks_fin_true - - def find_num_col_by_vertical_lines(self, regions_without_seperators, multiplier=3.8): - regions_without_seperators_0 = regions_without_seperators[:, :, 0].sum(axis=0) - - ##plt.plot(regions_without_seperators_0) - ##plt.show() - - sigma_ = 35 # 70#35 - - z = gaussian_filter1d(regions_without_seperators_0, sigma_) - - peaks, _ = find_peaks(z, height=0) - - # print(peaks,'peaksnew') - return peaks - - def find_num_col(self, regions_without_seperators, multiplier=3.8): - regions_without_seperators_0 = regions_without_seperators[:, :].sum(axis=0) - - ##plt.plot(regions_without_seperators_0) - ##plt.show() - - sigma_ = 35 # 70#35 - - meda_n_updown = regions_without_seperators_0[len(regions_without_seperators_0) :: -1] - - first_nonzero = next((i for i, x in enumerate(regions_without_seperators_0) if x), 0) - last_nonzero = next((i for i, x in enumerate(meda_n_updown) if x), 0) - - # print(last_nonzero) - # print(isNaN(last_nonzero)) - # last_nonzero=0#halalikh - last_nonzero = len(regions_without_seperators_0) - last_nonzero - - y = regions_without_seperators_0 # [first_nonzero:last_nonzero] - - y_help = np.zeros(len(y) + 20) - - y_help[10 : len(y) + 10] = y - - x = np.array(range(len(y))) - - zneg_rev = -y_help + np.max(y_help) - - zneg = np.zeros(len(zneg_rev) + 20) - - zneg[10 : len(zneg_rev) + 10] = zneg_rev - - z = gaussian_filter1d(y, sigma_) - zneg = gaussian_filter1d(zneg, sigma_) - - peaks_neg, _ = find_peaks(zneg, height=0) - peaks, _ = find_peaks(z, height=0) - - peaks_neg = peaks_neg - 10 - 10 - - last_nonzero = last_nonzero - 100 - first_nonzero = first_nonzero + 200 - - peaks_neg = peaks_neg[(peaks_neg > first_nonzero) & (peaks_neg < last_nonzero)] - - peaks = peaks[(peaks > 0.06 * regions_without_seperators.shape[1]) & (peaks < 0.94 * regions_without_seperators.shape[1])] - peaks_neg = peaks_neg[(peaks_neg > 370) & (peaks_neg < (regions_without_seperators.shape[1] - 370))] - - # print(peaks) - interest_pos = z[peaks] - - interest_pos = interest_pos[interest_pos > 10] - - # plt.plot(z) - # plt.show() - interest_neg = z[peaks_neg] - - min_peaks_pos = np.min(interest_pos) - max_peaks_pos = np.max(interest_pos) - - if max_peaks_pos / min_peaks_pos >= 35: - min_peaks_pos = np.mean(interest_pos) - - min_peaks_neg = 0 # np.min(interest_neg) - - # print(np.min(interest_pos),np.max(interest_pos),np.max(interest_pos)/np.min(interest_pos),'minmax') - # $print(min_peaks_pos) - dis_talaei = (min_peaks_pos - min_peaks_neg) / multiplier - # print(interest_pos) - grenze = min_peaks_pos - dis_talaei # np.mean(y[peaks_neg[0]:peaks_neg[len(peaks_neg)-1]])-np.std(y[peaks_neg[0]:peaks_neg[len(peaks_neg)-1]])/2.0 - - # print(interest_neg,'interest_neg') - # print(grenze,'grenze') - # print(min_peaks_pos,'min_peaks_pos') - # print(dis_talaei,'dis_talaei') - # print(peaks_neg,'peaks_neg') - - interest_neg_fin = interest_neg[(interest_neg < grenze)] - peaks_neg_fin = peaks_neg[(interest_neg < grenze)] - # interest_neg_fin=interest_neg[(interest_neg p_g_u and peaks_neg_fin[1] > p_g_u) or (peaks_neg_fin[0] < p_g_l and peaks_neg_fin[1] < p_g_l) or ((peaks_neg_fin[0] + 200) < p_m and peaks_neg_fin[1] < p_m) or ((peaks_neg_fin[0] - 200) > p_m and peaks_neg_fin[1] > p_m): - num_col = 1 - peaks_neg_fin = [] - else: - pass - - if num_col == 2: - if (peaks_neg_fin[0] > p_g_u) or (peaks_neg_fin[0] < p_g_l): - num_col = 1 - peaks_neg_fin = [] - else: - pass - - ##print(len(peaks_neg_fin)) - - diff_peaks = np.abs(np.diff(peaks_neg_fin)) - - cut_off = 400 - peaks_neg_true = [] - forest = [] - - # print(len(peaks_neg_fin),'len_') - - for i in range(len(peaks_neg_fin)): - if i == 0: - forest.append(peaks_neg_fin[i]) - if i < (len(peaks_neg_fin) - 1): - if diff_peaks[i] <= cut_off: - forest.append(peaks_neg_fin[i + 1]) - if diff_peaks[i] > cut_off: - # print(forest[np.argmin(z[forest]) ] ) - if not isNaN(forest[np.argmin(z[forest])]): - peaks_neg_true.append(forest[np.argmin(z[forest])]) - forest = [] - forest.append(peaks_neg_fin[i + 1]) - if i == (len(peaks_neg_fin) - 1): - # print(print(forest[np.argmin(z[forest]) ] )) - if not isNaN(forest[np.argmin(z[forest])]): - peaks_neg_true.append(forest[np.argmin(z[forest])]) - - num_col = (len(peaks_neg_true)) + 1 - p_l = 0 - p_u = len(y) - 1 - p_m = int(len(y) / 2.0) - p_quarter = int(len(y) / 5.0) - p_g_l = int(len(y) / 4.0) - p_g_u = len(y) - int(len(y) / 4.0) - - p_u_quarter = len(y) - p_quarter - - ##print(num_col,'early') - if num_col == 3: - if (peaks_neg_true[0] > p_g_u and peaks_neg_true[1] > p_g_u) or (peaks_neg_true[0] < p_g_l and peaks_neg_true[1] < p_g_l) or (peaks_neg_true[0] < p_m and (peaks_neg_true[1] + 200) < p_m) or ((peaks_neg_true[0] - 200) > p_m and peaks_neg_true[1] > p_m): - num_col = 1 - peaks_neg_true = [] - elif (peaks_neg_true[0] < p_g_u and peaks_neg_true[0] > p_g_l) and (peaks_neg_true[1] > p_u_quarter): - peaks_neg_true = [peaks_neg_true[0]] - elif (peaks_neg_true[1] < p_g_u and peaks_neg_true[1] > p_g_l) and (peaks_neg_true[0] < p_quarter): - peaks_neg_true = [peaks_neg_true[1]] - else: - pass - - if num_col == 2: - if (peaks_neg_true[0] > p_g_u) or (peaks_neg_true[0] < p_g_l): - num_col = 1 - peaks_neg_true = [] - else: - pass - - diff_peaks_annormal = diff_peaks[diff_peaks < 360] - - if len(diff_peaks_annormal) > 0: - arg_help = np.array(range(len(diff_peaks))) - arg_help_ann = arg_help[diff_peaks < 360] - - peaks_neg_fin_new = [] - - for ii in range(len(peaks_neg_fin)): - if ii in arg_help_ann: - arg_min = np.argmin([interest_neg_fin[ii], interest_neg_fin[ii + 1]]) - if arg_min == 0: - peaks_neg_fin_new.append(peaks_neg_fin[ii]) - else: - peaks_neg_fin_new.append(peaks_neg_fin[ii + 1]) - - elif (ii - 1) in arg_help_ann: - pass - else: - peaks_neg_fin_new.append(peaks_neg_fin[ii]) - else: - peaks_neg_fin_new = peaks_neg_fin - - # plt.plot(gaussian_filter1d(y, sigma_)) - # plt.plot(peaks_neg_true,z[peaks_neg_true],'*') - # plt.plot([0,len(y)], [grenze,grenze]) - # plt.show() - - ##print(len(peaks_neg_true)) - return len(peaks_neg_true), peaks_neg_true - - def return_points_with_boundies(self, peaks_neg_fin, first_point, last_point): peaks_neg_tot = [] peaks_neg_tot.append(first_point) @@ -4502,23 +4111,6 @@ class eynollah: peaks_neg_tot.append(last_point) return peaks_neg_tot - def contours_in_same_horizon(self, cy_main_hor): - X1 = np.zeros((len(cy_main_hor), len(cy_main_hor))) - X2 = np.zeros((len(cy_main_hor), len(cy_main_hor))) - - X1[0::1, :] = cy_main_hor[:] - X2 = X1.T - - X_dif = np.abs(X2 - X1) - args_help = np.array(range(len(cy_main_hor))) - all_args = [] - for i in range(len(cy_main_hor)): - list_h = list(args_help[X_dif[i, :] <= 20]) - list_h.append(i) - if len(list_h) > 1: - all_args.append(list(set(list_h))) - return np.unique(all_args) - def return_boxes_of_images_by_order_of_reading_without_seperators(self, spliter_y_new, image_p_rev, regions_without_seperators, matrix_of_lines_ch, seperators_closeup_n): boxes = [] @@ -4542,7 +4134,7 @@ class eynollah: ##plt.show() ###find_num_col_both_layout_and_org(regions_without_seperators,image_page[int(spliter_y_new[i]):int(spliter_y_new[i+1]),:,:],7.) - num_col, peaks_neg_fin = self.find_num_col_only_image(image_p_rev[int(spliter_y_new[i]) : int(spliter_y_new[i + 1]), :], multiplier=2.4) + num_col, peaks_neg_fin = find_num_col_only_image(image_p_rev[int(spliter_y_new[i]) : int(spliter_y_new[i + 1]), :], multiplier=2.4) # num_col, peaks_neg_fin=find_num_col(regions_without_seperators[int(spliter_y_new[i]):int(spliter_y_new[i+1]),:],multiplier=7.0) x_min_hor_some = matrix_new[:, 2][(matrix_new[:, 9] == 0)] @@ -4664,7 +4256,7 @@ class eynollah: # print(matrix_new_new,newest_y_spliter[n],newest_y_spliter[n+1],newest_peaks[j],newest_peaks[j+1],'gada') if 1 > 0: # len( matrix_new_new[:,9][matrix_new_new[:,9]==1] )>0 and np.max(matrix_new_new[:,8][matrix_new_new[:,9]==1])>=0.2*(np.abs(newest_y_spliter[n+1]-newest_y_spliter[n] )): # num_col_sub, peaks_neg_fin_sub=find_num_col(regions_without_seperators[int(newest_y_spliter[n]):int(newest_y_spliter[n+1]),newest_peaks[j]:newest_peaks[j+1]],multiplier=2.3) - num_col_sub, peaks_neg_fin_sub = self.find_num_col_only_image(image_p_rev[int(newest_y_spliter[n]) : int(newest_y_spliter[n + 1]), newest_peaks[j] : newest_peaks[j + 1]], multiplier=2.4) + num_col_sub, peaks_neg_fin_sub = find_num_col_only_image(image_p_rev[int(newest_y_spliter[n]) : int(newest_y_spliter[n + 1]), newest_peaks[j] : newest_peaks[j + 1]], multiplier=2.4) else: peaks_neg_fin_sub = [] @@ -4699,7 +4291,7 @@ class eynollah: if len(cy_child_in) > 0: ###num_col_ch, peaks_neg_ch=find_num_col( regions_without_seperators[int(newest_y_spliter[n]):int(newest_y_spliter[n+1]),newest_peaks[j]:newest_peaks[j+1]],multiplier=2.3) - num_col_ch, peaks_neg_ch = self.find_num_col_only_image(image_p_rev[int(newest_y_spliter[n]) : int(newest_y_spliter[n + 1]), newest_peaks[j] : newest_peaks[j + 1]], multiplier=2.3) + num_col_ch, peaks_neg_ch = find_num_col_only_image(image_p_rev[int(newest_y_spliter[n]) : int(newest_y_spliter[n + 1]), newest_peaks[j] : newest_peaks[j + 1]], multiplier=2.3) peaks_neg_ch = peaks_neg_ch[:] + newest_peaks[j] @@ -4731,7 +4323,7 @@ class eynollah: if 1 > 0: # len( matrix_new_new2[:,9][matrix_new_new2[:,9]==1] )>0 and np.max(matrix_new_new2[:,8][matrix_new_new2[:,9]==1])>=0.2*(np.abs(newest_y_spliter_h[nd+1]-newest_y_spliter_h[nd] )): # num_col_sub_ch, peaks_neg_fin_sub_ch=find_num_col(regions_without_seperators[int(newest_y_spliter_h[nd]):int(newest_y_spliter_h[nd+1]),nst_p_ch[jn]:nst_p_ch[jn+1]],multiplier=2.3) - num_col_sub_ch, peaks_neg_fin_sub_ch = self.find_num_col_only_image(image_p_rev[int(newest_y_spliter_h[nd]) : int(newest_y_spliter_h[nd + 1]), nst_p_ch[jn] : nst_p_ch[jn + 1]], multiplier=2.3) + num_col_sub_ch, peaks_neg_fin_sub_ch = find_num_col_only_image(image_p_rev[int(newest_y_spliter_h[nd]) : int(newest_y_spliter_h[nd + 1]), nst_p_ch[jn] : nst_p_ch[jn + 1]], multiplier=2.3) # print(peaks_neg_fin_sub_ch,'gada kutullllllll') else: peaks_neg_fin_sub_ch = [] @@ -4755,7 +4347,7 @@ class eynollah: # print(matrix_new_new,newest_y_spliter[n],newest_y_spliter[n+1],newest_peaks[j],newest_peaks[j+1],'gada') if 1 > 0: # len( matrix_new_new[:,9][matrix_new_new[:,9]==1] )>0 and np.max(matrix_new_new[:,8][matrix_new_new[:,9]==1])>=0.2*(np.abs(newest_y_spliter[n+1]-newest_y_spliter[n] )): ###num_col_sub, peaks_neg_fin_sub=find_num_col(regions_without_seperators[int(newest_y_spliter[n]):int(newest_y_spliter[n+1]),newest_peaks[j]:newest_peaks[j+1]],multiplier=2.3) - num_col_sub, peaks_neg_fin_sub = self.find_num_col_only_image(image_p_rev[int(newest_y_spliter[n]) : int(newest_y_spliter[n + 1]), newest_peaks[j] : newest_peaks[j + 1]], multiplier=2.3) + num_col_sub, peaks_neg_fin_sub = find_num_col_only_image(image_p_rev[int(newest_y_spliter[n]) : int(newest_y_spliter[n + 1]), newest_peaks[j] : newest_peaks[j + 1]], multiplier=2.3) else: peaks_neg_fin_sub = [] @@ -4784,7 +4376,7 @@ class eynollah: # print(matrix_new_new,newest_y_spliter[n],newest_y_spliter[n+1],newest_peaks[j],newest_peaks[j+1],'gada') if 1 > 0: # len( matrix_new_new[:,9][matrix_new_new[:,9]==1] )>0 and np.max(matrix_new_new[:,8][matrix_new_new[:,9]==1])>=0.2*(np.abs(newest_y_spliter[n+1]-newest_y_spliter[n] )): ###num_col_sub, peaks_neg_fin_sub=find_num_col(regions_without_seperators[int(newest_y_spliter[n]):int(newest_y_spliter[n+1]),newest_peaks[j]:newest_peaks[j+1]],multiplier=5.0) - num_col_sub, peaks_neg_fin_sub = self.find_num_col_only_image(image_p_rev[int(newest_y_spliter[n]) : int(newest_y_spliter[n + 1]), newest_peaks[j] : newest_peaks[j + 1]], multiplier=2.3) + num_col_sub, peaks_neg_fin_sub = find_num_col_only_image(image_p_rev[int(newest_y_spliter[n]) : int(newest_y_spliter[n + 1]), newest_peaks[j] : newest_peaks[j + 1]], multiplier=2.3) else: peaks_neg_fin_sub = [] @@ -4829,7 +4421,7 @@ class eynollah: ###find_num_col_both_layout_and_org(regions_without_seperators,image_page[int(spliter_y_new[i]):int(spliter_y_new[i+1]),:,:],7.) try: - num_col, peaks_neg_fin = self.find_num_col_only_image(image_p_rev[int(spliter_y_new[i]) : int(spliter_y_new[i + 1]), :], multiplier=2.4) + num_col, peaks_neg_fin = find_num_col_only_image(image_p_rev[int(spliter_y_new[i]) : int(spliter_y_new[i + 1]), :], multiplier=2.4) except: peaks_neg_fin = [] num_col = 0 @@ -4921,7 +4513,7 @@ class eynollah: for jv in range(len(args_hor)): img_p_in_hor = cv2.fillPoly(img_in_hor, pts=[contours_lines[args_hor[jv]]], color=(1, 1, 1)) - all_args_uniq = self.contours_in_same_horizon(cy_main_hor) + all_args_uniq = contours_in_same_horizon(cy_main_hor) # print(all_args_uniq,'all_args_uniq') if len(all_args_uniq) > 0: if type(all_args_uniq[0]) is list: @@ -5083,7 +4675,7 @@ class eynollah: slope_lines_hor, dist_x_hor, x_min_main_hor, x_max_main_hor, cy_main_hor, slope_lines_org_hor, y_min_main_hor, y_max_main_hor, cx_main_hor = find_features_of_lines(contours_lines_hor) args_hor = np.array(range(len(slope_lines_hor))) - all_args_uniq = self.contours_in_same_horizon(cy_main_hor) + all_args_uniq = contours_in_same_horizon(cy_main_hor) # print(all_args_uniq,'all_args_uniq') if len(all_args_uniq) > 0: if type(all_args_uniq[0]) is list: @@ -5173,7 +4765,7 @@ class eynollah: ##plt.show() ###find_num_col_both_layout_and_org(regions_without_seperators,image_page[int(spliter_y_new[i]):int(spliter_y_new[i+1]),:,:],7.) - num_col, peaks_neg_fin = self.find_num_col(regions_without_seperators[int(spliter_y_new[i]) : int(spliter_y_new[i + 1]), :], multiplier=7.0) + num_col, peaks_neg_fin = find_num_col(regions_without_seperators[int(spliter_y_new[i]) : int(spliter_y_new[i + 1]), :], multiplier=7.0) # num_col, peaks_neg_fin=find_num_col(regions_without_seperators[int(spliter_y_new[i]):int(spliter_y_new[i+1]),:],multiplier=7.0) x_min_hor_some = matrix_new[:, 2][(matrix_new[:, 9] == 0)] @@ -5302,7 +4894,7 @@ class eynollah: matrix_new_new = matrix_of_lines_ch[:, :][(matrix_of_lines_ch[:, 9] == 1) & (matrix_of_lines_ch[:, 6] > newest_y_spliter[n]) & (matrix_of_lines_ch[:, 7] < newest_y_spliter[n + 1]) & ((matrix_of_lines_ch[:, 1] + 500) < newest_peaks[j + 1]) & ((matrix_of_lines_ch[:, 1] - 500) > newest_peaks[j])] # print(matrix_new_new,newest_y_spliter[n],newest_y_spliter[n+1],newest_peaks[j],newest_peaks[j+1],'gada') if len(matrix_new_new[:, 9][matrix_new_new[:, 9] == 1]) > 0 and np.max(matrix_new_new[:, 8][matrix_new_new[:, 9] == 1]) >= 0.2 * (np.abs(newest_y_spliter[n + 1] - newest_y_spliter[n])): - num_col_sub, peaks_neg_fin_sub = self.find_num_col(regions_without_seperators[int(newest_y_spliter[n]) : int(newest_y_spliter[n + 1]), newest_peaks[j] : newest_peaks[j + 1]], multiplier=5.0) + num_col_sub, peaks_neg_fin_sub = find_num_col(regions_without_seperators[int(newest_y_spliter[n]) : int(newest_y_spliter[n + 1]), newest_peaks[j] : newest_peaks[j + 1]], multiplier=5.0) else: peaks_neg_fin_sub = [] @@ -5337,7 +4929,7 @@ class eynollah: cy_child_in = cy_hor_some_sort_child[(cy_hor_some_sort_child > newest_y_spliter[n]) & (cy_hor_some_sort_child < newest_y_spliter[n + 1])] if len(cy_child_in) > 0: - num_col_ch, peaks_neg_ch = self.find_num_col(regions_without_seperators[int(newest_y_spliter[n]) : int(newest_y_spliter[n + 1]), newest_peaks[j] : newest_peaks[j + 1]], multiplier=5.0) + num_col_ch, peaks_neg_ch = find_num_col(regions_without_seperators[int(newest_y_spliter[n]) : int(newest_y_spliter[n + 1]), newest_peaks[j] : newest_peaks[j + 1]], multiplier=5.0) # print(peaks_neg_ch,'mizzzz') # peaks_neg_ch=[] # for djh in range(len(peaks_neg_ch)): @@ -5369,7 +4961,7 @@ class eynollah: matrix_new_new2 = matrix_of_lines_ch[:, :][(matrix_of_lines_ch[:, 9] == 1) & (matrix_of_lines_ch[:, 6] > newest_y_spliter_h[nd]) & (matrix_of_lines_ch[:, 7] < newest_y_spliter_h[nd + 1]) & ((matrix_of_lines_ch[:, 1] + 500) < nst_p_ch[jn + 1]) & ((matrix_of_lines_ch[:, 1] - 500) > nst_p_ch[jn])] # print(matrix_new_new,newest_y_spliter[n],newest_y_spliter[n+1],newest_peaks[j],newest_peaks[j+1],'gada') if len(matrix_new_new2[:, 9][matrix_new_new2[:, 9] == 1]) > 0 and np.max(matrix_new_new2[:, 8][matrix_new_new2[:, 9] == 1]) >= 0.2 * (np.abs(newest_y_spliter_h[nd + 1] - newest_y_spliter_h[nd])): - num_col_sub_ch, peaks_neg_fin_sub_ch = self.find_num_col(regions_without_seperators[int(newest_y_spliter_h[nd]) : int(newest_y_spliter_h[nd + 1]), nst_p_ch[jn] : nst_p_ch[jn + 1]], multiplier=5.0) + num_col_sub_ch, peaks_neg_fin_sub_ch = find_num_col(regions_without_seperators[int(newest_y_spliter_h[nd]) : int(newest_y_spliter_h[nd + 1]), nst_p_ch[jn] : nst_p_ch[jn + 1]], multiplier=5.0) else: peaks_neg_fin_sub_ch = [] @@ -5392,7 +4984,7 @@ class eynollah: matrix_new_new = matrix_of_lines_ch[:, :][(matrix_of_lines_ch[:, 9] == 1) & (matrix_of_lines_ch[:, 6] > newest_y_spliter[n]) & (matrix_of_lines_ch[:, 7] < newest_y_spliter[n + 1]) & ((matrix_of_lines_ch[:, 1] + 500) < newest_peaks[j + 1]) & ((matrix_of_lines_ch[:, 1] - 500) > newest_peaks[j])] # print(matrix_new_new,newest_y_spliter[n],newest_y_spliter[n+1],newest_peaks[j],newest_peaks[j+1],'gada') if len(matrix_new_new[:, 9][matrix_new_new[:, 9] == 1]) > 0 and np.max(matrix_new_new[:, 8][matrix_new_new[:, 9] == 1]) >= 0.2 * (np.abs(newest_y_spliter[n + 1] - newest_y_spliter[n])): - num_col_sub, peaks_neg_fin_sub = self.find_num_col(regions_without_seperators[int(newest_y_spliter[n]) : int(newest_y_spliter[n + 1]), newest_peaks[j] : newest_peaks[j + 1]], multiplier=5.0) + num_col_sub, peaks_neg_fin_sub = find_num_col(regions_without_seperators[int(newest_y_spliter[n]) : int(newest_y_spliter[n + 1]), newest_peaks[j] : newest_peaks[j + 1]], multiplier=5.0) else: peaks_neg_fin_sub = [] @@ -5417,7 +5009,7 @@ class eynollah: matrix_new_new = matrix_of_lines_ch[:, :][(matrix_of_lines_ch[:, 9] == 1) & (matrix_of_lines_ch[:, 6] > newest_y_spliter[n]) & (matrix_of_lines_ch[:, 7] < newest_y_spliter[n + 1]) & ((matrix_of_lines_ch[:, 1] + 500) < newest_peaks[j + 1]) & ((matrix_of_lines_ch[:, 1] - 500) > newest_peaks[j])] # print(matrix_new_new,newest_y_spliter[n],newest_y_spliter[n+1],newest_peaks[j],newest_peaks[j+1],'gada') if len(matrix_new_new[:, 9][matrix_new_new[:, 9] == 1]) > 0 and np.max(matrix_new_new[:, 8][matrix_new_new[:, 9] == 1]) >= 0.2 * (np.abs(newest_y_spliter[n + 1] - newest_y_spliter[n])): - num_col_sub, peaks_neg_fin_sub = self.find_num_col(regions_without_seperators[int(newest_y_spliter[n]) : int(newest_y_spliter[n + 1]), newest_peaks[j] : newest_peaks[j + 1]], multiplier=5.0) + num_col_sub, peaks_neg_fin_sub = find_num_col(regions_without_seperators[int(newest_y_spliter[n]) : int(newest_y_spliter[n + 1]), newest_peaks[j] : newest_peaks[j + 1]], multiplier=5.0) else: peaks_neg_fin_sub = [] @@ -5466,7 +5058,7 @@ class eynollah: # plt.imshow(regions_without_seperators[int(spliter_y_new[i]):int(spliter_y_new[i+1]),:]) # plt.show() try: - num_col, peaks_neg_fin = self.find_num_col(regions_without_seperators[int(spliter_y_new[i]) : int(spliter_y_new[i + 1]), :], multiplier=7.0) + num_col, peaks_neg_fin = find_num_col(regions_without_seperators[int(spliter_y_new[i]) : int(spliter_y_new[i + 1]), :], multiplier=7.0) except: peaks_neg_fin = [] @@ -5597,7 +5189,7 @@ class eynollah: if 1 > 0: # len( matrix_new_new[:,9][matrix_new_new[:,9]==1] )>0 and np.max(matrix_new_new[:,8][matrix_new_new[:,9]==1])>=0.2*(np.abs(newest_y_spliter[n+1]-newest_y_spliter[n] )): # print( int(newest_y_spliter[n]),int(newest_y_spliter[n+1]),newest_peaks[j],newest_peaks[j+1] ) try: - num_col_sub, peaks_neg_fin_sub = self.find_num_col(regions_without_seperators[int(newest_y_spliter[n]) : int(newest_y_spliter[n + 1]), newest_peaks[j] : newest_peaks[j + 1]], multiplier=7.0) + num_col_sub, peaks_neg_fin_sub = find_num_col(regions_without_seperators[int(newest_y_spliter[n]) : int(newest_y_spliter[n + 1]), newest_peaks[j] : newest_peaks[j + 1]], multiplier=7.0) except: peaks_neg_fin_sub = [] else: @@ -5634,7 +5226,7 @@ class eynollah: if len(cy_child_in) > 0: try: - num_col_ch, peaks_neg_ch = self.find_num_col(regions_without_seperators[int(newest_y_spliter[n]) : int(newest_y_spliter[n + 1]), newest_peaks[j] : newest_peaks[j + 1]], multiplier=7.0) + num_col_ch, peaks_neg_ch = find_num_col(regions_without_seperators[int(newest_y_spliter[n]) : int(newest_y_spliter[n + 1]), newest_peaks[j] : newest_peaks[j + 1]], multiplier=7.0) except: peaks_neg_ch = [] # print(peaks_neg_ch,'mizzzz') @@ -5669,7 +5261,7 @@ class eynollah: # print(matrix_new_new,newest_y_spliter[n],newest_y_spliter[n+1],newest_peaks[j],newest_peaks[j+1],'gada') if 1 > 0: # len( matrix_new_new2[:,9][matrix_new_new2[:,9]==1] )>0 and np.max(matrix_new_new2[:,8][matrix_new_new2[:,9]==1])>=0.2*(np.abs(newest_y_spliter_h[nd+1]-newest_y_spliter_h[nd] )): try: - num_col_sub_ch, peaks_neg_fin_sub_ch = self.find_num_col(regions_without_seperators[int(newest_y_spliter_h[nd]) : int(newest_y_spliter_h[nd + 1]), nst_p_ch[jn] : nst_p_ch[jn + 1]], multiplier=7.0) + num_col_sub_ch, peaks_neg_fin_sub_ch = find_num_col(regions_without_seperators[int(newest_y_spliter_h[nd]) : int(newest_y_spliter_h[nd + 1]), nst_p_ch[jn] : nst_p_ch[jn + 1]], multiplier=7.0) except: peaks_neg_fin_sub_ch = [] @@ -5695,7 +5287,7 @@ class eynollah: # print(matrix_new_new,newest_y_spliter[n],newest_y_spliter[n+1],newest_peaks[j],newest_peaks[j+1],'gada') if 1 > 0: # len( matrix_new_new[:,9][matrix_new_new[:,9]==1] )>0 and np.max(matrix_new_new[:,8][matrix_new_new[:,9]==1])>=0.2*(np.abs(newest_y_spliter[n+1]-newest_y_spliter[n] )): try: - num_col_sub, peaks_neg_fin_sub = self.find_num_col(regions_without_seperators[int(newest_y_spliter[n]) : int(newest_y_spliter[n + 1]), newest_peaks[j] : newest_peaks[j + 1]], multiplier=7.0) + num_col_sub, peaks_neg_fin_sub = find_num_col(regions_without_seperators[int(newest_y_spliter[n]) : int(newest_y_spliter[n + 1]), newest_peaks[j] : newest_peaks[j + 1]], multiplier=7.0) except: peaks_neg_fin_sub = [] else: @@ -5723,7 +5315,7 @@ class eynollah: # print(matrix_new_new,newest_y_spliter[n],newest_y_spliter[n+1],newest_peaks[j],newest_peaks[j+1],'gada') if 1 > 0: # len( matrix_new_new[:,9][matrix_new_new[:,9]==1] )>0 and np.max(matrix_new_new[:,8][matrix_new_new[:,9]==1])>=0.2*(np.abs(newest_y_spliter[n+1]-newest_y_spliter[n] )): try: - num_col_sub, peaks_neg_fin_sub = self.find_num_col(regions_without_seperators[int(newest_y_spliter[n]) : int(newest_y_spliter[n + 1]), newest_peaks[j] : newest_peaks[j + 1]], multiplier=5.0) + num_col_sub, peaks_neg_fin_sub = find_num_col(regions_without_seperators[int(newest_y_spliter[n]) : int(newest_y_spliter[n + 1]), newest_peaks[j] : newest_peaks[j + 1]], multiplier=5.0) except: peaks_neg_fin_sub = [] else: @@ -5770,7 +5362,7 @@ class eynollah: ###find_num_col_both_layout_and_org(regions_without_seperators,image_page[int(spliter_y_new[i]):int(spliter_y_new[i+1]),:,:],7.) try: - num_col, peaks_neg_fin = self.find_num_col(regions_without_seperators[int(spliter_y_new[i]) : int(spliter_y_new[i + 1]), :], multiplier=7.0) + num_col, peaks_neg_fin = find_num_col(regions_without_seperators[int(spliter_y_new[i]) : int(spliter_y_new[i + 1]), :], multiplier=7.0) except: peaks_neg_fin = [] @@ -6004,6 +5596,21 @@ class eynollah: return y_min_main, y_max_main + def find_features_of_contours(self, contours_main): + + areas_main = np.array([cv2.contourArea(contours_main[j]) for j in range(len(contours_main))]) + M_main = [cv2.moments(contours_main[j]) for j in range(len(contours_main))] + cx_main = [(M_main[j]["m10"] / (M_main[j]["m00"] + 1e-32)) for j in range(len(M_main))] + cy_main = [(M_main[j]["m01"] / (M_main[j]["m00"] + 1e-32)) for j in range(len(M_main))] + x_min_main = np.array([np.min(contours_main[j][:, 0, 0]) for j in range(len(contours_main))]) + x_max_main = np.array([np.max(contours_main[j][:, 0, 0]) for j in range(len(contours_main))]) + + y_min_main = np.array([np.min(contours_main[j][:, 0, 1]) for j in range(len(contours_main))]) + y_max_main = np.array([np.max(contours_main[j][:, 0, 1]) for j in range(len(contours_main))]) + + return y_min_main, y_max_main, areas_main + + def add_tables_heuristic_to_layout(self, image_regions_eraly_p, boxes, slope_mean_hor, spliter_y, peaks_neg_tot, image_revised): image_revised_1 = self.delete_seperator_around(spliter_y, peaks_neg_tot, image_revised) @@ -6119,20 +5726,6 @@ class eynollah: ##plt.show() return image_revised_last - def find_features_of_contours(self, contours_main): - - areas_main = np.array([cv2.contourArea(contours_main[j]) for j in range(len(contours_main))]) - M_main = [cv2.moments(contours_main[j]) for j in range(len(contours_main))] - cx_main = [(M_main[j]["m10"] / (M_main[j]["m00"] + 1e-32)) for j in range(len(M_main))] - cy_main = [(M_main[j]["m01"] / (M_main[j]["m00"] + 1e-32)) for j in range(len(M_main))] - x_min_main = np.array([np.min(contours_main[j][:, 0, 0]) for j in range(len(contours_main))]) - x_max_main = np.array([np.max(contours_main[j][:, 0, 0]) for j in range(len(contours_main))]) - - y_min_main = np.array([np.min(contours_main[j][:, 0, 1]) for j in range(len(contours_main))]) - y_max_main = np.array([np.max(contours_main[j][:, 0, 1]) for j in range(len(contours_main))]) - - return y_min_main, y_max_main, areas_main - def remove_headers_and_mains_intersection(self, seperators_closeup_n, img_revised_tab, boxes): for ind in range(len(boxes)): asp = np.zeros((img_revised_tab[:, :, 0].shape[0], seperators_closeup_n[:, :, 0].shape[1])) @@ -6815,11 +6408,11 @@ class eynollah: ##plt.imshow(regions_without_seperators_teil) ##plt.show() - # num_col, peaks_neg_fin=self.find_num_col(regions_without_seperators_teil,multiplier=6.0) + # num_col, peaks_neg_fin=find_num_col(regions_without_seperators_teil,multiplier=6.0) # regions_without_seperators_teil=cv2.erode(regions_without_seperators_teil,kernel,iterations = 3) # - num_col, peaks_neg_fin = self.find_num_col(regions_without_seperators_teil, multiplier=7.0) + num_col, peaks_neg_fin = find_num_col(regions_without_seperators_teil, multiplier=7.0) if num_col > num_col_fin: num_col_fin = num_col @@ -6827,19 +6420,19 @@ class eynollah: """ #print(length_y_vertical_lines,length_y_threshold,'x_center_of_ver_linesx_center_of_ver_linesx_center_of_ver_lines') if len(cx_main_ver)>0 and len( dist_y_ver[dist_y_ver>=length_y_threshold] ) >=1: - num_col, peaks_neg_fin=self.find_num_col(regions_without_seperators_teil,multiplier=6.0) + num_col, peaks_neg_fin=find_num_col(regions_without_seperators_teil,multiplier=6.0) else: #plt.imshow(image_page_background_zero_teil) #plt.show() - #num_col, peaks_neg_fin=self.find_num_col_only_image(image_page_background_zero,multiplier=2.4)#2.3) - num_col, peaks_neg_fin=self.find_num_col_only_image(image_page_background_zero_teil,multiplier=3.4)#2.3) + #num_col, peaks_neg_fin=find_num_col_only_image(image_page_background_zero,multiplier=2.4)#2.3) + num_col, peaks_neg_fin=find_num_col_only_image(image_page_background_zero_teil,multiplier=3.4)#2.3) print(num_col,'birda') if num_col>0: pass elif num_col==0: print(num_col,'birda2222') - num_col_regions, peaks_neg_fin_regions=self.find_num_col(regions_without_seperators_teil,multiplier=10.0) + num_col_regions, peaks_neg_fin_regions=find_num_col(regions_without_seperators_teil,multiplier=10.0) if num_col_regions==0: pass else: @@ -6851,7 +6444,7 @@ class eynollah: # print(num_col+1,'num colmsssssssss') if len(args_big_parts) == 1 and (len(peaks_neg_fin_fin) + 1) < num_col_classifier: - peaks_neg_fin = self.find_num_col_by_vertical_lines(vertical) + peaks_neg_fin = find_num_col_by_vertical_lines(vertical) peaks_neg_fin = peaks_neg_fin[peaks_neg_fin >= 500] peaks_neg_fin = peaks_neg_fin[peaks_neg_fin <= (vertical.shape[1] - 500)] peaks_neg_fin_fin = peaks_neg_fin[:] @@ -7285,37 +6878,6 @@ class eynollah: ##plt.show() return text_region2_1st_channel - def rotation_not_90_func(self, img, textline, text_regions_p_1, thetha): - rotated = imutils.rotate(img, thetha) - rotated_textline = imutils.rotate(textline, thetha) - rotated_layout = imutils.rotate(text_regions_p_1, thetha) - return self.rotate_max_area(img, rotated, rotated_textline, rotated_layout, thetha) - - def rotate_max_area(self, image, rotated, rotated_textline, rotated_layout, angle): - wr, hr = rotatedRectWithMaxArea(image.shape[1], image.shape[0], math.radians(angle)) - h, w, _ = rotated.shape - y1 = h // 2 - int(hr / 2) - y2 = y1 + int(hr) - x1 = w // 2 - int(wr / 2) - x2 = x1 + int(wr) - return rotated[y1:y2, x1:x2], rotated_textline[y1:y2, x1:x2], rotated_layout[y1:y2, x1:x2] - - def rotation_not_90_func_full_layout(self, img, textline, text_regions_p_1, text_regions_p_fully, thetha): - rotated = imutils.rotate(img, thetha) - rotated_textline = imutils.rotate(textline, thetha) - rotated_layout = imutils.rotate(text_regions_p_1, thetha) - rotated_layout_full = imutils.rotate(text_regions_p_fully, thetha) - return self.rotate_max_area_full_layout(img, rotated, rotated_textline, rotated_layout, rotated_layout_full, thetha) - - def rotate_max_area_full_layout(self, image, rotated, rotated_textline, rotated_layout, rotated_layout_full, angle): - wr, hr = rotatedRectWithMaxArea(image.shape[1], image.shape[0], math.radians(angle)) - h, w, _ = rotated.shape - y1 = h // 2 - int(hr / 2) - y2 = y1 + int(hr) - x1 = w // 2 - int(wr / 2) - x2 = x1 + int(wr) - return rotated[y1:y2, x1:x2], rotated_textline[y1:y2, x1:x2], rotated_layout[y1:y2, x1:x2], rotated_layout_full[y1:y2, x1:x2] - def get_regions_from_xy_2models_ens(self, img): img_org = np.copy(img) @@ -9087,7 +8649,7 @@ class eynollah: img_only_regions = cv2.erode(img_only_regions_with_sep[:, :], self.kernel, iterations=6) try: - num_col, peaks_neg_fin = self.find_num_col(img_only_regions, multiplier=6.0) + num_col, peaks_neg_fin = find_num_col(img_only_regions, multiplier=6.0) if not num_column_is_classified: num_col_classifier = num_col + 1 except: @@ -9157,7 +8719,7 @@ class eynollah: ##sys.exit() print("deskewing: " + str(time.time() - t1)) - image_page_rotated, textline_mask_tot = image_page[:, :], textline_mask_tot_ea[:, :] # self.rotation_not_90_func(image_page,textline_mask_tot_ea,slope_first) + image_page_rotated, textline_mask_tot = image_page[:, :], textline_mask_tot_ea[:, :] # rotation_not_90_func(image_page,textline_mask_tot_ea,slope_first) textline_mask_tot[mask_images[:, :] == 1] = 0 pixel_img = 1 @@ -9201,7 +8763,7 @@ class eynollah: if not self.full_layout: if np.abs(slope_deskew) >= SLOPE_THRESHOLD: - image_page_rotated_n, textline_mask_tot_d, text_regions_p_1_n = self.rotation_not_90_func(image_page, textline_mask_tot, text_regions_p, slope_deskew) + image_page_rotated_n, textline_mask_tot_d, text_regions_p_1_n = rotation_not_90_func(image_page, textline_mask_tot, text_regions_p, slope_deskew) text_regions_p_1_n = resize_image(text_regions_p_1_n, text_regions_p.shape[0], text_regions_p.shape[1]) textline_mask_tot_d = resize_image(textline_mask_tot_d, text_regions_p.shape[0], text_regions_p.shape[1]) @@ -9332,7 +8894,7 @@ class eynollah: # plt.show() if np.abs(slope_deskew) >= SLOPE_THRESHOLD: - image_page_rotated_n, textline_mask_tot_d, text_regions_p_1_n, regions_fully_n = self.rotation_not_90_func_full_layout(image_page, textline_mask_tot, text_regions_p, regions_fully, slope_deskew) + image_page_rotated_n, textline_mask_tot_d, text_regions_p_1_n, regions_fully_n = rotation_not_90_func_full_layout(image_page, textline_mask_tot, text_regions_p, regions_fully, slope_deskew) text_regions_p_1_n = resize_image(text_regions_p_1_n, text_regions_p.shape[0], text_regions_p.shape[1]) textline_mask_tot_d = resize_image(textline_mask_tot_d, text_regions_p.shape[0], text_regions_p.shape[1]) diff --git a/sbb_newspapers_org_image/utils.py b/sbb_newspapers_org_image/utils.py index 714c472..aaa49bb 100644 --- a/sbb_newspapers_org_image/utils.py +++ b/sbb_newspapers_org_image/utils.py @@ -107,6 +107,38 @@ def rotyate_image_different( img, slope): img_rotation = cv2.warpAffine(img, rotation_matrix, (num_cols, num_rows)) return img_rotation +def rotate_max_area(image, rotated, rotated_textline, rotated_layout, angle): + wr, hr = rotatedRectWithMaxArea(image.shape[1], image.shape[0], math.radians(angle)) + h, w, _ = rotated.shape + y1 = h // 2 - int(hr / 2) + y2 = y1 + int(hr) + x1 = w // 2 - int(wr / 2) + x2 = x1 + int(wr) + return rotated[y1:y2, x1:x2], rotated_textline[y1:y2, x1:x2], rotated_layout[y1:y2, x1:x2] + +def rotation_not_90_func(img, textline, text_regions_p_1, thetha): + rotated = imutils.rotate(img, thetha) + rotated_textline = imutils.rotate(textline, thetha) + rotated_layout = imutils.rotate(text_regions_p_1, thetha) + return rotate_max_area(img, rotated, rotated_textline, rotated_layout, thetha) + +def rotation_not_90_func_full_layout(img, textline, text_regions_p_1, text_regions_p_fully, thetha): + rotated = imutils.rotate(img, thetha) + rotated_textline = imutils.rotate(textline, thetha) + rotated_layout = imutils.rotate(text_regions_p_1, thetha) + rotated_layout_full = imutils.rotate(text_regions_p_fully, thetha) + return rotate_max_area_full_layout(img, rotated, rotated_textline, rotated_layout, rotated_layout_full, thetha) + +def rotate_max_area_full_layout(image, rotated, rotated_textline, rotated_layout, rotated_layout_full, angle): + wr, hr = rotatedRectWithMaxArea(image.shape[1], image.shape[0], math.radians(angle)) + h, w, _ = rotated.shape + y1 = h // 2 - int(hr / 2) + y2 = y1 + int(hr) + x1 = w // 2 - int(wr / 2) + x2 = x1 + int(wr) + return rotated[y1:y2, x1:x2], rotated_textline[y1:y2, x1:x2], rotated_layout[y1:y2, x1:x2], rotated_layout_full[y1:y2, x1:x2] + + def crop_image_inside_box(box, img_org_copy): image_box = img_org_copy[box[1] : box[1] + box[3], box[0] : box[0] + box[2]] return image_box, [box[1], box[1] + box[3], box[0], box[0] + box[2]] @@ -1669,3 +1701,419 @@ def find_new_features_of_contoures(contours_main): # dis_x=np.abs(x_max_main-x_min_main) return cx_main, cy_main, x_min_main, x_max_main, y_min_main, y_max_main, y_corr_x_min_from_argmin + +def find_num_col(regions_without_seperators, multiplier=3.8): + regions_without_seperators_0 = regions_without_seperators[:, :].sum(axis=0) + + ##plt.plot(regions_without_seperators_0) + ##plt.show() + + sigma_ = 35 # 70#35 + + meda_n_updown = regions_without_seperators_0[len(regions_without_seperators_0) :: -1] + + first_nonzero = next((i for i, x in enumerate(regions_without_seperators_0) if x), 0) + last_nonzero = next((i for i, x in enumerate(meda_n_updown) if x), 0) + + # print(last_nonzero) + # print(isNaN(last_nonzero)) + # last_nonzero=0#halalikh + last_nonzero = len(regions_without_seperators_0) - last_nonzero + + y = regions_without_seperators_0 # [first_nonzero:last_nonzero] + + y_help = np.zeros(len(y) + 20) + + y_help[10 : len(y) + 10] = y + + x = np.array(range(len(y))) + + zneg_rev = -y_help + np.max(y_help) + + zneg = np.zeros(len(zneg_rev) + 20) + + zneg[10 : len(zneg_rev) + 10] = zneg_rev + + z = gaussian_filter1d(y, sigma_) + zneg = gaussian_filter1d(zneg, sigma_) + + peaks_neg, _ = find_peaks(zneg, height=0) + peaks, _ = find_peaks(z, height=0) + + peaks_neg = peaks_neg - 10 - 10 + + last_nonzero = last_nonzero - 100 + first_nonzero = first_nonzero + 200 + + peaks_neg = peaks_neg[(peaks_neg > first_nonzero) & (peaks_neg < last_nonzero)] + + peaks = peaks[(peaks > 0.06 * regions_without_seperators.shape[1]) & (peaks < 0.94 * regions_without_seperators.shape[1])] + peaks_neg = peaks_neg[(peaks_neg > 370) & (peaks_neg < (regions_without_seperators.shape[1] - 370))] + + # print(peaks) + interest_pos = z[peaks] + + interest_pos = interest_pos[interest_pos > 10] + + # plt.plot(z) + # plt.show() + interest_neg = z[peaks_neg] + + min_peaks_pos = np.min(interest_pos) + max_peaks_pos = np.max(interest_pos) + + if max_peaks_pos / min_peaks_pos >= 35: + min_peaks_pos = np.mean(interest_pos) + + min_peaks_neg = 0 # np.min(interest_neg) + + # print(np.min(interest_pos),np.max(interest_pos),np.max(interest_pos)/np.min(interest_pos),'minmax') + # $print(min_peaks_pos) + dis_talaei = (min_peaks_pos - min_peaks_neg) / multiplier + # print(interest_pos) + grenze = min_peaks_pos - dis_talaei # np.mean(y[peaks_neg[0]:peaks_neg[len(peaks_neg)-1]])-np.std(y[peaks_neg[0]:peaks_neg[len(peaks_neg)-1]])/2.0 + + # print(interest_neg,'interest_neg') + # print(grenze,'grenze') + # print(min_peaks_pos,'min_peaks_pos') + # print(dis_talaei,'dis_talaei') + # print(peaks_neg,'peaks_neg') + + interest_neg_fin = interest_neg[(interest_neg < grenze)] + peaks_neg_fin = peaks_neg[(interest_neg < grenze)] + # interest_neg_fin=interest_neg[(interest_neg p_g_u and peaks_neg_fin[1] > p_g_u) or (peaks_neg_fin[0] < p_g_l and peaks_neg_fin[1] < p_g_l) or ((peaks_neg_fin[0] + 200) < p_m and peaks_neg_fin[1] < p_m) or ((peaks_neg_fin[0] - 200) > p_m and peaks_neg_fin[1] > p_m): + num_col = 1 + peaks_neg_fin = [] + else: + pass + + if num_col == 2: + if (peaks_neg_fin[0] > p_g_u) or (peaks_neg_fin[0] < p_g_l): + num_col = 1 + peaks_neg_fin = [] + else: + pass + + ##print(len(peaks_neg_fin)) + + diff_peaks = np.abs(np.diff(peaks_neg_fin)) + + cut_off = 400 + peaks_neg_true = [] + forest = [] + + # print(len(peaks_neg_fin),'len_') + + for i in range(len(peaks_neg_fin)): + if i == 0: + forest.append(peaks_neg_fin[i]) + if i < (len(peaks_neg_fin) - 1): + if diff_peaks[i] <= cut_off: + forest.append(peaks_neg_fin[i + 1]) + if diff_peaks[i] > cut_off: + # print(forest[np.argmin(z[forest]) ] ) + if not isNaN(forest[np.argmin(z[forest])]): + peaks_neg_true.append(forest[np.argmin(z[forest])]) + forest = [] + forest.append(peaks_neg_fin[i + 1]) + if i == (len(peaks_neg_fin) - 1): + # print(print(forest[np.argmin(z[forest]) ] )) + if not isNaN(forest[np.argmin(z[forest])]): + peaks_neg_true.append(forest[np.argmin(z[forest])]) + + num_col = (len(peaks_neg_true)) + 1 + p_l = 0 + p_u = len(y) - 1 + p_m = int(len(y) / 2.0) + p_quarter = int(len(y) / 5.0) + p_g_l = int(len(y) / 4.0) + p_g_u = len(y) - int(len(y) / 4.0) + + p_u_quarter = len(y) - p_quarter + + ##print(num_col,'early') + if num_col == 3: + if (peaks_neg_true[0] > p_g_u and peaks_neg_true[1] > p_g_u) or (peaks_neg_true[0] < p_g_l and peaks_neg_true[1] < p_g_l) or (peaks_neg_true[0] < p_m and (peaks_neg_true[1] + 200) < p_m) or ((peaks_neg_true[0] - 200) > p_m and peaks_neg_true[1] > p_m): + num_col = 1 + peaks_neg_true = [] + elif (peaks_neg_true[0] < p_g_u and peaks_neg_true[0] > p_g_l) and (peaks_neg_true[1] > p_u_quarter): + peaks_neg_true = [peaks_neg_true[0]] + elif (peaks_neg_true[1] < p_g_u and peaks_neg_true[1] > p_g_l) and (peaks_neg_true[0] < p_quarter): + peaks_neg_true = [peaks_neg_true[1]] + else: + pass + + if num_col == 2: + if (peaks_neg_true[0] > p_g_u) or (peaks_neg_true[0] < p_g_l): + num_col = 1 + peaks_neg_true = [] + else: + pass + + diff_peaks_annormal = diff_peaks[diff_peaks < 360] + + if len(diff_peaks_annormal) > 0: + arg_help = np.array(range(len(diff_peaks))) + arg_help_ann = arg_help[diff_peaks < 360] + + peaks_neg_fin_new = [] + + for ii in range(len(peaks_neg_fin)): + if ii in arg_help_ann: + arg_min = np.argmin([interest_neg_fin[ii], interest_neg_fin[ii + 1]]) + if arg_min == 0: + peaks_neg_fin_new.append(peaks_neg_fin[ii]) + else: + peaks_neg_fin_new.append(peaks_neg_fin[ii + 1]) + + elif (ii - 1) in arg_help_ann: + pass + else: + peaks_neg_fin_new.append(peaks_neg_fin[ii]) + else: + peaks_neg_fin_new = peaks_neg_fin + + # plt.plot(gaussian_filter1d(y, sigma_)) + # plt.plot(peaks_neg_true,z[peaks_neg_true],'*') + # plt.plot([0,len(y)], [grenze,grenze]) + # plt.show() + + ##print(len(peaks_neg_true)) + return len(peaks_neg_true), peaks_neg_true + +def find_num_col_only_image(regions_without_seperators, multiplier=3.8): + regions_without_seperators_0 = regions_without_seperators[:, :].sum(axis=0) + + ##plt.plot(regions_without_seperators_0) + ##plt.show() + + sigma_ = 15 + + meda_n_updown = regions_without_seperators_0[len(regions_without_seperators_0) :: -1] + + first_nonzero = next((i for i, x in enumerate(regions_without_seperators_0) if x), 0) + last_nonzero = next((i for i, x in enumerate(meda_n_updown) if x), 0) + + last_nonzero = len(regions_without_seperators_0) - last_nonzero + + y = regions_without_seperators_0 # [first_nonzero:last_nonzero] + + y_help = np.zeros(len(y) + 20) + + y_help[10 : len(y) + 10] = y + + x = np.array(range(len(y))) + + zneg_rev = -y_help + np.max(y_help) + + zneg = np.zeros(len(zneg_rev) + 20) + + zneg[10 : len(zneg_rev) + 10] = zneg_rev + + z = gaussian_filter1d(y, sigma_) + zneg = gaussian_filter1d(zneg, sigma_) + + peaks_neg, _ = find_peaks(zneg, height=0) + peaks, _ = find_peaks(z, height=0) + + peaks_neg = peaks_neg - 10 - 10 + + peaks_neg_org = np.copy(peaks_neg) + + peaks_neg = peaks_neg[(peaks_neg > first_nonzero) & (peaks_neg < last_nonzero)] + + peaks = peaks[(peaks > 0.09 * regions_without_seperators.shape[1]) & (peaks < 0.91 * regions_without_seperators.shape[1])] + + peaks_neg = peaks_neg[(peaks_neg > 500) & (peaks_neg < (regions_without_seperators.shape[1] - 500))] + # print(peaks) + interest_pos = z[peaks] + + interest_pos = interest_pos[interest_pos > 10] + + interest_neg = z[peaks_neg] + min_peaks_pos = np.mean(interest_pos) # np.min(interest_pos) + min_peaks_neg = 0 # np.min(interest_neg) + + # $print(min_peaks_pos) + dis_talaei = (min_peaks_pos - min_peaks_neg) / multiplier + # print(interest_pos) + grenze = min_peaks_pos - dis_talaei # np.mean(y[peaks_neg[0]:peaks_neg[len(peaks_neg)-1]])-np.std(y[peaks_neg[0]:peaks_neg[len(peaks_neg)-1]])/2.0 + + interest_neg_fin = interest_neg[(interest_neg < grenze)] + peaks_neg_fin = peaks_neg[(interest_neg < grenze)] + + num_col = (len(interest_neg_fin)) + 1 + + p_l = 0 + p_u = len(y) - 1 + p_m = int(len(y) / 2.0) + p_g_l = int(len(y) / 3.0) + p_g_u = len(y) - int(len(y) / 3.0) + + if num_col == 3: + if (peaks_neg_fin[0] > p_g_u and peaks_neg_fin[1] > p_g_u) or (peaks_neg_fin[0] < p_g_l and peaks_neg_fin[1] < p_g_l) or (peaks_neg_fin[0] < p_m and peaks_neg_fin[1] < p_m) or (peaks_neg_fin[0] > p_m and peaks_neg_fin[1] > p_m): + num_col = 1 + else: + pass + + if num_col == 2: + if (peaks_neg_fin[0] > p_g_u) or (peaks_neg_fin[0] < p_g_l): + num_col = 1 + else: + pass + + diff_peaks = np.abs(np.diff(peaks_neg_fin)) + + cut_off = 400 + peaks_neg_true = [] + forest = [] + + for i in range(len(peaks_neg_fin)): + if i == 0: + forest.append(peaks_neg_fin[i]) + if i < (len(peaks_neg_fin) - 1): + if diff_peaks[i] <= cut_off: + forest.append(peaks_neg_fin[i + 1]) + if diff_peaks[i] > cut_off: + # print(forest[np.argmin(z[forest]) ] ) + if not isNaN(forest[np.argmin(z[forest])]): + peaks_neg_true.append(forest[np.argmin(z[forest])]) + forest = [] + forest.append(peaks_neg_fin[i + 1]) + if i == (len(peaks_neg_fin) - 1): + # print(print(forest[np.argmin(z[forest]) ] )) + if not isNaN(forest[np.argmin(z[forest])]): + peaks_neg_true.append(forest[np.argmin(z[forest])]) + + num_col = (len(peaks_neg_true)) + 1 + p_l = 0 + p_u = len(y) - 1 + p_m = int(len(y) / 2.0) + p_quarter = int(len(y) / 4.0) + p_g_l = int(len(y) / 3.0) + p_g_u = len(y) - int(len(y) / 3.0) + + p_u_quarter = len(y) - p_quarter + + if num_col == 3: + if (peaks_neg_true[0] > p_g_u and peaks_neg_true[1] > p_g_u) or (peaks_neg_true[0] < p_g_l and peaks_neg_true[1] < p_g_l) or (peaks_neg_true[0] < p_m and peaks_neg_true[1] < p_m) or (peaks_neg_true[0] > p_m and peaks_neg_true[1] > p_m): + num_col = 1 + peaks_neg_true = [] + elif (peaks_neg_true[0] < p_g_u and peaks_neg_true[0] > p_g_l) and (peaks_neg_true[1] > p_u_quarter): + peaks_neg_true = [peaks_neg_true[0]] + elif (peaks_neg_true[1] < p_g_u and peaks_neg_true[1] > p_g_l) and (peaks_neg_true[0] < p_quarter): + peaks_neg_true = [peaks_neg_true[1]] + else: + pass + + if num_col == 2: + if (peaks_neg_true[0] > p_g_u) or (peaks_neg_true[0] < p_g_l): + num_col = 1 + peaks_neg_true = [] + + if num_col == 4: + if len(np.array(peaks_neg_true)[np.array(peaks_neg_true) < p_g_l]) == 2 or len(np.array(peaks_neg_true)[np.array(peaks_neg_true) > (len(y) - p_g_l)]) == 2: + num_col = 1 + peaks_neg_true = [] + else: + pass + + # no deeper hill around found hills + + peaks_fin_true = [] + for i in range(len(peaks_neg_true)): + hill_main = peaks_neg_true[i] + # deep_depth=z[peaks_neg] + hills_around = peaks_neg_org[((peaks_neg_org > hill_main) & (peaks_neg_org <= hill_main + 400)) | ((peaks_neg_org < hill_main) & (peaks_neg_org >= hill_main - 400))] + deep_depth_around = z[hills_around] + + # print(hill_main,z[hill_main],hills_around,deep_depth_around,'manoooo') + try: + if np.min(deep_depth_around) < z[hill_main]: + pass + else: + peaks_fin_true.append(hill_main) + except: + pass + + diff_peaks_annormal = diff_peaks[diff_peaks < 360] + + if len(diff_peaks_annormal) > 0: + arg_help = np.array(range(len(diff_peaks))) + arg_help_ann = arg_help[diff_peaks < 360] + + peaks_neg_fin_new = [] + + for ii in range(len(peaks_neg_fin)): + if ii in arg_help_ann: + arg_min = np.argmin([interest_neg_fin[ii], interest_neg_fin[ii + 1]]) + if arg_min == 0: + peaks_neg_fin_new.append(peaks_neg_fin[ii]) + else: + peaks_neg_fin_new.append(peaks_neg_fin[ii + 1]) + + elif (ii - 1) in arg_help_ann: + pass + else: + peaks_neg_fin_new.append(peaks_neg_fin[ii]) + else: + peaks_neg_fin_new = peaks_neg_fin + + # sometime pages with one columns gives also some negative peaks. delete those peaks + param = z[peaks_neg_true] / float(min_peaks_pos) * 100 + + if len(param[param <= 41]) == 0: + peaks_neg_true = [] + + return len(peaks_fin_true), peaks_fin_true + +def find_num_col_by_vertical_lines(regions_without_seperators, multiplier=3.8): + regions_without_seperators_0 = regions_without_seperators[:, :, 0].sum(axis=0) + + ##plt.plot(regions_without_seperators_0) + ##plt.show() + + sigma_ = 35 # 70#35 + + z = gaussian_filter1d(regions_without_seperators_0, sigma_) + + peaks, _ = find_peaks(z, height=0) + + # print(peaks,'peaksnew') + return peaks + +def contours_in_same_horizon(cy_main_hor): + X1 = np.zeros((len(cy_main_hor), len(cy_main_hor))) + X2 = np.zeros((len(cy_main_hor), len(cy_main_hor))) + + X1[0::1, :] = cy_main_hor[:] + X2 = X1.T + + X_dif = np.abs(X2 - X1) + args_help = np.array(range(len(cy_main_hor))) + all_args = [] + for i in range(len(cy_main_hor)): + list_h = list(args_help[X_dif[i, :] <= 20]) + list_h.append(i) + if len(list_h) > 1: + all_args.append(list(set(list_h))) + return np.unique(all_args) + +def find_contours_mean_y_diff(contours_main): + M_main = [cv2.moments(contours_main[j]) for j in range(len(contours_main))] + cy_main = [(M_main[j]["m01"] / (M_main[j]["m00"] + 1e-32)) for j in range(len(M_main))] + return np.mean(np.diff(np.sort(np.array(cy_main)))) +