From 3bb516bfb5c4f2dc284e8e3f5b820a7061d6613f Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Tue, 24 Nov 2020 19:49:44 +0100 Subject: [PATCH] more extraction of util/unused functions --- sbb_newspapers_org_image/eynollah.py | 1149 +------------------------- sbb_newspapers_org_image/unused.py | 335 ++++++++ sbb_newspapers_org_image/utils.py | 746 +++++++++++++++++ 3 files changed, 1122 insertions(+), 1108 deletions(-) diff --git a/sbb_newspapers_org_image/eynollah.py b/sbb_newspapers_org_image/eynollah.py index cb05881..79554c2 100644 --- a/sbb_newspapers_org_image/eynollah.py +++ b/sbb_newspapers_org_image/eynollah.py @@ -78,6 +78,17 @@ from .utils import ( delete_seperator_around, return_regions_without_seperators, return_deskew_slop, + put_drop_out_from_only_drop_model, + putt_bb_of_drop_capitals_of_model_in_patches_in_layout, + check_any_text_region_in_model_one_is_main_or_header, + small_textlines_to_parent_adherence2, + return_contours_of_interested_region_by_size, + order_and_id_of_texts, + order_of_regions, + implent_law_head_main_not_parallel, + return_hor_spliter_by_index, + combine_hor_lines_and_delete_cross_points_and_get_lines_features_back_new, + return_points_with_boundies, ) @@ -3397,22 +3408,6 @@ class eynollah: # cv2.imwrite(os.path.join(dir_of_image, self.f_name) + ".tif",self.image_org) - def image_change_background_pixels_to_zero(self, image_page): - image_back_zero = np.zeros((image_page.shape[0], image_page.shape[1])) - image_back_zero[:, :] = image_page[:, :, 0] - image_back_zero[:, :][image_back_zero[:, :] == 0] = -255 - image_back_zero[:, :][image_back_zero[:, :] == 255] = 0 - image_back_zero[:, :][image_back_zero[:, :] == -255] = 255 - return image_back_zero - - def return_points_with_boundies(self, peaks_neg_fin, first_point, last_point): - peaks_neg_tot = [] - peaks_neg_tot.append(first_point) - for ii in range(len(peaks_neg_fin)): - peaks_neg_tot.append(peaks_neg_fin[ii]) - peaks_neg_tot.append(last_point) - return peaks_neg_tot - def return_boxes_of_images_by_order_of_reading_without_seperators(self, spliter_y_new, image_p_rev, regions_without_seperators, matrix_of_lines_ch, seperators_closeup_n): boxes = [] @@ -3444,7 +3439,7 @@ class eynollah: cy_hor_some = matrix_new[:, 5][(matrix_new[:, 9] == 0)] arg_org_hor_some = matrix_new[:, 0][(matrix_new[:, 9] == 0)] - peaks_neg_tot = self.return_points_with_boundies(peaks_neg_fin, 0, seperators_closeup_n[:, :, 0].shape[1]) + peaks_neg_tot = return_points_with_boundies(peaks_neg_fin, 0, seperators_closeup_n[:, :, 0].shape[1]) start_index_of_hor, newest_peaks, arg_min_hor_sort, lines_length_dels, lines_indexes_deleted = return_hor_spliter_by_index_for_without_verticals(peaks_neg_tot, x_min_hor_some, x_max_hor_some) @@ -3597,7 +3592,7 @@ class eynollah: peaks_neg_ch = peaks_neg_ch[:] + newest_peaks[j] - peaks_neg_ch_tot = self.return_points_with_boundies(peaks_neg_ch, newest_peaks[j], newest_peaks[j + 1]) + peaks_neg_ch_tot = return_points_with_boundies(peaks_neg_ch, newest_peaks[j], newest_peaks[j + 1]) ss_in_ch, nst_p_ch, arg_n_ch, lines_l_del_ch, lines_in_del_ch = return_hor_spliter_by_index_for_without_verticals(peaks_neg_ch_tot, x_min_ch, x_max_ch) @@ -3728,7 +3723,7 @@ class eynollah: peaks_neg_fin = [] num_col = 0 - peaks_neg_tot = self.return_points_with_boundies(peaks_neg_fin, 0, seperators_closeup_n[:, :, 0].shape[1]) + peaks_neg_tot = return_points_with_boundies(peaks_neg_fin, 0, seperators_closeup_n[:, :, 0].shape[1]) for kh in range(len(peaks_neg_tot) - 1): boxes.append([peaks_neg_tot[kh], peaks_neg_tot[kh + 1], spliter_y_new[i], spliter_y_new[i + 1]]) @@ -3737,314 +3732,6 @@ class eynollah: return boxes - def combine_hor_lines_and_delete_cross_points_and_get_lines_features_back(self, regions_pre_p): - seperators_closeup = ((regions_pre_p[:, :] == 6)) * 1 - - seperators_closeup = seperators_closeup.astype(np.uint8) - kernel = np.ones((5, 5), np.uint8) - - seperators_closeup = cv2.dilate(seperators_closeup, kernel, iterations=1) - seperators_closeup = cv2.erode(seperators_closeup, kernel, iterations=1) - - seperators_closeup = cv2.erode(seperators_closeup, kernel, iterations=1) - seperators_closeup = cv2.dilate(seperators_closeup, kernel, iterations=1) - - if len(seperators_closeup.shape) == 2: - seperators_closeup_n = np.zeros((seperators_closeup.shape[0], seperators_closeup.shape[1], 3)) - seperators_closeup_n[:, :, 0] = seperators_closeup - seperators_closeup_n[:, :, 1] = seperators_closeup - seperators_closeup_n[:, :, 2] = seperators_closeup - else: - seperators_closeup_n = seperators_closeup[:, :, :] - # seperators_closeup=seperators_closeup.astype(np.uint8) - seperators_closeup_n = seperators_closeup_n.astype(np.uint8) - imgray = cv2.cvtColor(seperators_closeup_n, cv2.COLOR_BGR2GRAY) - ret, thresh = cv2.threshold(imgray, 0, 255, 0) - contours_lines, hierachy = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) - slope_lines, dist_x, x_min_main, x_max_main, cy_main, slope_lines_org, y_min_main, y_max_main, cx_main = find_features_of_lines(contours_lines) - - dist_y = np.abs(y_max_main - y_min_main) - - slope_lines_org_hor = slope_lines_org[slope_lines == 0] - args = np.array(range(len(slope_lines))) - len_x = seperators_closeup.shape[1] * 0 - len_y = seperators_closeup.shape[0] * 0.01 - - args_hor = args[slope_lines == 0] - dist_x_hor = dist_x[slope_lines == 0] - dist_y_hor = dist_y[slope_lines == 0] - x_min_main_hor = x_min_main[slope_lines == 0] - x_max_main_hor = x_max_main[slope_lines == 0] - cy_main_hor = cy_main[slope_lines == 0] - y_min_main_hor = y_min_main[slope_lines == 0] - y_max_main_hor = y_max_main[slope_lines == 0] - - args_hor = args_hor[dist_x_hor >= len_x] - x_max_main_hor = x_max_main_hor[dist_x_hor >= len_x] - x_min_main_hor = x_min_main_hor[dist_x_hor >= len_x] - cy_main_hor = cy_main_hor[dist_x_hor >= len_x] - y_min_main_hor = y_min_main_hor[dist_x_hor >= len_x] - y_max_main_hor = y_max_main_hor[dist_x_hor >= len_x] - slope_lines_org_hor = slope_lines_org_hor[dist_x_hor >= len_x] - dist_y_hor = dist_y_hor[dist_x_hor >= len_x] - dist_x_hor = dist_x_hor[dist_x_hor >= len_x] - - args_ver = args[slope_lines == 1] - dist_y_ver = dist_y[slope_lines == 1] - dist_x_ver = dist_x[slope_lines == 1] - x_min_main_ver = x_min_main[slope_lines == 1] - x_max_main_ver = x_max_main[slope_lines == 1] - y_min_main_ver = y_min_main[slope_lines == 1] - y_max_main_ver = y_max_main[slope_lines == 1] - cx_main_ver = cx_main[slope_lines == 1] - - args_ver = args_ver[dist_y_ver >= len_y] - x_max_main_ver = x_max_main_ver[dist_y_ver >= len_y] - x_min_main_ver = x_min_main_ver[dist_y_ver >= len_y] - cx_main_ver = cx_main_ver[dist_y_ver >= len_y] - y_min_main_ver = y_min_main_ver[dist_y_ver >= len_y] - y_max_main_ver = y_max_main_ver[dist_y_ver >= len_y] - dist_x_ver = dist_x_ver[dist_y_ver >= len_y] - dist_y_ver = dist_y_ver[dist_y_ver >= len_y] - - img_p_in_ver = np.zeros(seperators_closeup_n[:, :, 2].shape) - for jv in range(len(args_ver)): - img_p_in_ver = cv2.fillPoly(img_p_in_ver, pts=[contours_lines[args_ver[jv]]], color=(1, 1, 1)) - - img_in_hor = np.zeros(seperators_closeup_n[:, :, 2].shape) - for jv in range(len(args_hor)): - img_p_in_hor = cv2.fillPoly(img_in_hor, pts=[contours_lines[args_hor[jv]]], color=(1, 1, 1)) - - all_args_uniq = contours_in_same_horizon(cy_main_hor) - # print(all_args_uniq,'all_args_uniq') - if len(all_args_uniq) > 0: - if type(all_args_uniq[0]) is list: - contours_new = [] - for dd in range(len(all_args_uniq)): - merged_all = None - some_args = args_hor[all_args_uniq[dd]] - some_cy = cy_main_hor[all_args_uniq[dd]] - some_x_min = x_min_main_hor[all_args_uniq[dd]] - some_x_max = x_max_main_hor[all_args_uniq[dd]] - - img_in = np.zeros(seperators_closeup_n[:, :, 2].shape) - for jv in range(len(some_args)): - - img_p_in = cv2.fillPoly(img_p_in_hor, pts=[contours_lines[some_args[jv]]], color=(1, 1, 1)) - img_p_in[int(np.mean(some_cy)) - 5 : int(np.mean(some_cy)) + 5, int(np.min(some_x_min)) : int(np.max(some_x_max))] = 1 - - else: - img_p_in = seperators_closeup - else: - img_p_in = seperators_closeup - - sep_ver_hor = img_p_in + img_p_in_ver - sep_ver_hor_cross = (sep_ver_hor == 2) * 1 - - sep_ver_hor_cross = np.repeat(sep_ver_hor_cross[:, :, np.newaxis], 3, axis=2) - sep_ver_hor_cross = sep_ver_hor_cross.astype(np.uint8) - imgray = cv2.cvtColor(sep_ver_hor_cross, cv2.COLOR_BGR2GRAY) - ret, thresh = cv2.threshold(imgray, 0, 255, 0) - contours_cross, _ = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) - - cx_cross, cy_cross, _, _, _, _, _ = find_new_features_of_contoures(contours_cross) - - for ii in range(len(cx_cross)): - sep_ver_hor[int(cy_cross[ii]) - 15 : int(cy_cross[ii]) + 15, int(cx_cross[ii]) + 5 : int(cx_cross[ii]) + 40] = 0 - sep_ver_hor[int(cy_cross[ii]) - 15 : int(cy_cross[ii]) + 15, int(cx_cross[ii]) - 40 : int(cx_cross[ii]) - 4] = 0 - - img_p_in[:, :] = sep_ver_hor[:, :] - - if len(img_p_in.shape) == 2: - seperators_closeup_n = np.zeros((img_p_in.shape[0], img_p_in.shape[1], 3)) - seperators_closeup_n[:, :, 0] = img_p_in - seperators_closeup_n[:, :, 1] = img_p_in - seperators_closeup_n[:, :, 2] = img_p_in - else: - seperators_closeup_n = img_p_in[:, :, :] - # seperators_closeup=seperators_closeup.astype(np.uint8) - seperators_closeup_n = seperators_closeup_n.astype(np.uint8) - imgray = cv2.cvtColor(seperators_closeup_n, cv2.COLOR_BGR2GRAY) - ret, thresh = cv2.threshold(imgray, 0, 255, 0) - - contours_lines, hierachy = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) - - slope_lines, dist_x, x_min_main, x_max_main, cy_main, slope_lines_org, y_min_main, y_max_main, cx_main = find_features_of_lines(contours_lines) - - dist_y = np.abs(y_max_main - y_min_main) - - slope_lines_org_hor = slope_lines_org[slope_lines == 0] - args = np.array(range(len(slope_lines))) - len_x = seperators_closeup.shape[1] * 0.04 - len_y = seperators_closeup.shape[0] * 0.08 - - args_hor = args[slope_lines == 0] - dist_x_hor = dist_x[slope_lines == 0] - dist_y_hor = dist_y[slope_lines == 0] - x_min_main_hor = x_min_main[slope_lines == 0] - x_max_main_hor = x_max_main[slope_lines == 0] - cy_main_hor = cy_main[slope_lines == 0] - y_min_main_hor = y_min_main[slope_lines == 0] - y_max_main_hor = y_max_main[slope_lines == 0] - - args_hor = args_hor[dist_x_hor >= len_x] - x_max_main_hor = x_max_main_hor[dist_x_hor >= len_x] - x_min_main_hor = x_min_main_hor[dist_x_hor >= len_x] - cy_main_hor = cy_main_hor[dist_x_hor >= len_x] - y_min_main_hor = y_min_main_hor[dist_x_hor >= len_x] - y_max_main_hor = y_max_main_hor[dist_x_hor >= len_x] - slope_lines_org_hor = slope_lines_org_hor[dist_x_hor >= len_x] - dist_y_hor = dist_y_hor[dist_x_hor >= len_x] - dist_x_hor = dist_x_hor[dist_x_hor >= len_x] - - args_ver = args[slope_lines == 1] - dist_y_ver = dist_y[slope_lines == 1] - dist_x_ver = dist_x[slope_lines == 1] - x_min_main_ver = x_min_main[slope_lines == 1] - x_max_main_ver = x_max_main[slope_lines == 1] - y_min_main_ver = y_min_main[slope_lines == 1] - y_max_main_ver = y_max_main[slope_lines == 1] - cx_main_ver = cx_main[slope_lines == 1] - - args_ver = args_ver[dist_y_ver >= len_y] - x_max_main_ver = x_max_main_ver[dist_y_ver >= len_y] - x_min_main_ver = x_min_main_ver[dist_y_ver >= len_y] - cx_main_ver = cx_main_ver[dist_y_ver >= len_y] - y_min_main_ver = y_min_main_ver[dist_y_ver >= len_y] - y_max_main_ver = y_max_main_ver[dist_y_ver >= len_y] - dist_x_ver = dist_x_ver[dist_y_ver >= len_y] - dist_y_ver = dist_y_ver[dist_y_ver >= len_y] - - matrix_of_lines_ch = np.zeros((len(cy_main_hor) + len(cx_main_ver), 10)) - - matrix_of_lines_ch[: len(cy_main_hor), 0] = args_hor - matrix_of_lines_ch[len(cy_main_hor) :, 0] = args_ver - - matrix_of_lines_ch[len(cy_main_hor) :, 1] = cx_main_ver - - matrix_of_lines_ch[: len(cy_main_hor), 2] = x_min_main_hor - matrix_of_lines_ch[len(cy_main_hor) :, 2] = x_min_main_ver - - matrix_of_lines_ch[: len(cy_main_hor), 3] = x_max_main_hor - matrix_of_lines_ch[len(cy_main_hor) :, 3] = x_max_main_ver - - matrix_of_lines_ch[: len(cy_main_hor), 4] = dist_x_hor - matrix_of_lines_ch[len(cy_main_hor) :, 4] = dist_x_ver - - matrix_of_lines_ch[: len(cy_main_hor), 5] = cy_main_hor - - matrix_of_lines_ch[: len(cy_main_hor), 6] = y_min_main_hor - matrix_of_lines_ch[len(cy_main_hor) :, 6] = y_min_main_ver - - matrix_of_lines_ch[: len(cy_main_hor), 7] = y_max_main_hor - matrix_of_lines_ch[len(cy_main_hor) :, 7] = y_max_main_ver - - matrix_of_lines_ch[: len(cy_main_hor), 8] = dist_y_hor - matrix_of_lines_ch[len(cy_main_hor) :, 8] = dist_y_ver - - matrix_of_lines_ch[len(cy_main_hor) :, 9] = 1 - - return matrix_of_lines_ch, seperators_closeup_n - - def combine_hor_lines_and_delete_cross_points_and_get_lines_features_back_new(self, img_p_in_ver, img_in_hor): - - # plt.imshow(img_in_hor) - # plt.show() - - # img_p_in_ver = cv2.erode(img_p_in_ver, self.kernel, iterations=2) - img_p_in_ver = img_p_in_ver.astype(np.uint8) - img_p_in_ver = np.repeat(img_p_in_ver[:, :, np.newaxis], 3, axis=2) - imgray = cv2.cvtColor(img_p_in_ver, cv2.COLOR_BGR2GRAY) - ret, thresh = cv2.threshold(imgray, 0, 255, 0) - - contours_lines_ver, hierachy = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) - - slope_lines_ver, dist_x_ver, x_min_main_ver, x_max_main_ver, cy_main_ver, slope_lines_org_ver, y_min_main_ver, y_max_main_ver, cx_main_ver = find_features_of_lines(contours_lines_ver) - - for i in range(len(x_min_main_ver)): - img_p_in_ver[int(y_min_main_ver[i]) : int(y_min_main_ver[i]) + 30, int(cx_main_ver[i]) - 25 : int(cx_main_ver[i]) + 25, 0] = 0 - img_p_in_ver[int(y_max_main_ver[i]) - 30 : int(y_max_main_ver[i]), int(cx_main_ver[i]) - 25 : int(cx_main_ver[i]) + 25, 0] = 0 - - # plt.imshow(img_p_in_ver[:,:,0]) - # plt.show() - img_in_hor = img_in_hor.astype(np.uint8) - img_in_hor = np.repeat(img_in_hor[:, :, np.newaxis], 3, axis=2) - imgray = cv2.cvtColor(img_in_hor, cv2.COLOR_BGR2GRAY) - ret, thresh = cv2.threshold(imgray, 0, 255, 0) - - contours_lines_hor, hierachy = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) - - slope_lines_hor, dist_x_hor, x_min_main_hor, x_max_main_hor, cy_main_hor, slope_lines_org_hor, y_min_main_hor, y_max_main_hor, cx_main_hor = find_features_of_lines(contours_lines_hor) - - args_hor = np.array(range(len(slope_lines_hor))) - all_args_uniq = contours_in_same_horizon(cy_main_hor) - # print(all_args_uniq,'all_args_uniq') - if len(all_args_uniq) > 0: - if type(all_args_uniq[0]) is list: - special_seperators = [] - contours_new = [] - for dd in range(len(all_args_uniq)): - merged_all = None - some_args = args_hor[all_args_uniq[dd]] - some_cy = cy_main_hor[all_args_uniq[dd]] - some_x_min = x_min_main_hor[all_args_uniq[dd]] - some_x_max = x_max_main_hor[all_args_uniq[dd]] - - # img_in=np.zeros(seperators_closeup_n[:,:,2].shape) - for jv in range(len(some_args)): - - img_p_in = cv2.fillPoly(img_in_hor, pts=[contours_lines_hor[some_args[jv]]], color=(1, 1, 1)) - img_p_in[int(np.mean(some_cy)) - 5 : int(np.mean(some_cy)) + 5, int(np.min(some_x_min)) : int(np.max(some_x_max))] = 1 - - sum_dis = dist_x_hor[some_args].sum() - diff_max_min_uniques = np.max(x_max_main_hor[some_args]) - np.min(x_min_main_hor[some_args]) - - # print( sum_dis/float(diff_max_min_uniques) ,diff_max_min_uniques/float(img_p_in_ver.shape[1]),dist_x_hor[some_args].sum(),diff_max_min_uniques,np.mean( dist_x_hor[some_args]),np.std( dist_x_hor[some_args]) ) - - if diff_max_min_uniques > sum_dis and ((sum_dis / float(diff_max_min_uniques)) > 0.85) and ((diff_max_min_uniques / float(img_p_in_ver.shape[1])) > 0.85) and np.std(dist_x_hor[some_args]) < (0.55 * np.mean(dist_x_hor[some_args])): - # print(dist_x_hor[some_args],dist_x_hor[some_args].sum(),np.min(x_min_main_hor[some_args]) ,np.max(x_max_main_hor[some_args]),'jalibdi') - # print(np.mean( dist_x_hor[some_args] ),np.std( dist_x_hor[some_args] ),np.var( dist_x_hor[some_args] ),'jalibdiha') - special_seperators.append(np.mean(cy_main_hor[some_args])) - - else: - img_p_in = img_in_hor - special_seperators = [] - else: - img_p_in = img_in_hor - special_seperators = [] - - img_p_in_ver[:, :, 0][img_p_in_ver[:, :, 0] == 255] = 1 - # print(img_p_in_ver.shape,np.unique(img_p_in_ver[:,:,0])) - - # plt.imshow(img_p_in[:,:,0]) - # plt.show() - - # plt.imshow(img_p_in_ver[:,:,0]) - # plt.show() - sep_ver_hor = img_p_in + img_p_in_ver - # print(sep_ver_hor.shape,np.unique(sep_ver_hor[:,:,0]),'sep_ver_horsep_ver_horsep_ver_hor') - # plt.imshow(sep_ver_hor[:,:,0]) - # plt.show() - - sep_ver_hor_cross = (sep_ver_hor[:, :, 0] == 2) * 1 - - sep_ver_hor_cross = np.repeat(sep_ver_hor_cross[:, :, np.newaxis], 3, axis=2) - sep_ver_hor_cross = sep_ver_hor_cross.astype(np.uint8) - imgray = cv2.cvtColor(sep_ver_hor_cross, cv2.COLOR_BGR2GRAY) - ret, thresh = cv2.threshold(imgray, 0, 255, 0) - contours_cross, _ = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) - - cx_cross, cy_cross, _, _, _, _, _ = find_new_features_of_contoures(contours_cross) - - for ii in range(len(cx_cross)): - img_p_in[int(cy_cross[ii]) - 30 : int(cy_cross[ii]) + 30, int(cx_cross[ii]) + 5 : int(cx_cross[ii]) + 40, 0] = 0 - img_p_in[int(cy_cross[ii]) - 30 : int(cy_cross[ii]) + 30, int(cx_cross[ii]) - 40 : int(cx_cross[ii]) - 4, 0] = 0 - - # plt.imshow(img_p_in[:,:,0]) - # plt.show() - - return img_p_in[:, :, 0], special_seperators - def return_boxes_of_images_by_order_of_reading(self, spliter_y_new, regions_without_seperators, matrix_of_lines_ch, seperators_closeup_n): boxes = [] @@ -4075,9 +3762,9 @@ class eynollah: cy_hor_some = matrix_new[:, 5][(matrix_new[:, 9] == 0)] arg_org_hor_some = matrix_new[:, 0][(matrix_new[:, 9] == 0)] - peaks_neg_tot = self.return_points_with_boundies(peaks_neg_fin, 0, seperators_closeup_n[:, :, 0].shape[1]) + peaks_neg_tot = return_points_with_boundies(peaks_neg_fin, 0, seperators_closeup_n[:, :, 0].shape[1]) - start_index_of_hor, newest_peaks, arg_min_hor_sort, lines_length_dels, lines_indexes_deleted = self.return_hor_spliter_by_index(peaks_neg_tot, x_min_hor_some, x_max_hor_some) + start_index_of_hor, newest_peaks, arg_min_hor_sort, lines_length_dels, lines_indexes_deleted = return_hor_spliter_by_index(peaks_neg_tot, x_min_hor_some, x_max_hor_some) arg_org_hor_some_sort = arg_org_hor_some[arg_min_hor_sort] @@ -4237,9 +3924,9 @@ class eynollah: # for djh in range(len(peaks_neg_ch)): # peaks_neg_ch.append( peaks_neg_ch[djh]+newest_peaks[j] ) - peaks_neg_ch_tot = self.return_points_with_boundies(peaks_neg_ch, newest_peaks[j], newest_peaks[j + 1]) + peaks_neg_ch_tot = return_points_with_boundies(peaks_neg_ch, newest_peaks[j], newest_peaks[j + 1]) - ss_in_ch, nst_p_ch, arg_n_ch, lines_l_del_ch, lines_in_del_ch = self.return_hor_spliter_by_index(peaks_neg_ch_tot, x_min_ch, x_max_ch) + ss_in_ch, nst_p_ch, arg_n_ch, lines_l_del_ch, lines_in_del_ch = return_hor_spliter_by_index(peaks_neg_ch_tot, x_min_ch, x_max_ch) newest_y_spliter_ch_tot = [] @@ -4371,7 +4058,7 @@ class eynollah: cy_hor_some = matrix_new[:, 5][(matrix_new[:, 9] == 0)] arg_org_hor_some = matrix_new[:, 0][(matrix_new[:, 9] == 0)] - peaks_neg_tot = self.return_points_with_boundies(peaks_neg_fin, 0, regions_without_seperators[:, :].shape[1]) + peaks_neg_tot = return_points_with_boundies(peaks_neg_fin, 0, regions_without_seperators[:, :].shape[1]) start_index_of_hor, newest_peaks, arg_min_hor_sort, lines_length_dels, lines_indexes_deleted = return_hor_spliter_by_index_for_without_verticals(peaks_neg_tot, x_min_hor_some, x_max_hor_some) @@ -4536,7 +4223,7 @@ class eynollah: # for djh in range(len(peaks_neg_ch)): # peaks_neg_ch.append( peaks_neg_ch[djh]+newest_peaks[j] ) - peaks_neg_ch_tot = self.return_points_with_boundies(peaks_neg_ch, newest_peaks[j], newest_peaks[j + 1]) + peaks_neg_ch_tot = return_points_with_boundies(peaks_neg_ch, newest_peaks[j], newest_peaks[j + 1]) ss_in_ch, nst_p_ch, arg_n_ch, lines_l_del_ch, lines_in_del_ch = return_hor_spliter_by_index_for_without_verticals(peaks_neg_ch_tot, x_min_ch, x_max_ch) @@ -4670,7 +4357,7 @@ class eynollah: peaks_neg_fin = [] num_col = 0 - peaks_neg_tot = self.return_points_with_boundies(peaks_neg_fin, 0, seperators_closeup_n[:, :, 0].shape[1]) + peaks_neg_tot = return_points_with_boundies(peaks_neg_fin, 0, seperators_closeup_n[:, :, 0].shape[1]) for kh in range(len(peaks_neg_tot) - 1): boxes.append([peaks_neg_tot[kh], peaks_neg_tot[kh + 1], spliter_y_new[i], spliter_y_new[i + 1]]) @@ -4680,73 +4367,6 @@ class eynollah: return boxes - def return_hor_spliter_by_index(self, peaks_neg_fin_t, x_min_hor_some, x_max_hor_some): - - arg_min_hor_sort = np.argsort(x_min_hor_some) - x_min_hor_some_sort = np.sort(x_min_hor_some) - x_max_hor_some_sort = x_max_hor_some[arg_min_hor_sort] - - arg_minmax = np.array(range(len(peaks_neg_fin_t))) - indexer_lines = [] - indexes_to_delete = [] - indexer_lines_deletions_len = [] - indexr_uniq_ind = [] - for i in range(len(x_min_hor_some_sort)): - min_h = peaks_neg_fin_t - x_min_hor_some_sort[i] - max_h = peaks_neg_fin_t - x_max_hor_some_sort[i] - - min_h[0] = min_h[0] # +20 - max_h[len(max_h) - 1] = max_h[len(max_h) - 1] ##-20 - - min_h_neg = arg_minmax[(min_h < 0) & (np.abs(min_h) < 360)] - max_h_neg = arg_minmax[(max_h >= 0) & (np.abs(max_h) < 360)] - - if len(min_h_neg) > 0 and len(max_h_neg) > 0: - deletions = list(range(min_h_neg[0] + 1, max_h_neg[0])) - unique_delets_int = [] - # print(deletions,len(deletions),'delii') - if len(deletions) > 0: - # print(deletions,len(deletions),'delii2') - - for j in range(len(deletions)): - indexes_to_delete.append(deletions[j]) - # print(deletions,indexes_to_delete,'badiii') - unique_delets = np.unique(indexes_to_delete) - # print(min_h_neg[0],unique_delets) - unique_delets_int = unique_delets[unique_delets < min_h_neg[0]] - - indexer_lines_deletions_len.append(len(deletions)) - indexr_uniq_ind.append([deletions]) - - else: - indexer_lines_deletions_len.append(0) - indexr_uniq_ind.append(-999) - - index_line_true = min_h_neg[0] - len(unique_delets_int) - # print(index_line_true) - if index_line_true > 0 and min_h_neg[0] >= 2: - index_line_true = index_line_true - else: - index_line_true = min_h_neg[0] - - indexer_lines.append(index_line_true) - - if len(unique_delets_int) > 0: - for dd in range(len(unique_delets_int)): - indexes_to_delete.append(unique_delets_int[dd]) - else: - indexer_lines.append(-999) - indexer_lines_deletions_len.append(-999) - indexr_uniq_ind.append(-999) - - peaks_true = [] - for m in range(len(peaks_neg_fin_t)): - if m in indexes_to_delete: - pass - else: - peaks_true.append(peaks_neg_fin_t[m]) - return indexer_lines, peaks_true, arg_min_hor_sort, indexer_lines_deletions_len, indexr_uniq_ind - def return_region_segmentation_after_implementing_not_head_maintext_parallel(self, image_regions_eraly_p, boxes): image_revised = np.zeros((image_regions_eraly_p.shape[0], image_regions_eraly_p.shape[1])) for i in range(len(boxes)): @@ -4757,113 +4377,13 @@ class eynollah: # plt.show() # print(int(boxes[i][2]),int(boxes[i][3]),int(boxes[i][0]),int(boxes[i][1]),'addaa') - image_box = self.implent_law_head_main_not_parallel(image_box) - image_box = self.implent_law_head_main_not_parallel(image_box) - image_box = self.implent_law_head_main_not_parallel(image_box) + image_box = implent_law_head_main_not_parallel(image_box) + image_box = implent_law_head_main_not_parallel(image_box) + image_box = implent_law_head_main_not_parallel(image_box) image_revised[int(boxes[i][2]) : int(boxes[i][3]), int(boxes[i][0]) : int(boxes[i][1])] = image_box[:, :] return image_revised - def tear_main_texts_on_the_boundaries_of_boxes(self, img_revised_tab, boxes): - for i in range(len(boxes)): - img_revised_tab[int(boxes[i][2]) : int(boxes[i][3]), int(boxes[i][1] - 10) : int(boxes[i][1]), 0][img_revised_tab[int(boxes[i][2]) : int(boxes[i][3]), int(boxes[i][1] - 10) : int(boxes[i][1]), 0] == 1] = 0 - img_revised_tab[int(boxes[i][2]) : int(boxes[i][3]), int(boxes[i][1] - 10) : int(boxes[i][1]), 1][img_revised_tab[int(boxes[i][2]) : int(boxes[i][3]), int(boxes[i][1] - 10) : int(boxes[i][1]), 1] == 1] = 0 - img_revised_tab[int(boxes[i][2]) : int(boxes[i][3]), int(boxes[i][1] - 10) : int(boxes[i][1]), 2][img_revised_tab[int(boxes[i][2]) : int(boxes[i][3]), int(boxes[i][1] - 10) : int(boxes[i][1]), 2] == 1] = 0 - return img_revised_tab - - def implent_law_head_main_not_parallel(self, text_regions): - # print(text_regions.shape) - text_indexes = [1, 2] # 1: main text , 2: header , 3: comments - - for t_i in text_indexes: - textline_mask = text_regions[:, :] == t_i - textline_mask = textline_mask * 255.0 - - textline_mask = textline_mask.astype(np.uint8) - textline_mask = np.repeat(textline_mask[:, :, np.newaxis], 3, axis=2) - kernel = np.ones((5, 5), np.uint8) - - # print(type(textline_mask),np.unique(textline_mask),textline_mask.shape) - imgray = cv2.cvtColor(textline_mask, cv2.COLOR_BGR2GRAY) - ret, thresh = cv2.threshold(imgray, 0, 255, 0) - - if t_i == 1: - contours_main, hirarchy = cv2.findContours(thresh.copy(), cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) - # print(type(contours_main)) - areas_main = np.array([cv2.contourArea(contours_main[j]) for j in range(len(contours_main))]) - M_main = [cv2.moments(contours_main[j]) for j in range(len(contours_main))] - cx_main = [(M_main[j]["m10"] / (M_main[j]["m00"] + 1e-32)) for j in range(len(M_main))] - cy_main = [(M_main[j]["m01"] / (M_main[j]["m00"] + 1e-32)) for j in range(len(M_main))] - x_min_main = np.array([np.min(contours_main[j][:, 0, 0]) for j in range(len(contours_main))]) - x_max_main = np.array([np.max(contours_main[j][:, 0, 0]) for j in range(len(contours_main))]) - - y_min_main = np.array([np.min(contours_main[j][:, 0, 1]) for j in range(len(contours_main))]) - y_max_main = np.array([np.max(contours_main[j][:, 0, 1]) for j in range(len(contours_main))]) - # print(contours_main[0],np.shape(contours_main[0]),contours_main[0][:,0,0]) - elif t_i == 2: - contours_header, hirarchy = cv2.findContours(thresh.copy(), cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) - # print(type(contours_header)) - areas_header = np.array([cv2.contourArea(contours_header[j]) for j in range(len(contours_header))]) - M_header = [cv2.moments(contours_header[j]) for j in range(len(contours_header))] - cx_header = [(M_header[j]["m10"] / (M_header[j]["m00"] + 1e-32)) for j in range(len(M_header))] - cy_header = [(M_header[j]["m01"] / (M_header[j]["m00"] + 1e-32)) for j in range(len(M_header))] - - x_min_header = np.array([np.min(contours_header[j][:, 0, 0]) for j in range(len(contours_header))]) - x_max_header = np.array([np.max(contours_header[j][:, 0, 0]) for j in range(len(contours_header))]) - - y_min_header = np.array([np.min(contours_header[j][:, 0, 1]) for j in range(len(contours_header))]) - y_max_header = np.array([np.max(contours_header[j][:, 0, 1]) for j in range(len(contours_header))]) - - args = np.array(range(1, len(cy_header) + 1)) - args_main = np.array(range(1, len(cy_main) + 1)) - for jj in range(len(contours_main)): - headers_in_main = [(cy_header > y_min_main[jj]) & ((cy_header < y_max_main[jj]))] - mains_in_main = [(cy_main > y_min_main[jj]) & ((cy_main < y_max_main[jj]))] - args_log = args * headers_in_main - res = args_log[args_log > 0] - res_true = res - 1 - - args_log_main = args_main * mains_in_main - res_main = args_log_main[args_log_main > 0] - res_true_main = res_main - 1 - - if len(res_true) > 0: - sum_header = np.sum(areas_header[res_true]) - sum_main = np.sum(areas_main[res_true_main]) - if sum_main > sum_header: - cnt_int = [contours_header[j] for j in res_true] - text_regions = cv2.fillPoly(text_regions, pts=cnt_int, color=(1, 1, 1)) - else: - cnt_int = [contours_main[j] for j in res_true_main] - text_regions = cv2.fillPoly(text_regions, pts=cnt_int, color=(2, 2, 2)) - - for jj in range(len(contours_header)): - main_in_header = [(cy_main > y_min_header[jj]) & ((cy_main < y_max_header[jj]))] - header_in_header = [(cy_header > y_min_header[jj]) & ((cy_header < y_max_header[jj]))] - args_log = args_main * main_in_header - res = args_log[args_log > 0] - res_true = res - 1 - - args_log_header = args * header_in_header - res_header = args_log_header[args_log_header > 0] - res_true_header = res_header - 1 - - if len(res_true) > 0: - - sum_header = np.sum(areas_header[res_true_header]) - sum_main = np.sum(areas_main[res_true]) - - if sum_main > sum_header: - - cnt_int = [contours_header[j] for j in res_true_header] - text_regions = cv2.fillPoly(text_regions, pts=cnt_int, color=(1, 1, 1)) - else: - cnt_int = [contours_main[j] for j in res_true] - text_regions = cv2.fillPoly(text_regions, pts=cnt_int, color=(2, 2, 2)) - - return text_regions - - def add_tables_heuristic_to_layout(self, image_regions_eraly_p, boxes, slope_mean_hor, spliter_y, peaks_neg_tot, image_revised): image_revised_1 = delete_seperator_around(spliter_y, peaks_neg_tot, image_revised) @@ -4979,320 +4499,6 @@ class eynollah: ##plt.show() return image_revised_last - def remove_headers_and_mains_intersection(self, seperators_closeup_n, img_revised_tab, boxes): - for ind in range(len(boxes)): - asp = np.zeros((img_revised_tab[:, :, 0].shape[0], seperators_closeup_n[:, :, 0].shape[1])) - asp[int(boxes[ind][2]) : int(boxes[ind][3]), int(boxes[ind][0]) : int(boxes[ind][1])] = img_revised_tab[int(boxes[ind][2]) : int(boxes[ind][3]), int(boxes[ind][0]) : int(boxes[ind][1]), 0] - - head_patch_con = (asp[:, :] == 2) * 1 - main_patch_con = (asp[:, :] == 1) * 1 - # print(head_patch_con) - head_patch_con = head_patch_con.astype(np.uint8) - main_patch_con = main_patch_con.astype(np.uint8) - - head_patch_con = np.repeat(head_patch_con[:, :, np.newaxis], 3, axis=2) - main_patch_con = np.repeat(main_patch_con[:, :, np.newaxis], 3, axis=2) - - imgray = cv2.cvtColor(head_patch_con, cv2.COLOR_BGR2GRAY) - ret, thresh = cv2.threshold(imgray, 0, 255, 0) - - contours_head_patch_con, hiearchy = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) - contours_head_patch_con = return_parent_contours(contours_head_patch_con, hiearchy) - - imgray = cv2.cvtColor(main_patch_con, cv2.COLOR_BGR2GRAY) - ret, thresh = cv2.threshold(imgray, 0, 255, 0) - - contours_main_patch_con, hiearchy = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) - contours_main_patch_con = return_parent_contours(contours_main_patch_con, hiearchy) - - y_patch_head_min, y_patch_head_max, _ = find_features_of_contours(contours_head_patch_con) - y_patch_main_min, y_patch_main_max, _ = find_features_of_contours(contours_main_patch_con) - - for i in range(len(y_patch_head_min)): - for j in range(len(y_patch_main_min)): - if y_patch_head_max[i] > y_patch_main_min[j] and y_patch_head_min[i] < y_patch_main_min[j]: - y_down = y_patch_head_max[i] - y_up = y_patch_main_min[j] - - patch_intersection = np.zeros(asp.shape) - patch_intersection[y_up:y_down, :] = asp[y_up:y_down, :] - - head_patch_con = (patch_intersection[:, :] == 2) * 1 - main_patch_con = (patch_intersection[:, :] == 1) * 1 - head_patch_con = head_patch_con.astype(np.uint8) - main_patch_con = main_patch_con.astype(np.uint8) - - head_patch_con = np.repeat(head_patch_con[:, :, np.newaxis], 3, axis=2) - main_patch_con = np.repeat(main_patch_con[:, :, np.newaxis], 3, axis=2) - - imgray = cv2.cvtColor(head_patch_con, cv2.COLOR_BGR2GRAY) - ret, thresh = cv2.threshold(imgray, 0, 255, 0) - - contours_head_patch_con, hiearchy = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) - contours_head_patch_con = return_parent_contours(contours_head_patch_con, hiearchy) - - imgray = cv2.cvtColor(main_patch_con, cv2.COLOR_BGR2GRAY) - ret, thresh = cv2.threshold(imgray, 0, 255, 0) - - contours_main_patch_con, hiearchy = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) - contours_main_patch_con = return_parent_contours(contours_main_patch_con, hiearchy) - - _, _, areas_head = find_features_of_contours(contours_head_patch_con) - _, _, areas_main = find_features_of_contours(contours_main_patch_con) - - if np.sum(areas_head) > np.sum(areas_main): - img_revised_tab[y_up:y_down, int(boxes[ind][0]) : int(boxes[ind][1]), 0][img_revised_tab[y_up:y_down, int(boxes[ind][0]) : int(boxes[ind][1]), 0] == 1] = 2 - else: - img_revised_tab[y_up:y_down, int(boxes[ind][0]) : int(boxes[ind][1]), 0][img_revised_tab[y_up:y_down, int(boxes[ind][0]) : int(boxes[ind][1]), 0] == 2] = 1 - - elif y_patch_head_min[i] < y_patch_main_max[j] and y_patch_head_max[i] > y_patch_main_max[j]: - y_down = y_patch_main_max[j] - y_up = y_patch_head_min[i] - - patch_intersection = np.zeros(asp.shape) - patch_intersection[y_up:y_down, :] = asp[y_up:y_down, :] - - head_patch_con = (patch_intersection[:, :] == 2) * 1 - main_patch_con = (patch_intersection[:, :] == 1) * 1 - head_patch_con = head_patch_con.astype(np.uint8) - main_patch_con = main_patch_con.astype(np.uint8) - - head_patch_con = np.repeat(head_patch_con[:, :, np.newaxis], 3, axis=2) - main_patch_con = np.repeat(main_patch_con[:, :, np.newaxis], 3, axis=2) - - imgray = cv2.cvtColor(head_patch_con, cv2.COLOR_BGR2GRAY) - ret, thresh = cv2.threshold(imgray, 0, 255, 0) - - contours_head_patch_con, hiearchy = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) - contours_head_patch_con = return_parent_contours(contours_head_patch_con, hiearchy) - - imgray = cv2.cvtColor(main_patch_con, cv2.COLOR_BGR2GRAY) - ret, thresh = cv2.threshold(imgray, 0, 255, 0) - - contours_main_patch_con, hiearchy = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) - contours_main_patch_con = return_parent_contours(contours_main_patch_con, hiearchy) - - _, _, areas_head = find_features_of_contours(contours_head_patch_con) - _, _, areas_main = find_features_of_contours(contours_main_patch_con) - - if np.sum(areas_head) > np.sum(areas_main): - img_revised_tab[y_up:y_down, int(boxes[ind][0]) : int(boxes[ind][1]), 0][img_revised_tab[y_up:y_down, int(boxes[ind][0]) : int(boxes[ind][1]), 0] == 1] = 2 - else: - img_revised_tab[y_up:y_down, int(boxes[ind][0]) : int(boxes[ind][1]), 0][img_revised_tab[y_up:y_down, int(boxes[ind][0]) : int(boxes[ind][1]), 0] == 2] = 1 - - # print(np.unique(patch_intersection) ) - ##plt.figure(figsize=(20,20)) - ##plt.imshow(patch_intersection) - ##plt.show() - else: - pass - - return img_revised_tab - - def order_of_regions(self, textline_mask, contours_main, contours_header, y_ref): - - ##plt.imshow(textline_mask) - ##plt.show() - """ - print(len(contours_main),'contours_main') - mada_n=textline_mask.sum(axis=1) - y=mada_n[:] - - y_help=np.zeros(len(y)+40) - y_help[20:len(y)+20]=y - x=np.array( range(len(y)) ) - - - peaks_real, _ = find_peaks(gaussian_filter1d(y, 3), height=0) - - ##plt.imshow(textline_mask[:,:]) - ##plt.show() - - - sigma_gaus=8 - - z= gaussian_filter1d(y_help, sigma_gaus) - zneg_rev=-y_help+np.max(y_help) - - zneg=np.zeros(len(zneg_rev)+40) - zneg[20:len(zneg_rev)+20]=zneg_rev - zneg= gaussian_filter1d(zneg, sigma_gaus) - - - peaks, _ = find_peaks(z, height=0) - peaks_neg, _ = find_peaks(zneg, height=0) - - peaks_neg=peaks_neg-20-20 - peaks=peaks-20 - """ - - textline_sum_along_width = textline_mask.sum(axis=1) - - y = textline_sum_along_width[:] - y_padded = np.zeros(len(y) + 40) - y_padded[20 : len(y) + 20] = y - x = np.array(range(len(y))) - - peaks_real, _ = find_peaks(gaussian_filter1d(y, 3), height=0) - - sigma_gaus = 8 - - z = gaussian_filter1d(y_padded, sigma_gaus) - zneg_rev = -y_padded + np.max(y_padded) - - zneg = np.zeros(len(zneg_rev) + 40) - zneg[20 : len(zneg_rev) + 20] = zneg_rev - zneg = gaussian_filter1d(zneg, sigma_gaus) - - peaks, _ = find_peaks(z, height=0) - peaks_neg, _ = find_peaks(zneg, height=0) - - peaks_neg = peaks_neg - 20 - 20 - peaks = peaks - 20 - - ##plt.plot(z) - ##plt.show() - - if contours_main != None: - areas_main = np.array([cv2.contourArea(contours_main[j]) for j in range(len(contours_main))]) - M_main = [cv2.moments(contours_main[j]) for j in range(len(contours_main))] - cx_main = [(M_main[j]["m10"] / (M_main[j]["m00"] + 1e-32)) for j in range(len(M_main))] - cy_main = [(M_main[j]["m01"] / (M_main[j]["m00"] + 1e-32)) for j in range(len(M_main))] - x_min_main = np.array([np.min(contours_main[j][:, 0, 0]) for j in range(len(contours_main))]) - x_max_main = np.array([np.max(contours_main[j][:, 0, 0]) for j in range(len(contours_main))]) - - y_min_main = np.array([np.min(contours_main[j][:, 0, 1]) for j in range(len(contours_main))]) - y_max_main = np.array([np.max(contours_main[j][:, 0, 1]) for j in range(len(contours_main))]) - - if len(contours_header) != None: - areas_header = np.array([cv2.contourArea(contours_header[j]) for j in range(len(contours_header))]) - M_header = [cv2.moments(contours_header[j]) for j in range(len(contours_header))] - cx_header = [(M_header[j]["m10"] / (M_header[j]["m00"] + 1e-32)) for j in range(len(M_header))] - cy_header = [(M_header[j]["m01"] / (M_header[j]["m00"] + 1e-32)) for j in range(len(M_header))] - - x_min_header = np.array([np.min(contours_header[j][:, 0, 0]) for j in range(len(contours_header))]) - x_max_header = np.array([np.max(contours_header[j][:, 0, 0]) for j in range(len(contours_header))]) - - y_min_header = np.array([np.min(contours_header[j][:, 0, 1]) for j in range(len(contours_header))]) - y_max_header = np.array([np.max(contours_header[j][:, 0, 1]) for j in range(len(contours_header))]) - # print(cy_main,'mainy') - - peaks_neg_new = [] - - peaks_neg_new.append(0 + y_ref) - for iii in range(len(peaks_neg)): - peaks_neg_new.append(peaks_neg[iii] + y_ref) - - peaks_neg_new.append(textline_mask.shape[0] + y_ref) - - if len(cy_main) > 0 and np.max(cy_main) > np.max(peaks_neg_new): - cy_main = np.array(cy_main) * (np.max(peaks_neg_new) / np.max(cy_main)) - 10 - - if contours_main != None: - indexer_main = np.array(range(len(contours_main))) - - if contours_main != None: - len_main = len(contours_main) - else: - len_main = 0 - - matrix_of_orders = np.zeros((len(contours_main) + len(contours_header), 5)) - - matrix_of_orders[:, 0] = np.array(range(len(contours_main) + len(contours_header))) - - matrix_of_orders[: len(contours_main), 1] = 1 - matrix_of_orders[len(contours_main) :, 1] = 2 - - matrix_of_orders[: len(contours_main), 2] = cx_main - matrix_of_orders[len(contours_main) :, 2] = cx_header - - matrix_of_orders[: len(contours_main), 3] = cy_main - matrix_of_orders[len(contours_main) :, 3] = cy_header - - matrix_of_orders[: len(contours_main), 4] = np.array(range(len(contours_main))) - matrix_of_orders[len(contours_main) :, 4] = np.array(range(len(contours_header))) - - # print(peaks_neg_new,'peaks_neg_new') - - # print(matrix_of_orders,'matrix_of_orders') - # print(peaks_neg_new,np.max(peaks_neg_new)) - final_indexers_sorted = [] - final_types = [] - final_index_type = [] - for i in range(len(peaks_neg_new) - 1): - top = peaks_neg_new[i] - down = peaks_neg_new[i + 1] - - # print(top,down,'topdown') - - indexes_in = matrix_of_orders[:, 0][(matrix_of_orders[:, 3] >= top) & ((matrix_of_orders[:, 3] < down))] - cxs_in = matrix_of_orders[:, 2][(matrix_of_orders[:, 3] >= top) & ((matrix_of_orders[:, 3] < down))] - cys_in = matrix_of_orders[:, 3][(matrix_of_orders[:, 3] >= top) & ((matrix_of_orders[:, 3] < down))] - types_of_text = matrix_of_orders[:, 1][(matrix_of_orders[:, 3] >= top) & ((matrix_of_orders[:, 3] < down))] - index_types_of_text = matrix_of_orders[:, 4][(matrix_of_orders[:, 3] >= top) & ((matrix_of_orders[:, 3] < down))] - - # print(top,down) - # print(cys_in,'cyyyins') - # print(indexes_in,'indexes') - sorted_inside = np.argsort(cxs_in) - - ind_in_int = indexes_in[sorted_inside] - ind_in_type = types_of_text[sorted_inside] - ind_ind_type = index_types_of_text[sorted_inside] - - for j in range(len(ind_in_int)): - final_indexers_sorted.append(int(ind_in_int[j])) - final_types.append(int(ind_in_type[j])) - final_index_type.append(int(ind_ind_type[j])) - - ##matrix_of_orders[:len_main,4]=final_indexers_sorted[:] - - # print(peaks_neg_new,'peaks') - # print(final_indexers_sorted,'indexsorted') - # print(final_types,'types') - # print(final_index_type,'final_index_type') - - return final_indexers_sorted, matrix_of_orders, final_types, final_index_type - - def order_and_id_of_texts(self, found_polygons_text_region, found_polygons_text_region_h, matrix_of_orders, indexes_sorted, index_of_types, kind_of_texts, ref_point): - indexes_sorted = np.array(indexes_sorted) - index_of_types = np.array(index_of_types) - kind_of_texts = np.array(kind_of_texts) - - id_of_texts = [] - order_of_texts = [] - - index_of_types_1 = index_of_types[kind_of_texts == 1] - indexes_sorted_1 = indexes_sorted[kind_of_texts == 1] - - index_of_types_2 = index_of_types[kind_of_texts == 2] - indexes_sorted_2 = indexes_sorted[kind_of_texts == 2] - - ##print(index_of_types,'index_of_types') - ##print(kind_of_texts,'kind_of_texts') - ##print(len(found_polygons_text_region),'found_polygons_text_region') - ##print(index_of_types_1,'index_of_types_1') - ##print(indexes_sorted_1,'indexes_sorted_1') - index_b = 0 + ref_point - for mm in range(len(found_polygons_text_region)): - - id_of_texts.append("r" + str(index_b)) - interest = indexes_sorted_1[indexes_sorted_1 == index_of_types_1[mm]] - - if len(interest) > 0: - order_of_texts.append(interest[0]) - index_b += 1 - else: - pass - - for mm in range(len(found_polygons_text_region_h)): - id_of_texts.append("r" + str(index_b)) - interest = indexes_sorted_2[index_of_types_2[mm]] - order_of_texts.append(interest) - index_b += 1 - - return order_of_texts, id_of_texts - def return_teilwiese_deskewed_lines(self, text_regions_p, textline_rotated): kernel = np.ones((5, 5), np.uint8) @@ -5406,7 +4612,7 @@ class eynollah: vertical = cv2.dilate(vertical, kernel, iterations=1) # Show extracted vertical lines - horizontal, special_seperators = self.combine_hor_lines_and_delete_cross_points_and_get_lines_features_back_new(vertical, horizontal) + horizontal, special_seperators = combine_hor_lines_and_delete_cross_points_and_get_lines_features_back_new(vertical, horizontal) ##plt.imshow(vertical) ##plt.show() @@ -5672,25 +4878,6 @@ class eynollah: return num_col_fin, peaks_neg_fin_fin, matrix_of_lines_ch, spliter_y_new, seperators_closeup_n - def return_contours_of_interested_region_by_size(self, region_pre_p, pixel, min_area, max_area): - - # pixels of images are identified by 5 - if len(region_pre_p.shape) == 3: - cnts_images = (region_pre_p[:, :, 0] == pixel) * 1 - else: - cnts_images = (region_pre_p[:, :] == pixel) * 1 - cnts_images = cnts_images.astype(np.uint8) - cnts_images = np.repeat(cnts_images[:, :, np.newaxis], 3, axis=2) - imgray = cv2.cvtColor(cnts_images, cv2.COLOR_BGR2GRAY) - ret, thresh = cv2.threshold(imgray, 0, 255, 0) - contours_imgs, hiearchy = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) - - contours_imgs = return_parent_contours(contours_imgs, hiearchy) - contours_imgs = filter_contours_area_of_image_tables(thresh, contours_imgs, hiearchy, max_area=max_area, min_area=min_area) - - img_ret = np.zeros((region_pre_p.shape[0], region_pre_p.shape[1], 3)) - img_ret = cv2.fillPoly(img_ret, pts=contours_imgs, color=(1, 1, 1)) - return img_ret[:, :, 0] def get_regions_from_xy_neu(self, img): img_org = np.copy(img) @@ -6723,260 +5910,6 @@ class eynollah: queue_of_all_params.put([index_polygons_per_process_per_process, polygons_per_par_process_per_process, textregions_cnt_tot_per_process, textlines_cnt_tot_per_process]) - def small_textlines_to_parent_adherence2(self, textlines_con, textline_iamge, num_col): - # print(textlines_con) - # textlines_con=textlines_con.astype(np.uint32) - - textlines_con_changed = [] - for m1 in range(len(textlines_con)): - - # textlines_tot=textlines_con[m1] - # textlines_tot=textlines_tot.astype() - textlines_tot = [] - textlines_tot_org_form = [] - # print(textlines_tot) - - for nn in range(len(textlines_con[m1])): - textlines_tot.append(np.array(textlines_con[m1][nn], dtype=np.int32)) - textlines_tot_org_form.append(textlines_con[m1][nn]) - - ##img_text_all=np.zeros((textline_iamge.shape[0],textline_iamge.shape[1])) - ##img_text_all=cv2.fillPoly(img_text_all, pts =textlines_tot , color=(1,1,1)) - - ##plt.imshow(img_text_all) - ##plt.show() - areas_cnt_text = np.array([cv2.contourArea(textlines_tot[j]) for j in range(len(textlines_tot))]) - areas_cnt_text = areas_cnt_text / float(textline_iamge.shape[0] * textline_iamge.shape[1]) - indexes_textlines = np.array(range(len(textlines_tot))) - - # print(areas_cnt_text,np.min(areas_cnt_text),np.max(areas_cnt_text)) - if num_col == 0: - min_area = 0.0004 - elif num_col == 1: - min_area = 0.0003 - else: - min_area = 0.0001 - indexes_textlines_small = indexes_textlines[areas_cnt_text < min_area] - - # print(indexes_textlines) - - textlines_small = [] - textlines_small_org_form = [] - for i in indexes_textlines_small: - textlines_small.append(textlines_tot[i]) - textlines_small_org_form.append(textlines_tot_org_form[i]) - - textlines_big = [] - textlines_big_org_form = [] - for i in list(set(indexes_textlines) - set(indexes_textlines_small)): - textlines_big.append(textlines_tot[i]) - textlines_big_org_form.append(textlines_tot_org_form[i]) - - img_textline_s = np.zeros((textline_iamge.shape[0], textline_iamge.shape[1])) - img_textline_s = cv2.fillPoly(img_textline_s, pts=textlines_small, color=(1, 1, 1)) - - img_textline_b = np.zeros((textline_iamge.shape[0], textline_iamge.shape[1])) - img_textline_b = cv2.fillPoly(img_textline_b, pts=textlines_big, color=(1, 1, 1)) - - sum_small_big_all = img_textline_s + img_textline_b - sum_small_big_all2 = (sum_small_big_all[:, :] == 2) * 1 - - sum_intersection_sb = sum_small_big_all2.sum(axis=1).sum() - - if sum_intersection_sb > 0: - - dis_small_from_bigs_tot = [] - for z1 in range(len(textlines_small)): - # print(len(textlines_small),'small') - intersections = [] - for z2 in range(len(textlines_big)): - img_text = np.zeros((textline_iamge.shape[0], textline_iamge.shape[1])) - img_text = cv2.fillPoly(img_text, pts=[textlines_small[z1]], color=(1, 1, 1)) - - img_text2 = np.zeros((textline_iamge.shape[0], textline_iamge.shape[1])) - img_text2 = cv2.fillPoly(img_text2, pts=[textlines_big[z2]], color=(1, 1, 1)) - - sum_small_big = img_text2 + img_text - sum_small_big_2 = (sum_small_big[:, :] == 2) * 1 - - sum_intersection = sum_small_big_2.sum(axis=1).sum() - - # print(sum_intersection) - - intersections.append(sum_intersection) - - if len(np.array(intersections)[np.array(intersections) > 0]) == 0: - intersections = [] - - try: - dis_small_from_bigs_tot.append(np.argmax(intersections)) - except: - dis_small_from_bigs_tot.append(-1) - - smalls_list = np.array(dis_small_from_bigs_tot)[np.array(dis_small_from_bigs_tot) >= 0] - - # index_small_textlines_rest=list( set(indexes_textlines_small)-set(smalls_list) ) - - textlines_big_with_change = [] - textlines_big_with_change_con = [] - textlines_small_with_change = [] - - for z in list(set(smalls_list)): - index_small_textlines = list(np.where(np.array(dis_small_from_bigs_tot) == z)[0]) - # print(z,index_small_textlines) - - img_text2 = np.zeros((textline_iamge.shape[0], textline_iamge.shape[1], 3)) - img_text2 = cv2.fillPoly(img_text2, pts=[textlines_big[z]], color=(255, 255, 255)) - - textlines_big_with_change.append(z) - - for k in index_small_textlines: - img_text2 = cv2.fillPoly(img_text2, pts=[textlines_small[k]], color=(255, 255, 255)) - textlines_small_with_change.append(k) - - img_text2 = img_text2.astype(np.uint8) - imgray = cv2.cvtColor(img_text2, cv2.COLOR_BGR2GRAY) - ret, thresh = cv2.threshold(imgray, 0, 255, 0) - cont, hierachy = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) - - # print(cont[0],type(cont)) - - textlines_big_with_change_con.append(cont) - textlines_big_org_form[z] = cont[0] - - # plt.imshow(img_text2) - # plt.show() - - # print(textlines_big_with_change,'textlines_big_with_change') - # print(textlines_small_with_change,'textlines_small_with_change') - # print(textlines_big) - textlines_con_changed.append(textlines_big_org_form) - - else: - textlines_con_changed.append(textlines_big_org_form) - return textlines_con_changed - - def check_any_text_region_in_model_one_is_main_or_header(self, regions_model_1, regions_model_full, contours_only_text_parent, all_box_coord, all_found_texline_polygons, slopes, contours_only_text_parent_d_ordered): - text_only = (regions_model_1[:, :] == 1) * 1 - contours_only_text, hir_on_text = return_contours_of_image(text_only) - - """ - contours_only_text_parent=return_parent_contours( contours_only_text,hir_on_text) - - areas_cnt_text=np.array([cv2.contourArea(contours_only_text_parent[j]) for j in range(len(contours_only_text_parent))]) - areas_cnt_text=areas_cnt_text/float(text_only.shape[0]*text_only.shape[1]) - - ###areas_cnt_text_h=np.array([cv2.contourArea(contours_only_text_parent_h[j]) for j in range(len(contours_only_text_parent_h))]) - ###areas_cnt_text_h=areas_cnt_text_h/float(text_only_h.shape[0]*text_only_h.shape[1]) - - ###contours_only_text_parent=[contours_only_text_parent[jz] for jz in range(len(contours_only_text_parent)) if areas_cnt_text[jz]>0.0002] - contours_only_text_parent=[contours_only_text_parent[jz] for jz in range(len(contours_only_text_parent)) if areas_cnt_text[jz]>0.00001] - """ - - cx_main, cy_main, x_min_main, x_max_main, y_min_main, y_max_main, y_corr_x_min_from_argmin = find_new_features_of_contoures(contours_only_text_parent) - - length_con = x_max_main - x_min_main - height_con = y_max_main - y_min_main - - all_found_texline_polygons_main = [] - all_found_texline_polygons_head = [] - - all_box_coord_main = [] - all_box_coord_head = [] - - slopes_main = [] - slopes_head = [] - - contours_only_text_parent_main = [] - contours_only_text_parent_head = [] - - contours_only_text_parent_main_d = [] - contours_only_text_parent_head_d = [] - - for ii in range(len(contours_only_text_parent)): - con = contours_only_text_parent[ii] - img = np.zeros((regions_model_1.shape[0], regions_model_1.shape[1], 3)) - img = cv2.fillPoly(img, pts=[con], color=(255, 255, 255)) - - all_pixels = ((img[:, :, 0] == 255) * 1).sum() - - pixels_header = (((img[:, :, 0] == 255) & (regions_model_full[:, :, 0] == 2)) * 1).sum() - pixels_main = all_pixels - pixels_header - - if (pixels_header >= pixels_main) and ((length_con[ii] / float(height_con[ii])) >= 1.3): - regions_model_1[:, :][(regions_model_1[:, :] == 1) & (img[:, :, 0] == 255)] = 2 - contours_only_text_parent_head.append(con) - if contours_only_text_parent_d_ordered is not None: - contours_only_text_parent_head_d.append(contours_only_text_parent_d_ordered[ii]) - all_box_coord_head.append(all_box_coord[ii]) - slopes_head.append(slopes[ii]) - all_found_texline_polygons_head.append(all_found_texline_polygons[ii]) - else: - regions_model_1[:, :][(regions_model_1[:, :] == 1) & (img[:, :, 0] == 255)] = 1 - contours_only_text_parent_main.append(con) - if contours_only_text_parent_d_ordered is not None: - contours_only_text_parent_main_d.append(contours_only_text_parent_d_ordered[ii]) - all_box_coord_main.append(all_box_coord[ii]) - slopes_main.append(slopes[ii]) - all_found_texline_polygons_main.append(all_found_texline_polygons[ii]) - - # print(all_pixels,pixels_main,pixels_header) - - # plt.imshow(img[:,:,0]) - # plt.show() - return regions_model_1, contours_only_text_parent_main, contours_only_text_parent_head, all_box_coord_main, all_box_coord_head, all_found_texline_polygons_main, all_found_texline_polygons_head, slopes_main, slopes_head, contours_only_text_parent_main_d, contours_only_text_parent_head_d - - def putt_bb_of_drop_capitals_of_model_in_patches_in_layout(self, layout_in_patch): - - drop_only = (layout_in_patch[:, :, 0] == 4) * 1 - contours_drop, hir_on_drop = return_contours_of_image(drop_only) - contours_drop_parent = return_parent_contours(contours_drop, hir_on_drop) - - areas_cnt_text = np.array([cv2.contourArea(contours_drop_parent[j]) for j in range(len(contours_drop_parent))]) - areas_cnt_text = areas_cnt_text / float(drop_only.shape[0] * drop_only.shape[1]) - - contours_drop_parent = [contours_drop_parent[jz] for jz in range(len(contours_drop_parent)) if areas_cnt_text[jz] > 0.00001] - - areas_cnt_text = [areas_cnt_text[jz] for jz in range(len(areas_cnt_text)) if areas_cnt_text[jz] > 0.001] - - contours_drop_parent_final = [] - - for jj in range(len(contours_drop_parent)): - x, y, w, h = cv2.boundingRect(contours_drop_parent[jj]) - layout_in_patch[y : y + h, x : x + w, 0] = 4 - - return layout_in_patch - - def put_drop_out_from_only_drop_model(self, layout_no_patch, layout1): - - drop_only = (layout_no_patch[:, :, 0] == 4) * 1 - contours_drop, hir_on_drop = return_contours_of_image(drop_only) - contours_drop_parent = return_parent_contours(contours_drop, hir_on_drop) - - areas_cnt_text = np.array([cv2.contourArea(contours_drop_parent[j]) for j in range(len(contours_drop_parent))]) - areas_cnt_text = areas_cnt_text / float(drop_only.shape[0] * drop_only.shape[1]) - - contours_drop_parent = [contours_drop_parent[jz] for jz in range(len(contours_drop_parent)) if areas_cnt_text[jz] > 0.00001] - - areas_cnt_text = [areas_cnt_text[jz] for jz in range(len(areas_cnt_text)) if areas_cnt_text[jz] > 0.00001] - - contours_drop_parent_final = [] - - for jj in range(len(contours_drop_parent)): - x, y, w, h = cv2.boundingRect(contours_drop_parent[jj]) - # boxes.append([int(x), int(y), int(w), int(h)]) - - map_of_drop_contour_bb = np.zeros((layout1.shape[0], layout1.shape[1])) - map_of_drop_contour_bb[y : y + h, x : x + w] = layout1[y : y + h, x : x + w] - - if (((map_of_drop_contour_bb == 1) * 1).sum() / float(((map_of_drop_contour_bb == 5) * 1).sum()) * 100) >= 15: - contours_drop_parent_final.append(contours_drop_parent[jj]) - - layout_no_patch[:, :, 0][layout_no_patch[:, :, 0] == 4] = 0 - - layout_no_patch = cv2.fillPoly(layout_no_patch, pts=contours_drop_parent_final, color=(4, 4, 4)) - - return layout_no_patch def do_order_of_regions(self, contours_only_text_parent, contours_only_text_parent_h, boxes, textline_mask_tot): @@ -7022,9 +5955,9 @@ class eynollah: for i in range(len(args_contours_box_h)): con_inter_box_h.append(contours_only_text_parent_h[args_contours_box_h[i]]) - indexes_sorted, matrix_of_orders, kind_of_texts_sorted, index_by_kind_sorted = self.order_of_regions(textline_mask_tot[int(boxes[iij][2]) : int(boxes[iij][3]), int(boxes[iij][0]) : int(boxes[iij][1])], con_inter_box, con_inter_box_h, boxes[iij][2]) + indexes_sorted, matrix_of_orders, kind_of_texts_sorted, index_by_kind_sorted = order_of_regions(textline_mask_tot[int(boxes[iij][2]) : int(boxes[iij][3]), int(boxes[iij][0]) : int(boxes[iij][1])], con_inter_box, con_inter_box_h, boxes[iij][2]) - order_of_texts, id_of_texts = self.order_and_id_of_texts(con_inter_box, con_inter_box_h, matrix_of_orders, indexes_sorted, index_by_kind_sorted, kind_of_texts_sorted, ref_point) + order_of_texts, id_of_texts = order_and_id_of_texts(con_inter_box, con_inter_box_h, matrix_of_orders, indexes_sorted, index_by_kind_sorted, kind_of_texts_sorted, ref_point) indexes_sorted_main = np.array(indexes_sorted)[np.array(kind_of_texts_sorted) == 1] indexes_by_type_main = np.array(index_by_kind_sorted)[np.array(kind_of_texts_sorted) == 1] @@ -7105,9 +6038,9 @@ class eynollah: con_inter_box_h.append(contours_only_text_parent_h[args_contours_box_h[i]]) - indexes_sorted, matrix_of_orders, kind_of_texts_sorted, index_by_kind_sorted = self.order_of_regions(textline_mask_tot[int(boxes[iij][2]) : int(boxes[iij][3]), int(boxes[iij][0]) : int(boxes[iij][1])], con_inter_box, con_inter_box_h, boxes[iij][2]) + indexes_sorted, matrix_of_orders, kind_of_texts_sorted, index_by_kind_sorted = order_of_regions(textline_mask_tot[int(boxes[iij][2]) : int(boxes[iij][3]), int(boxes[iij][0]) : int(boxes[iij][1])], con_inter_box, con_inter_box_h, boxes[iij][2]) - order_of_texts, id_of_texts = self.order_and_id_of_texts(con_inter_box, con_inter_box_h, matrix_of_orders, indexes_sorted, index_by_kind_sorted, kind_of_texts_sorted, ref_point) + order_of_texts, id_of_texts = order_and_id_of_texts(con_inter_box, con_inter_box_h, matrix_of_orders, indexes_sorted, index_by_kind_sorted, kind_of_texts_sorted, ref_point) indexes_sorted_main = np.array(indexes_sorted)[np.array(kind_of_texts_sorted) == 1] indexes_by_type_main = np.array(index_by_kind_sorted)[np.array(kind_of_texts_sorted) == 1] @@ -7175,9 +6108,9 @@ class eynollah: for i in range(len(args_contours_box)): con_inter_box.append(contours_only_text_parent[args_contours_box[i]]) - indexes_sorted, matrix_of_orders, kind_of_texts_sorted, index_by_kind_sorted = self.order_of_regions(textline_mask_tot[int(boxes[iij][2]) : int(boxes[iij][3]), int(boxes[iij][0]) : int(boxes[iij][1])], con_inter_box, con_inter_box_h, boxes[iij][2]) + indexes_sorted, matrix_of_orders, kind_of_texts_sorted, index_by_kind_sorted = order_of_regions(textline_mask_tot[int(boxes[iij][2]) : int(boxes[iij][3]), int(boxes[iij][0]) : int(boxes[iij][1])], con_inter_box, con_inter_box_h, boxes[iij][2]) - order_of_texts, id_of_texts = self.order_and_id_of_texts(con_inter_box, con_inter_box_h, matrix_of_orders, indexes_sorted, index_by_kind_sorted, kind_of_texts_sorted, ref_point) + order_of_texts, id_of_texts = order_and_id_of_texts(con_inter_box, con_inter_box_h, matrix_of_orders, indexes_sorted, index_by_kind_sorted, kind_of_texts_sorted, ref_point) indexes_sorted_main = np.array(indexes_sorted)[np.array(kind_of_texts_sorted) == 1] indexes_by_type_main = np.array(index_by_kind_sorted)[np.array(kind_of_texts_sorted) == 1] @@ -7229,9 +6162,9 @@ class eynollah: con_inter_box.append(contours_only_text_parent[args_contours_box[i]]) - indexes_sorted, matrix_of_orders, kind_of_texts_sorted, index_by_kind_sorted = self.order_of_regions(textline_mask_tot[int(boxes[iij][2]) : int(boxes[iij][3]), int(boxes[iij][0]) : int(boxes[iij][1])], con_inter_box, con_inter_box_h, boxes[iij][2]) + indexes_sorted, matrix_of_orders, kind_of_texts_sorted, index_by_kind_sorted = order_of_regions(textline_mask_tot[int(boxes[iij][2]) : int(boxes[iij][3]), int(boxes[iij][0]) : int(boxes[iij][1])], con_inter_box, con_inter_box_h, boxes[iij][2]) - order_of_texts, id_of_texts = self.order_and_id_of_texts(con_inter_box, con_inter_box_h, matrix_of_orders, indexes_sorted, index_by_kind_sorted, kind_of_texts_sorted, ref_point) + order_of_texts, id_of_texts = order_and_id_of_texts(con_inter_box, con_inter_box_h, matrix_of_orders, indexes_sorted, index_by_kind_sorted, kind_of_texts_sorted, ref_point) indexes_sorted_main = np.array(indexes_sorted)[np.array(kind_of_texts_sorted) == 1] indexes_by_type_main = np.array(index_by_kind_sorted)[np.array(kind_of_texts_sorted) == 1] @@ -7944,7 +6877,7 @@ class eynollah: pixel_img = 1 min_area = 0.00001 max_area = 0.0006 - textline_mask_tot_small_size = self.return_contours_of_interested_region_by_size(textline_mask_tot, pixel_img, min_area, max_area) + textline_mask_tot_small_size = return_contours_of_interested_region_by_size(textline_mask_tot, pixel_img, min_area, max_area) # text_regions_p_1[(textline_mask_tot[:,:]==1) & (text_regions_p_1[:,:]==2)]=1 @@ -8063,7 +6996,7 @@ class eynollah: # print(type(image_page)) regions_fully, regions_fully_only_drop = self.extract_text_regions(image_page, patches, cols=num_col_classifier) - regions_fully_only_drop = self.put_drop_out_from_only_drop_model(regions_fully_only_drop, text_regions_p) + regions_fully_only_drop = put_drop_out_from_only_drop_model(regions_fully_only_drop, text_regions_p) regions_fully[:, :, 0][regions_fully_only_drop[:, :, 0] == 4] = 4 K.clear_session() gc.collect() @@ -8071,7 +7004,7 @@ class eynollah: # plt.imshow(regions_fully[:,:,0]) # plt.show() - regions_fully = self.putt_bb_of_drop_capitals_of_model_in_patches_in_layout(regions_fully) + regions_fully = putt_bb_of_drop_capitals_of_model_in_patches_in_layout(regions_fully) # plt.imshow(regions_fully[:,:,0]) # plt.show() @@ -8274,14 +7207,14 @@ class eynollah: all_found_texline_polygons, boxes_text, txt_con_org, contours_only_text_parent, all_box_coord, index_by_text_par_con, slopes = self.get_slopes_and_deskew_new_curved(txt_con_org, contours_only_text_parent, cv2.erode(textline_mask_tot_ea, kernel=self.kernel, iterations=1), image_page_rotated, boxes_text, text_only, num_col_classifier, scale_param, slope_deskew) # all_found_texline_polygons,boxes_text,txt_con_org,contours_only_text_parent,all_box_coord=self.get_slopes_and_deskew_new_curved(txt_con_org,contours_only_text_parent,textline_mask_tot_ea,image_page_rotated,boxes_text,text_only,num_col,scale_param) - all_found_texline_polygons = self.small_textlines_to_parent_adherence2(all_found_texline_polygons, textline_mask_tot_ea, num_col_classifier) + all_found_texline_polygons = small_textlines_to_parent_adherence2(all_found_texline_polygons, textline_mask_tot_ea, num_col_classifier) # slopes=list(np.zeros(len(contours_only_text_parent))) all_found_texline_polygons_marginals, boxes_marginals, _, polygons_of_marginals, all_box_coord_marginals, index_by_text_par_con_marginal, slopes_marginals = self.get_slopes_and_deskew_new_curved(polygons_of_marginals, polygons_of_marginals, cv2.erode(textline_mask_tot_ea, kernel=self.kernel, iterations=1), image_page_rotated, boxes_marginals, text_only, num_col_classifier, scale_param, slope_deskew) # all_found_texline_polygons,boxes_text,txt_con_org,contours_only_text_parent,all_box_coord=self.get_slopes_and_deskew_new_curved(txt_con_org,contours_only_text_parent,textline_mask_tot_ea,image_page_rotated,boxes_text,text_only,num_col,scale_param) - all_found_texline_polygons_marginals = self.small_textlines_to_parent_adherence2(all_found_texline_polygons_marginals, textline_mask_tot_ea, num_col_classifier) + all_found_texline_polygons_marginals = small_textlines_to_parent_adherence2(all_found_texline_polygons_marginals, textline_mask_tot_ea, num_col_classifier) index_of_vertical_text_contours = np.array(range(len(slopes)))[(abs(np.array(slopes)) > 60)] @@ -8310,13 +7243,13 @@ class eynollah: if np.abs(slope_deskew) >= SLOPE_THRESHOLD: contours_only_text_parent_d_ordered = list(np.array(contours_only_text_parent_d_ordered)[index_by_text_par_con]) - text_regions_p, contours_only_text_parent, contours_only_text_parent_h, all_box_coord, all_box_coord_h, all_found_texline_polygons, all_found_texline_polygons_h, slopes, slopes_h, contours_only_text_parent_d_ordered, contours_only_text_parent_h_d_ordered = self.check_any_text_region_in_model_one_is_main_or_header(text_regions_p, regions_fully, contours_only_text_parent, all_box_coord, all_found_texline_polygons, slopes, contours_only_text_parent_d_ordered) + text_regions_p, contours_only_text_parent, contours_only_text_parent_h, all_box_coord, all_box_coord_h, all_found_texline_polygons, all_found_texline_polygons_h, slopes, slopes_h, contours_only_text_parent_d_ordered, contours_only_text_parent_h_d_ordered = check_any_text_region_in_model_one_is_main_or_header(text_regions_p, regions_fully, contours_only_text_parent, all_box_coord, all_found_texline_polygons, slopes, contours_only_text_parent_d_ordered) else: contours_only_text_parent_d_ordered = None - text_regions_p, contours_only_text_parent, contours_only_text_parent_h, all_box_coord, all_box_coord_h, all_found_texline_polygons, all_found_texline_polygons_h, slopes, slopes_h, contours_only_text_parent_d_ordered, contours_only_text_parent_h_d_ordered = self.check_any_text_region_in_model_one_is_main_or_header(text_regions_p, regions_fully, contours_only_text_parent, all_box_coord, all_found_texline_polygons, slopes, contours_only_text_parent_d_ordered) + text_regions_p, contours_only_text_parent, contours_only_text_parent_h, all_box_coord, all_box_coord_h, all_found_texline_polygons, all_found_texline_polygons_h, slopes, slopes_h, contours_only_text_parent_d_ordered, contours_only_text_parent_h_d_ordered = check_any_text_region_in_model_one_is_main_or_header(text_regions_p, regions_fully, contours_only_text_parent, all_box_coord, all_found_texline_polygons, slopes, contours_only_text_parent_d_ordered) - ###text_regions_p,contours_only_text_parent,contours_only_text_parent_h,all_box_coord,all_box_coord_h,all_found_texline_polygons,all_found_texline_polygons_h=self.check_any_text_region_in_model_one_is_main_or_header(text_regions_p,regions_fully,contours_only_text_parent,all_box_coord,all_found_texline_polygons) + ###text_regions_p,contours_only_text_parent,contours_only_text_parent_h,all_box_coord,all_box_coord_h,all_found_texline_polygons,all_found_texline_polygons_h=check_any_text_region_in_model_one_is_main_or_header(text_regions_p,regions_fully,contours_only_text_parent,all_box_coord,all_found_texline_polygons) # text_regions_p=self.return_region_segmentation_after_implementing_not_head_maintext_parallel(text_regions_p,boxes) # if you want to save the layout result just uncommet following plot diff --git a/sbb_newspapers_org_image/unused.py b/sbb_newspapers_org_image/unused.py index 500f3fe..f4a9530 100644 --- a/sbb_newspapers_org_image/unused.py +++ b/sbb_newspapers_org_image/unused.py @@ -488,3 +488,338 @@ def order_of_regions_old(textline_mask, contours_main): final_indexers_sorted.append(int(ind_in_int[j])) return final_indexers_sorted, matrix_of_orders + +def remove_headers_and_mains_intersection(seperators_closeup_n, img_revised_tab, boxes): + for ind in range(len(boxes)): + asp = np.zeros((img_revised_tab[:, :, 0].shape[0], seperators_closeup_n[:, :, 0].shape[1])) + asp[int(boxes[ind][2]) : int(boxes[ind][3]), int(boxes[ind][0]) : int(boxes[ind][1])] = img_revised_tab[int(boxes[ind][2]) : int(boxes[ind][3]), int(boxes[ind][0]) : int(boxes[ind][1]), 0] + + head_patch_con = (asp[:, :] == 2) * 1 + main_patch_con = (asp[:, :] == 1) * 1 + # print(head_patch_con) + head_patch_con = head_patch_con.astype(np.uint8) + main_patch_con = main_patch_con.astype(np.uint8) + + head_patch_con = np.repeat(head_patch_con[:, :, np.newaxis], 3, axis=2) + main_patch_con = np.repeat(main_patch_con[:, :, np.newaxis], 3, axis=2) + + imgray = cv2.cvtColor(head_patch_con, cv2.COLOR_BGR2GRAY) + ret, thresh = cv2.threshold(imgray, 0, 255, 0) + + contours_head_patch_con, hiearchy = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) + contours_head_patch_con = return_parent_contours(contours_head_patch_con, hiearchy) + + imgray = cv2.cvtColor(main_patch_con, cv2.COLOR_BGR2GRAY) + ret, thresh = cv2.threshold(imgray, 0, 255, 0) + + contours_main_patch_con, hiearchy = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) + contours_main_patch_con = return_parent_contours(contours_main_patch_con, hiearchy) + + y_patch_head_min, y_patch_head_max, _ = find_features_of_contours(contours_head_patch_con) + y_patch_main_min, y_patch_main_max, _ = find_features_of_contours(contours_main_patch_con) + + for i in range(len(y_patch_head_min)): + for j in range(len(y_patch_main_min)): + if y_patch_head_max[i] > y_patch_main_min[j] and y_patch_head_min[i] < y_patch_main_min[j]: + y_down = y_patch_head_max[i] + y_up = y_patch_main_min[j] + + patch_intersection = np.zeros(asp.shape) + patch_intersection[y_up:y_down, :] = asp[y_up:y_down, :] + + head_patch_con = (patch_intersection[:, :] == 2) * 1 + main_patch_con = (patch_intersection[:, :] == 1) * 1 + head_patch_con = head_patch_con.astype(np.uint8) + main_patch_con = main_patch_con.astype(np.uint8) + + head_patch_con = np.repeat(head_patch_con[:, :, np.newaxis], 3, axis=2) + main_patch_con = np.repeat(main_patch_con[:, :, np.newaxis], 3, axis=2) + + imgray = cv2.cvtColor(head_patch_con, cv2.COLOR_BGR2GRAY) + ret, thresh = cv2.threshold(imgray, 0, 255, 0) + + contours_head_patch_con, hiearchy = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) + contours_head_patch_con = return_parent_contours(contours_head_patch_con, hiearchy) + + imgray = cv2.cvtColor(main_patch_con, cv2.COLOR_BGR2GRAY) + ret, thresh = cv2.threshold(imgray, 0, 255, 0) + + contours_main_patch_con, hiearchy = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) + contours_main_patch_con = return_parent_contours(contours_main_patch_con, hiearchy) + + _, _, areas_head = find_features_of_contours(contours_head_patch_con) + _, _, areas_main = find_features_of_contours(contours_main_patch_con) + + if np.sum(areas_head) > np.sum(areas_main): + img_revised_tab[y_up:y_down, int(boxes[ind][0]) : int(boxes[ind][1]), 0][img_revised_tab[y_up:y_down, int(boxes[ind][0]) : int(boxes[ind][1]), 0] == 1] = 2 + else: + img_revised_tab[y_up:y_down, int(boxes[ind][0]) : int(boxes[ind][1]), 0][img_revised_tab[y_up:y_down, int(boxes[ind][0]) : int(boxes[ind][1]), 0] == 2] = 1 + + elif y_patch_head_min[i] < y_patch_main_max[j] and y_patch_head_max[i] > y_patch_main_max[j]: + y_down = y_patch_main_max[j] + y_up = y_patch_head_min[i] + + patch_intersection = np.zeros(asp.shape) + patch_intersection[y_up:y_down, :] = asp[y_up:y_down, :] + + head_patch_con = (patch_intersection[:, :] == 2) * 1 + main_patch_con = (patch_intersection[:, :] == 1) * 1 + head_patch_con = head_patch_con.astype(np.uint8) + main_patch_con = main_patch_con.astype(np.uint8) + + head_patch_con = np.repeat(head_patch_con[:, :, np.newaxis], 3, axis=2) + main_patch_con = np.repeat(main_patch_con[:, :, np.newaxis], 3, axis=2) + + imgray = cv2.cvtColor(head_patch_con, cv2.COLOR_BGR2GRAY) + ret, thresh = cv2.threshold(imgray, 0, 255, 0) + + contours_head_patch_con, hiearchy = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) + contours_head_patch_con = return_parent_contours(contours_head_patch_con, hiearchy) + + imgray = cv2.cvtColor(main_patch_con, cv2.COLOR_BGR2GRAY) + ret, thresh = cv2.threshold(imgray, 0, 255, 0) + + contours_main_patch_con, hiearchy = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) + contours_main_patch_con = return_parent_contours(contours_main_patch_con, hiearchy) + + _, _, areas_head = find_features_of_contours(contours_head_patch_con) + _, _, areas_main = find_features_of_contours(contours_main_patch_con) + + if np.sum(areas_head) > np.sum(areas_main): + img_revised_tab[y_up:y_down, int(boxes[ind][0]) : int(boxes[ind][1]), 0][img_revised_tab[y_up:y_down, int(boxes[ind][0]) : int(boxes[ind][1]), 0] == 1] = 2 + else: + img_revised_tab[y_up:y_down, int(boxes[ind][0]) : int(boxes[ind][1]), 0][img_revised_tab[y_up:y_down, int(boxes[ind][0]) : int(boxes[ind][1]), 0] == 2] = 1 + + # print(np.unique(patch_intersection) ) + ##plt.figure(figsize=(20,20)) + ##plt.imshow(patch_intersection) + ##plt.show() + else: + pass + + return img_revised_tab + +def tear_main_texts_on_the_boundaries_of_boxes(img_revised_tab, boxes): + for i in range(len(boxes)): + img_revised_tab[int(boxes[i][2]) : int(boxes[i][3]), int(boxes[i][1] - 10) : int(boxes[i][1]), 0][img_revised_tab[int(boxes[i][2]) : int(boxes[i][3]), int(boxes[i][1] - 10) : int(boxes[i][1]), 0] == 1] = 0 + img_revised_tab[int(boxes[i][2]) : int(boxes[i][3]), int(boxes[i][1] - 10) : int(boxes[i][1]), 1][img_revised_tab[int(boxes[i][2]) : int(boxes[i][3]), int(boxes[i][1] - 10) : int(boxes[i][1]), 1] == 1] = 0 + img_revised_tab[int(boxes[i][2]) : int(boxes[i][3]), int(boxes[i][1] - 10) : int(boxes[i][1]), 2][img_revised_tab[int(boxes[i][2]) : int(boxes[i][3]), int(boxes[i][1] - 10) : int(boxes[i][1]), 2] == 1] = 0 + return img_revised_tab + +def combine_hor_lines_and_delete_cross_points_and_get_lines_features_back(self, regions_pre_p): + seperators_closeup = ((regions_pre_p[:, :] == 6)) * 1 + + seperators_closeup = seperators_closeup.astype(np.uint8) + kernel = np.ones((5, 5), np.uint8) + + seperators_closeup = cv2.dilate(seperators_closeup, kernel, iterations=1) + seperators_closeup = cv2.erode(seperators_closeup, kernel, iterations=1) + + seperators_closeup = cv2.erode(seperators_closeup, kernel, iterations=1) + seperators_closeup = cv2.dilate(seperators_closeup, kernel, iterations=1) + + if len(seperators_closeup.shape) == 2: + seperators_closeup_n = np.zeros((seperators_closeup.shape[0], seperators_closeup.shape[1], 3)) + seperators_closeup_n[:, :, 0] = seperators_closeup + seperators_closeup_n[:, :, 1] = seperators_closeup + seperators_closeup_n[:, :, 2] = seperators_closeup + else: + seperators_closeup_n = seperators_closeup[:, :, :] + # seperators_closeup=seperators_closeup.astype(np.uint8) + seperators_closeup_n = seperators_closeup_n.astype(np.uint8) + imgray = cv2.cvtColor(seperators_closeup_n, cv2.COLOR_BGR2GRAY) + ret, thresh = cv2.threshold(imgray, 0, 255, 0) + contours_lines, hierachy = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) + slope_lines, dist_x, x_min_main, x_max_main, cy_main, slope_lines_org, y_min_main, y_max_main, cx_main = find_features_of_lines(contours_lines) + + dist_y = np.abs(y_max_main - y_min_main) + + slope_lines_org_hor = slope_lines_org[slope_lines == 0] + args = np.array(range(len(slope_lines))) + len_x = seperators_closeup.shape[1] * 0 + len_y = seperators_closeup.shape[0] * 0.01 + + args_hor = args[slope_lines == 0] + dist_x_hor = dist_x[slope_lines == 0] + dist_y_hor = dist_y[slope_lines == 0] + x_min_main_hor = x_min_main[slope_lines == 0] + x_max_main_hor = x_max_main[slope_lines == 0] + cy_main_hor = cy_main[slope_lines == 0] + y_min_main_hor = y_min_main[slope_lines == 0] + y_max_main_hor = y_max_main[slope_lines == 0] + + args_hor = args_hor[dist_x_hor >= len_x] + x_max_main_hor = x_max_main_hor[dist_x_hor >= len_x] + x_min_main_hor = x_min_main_hor[dist_x_hor >= len_x] + cy_main_hor = cy_main_hor[dist_x_hor >= len_x] + y_min_main_hor = y_min_main_hor[dist_x_hor >= len_x] + y_max_main_hor = y_max_main_hor[dist_x_hor >= len_x] + slope_lines_org_hor = slope_lines_org_hor[dist_x_hor >= len_x] + dist_y_hor = dist_y_hor[dist_x_hor >= len_x] + dist_x_hor = dist_x_hor[dist_x_hor >= len_x] + + args_ver = args[slope_lines == 1] + dist_y_ver = dist_y[slope_lines == 1] + dist_x_ver = dist_x[slope_lines == 1] + x_min_main_ver = x_min_main[slope_lines == 1] + x_max_main_ver = x_max_main[slope_lines == 1] + y_min_main_ver = y_min_main[slope_lines == 1] + y_max_main_ver = y_max_main[slope_lines == 1] + cx_main_ver = cx_main[slope_lines == 1] + + args_ver = args_ver[dist_y_ver >= len_y] + x_max_main_ver = x_max_main_ver[dist_y_ver >= len_y] + x_min_main_ver = x_min_main_ver[dist_y_ver >= len_y] + cx_main_ver = cx_main_ver[dist_y_ver >= len_y] + y_min_main_ver = y_min_main_ver[dist_y_ver >= len_y] + y_max_main_ver = y_max_main_ver[dist_y_ver >= len_y] + dist_x_ver = dist_x_ver[dist_y_ver >= len_y] + dist_y_ver = dist_y_ver[dist_y_ver >= len_y] + + img_p_in_ver = np.zeros(seperators_closeup_n[:, :, 2].shape) + for jv in range(len(args_ver)): + img_p_in_ver = cv2.fillPoly(img_p_in_ver, pts=[contours_lines[args_ver[jv]]], color=(1, 1, 1)) + + img_in_hor = np.zeros(seperators_closeup_n[:, :, 2].shape) + for jv in range(len(args_hor)): + img_p_in_hor = cv2.fillPoly(img_in_hor, pts=[contours_lines[args_hor[jv]]], color=(1, 1, 1)) + + all_args_uniq = contours_in_same_horizon(cy_main_hor) + # print(all_args_uniq,'all_args_uniq') + if len(all_args_uniq) > 0: + if type(all_args_uniq[0]) is list: + contours_new = [] + for dd in range(len(all_args_uniq)): + merged_all = None + some_args = args_hor[all_args_uniq[dd]] + some_cy = cy_main_hor[all_args_uniq[dd]] + some_x_min = x_min_main_hor[all_args_uniq[dd]] + some_x_max = x_max_main_hor[all_args_uniq[dd]] + + img_in = np.zeros(seperators_closeup_n[:, :, 2].shape) + for jv in range(len(some_args)): + + img_p_in = cv2.fillPoly(img_p_in_hor, pts=[contours_lines[some_args[jv]]], color=(1, 1, 1)) + img_p_in[int(np.mean(some_cy)) - 5 : int(np.mean(some_cy)) + 5, int(np.min(some_x_min)) : int(np.max(some_x_max))] = 1 + + else: + img_p_in = seperators_closeup + else: + img_p_in = seperators_closeup + + sep_ver_hor = img_p_in + img_p_in_ver + sep_ver_hor_cross = (sep_ver_hor == 2) * 1 + + sep_ver_hor_cross = np.repeat(sep_ver_hor_cross[:, :, np.newaxis], 3, axis=2) + sep_ver_hor_cross = sep_ver_hor_cross.astype(np.uint8) + imgray = cv2.cvtColor(sep_ver_hor_cross, cv2.COLOR_BGR2GRAY) + ret, thresh = cv2.threshold(imgray, 0, 255, 0) + contours_cross, _ = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) + + cx_cross, cy_cross, _, _, _, _, _ = find_new_features_of_contoures(contours_cross) + + for ii in range(len(cx_cross)): + sep_ver_hor[int(cy_cross[ii]) - 15 : int(cy_cross[ii]) + 15, int(cx_cross[ii]) + 5 : int(cx_cross[ii]) + 40] = 0 + sep_ver_hor[int(cy_cross[ii]) - 15 : int(cy_cross[ii]) + 15, int(cx_cross[ii]) - 40 : int(cx_cross[ii]) - 4] = 0 + + img_p_in[:, :] = sep_ver_hor[:, :] + + if len(img_p_in.shape) == 2: + seperators_closeup_n = np.zeros((img_p_in.shape[0], img_p_in.shape[1], 3)) + seperators_closeup_n[:, :, 0] = img_p_in + seperators_closeup_n[:, :, 1] = img_p_in + seperators_closeup_n[:, :, 2] = img_p_in + else: + seperators_closeup_n = img_p_in[:, :, :] + # seperators_closeup=seperators_closeup.astype(np.uint8) + seperators_closeup_n = seperators_closeup_n.astype(np.uint8) + imgray = cv2.cvtColor(seperators_closeup_n, cv2.COLOR_BGR2GRAY) + ret, thresh = cv2.threshold(imgray, 0, 255, 0) + + contours_lines, hierachy = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) + + slope_lines, dist_x, x_min_main, x_max_main, cy_main, slope_lines_org, y_min_main, y_max_main, cx_main = find_features_of_lines(contours_lines) + + dist_y = np.abs(y_max_main - y_min_main) + + slope_lines_org_hor = slope_lines_org[slope_lines == 0] + args = np.array(range(len(slope_lines))) + len_x = seperators_closeup.shape[1] * 0.04 + len_y = seperators_closeup.shape[0] * 0.08 + + args_hor = args[slope_lines == 0] + dist_x_hor = dist_x[slope_lines == 0] + dist_y_hor = dist_y[slope_lines == 0] + x_min_main_hor = x_min_main[slope_lines == 0] + x_max_main_hor = x_max_main[slope_lines == 0] + cy_main_hor = cy_main[slope_lines == 0] + y_min_main_hor = y_min_main[slope_lines == 0] + y_max_main_hor = y_max_main[slope_lines == 0] + + args_hor = args_hor[dist_x_hor >= len_x] + x_max_main_hor = x_max_main_hor[dist_x_hor >= len_x] + x_min_main_hor = x_min_main_hor[dist_x_hor >= len_x] + cy_main_hor = cy_main_hor[dist_x_hor >= len_x] + y_min_main_hor = y_min_main_hor[dist_x_hor >= len_x] + y_max_main_hor = y_max_main_hor[dist_x_hor >= len_x] + slope_lines_org_hor = slope_lines_org_hor[dist_x_hor >= len_x] + dist_y_hor = dist_y_hor[dist_x_hor >= len_x] + dist_x_hor = dist_x_hor[dist_x_hor >= len_x] + + args_ver = args[slope_lines == 1] + dist_y_ver = dist_y[slope_lines == 1] + dist_x_ver = dist_x[slope_lines == 1] + x_min_main_ver = x_min_main[slope_lines == 1] + x_max_main_ver = x_max_main[slope_lines == 1] + y_min_main_ver = y_min_main[slope_lines == 1] + y_max_main_ver = y_max_main[slope_lines == 1] + cx_main_ver = cx_main[slope_lines == 1] + + args_ver = args_ver[dist_y_ver >= len_y] + x_max_main_ver = x_max_main_ver[dist_y_ver >= len_y] + x_min_main_ver = x_min_main_ver[dist_y_ver >= len_y] + cx_main_ver = cx_main_ver[dist_y_ver >= len_y] + y_min_main_ver = y_min_main_ver[dist_y_ver >= len_y] + y_max_main_ver = y_max_main_ver[dist_y_ver >= len_y] + dist_x_ver = dist_x_ver[dist_y_ver >= len_y] + dist_y_ver = dist_y_ver[dist_y_ver >= len_y] + + matrix_of_lines_ch = np.zeros((len(cy_main_hor) + len(cx_main_ver), 10)) + + matrix_of_lines_ch[: len(cy_main_hor), 0] = args_hor + matrix_of_lines_ch[len(cy_main_hor) :, 0] = args_ver + + matrix_of_lines_ch[len(cy_main_hor) :, 1] = cx_main_ver + + matrix_of_lines_ch[: len(cy_main_hor), 2] = x_min_main_hor + matrix_of_lines_ch[len(cy_main_hor) :, 2] = x_min_main_ver + + matrix_of_lines_ch[: len(cy_main_hor), 3] = x_max_main_hor + matrix_of_lines_ch[len(cy_main_hor) :, 3] = x_max_main_ver + + matrix_of_lines_ch[: len(cy_main_hor), 4] = dist_x_hor + matrix_of_lines_ch[len(cy_main_hor) :, 4] = dist_x_ver + + matrix_of_lines_ch[: len(cy_main_hor), 5] = cy_main_hor + + matrix_of_lines_ch[: len(cy_main_hor), 6] = y_min_main_hor + matrix_of_lines_ch[len(cy_main_hor) :, 6] = y_min_main_ver + + matrix_of_lines_ch[: len(cy_main_hor), 7] = y_max_main_hor + matrix_of_lines_ch[len(cy_main_hor) :, 7] = y_max_main_ver + + matrix_of_lines_ch[: len(cy_main_hor), 8] = dist_y_hor + matrix_of_lines_ch[len(cy_main_hor) :, 8] = dist_y_ver + + matrix_of_lines_ch[len(cy_main_hor) :, 9] = 1 + + return matrix_of_lines_ch, seperators_closeup_n + +def image_change_background_pixels_to_zero(self, image_page): + image_back_zero = np.zeros((image_page.shape[0], image_page.shape[1])) + image_back_zero[:, :] = image_page[:, :, 0] + image_back_zero[:, :][image_back_zero[:, :] == 0] = -255 + image_back_zero[:, :][image_back_zero[:, :] == 255] = 0 + image_back_zero[:, :][image_back_zero[:, :] == -255] = 255 + return image_back_zero + diff --git a/sbb_newspapers_org_image/utils.py b/sbb_newspapers_org_image/utils.py index a12b022..b8805f2 100644 --- a/sbb_newspapers_org_image/utils.py +++ b/sbb_newspapers_org_image/utils.py @@ -2773,3 +2773,749 @@ def return_deskew_slop(img_patch_org, sigma_des, main_page=False, dir_of_all=Non return ang_int +def put_drop_out_from_only_drop_model(layout_no_patch, layout1): + + drop_only = (layout_no_patch[:, :, 0] == 4) * 1 + contours_drop, hir_on_drop = return_contours_of_image(drop_only) + contours_drop_parent = return_parent_contours(contours_drop, hir_on_drop) + + areas_cnt_text = np.array([cv2.contourArea(contours_drop_parent[j]) for j in range(len(contours_drop_parent))]) + areas_cnt_text = areas_cnt_text / float(drop_only.shape[0] * drop_only.shape[1]) + + contours_drop_parent = [contours_drop_parent[jz] for jz in range(len(contours_drop_parent)) if areas_cnt_text[jz] > 0.00001] + + areas_cnt_text = [areas_cnt_text[jz] for jz in range(len(areas_cnt_text)) if areas_cnt_text[jz] > 0.00001] + + contours_drop_parent_final = [] + + for jj in range(len(contours_drop_parent)): + x, y, w, h = cv2.boundingRect(contours_drop_parent[jj]) + # boxes.append([int(x), int(y), int(w), int(h)]) + + map_of_drop_contour_bb = np.zeros((layout1.shape[0], layout1.shape[1])) + map_of_drop_contour_bb[y : y + h, x : x + w] = layout1[y : y + h, x : x + w] + + if (((map_of_drop_contour_bb == 1) * 1).sum() / float(((map_of_drop_contour_bb == 5) * 1).sum()) * 100) >= 15: + contours_drop_parent_final.append(contours_drop_parent[jj]) + + layout_no_patch[:, :, 0][layout_no_patch[:, :, 0] == 4] = 0 + + layout_no_patch = cv2.fillPoly(layout_no_patch, pts=contours_drop_parent_final, color=(4, 4, 4)) + + return layout_no_patch + +def putt_bb_of_drop_capitals_of_model_in_patches_in_layout(layout_in_patch): + + drop_only = (layout_in_patch[:, :, 0] == 4) * 1 + contours_drop, hir_on_drop = return_contours_of_image(drop_only) + contours_drop_parent = return_parent_contours(contours_drop, hir_on_drop) + + areas_cnt_text = np.array([cv2.contourArea(contours_drop_parent[j]) for j in range(len(contours_drop_parent))]) + areas_cnt_text = areas_cnt_text / float(drop_only.shape[0] * drop_only.shape[1]) + + contours_drop_parent = [contours_drop_parent[jz] for jz in range(len(contours_drop_parent)) if areas_cnt_text[jz] > 0.00001] + + areas_cnt_text = [areas_cnt_text[jz] for jz in range(len(areas_cnt_text)) if areas_cnt_text[jz] > 0.001] + + contours_drop_parent_final = [] + + for jj in range(len(contours_drop_parent)): + x, y, w, h = cv2.boundingRect(contours_drop_parent[jj]) + layout_in_patch[y : y + h, x : x + w, 0] = 4 + + return layout_in_patch + +def check_any_text_region_in_model_one_is_main_or_header(regions_model_1, regions_model_full, contours_only_text_parent, all_box_coord, all_found_texline_polygons, slopes, contours_only_text_parent_d_ordered): + text_only = (regions_model_1[:, :] == 1) * 1 + contours_only_text, hir_on_text = return_contours_of_image(text_only) + + """ + contours_only_text_parent=return_parent_contours( contours_only_text,hir_on_text) + + areas_cnt_text=np.array([cv2.contourArea(contours_only_text_parent[j]) for j in range(len(contours_only_text_parent))]) + areas_cnt_text=areas_cnt_text/float(text_only.shape[0]*text_only.shape[1]) + + ###areas_cnt_text_h=np.array([cv2.contourArea(contours_only_text_parent_h[j]) for j in range(len(contours_only_text_parent_h))]) + ###areas_cnt_text_h=areas_cnt_text_h/float(text_only_h.shape[0]*text_only_h.shape[1]) + + ###contours_only_text_parent=[contours_only_text_parent[jz] for jz in range(len(contours_only_text_parent)) if areas_cnt_text[jz]>0.0002] + contours_only_text_parent=[contours_only_text_parent[jz] for jz in range(len(contours_only_text_parent)) if areas_cnt_text[jz]>0.00001] + """ + + cx_main, cy_main, x_min_main, x_max_main, y_min_main, y_max_main, y_corr_x_min_from_argmin = find_new_features_of_contoures(contours_only_text_parent) + + length_con = x_max_main - x_min_main + height_con = y_max_main - y_min_main + + all_found_texline_polygons_main = [] + all_found_texline_polygons_head = [] + + all_box_coord_main = [] + all_box_coord_head = [] + + slopes_main = [] + slopes_head = [] + + contours_only_text_parent_main = [] + contours_only_text_parent_head = [] + + contours_only_text_parent_main_d = [] + contours_only_text_parent_head_d = [] + + for ii in range(len(contours_only_text_parent)): + con = contours_only_text_parent[ii] + img = np.zeros((regions_model_1.shape[0], regions_model_1.shape[1], 3)) + img = cv2.fillPoly(img, pts=[con], color=(255, 255, 255)) + + all_pixels = ((img[:, :, 0] == 255) * 1).sum() + + pixels_header = (((img[:, :, 0] == 255) & (regions_model_full[:, :, 0] == 2)) * 1).sum() + pixels_main = all_pixels - pixels_header + + if (pixels_header >= pixels_main) and ((length_con[ii] / float(height_con[ii])) >= 1.3): + regions_model_1[:, :][(regions_model_1[:, :] == 1) & (img[:, :, 0] == 255)] = 2 + contours_only_text_parent_head.append(con) + if contours_only_text_parent_d_ordered is not None: + contours_only_text_parent_head_d.append(contours_only_text_parent_d_ordered[ii]) + all_box_coord_head.append(all_box_coord[ii]) + slopes_head.append(slopes[ii]) + all_found_texline_polygons_head.append(all_found_texline_polygons[ii]) + else: + regions_model_1[:, :][(regions_model_1[:, :] == 1) & (img[:, :, 0] == 255)] = 1 + contours_only_text_parent_main.append(con) + if contours_only_text_parent_d_ordered is not None: + contours_only_text_parent_main_d.append(contours_only_text_parent_d_ordered[ii]) + all_box_coord_main.append(all_box_coord[ii]) + slopes_main.append(slopes[ii]) + all_found_texline_polygons_main.append(all_found_texline_polygons[ii]) + + # print(all_pixels,pixels_main,pixels_header) + + # plt.imshow(img[:,:,0]) + # plt.show() + return regions_model_1, contours_only_text_parent_main, contours_only_text_parent_head, all_box_coord_main, all_box_coord_head, all_found_texline_polygons_main, all_found_texline_polygons_head, slopes_main, slopes_head, contours_only_text_parent_main_d, contours_only_text_parent_head_d + +def small_textlines_to_parent_adherence2(textlines_con, textline_iamge, num_col): + # print(textlines_con) + # textlines_con=textlines_con.astype(np.uint32) + + textlines_con_changed = [] + for m1 in range(len(textlines_con)): + + # textlines_tot=textlines_con[m1] + # textlines_tot=textlines_tot.astype() + textlines_tot = [] + textlines_tot_org_form = [] + # print(textlines_tot) + + for nn in range(len(textlines_con[m1])): + textlines_tot.append(np.array(textlines_con[m1][nn], dtype=np.int32)) + textlines_tot_org_form.append(textlines_con[m1][nn]) + + ##img_text_all=np.zeros((textline_iamge.shape[0],textline_iamge.shape[1])) + ##img_text_all=cv2.fillPoly(img_text_all, pts =textlines_tot , color=(1,1,1)) + + ##plt.imshow(img_text_all) + ##plt.show() + areas_cnt_text = np.array([cv2.contourArea(textlines_tot[j]) for j in range(len(textlines_tot))]) + areas_cnt_text = areas_cnt_text / float(textline_iamge.shape[0] * textline_iamge.shape[1]) + indexes_textlines = np.array(range(len(textlines_tot))) + + # print(areas_cnt_text,np.min(areas_cnt_text),np.max(areas_cnt_text)) + if num_col == 0: + min_area = 0.0004 + elif num_col == 1: + min_area = 0.0003 + else: + min_area = 0.0001 + indexes_textlines_small = indexes_textlines[areas_cnt_text < min_area] + + # print(indexes_textlines) + + textlines_small = [] + textlines_small_org_form = [] + for i in indexes_textlines_small: + textlines_small.append(textlines_tot[i]) + textlines_small_org_form.append(textlines_tot_org_form[i]) + + textlines_big = [] + textlines_big_org_form = [] + for i in list(set(indexes_textlines) - set(indexes_textlines_small)): + textlines_big.append(textlines_tot[i]) + textlines_big_org_form.append(textlines_tot_org_form[i]) + + img_textline_s = np.zeros((textline_iamge.shape[0], textline_iamge.shape[1])) + img_textline_s = cv2.fillPoly(img_textline_s, pts=textlines_small, color=(1, 1, 1)) + + img_textline_b = np.zeros((textline_iamge.shape[0], textline_iamge.shape[1])) + img_textline_b = cv2.fillPoly(img_textline_b, pts=textlines_big, color=(1, 1, 1)) + + sum_small_big_all = img_textline_s + img_textline_b + sum_small_big_all2 = (sum_small_big_all[:, :] == 2) * 1 + + sum_intersection_sb = sum_small_big_all2.sum(axis=1).sum() + + if sum_intersection_sb > 0: + + dis_small_from_bigs_tot = [] + for z1 in range(len(textlines_small)): + # print(len(textlines_small),'small') + intersections = [] + for z2 in range(len(textlines_big)): + img_text = np.zeros((textline_iamge.shape[0], textline_iamge.shape[1])) + img_text = cv2.fillPoly(img_text, pts=[textlines_small[z1]], color=(1, 1, 1)) + + img_text2 = np.zeros((textline_iamge.shape[0], textline_iamge.shape[1])) + img_text2 = cv2.fillPoly(img_text2, pts=[textlines_big[z2]], color=(1, 1, 1)) + + sum_small_big = img_text2 + img_text + sum_small_big_2 = (sum_small_big[:, :] == 2) * 1 + + sum_intersection = sum_small_big_2.sum(axis=1).sum() + + # print(sum_intersection) + + intersections.append(sum_intersection) + + if len(np.array(intersections)[np.array(intersections) > 0]) == 0: + intersections = [] + + try: + dis_small_from_bigs_tot.append(np.argmax(intersections)) + except: + dis_small_from_bigs_tot.append(-1) + + smalls_list = np.array(dis_small_from_bigs_tot)[np.array(dis_small_from_bigs_tot) >= 0] + + # index_small_textlines_rest=list( set(indexes_textlines_small)-set(smalls_list) ) + + textlines_big_with_change = [] + textlines_big_with_change_con = [] + textlines_small_with_change = [] + + for z in list(set(smalls_list)): + index_small_textlines = list(np.where(np.array(dis_small_from_bigs_tot) == z)[0]) + # print(z,index_small_textlines) + + img_text2 = np.zeros((textline_iamge.shape[0], textline_iamge.shape[1], 3)) + img_text2 = cv2.fillPoly(img_text2, pts=[textlines_big[z]], color=(255, 255, 255)) + + textlines_big_with_change.append(z) + + for k in index_small_textlines: + img_text2 = cv2.fillPoly(img_text2, pts=[textlines_small[k]], color=(255, 255, 255)) + textlines_small_with_change.append(k) + + img_text2 = img_text2.astype(np.uint8) + imgray = cv2.cvtColor(img_text2, cv2.COLOR_BGR2GRAY) + ret, thresh = cv2.threshold(imgray, 0, 255, 0) + cont, hierachy = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) + + # print(cont[0],type(cont)) + + textlines_big_with_change_con.append(cont) + textlines_big_org_form[z] = cont[0] + + # plt.imshow(img_text2) + # plt.show() + + # print(textlines_big_with_change,'textlines_big_with_change') + # print(textlines_small_with_change,'textlines_small_with_change') + # print(textlines_big) + textlines_con_changed.append(textlines_big_org_form) + + else: + textlines_con_changed.append(textlines_big_org_form) + return textlines_con_changed + +def return_contours_of_interested_region_by_size(region_pre_p, pixel, min_area, max_area): + + # pixels of images are identified by 5 + if len(region_pre_p.shape) == 3: + cnts_images = (region_pre_p[:, :, 0] == pixel) * 1 + else: + cnts_images = (region_pre_p[:, :] == pixel) * 1 + cnts_images = cnts_images.astype(np.uint8) + cnts_images = np.repeat(cnts_images[:, :, np.newaxis], 3, axis=2) + imgray = cv2.cvtColor(cnts_images, cv2.COLOR_BGR2GRAY) + ret, thresh = cv2.threshold(imgray, 0, 255, 0) + contours_imgs, hiearchy = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) + + contours_imgs = return_parent_contours(contours_imgs, hiearchy) + contours_imgs = filter_contours_area_of_image_tables(thresh, contours_imgs, hiearchy, max_area=max_area, min_area=min_area) + + img_ret = np.zeros((region_pre_p.shape[0], region_pre_p.shape[1], 3)) + img_ret = cv2.fillPoly(img_ret, pts=contours_imgs, color=(1, 1, 1)) + return img_ret[:, :, 0] + +def order_and_id_of_texts(found_polygons_text_region, found_polygons_text_region_h, matrix_of_orders, indexes_sorted, index_of_types, kind_of_texts, ref_point): + indexes_sorted = np.array(indexes_sorted) + index_of_types = np.array(index_of_types) + kind_of_texts = np.array(kind_of_texts) + + id_of_texts = [] + order_of_texts = [] + + index_of_types_1 = index_of_types[kind_of_texts == 1] + indexes_sorted_1 = indexes_sorted[kind_of_texts == 1] + + index_of_types_2 = index_of_types[kind_of_texts == 2] + indexes_sorted_2 = indexes_sorted[kind_of_texts == 2] + + ##print(index_of_types,'index_of_types') + ##print(kind_of_texts,'kind_of_texts') + ##print(len(found_polygons_text_region),'found_polygons_text_region') + ##print(index_of_types_1,'index_of_types_1') + ##print(indexes_sorted_1,'indexes_sorted_1') + index_b = 0 + ref_point + for mm in range(len(found_polygons_text_region)): + + id_of_texts.append("r" + str(index_b)) + interest = indexes_sorted_1[indexes_sorted_1 == index_of_types_1[mm]] + + if len(interest) > 0: + order_of_texts.append(interest[0]) + index_b += 1 + else: + pass + + for mm in range(len(found_polygons_text_region_h)): + id_of_texts.append("r" + str(index_b)) + interest = indexes_sorted_2[index_of_types_2[mm]] + order_of_texts.append(interest) + index_b += 1 + + return order_of_texts, id_of_texts + +def order_of_regions(textline_mask, contours_main, contours_header, y_ref): + + ##plt.imshow(textline_mask) + ##plt.show() + """ + print(len(contours_main),'contours_main') + mada_n=textline_mask.sum(axis=1) + y=mada_n[:] + + y_help=np.zeros(len(y)+40) + y_help[20:len(y)+20]=y + x=np.array( range(len(y)) ) + + + peaks_real, _ = find_peaks(gaussian_filter1d(y, 3), height=0) + + ##plt.imshow(textline_mask[:,:]) + ##plt.show() + + + sigma_gaus=8 + + z= gaussian_filter1d(y_help, sigma_gaus) + zneg_rev=-y_help+np.max(y_help) + + zneg=np.zeros(len(zneg_rev)+40) + zneg[20:len(zneg_rev)+20]=zneg_rev + zneg= gaussian_filter1d(zneg, sigma_gaus) + + + peaks, _ = find_peaks(z, height=0) + peaks_neg, _ = find_peaks(zneg, height=0) + + peaks_neg=peaks_neg-20-20 + peaks=peaks-20 + """ + + textline_sum_along_width = textline_mask.sum(axis=1) + + y = textline_sum_along_width[:] + y_padded = np.zeros(len(y) + 40) + y_padded[20 : len(y) + 20] = y + x = np.array(range(len(y))) + + peaks_real, _ = find_peaks(gaussian_filter1d(y, 3), height=0) + + sigma_gaus = 8 + + z = gaussian_filter1d(y_padded, sigma_gaus) + zneg_rev = -y_padded + np.max(y_padded) + + zneg = np.zeros(len(zneg_rev) + 40) + zneg[20 : len(zneg_rev) + 20] = zneg_rev + zneg = gaussian_filter1d(zneg, sigma_gaus) + + peaks, _ = find_peaks(z, height=0) + peaks_neg, _ = find_peaks(zneg, height=0) + + peaks_neg = peaks_neg - 20 - 20 + peaks = peaks - 20 + + ##plt.plot(z) + ##plt.show() + + if contours_main != None: + areas_main = np.array([cv2.contourArea(contours_main[j]) for j in range(len(contours_main))]) + M_main = [cv2.moments(contours_main[j]) for j in range(len(contours_main))] + cx_main = [(M_main[j]["m10"] / (M_main[j]["m00"] + 1e-32)) for j in range(len(M_main))] + cy_main = [(M_main[j]["m01"] / (M_main[j]["m00"] + 1e-32)) for j in range(len(M_main))] + x_min_main = np.array([np.min(contours_main[j][:, 0, 0]) for j in range(len(contours_main))]) + x_max_main = np.array([np.max(contours_main[j][:, 0, 0]) for j in range(len(contours_main))]) + + y_min_main = np.array([np.min(contours_main[j][:, 0, 1]) for j in range(len(contours_main))]) + y_max_main = np.array([np.max(contours_main[j][:, 0, 1]) for j in range(len(contours_main))]) + + if len(contours_header) != None: + areas_header = np.array([cv2.contourArea(contours_header[j]) for j in range(len(contours_header))]) + M_header = [cv2.moments(contours_header[j]) for j in range(len(contours_header))] + cx_header = [(M_header[j]["m10"] / (M_header[j]["m00"] + 1e-32)) for j in range(len(M_header))] + cy_header = [(M_header[j]["m01"] / (M_header[j]["m00"] + 1e-32)) for j in range(len(M_header))] + + x_min_header = np.array([np.min(contours_header[j][:, 0, 0]) for j in range(len(contours_header))]) + x_max_header = np.array([np.max(contours_header[j][:, 0, 0]) for j in range(len(contours_header))]) + + y_min_header = np.array([np.min(contours_header[j][:, 0, 1]) for j in range(len(contours_header))]) + y_max_header = np.array([np.max(contours_header[j][:, 0, 1]) for j in range(len(contours_header))]) + # print(cy_main,'mainy') + + peaks_neg_new = [] + + peaks_neg_new.append(0 + y_ref) + for iii in range(len(peaks_neg)): + peaks_neg_new.append(peaks_neg[iii] + y_ref) + + peaks_neg_new.append(textline_mask.shape[0] + y_ref) + + if len(cy_main) > 0 and np.max(cy_main) > np.max(peaks_neg_new): + cy_main = np.array(cy_main) * (np.max(peaks_neg_new) / np.max(cy_main)) - 10 + + if contours_main != None: + indexer_main = np.array(range(len(contours_main))) + + if contours_main != None: + len_main = len(contours_main) + else: + len_main = 0 + + matrix_of_orders = np.zeros((len(contours_main) + len(contours_header), 5)) + + matrix_of_orders[:, 0] = np.array(range(len(contours_main) + len(contours_header))) + + matrix_of_orders[: len(contours_main), 1] = 1 + matrix_of_orders[len(contours_main) :, 1] = 2 + + matrix_of_orders[: len(contours_main), 2] = cx_main + matrix_of_orders[len(contours_main) :, 2] = cx_header + + matrix_of_orders[: len(contours_main), 3] = cy_main + matrix_of_orders[len(contours_main) :, 3] = cy_header + + matrix_of_orders[: len(contours_main), 4] = np.array(range(len(contours_main))) + matrix_of_orders[len(contours_main) :, 4] = np.array(range(len(contours_header))) + + # print(peaks_neg_new,'peaks_neg_new') + + # print(matrix_of_orders,'matrix_of_orders') + # print(peaks_neg_new,np.max(peaks_neg_new)) + final_indexers_sorted = [] + final_types = [] + final_index_type = [] + for i in range(len(peaks_neg_new) - 1): + top = peaks_neg_new[i] + down = peaks_neg_new[i + 1] + + # print(top,down,'topdown') + + indexes_in = matrix_of_orders[:, 0][(matrix_of_orders[:, 3] >= top) & ((matrix_of_orders[:, 3] < down))] + cxs_in = matrix_of_orders[:, 2][(matrix_of_orders[:, 3] >= top) & ((matrix_of_orders[:, 3] < down))] + cys_in = matrix_of_orders[:, 3][(matrix_of_orders[:, 3] >= top) & ((matrix_of_orders[:, 3] < down))] + types_of_text = matrix_of_orders[:, 1][(matrix_of_orders[:, 3] >= top) & ((matrix_of_orders[:, 3] < down))] + index_types_of_text = matrix_of_orders[:, 4][(matrix_of_orders[:, 3] >= top) & ((matrix_of_orders[:, 3] < down))] + + # print(top,down) + # print(cys_in,'cyyyins') + # print(indexes_in,'indexes') + sorted_inside = np.argsort(cxs_in) + + ind_in_int = indexes_in[sorted_inside] + ind_in_type = types_of_text[sorted_inside] + ind_ind_type = index_types_of_text[sorted_inside] + + for j in range(len(ind_in_int)): + final_indexers_sorted.append(int(ind_in_int[j])) + final_types.append(int(ind_in_type[j])) + final_index_type.append(int(ind_ind_type[j])) + + ##matrix_of_orders[:len_main,4]=final_indexers_sorted[:] + + # print(peaks_neg_new,'peaks') + # print(final_indexers_sorted,'indexsorted') + # print(final_types,'types') + # print(final_index_type,'final_index_type') + + return final_indexers_sorted, matrix_of_orders, final_types, final_index_type + +def implent_law_head_main_not_parallel(text_regions): + # print(text_regions.shape) + text_indexes = [1, 2] # 1: main text , 2: header , 3: comments + + for t_i in text_indexes: + textline_mask = text_regions[:, :] == t_i + textline_mask = textline_mask * 255.0 + + textline_mask = textline_mask.astype(np.uint8) + textline_mask = np.repeat(textline_mask[:, :, np.newaxis], 3, axis=2) + kernel = np.ones((5, 5), np.uint8) + + # print(type(textline_mask),np.unique(textline_mask),textline_mask.shape) + imgray = cv2.cvtColor(textline_mask, cv2.COLOR_BGR2GRAY) + ret, thresh = cv2.threshold(imgray, 0, 255, 0) + + if t_i == 1: + contours_main, hirarchy = cv2.findContours(thresh.copy(), cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) + # print(type(contours_main)) + areas_main = np.array([cv2.contourArea(contours_main[j]) for j in range(len(contours_main))]) + M_main = [cv2.moments(contours_main[j]) for j in range(len(contours_main))] + cx_main = [(M_main[j]["m10"] / (M_main[j]["m00"] + 1e-32)) for j in range(len(M_main))] + cy_main = [(M_main[j]["m01"] / (M_main[j]["m00"] + 1e-32)) for j in range(len(M_main))] + x_min_main = np.array([np.min(contours_main[j][:, 0, 0]) for j in range(len(contours_main))]) + x_max_main = np.array([np.max(contours_main[j][:, 0, 0]) for j in range(len(contours_main))]) + + y_min_main = np.array([np.min(contours_main[j][:, 0, 1]) for j in range(len(contours_main))]) + y_max_main = np.array([np.max(contours_main[j][:, 0, 1]) for j in range(len(contours_main))]) + # print(contours_main[0],np.shape(contours_main[0]),contours_main[0][:,0,0]) + elif t_i == 2: + contours_header, hirarchy = cv2.findContours(thresh.copy(), cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) + # print(type(contours_header)) + areas_header = np.array([cv2.contourArea(contours_header[j]) for j in range(len(contours_header))]) + M_header = [cv2.moments(contours_header[j]) for j in range(len(contours_header))] + cx_header = [(M_header[j]["m10"] / (M_header[j]["m00"] + 1e-32)) for j in range(len(M_header))] + cy_header = [(M_header[j]["m01"] / (M_header[j]["m00"] + 1e-32)) for j in range(len(M_header))] + + x_min_header = np.array([np.min(contours_header[j][:, 0, 0]) for j in range(len(contours_header))]) + x_max_header = np.array([np.max(contours_header[j][:, 0, 0]) for j in range(len(contours_header))]) + + y_min_header = np.array([np.min(contours_header[j][:, 0, 1]) for j in range(len(contours_header))]) + y_max_header = np.array([np.max(contours_header[j][:, 0, 1]) for j in range(len(contours_header))]) + + args = np.array(range(1, len(cy_header) + 1)) + args_main = np.array(range(1, len(cy_main) + 1)) + for jj in range(len(contours_main)): + headers_in_main = [(cy_header > y_min_main[jj]) & ((cy_header < y_max_main[jj]))] + mains_in_main = [(cy_main > y_min_main[jj]) & ((cy_main < y_max_main[jj]))] + args_log = args * headers_in_main + res = args_log[args_log > 0] + res_true = res - 1 + + args_log_main = args_main * mains_in_main + res_main = args_log_main[args_log_main > 0] + res_true_main = res_main - 1 + + if len(res_true) > 0: + sum_header = np.sum(areas_header[res_true]) + sum_main = np.sum(areas_main[res_true_main]) + if sum_main > sum_header: + cnt_int = [contours_header[j] for j in res_true] + text_regions = cv2.fillPoly(text_regions, pts=cnt_int, color=(1, 1, 1)) + else: + cnt_int = [contours_main[j] for j in res_true_main] + text_regions = cv2.fillPoly(text_regions, pts=cnt_int, color=(2, 2, 2)) + + for jj in range(len(contours_header)): + main_in_header = [(cy_main > y_min_header[jj]) & ((cy_main < y_max_header[jj]))] + header_in_header = [(cy_header > y_min_header[jj]) & ((cy_header < y_max_header[jj]))] + args_log = args_main * main_in_header + res = args_log[args_log > 0] + res_true = res - 1 + + args_log_header = args * header_in_header + res_header = args_log_header[args_log_header > 0] + res_true_header = res_header - 1 + + if len(res_true) > 0: + + sum_header = np.sum(areas_header[res_true_header]) + sum_main = np.sum(areas_main[res_true]) + + if sum_main > sum_header: + + cnt_int = [contours_header[j] for j in res_true_header] + text_regions = cv2.fillPoly(text_regions, pts=cnt_int, color=(1, 1, 1)) + else: + cnt_int = [contours_main[j] for j in res_true] + text_regions = cv2.fillPoly(text_regions, pts=cnt_int, color=(2, 2, 2)) + + return text_regions + + +def return_hor_spliter_by_index(peaks_neg_fin_t, x_min_hor_some, x_max_hor_some): + + arg_min_hor_sort = np.argsort(x_min_hor_some) + x_min_hor_some_sort = np.sort(x_min_hor_some) + x_max_hor_some_sort = x_max_hor_some[arg_min_hor_sort] + + arg_minmax = np.array(range(len(peaks_neg_fin_t))) + indexer_lines = [] + indexes_to_delete = [] + indexer_lines_deletions_len = [] + indexr_uniq_ind = [] + for i in range(len(x_min_hor_some_sort)): + min_h = peaks_neg_fin_t - x_min_hor_some_sort[i] + max_h = peaks_neg_fin_t - x_max_hor_some_sort[i] + + min_h[0] = min_h[0] # +20 + max_h[len(max_h) - 1] = max_h[len(max_h) - 1] ##-20 + + min_h_neg = arg_minmax[(min_h < 0) & (np.abs(min_h) < 360)] + max_h_neg = arg_minmax[(max_h >= 0) & (np.abs(max_h) < 360)] + + if len(min_h_neg) > 0 and len(max_h_neg) > 0: + deletions = list(range(min_h_neg[0] + 1, max_h_neg[0])) + unique_delets_int = [] + # print(deletions,len(deletions),'delii') + if len(deletions) > 0: + # print(deletions,len(deletions),'delii2') + + for j in range(len(deletions)): + indexes_to_delete.append(deletions[j]) + # print(deletions,indexes_to_delete,'badiii') + unique_delets = np.unique(indexes_to_delete) + # print(min_h_neg[0],unique_delets) + unique_delets_int = unique_delets[unique_delets < min_h_neg[0]] + + indexer_lines_deletions_len.append(len(deletions)) + indexr_uniq_ind.append([deletions]) + + else: + indexer_lines_deletions_len.append(0) + indexr_uniq_ind.append(-999) + + index_line_true = min_h_neg[0] - len(unique_delets_int) + # print(index_line_true) + if index_line_true > 0 and min_h_neg[0] >= 2: + index_line_true = index_line_true + else: + index_line_true = min_h_neg[0] + + indexer_lines.append(index_line_true) + + if len(unique_delets_int) > 0: + for dd in range(len(unique_delets_int)): + indexes_to_delete.append(unique_delets_int[dd]) + else: + indexer_lines.append(-999) + indexer_lines_deletions_len.append(-999) + indexr_uniq_ind.append(-999) + + peaks_true = [] + for m in range(len(peaks_neg_fin_t)): + if m in indexes_to_delete: + pass + else: + peaks_true.append(peaks_neg_fin_t[m]) + return indexer_lines, peaks_true, arg_min_hor_sort, indexer_lines_deletions_len, indexr_uniq_ind + +def combine_hor_lines_and_delete_cross_points_and_get_lines_features_back_new(img_p_in_ver, img_in_hor): + + # plt.imshow(img_in_hor) + # plt.show() + + # img_p_in_ver = cv2.erode(img_p_in_ver, self.kernel, iterations=2) + img_p_in_ver = img_p_in_ver.astype(np.uint8) + img_p_in_ver = np.repeat(img_p_in_ver[:, :, np.newaxis], 3, axis=2) + imgray = cv2.cvtColor(img_p_in_ver, cv2.COLOR_BGR2GRAY) + ret, thresh = cv2.threshold(imgray, 0, 255, 0) + + contours_lines_ver, hierachy = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) + + slope_lines_ver, dist_x_ver, x_min_main_ver, x_max_main_ver, cy_main_ver, slope_lines_org_ver, y_min_main_ver, y_max_main_ver, cx_main_ver = find_features_of_lines(contours_lines_ver) + + for i in range(len(x_min_main_ver)): + img_p_in_ver[int(y_min_main_ver[i]) : int(y_min_main_ver[i]) + 30, int(cx_main_ver[i]) - 25 : int(cx_main_ver[i]) + 25, 0] = 0 + img_p_in_ver[int(y_max_main_ver[i]) - 30 : int(y_max_main_ver[i]), int(cx_main_ver[i]) - 25 : int(cx_main_ver[i]) + 25, 0] = 0 + + # plt.imshow(img_p_in_ver[:,:,0]) + # plt.show() + img_in_hor = img_in_hor.astype(np.uint8) + img_in_hor = np.repeat(img_in_hor[:, :, np.newaxis], 3, axis=2) + imgray = cv2.cvtColor(img_in_hor, cv2.COLOR_BGR2GRAY) + ret, thresh = cv2.threshold(imgray, 0, 255, 0) + + contours_lines_hor, hierachy = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) + + slope_lines_hor, dist_x_hor, x_min_main_hor, x_max_main_hor, cy_main_hor, slope_lines_org_hor, y_min_main_hor, y_max_main_hor, cx_main_hor = find_features_of_lines(contours_lines_hor) + + args_hor = np.array(range(len(slope_lines_hor))) + all_args_uniq = contours_in_same_horizon(cy_main_hor) + # print(all_args_uniq,'all_args_uniq') + if len(all_args_uniq) > 0: + if type(all_args_uniq[0]) is list: + special_seperators = [] + contours_new = [] + for dd in range(len(all_args_uniq)): + merged_all = None + some_args = args_hor[all_args_uniq[dd]] + some_cy = cy_main_hor[all_args_uniq[dd]] + some_x_min = x_min_main_hor[all_args_uniq[dd]] + some_x_max = x_max_main_hor[all_args_uniq[dd]] + + # img_in=np.zeros(seperators_closeup_n[:,:,2].shape) + for jv in range(len(some_args)): + + img_p_in = cv2.fillPoly(img_in_hor, pts=[contours_lines_hor[some_args[jv]]], color=(1, 1, 1)) + img_p_in[int(np.mean(some_cy)) - 5 : int(np.mean(some_cy)) + 5, int(np.min(some_x_min)) : int(np.max(some_x_max))] = 1 + + sum_dis = dist_x_hor[some_args].sum() + diff_max_min_uniques = np.max(x_max_main_hor[some_args]) - np.min(x_min_main_hor[some_args]) + + # print( sum_dis/float(diff_max_min_uniques) ,diff_max_min_uniques/float(img_p_in_ver.shape[1]),dist_x_hor[some_args].sum(),diff_max_min_uniques,np.mean( dist_x_hor[some_args]),np.std( dist_x_hor[some_args]) ) + + if diff_max_min_uniques > sum_dis and ((sum_dis / float(diff_max_min_uniques)) > 0.85) and ((diff_max_min_uniques / float(img_p_in_ver.shape[1])) > 0.85) and np.std(dist_x_hor[some_args]) < (0.55 * np.mean(dist_x_hor[some_args])): + # print(dist_x_hor[some_args],dist_x_hor[some_args].sum(),np.min(x_min_main_hor[some_args]) ,np.max(x_max_main_hor[some_args]),'jalibdi') + # print(np.mean( dist_x_hor[some_args] ),np.std( dist_x_hor[some_args] ),np.var( dist_x_hor[some_args] ),'jalibdiha') + special_seperators.append(np.mean(cy_main_hor[some_args])) + + else: + img_p_in = img_in_hor + special_seperators = [] + else: + img_p_in = img_in_hor + special_seperators = [] + + img_p_in_ver[:, :, 0][img_p_in_ver[:, :, 0] == 255] = 1 + # print(img_p_in_ver.shape,np.unique(img_p_in_ver[:,:,0])) + + # plt.imshow(img_p_in[:,:,0]) + # plt.show() + + # plt.imshow(img_p_in_ver[:,:,0]) + # plt.show() + sep_ver_hor = img_p_in + img_p_in_ver + # print(sep_ver_hor.shape,np.unique(sep_ver_hor[:,:,0]),'sep_ver_horsep_ver_horsep_ver_hor') + # plt.imshow(sep_ver_hor[:,:,0]) + # plt.show() + + sep_ver_hor_cross = (sep_ver_hor[:, :, 0] == 2) * 1 + + sep_ver_hor_cross = np.repeat(sep_ver_hor_cross[:, :, np.newaxis], 3, axis=2) + sep_ver_hor_cross = sep_ver_hor_cross.astype(np.uint8) + imgray = cv2.cvtColor(sep_ver_hor_cross, cv2.COLOR_BGR2GRAY) + ret, thresh = cv2.threshold(imgray, 0, 255, 0) + contours_cross, _ = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) + + cx_cross, cy_cross, _, _, _, _, _ = find_new_features_of_contoures(contours_cross) + + for ii in range(len(cx_cross)): + img_p_in[int(cy_cross[ii]) - 30 : int(cy_cross[ii]) + 30, int(cx_cross[ii]) + 5 : int(cx_cross[ii]) + 40, 0] = 0 + img_p_in[int(cy_cross[ii]) - 30 : int(cy_cross[ii]) + 30, int(cx_cross[ii]) - 40 : int(cx_cross[ii]) - 4, 0] = 0 + + # plt.imshow(img_p_in[:,:,0]) + # plt.show() + + return img_p_in[:, :, 0], special_seperators + +def return_points_with_boundies(peaks_neg_fin, first_point, last_point): + peaks_neg_tot = [] + peaks_neg_tot.append(first_point) + for ii in range(len(peaks_neg_fin)): + peaks_neg_tot.append(peaks_neg_fin[ii]) + peaks_neg_tot.append(last_point) + return peaks_neg_tot +