From e60b0e5911e498046fbbae779492bd04ff5d50e8 Mon Sep 17 00:00:00 2001 From: vahidrezanezhad Date: Wed, 12 Nov 2025 18:24:50 +0100 Subject: [PATCH] =?UTF-8?q?Revert=20to=20older=20deskew=20slope=20calculat?= =?UTF-8?q?ion=20=E2=80=94=20pairing=20between=20skewed=20and=20original?= =?UTF-8?q?=20contours=20was=20incorrect,=20so=20the=20original=20pairing?= =?UTF-8?q?=20logic=20has=20been=20restored.=20Also=20restored=20some=20or?= =?UTF-8?q?iginal=20functions=20to=20ensure=20correct=20reading=20order=20?= =?UTF-8?q?detection.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/eynollah/eynollah.py | 354 +++--- src/eynollah/utils/__init__.py | 1532 ++++++++++++-------------- src/eynollah/utils/separate_lines.py | 117 +- 3 files changed, 941 insertions(+), 1062 deletions(-) diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index eee3777..99d18dd 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -2482,119 +2482,150 @@ class Eynollah: self, contours_only_text_parent, contours_only_text_parent_h, boxes, textline_mask_tot): self.logger.debug("enter do_order_of_regions") - contours_only_text_parent = np.array(contours_only_text_parent) - contours_only_text_parent_h = np.array(contours_only_text_parent_h) boxes = np.array(boxes, dtype=int) # to be on the safe side - c_boxes = np.stack((0.5 * boxes[:, 2:4].sum(axis=1), - 0.5 * boxes[:, 0:2].sum(axis=1))) - cx_main, cy_main, mx_main, Mx_main, my_main, My_main, mxy_main = find_new_features_of_contours( + cx_text_only, cy_text_only, x_min_text_only, x_max_text_only, _, _, y_cor_x_min_main = find_new_features_of_contours( contours_only_text_parent) - cx_head, cy_head, mx_head, Mx_head, my_head, My_head, mxy_head = find_new_features_of_contours( + cx_text_only_h, cy_text_only_h, x_min_text_only_h, x_max_text_only_h, _, _, y_cor_x_min_main_h = find_new_features_of_contours( contours_only_text_parent_h) - + def match_boxes(only_centers: bool): - arg_text_con_main = np.zeros(len(contours_only_text_parent), dtype=int) - for ii in range(len(contours_only_text_parent)): + arg_text_con = [] + for ii in range(len(cx_text_only)): check_if_textregion_located_in_a_box = False - for jj, box in enumerate(boxes): - if ((cx_main[ii] >= box[0] and - cx_main[ii] < box[1] and - cy_main[ii] >= box[2] and - cy_main[ii] < box[3]) if only_centers else - (mx_main[ii] >= box[0] and - Mx_main[ii] < box[1] and - my_main[ii] >= box[2] and - My_main[ii] < box[3])): - arg_text_con_main[ii] = jj - check_if_textregion_located_in_a_box = True - #print("main/matched", (mx_main[ii], Mx_main[ii], my_main[ii], My_main[ii]), "\tin", box, only_centers) - break + for jj in range(len(boxes)): + if self.right2left: + if ((x_max_text_only[ii] - 80 >= boxes[jj][0] and + x_max_text_only[ii] - 80 < boxes[jj][1] and + y_cor_x_min_main[ii] >= boxes[jj][2] and + y_cor_x_min_main[ii] < boxes[jj][3]) if only_centers else + (cx_text_only[ii] >= boxes[jj][0] and + cx_text_only[ii] < boxes[jj][1] and + cy_text_only[ii] >= boxes[jj][2] and + cy_text_only[ii] < boxes[jj][3])): + arg_text_con.append(jj) + check_if_textregion_located_in_a_box = True + break + else: + if ((x_min_text_only[ii] + 80 >= boxes[jj][0] and + x_min_text_only[ii] + 80 < boxes[jj][1] and + y_cor_x_min_main[ii] >= boxes[jj][2] and + y_cor_x_min_main[ii] < boxes[jj][3]) if only_centers else + (cx_text_only[ii] >= boxes[jj][0] and + cx_text_only[ii] < boxes[jj][1] and + cy_text_only[ii] >= boxes[jj][2] and + cy_text_only[ii] < boxes[jj][3])): + arg_text_con.append(jj) + check_if_textregion_located_in_a_box = True + break if not check_if_textregion_located_in_a_box: - dists_tr_from_box = np.linalg.norm(c_boxes - np.array([[cy_main[ii]], [cx_main[ii]]]), axis=0) - pcontained_in_box = ((boxes[:, 2] <= cy_main[ii]) & (cy_main[ii] < boxes[:, 3]) & - (boxes[:, 0] <= cx_main[ii]) & (cx_main[ii] < boxes[:, 1])) - ind_min = np.argmin(np.ma.masked_array(dists_tr_from_box, ~pcontained_in_box)) - arg_text_con_main[ii] = ind_min - #print("main/fallback", (mx_main[ii], Mx_main[ii], my_main[ii], My_main[ii]), "\tin", boxes[ind_min], only_centers) - args_contours_main = np.arange(len(contours_only_text_parent)) - order_by_con_main = np.zeros_like(arg_text_con_main) + dists_tr_from_box = [math.sqrt((cx_text_only[ii] - boxes[jj][1]) ** 2 + + (cy_text_only[ii] - boxes[jj][2]) ** 2) + for jj in range(len(boxes))] + ind_min = np.argmin(dists_tr_from_box) + arg_text_con.append(ind_min) + args_contours = np.array(range(len(arg_text_con))) + arg_text_con_h = [] + for ii in range(len(cx_text_only_h)): + check_if_textregion_located_in_a_box = False + for jj in range(len(boxes)): + if self.right2left: + if ((x_max_text_only_h[ii] - 80 >= boxes[jj][0] and + x_max_text_only_h[ii] - 80 < boxes[jj][1] and + y_cor_x_min_main_h[ii] >= boxes[jj][2] and + y_cor_x_min_main_h[ii] < boxes[jj][3]) if only_centers else + (cx_text_only_h[ii] >= boxes[jj][0] and + cx_text_only_h[ii] < boxes[jj][1] and + cy_text_only_h[ii] >= boxes[jj][2] and + cy_text_only_h[ii] < boxes[jj][3])): + arg_text_con_h.append(jj) + check_if_textregion_located_in_a_box = True + break + else: + if ((x_min_text_only_h[ii] + 80 >= boxes[jj][0] and + x_min_text_only_h[ii] + 80 < boxes[jj][1] and + y_cor_x_min_main_h[ii] >= boxes[jj][2] and + y_cor_x_min_main_h[ii] < boxes[jj][3]) if only_centers else + (cx_text_only_h[ii] >= boxes[jj][0] and + cx_text_only_h[ii] < boxes[jj][1] and + cy_text_only_h[ii] >= boxes[jj][2] and + cy_text_only_h[ii] < boxes[jj][3])): + arg_text_con_h.append(jj) + check_if_textregion_located_in_a_box = True + break + if not check_if_textregion_located_in_a_box: + dists_tr_from_box = [math.sqrt((cx_text_only_h[ii] - boxes[jj][1]) ** 2 + + (cy_text_only_h[ii] - boxes[jj][2]) ** 2) + for jj in range(len(boxes))] + ind_min = np.argmin(dists_tr_from_box) + arg_text_con_h.append(ind_min) + args_contours_h = np.array(range(len(arg_text_con_h))) - arg_text_con_head = np.zeros(len(contours_only_text_parent_h), dtype=int) - for ii in range(len(contours_only_text_parent_h)): - check_if_textregion_located_in_a_box = False - for jj, box in enumerate(boxes): - if ((cx_head[ii] >= box[0] and - cx_head[ii] < box[1] and - cy_head[ii] >= box[2] and - cy_head[ii] < box[3]) if only_centers else - (mx_head[ii] >= box[0] and - Mx_head[ii] < box[1] and - my_head[ii] >= box[2] and - My_head[ii] < box[3])): - arg_text_con_head[ii] = jj - check_if_textregion_located_in_a_box = True - #print("head/matched", (mx_head[ii], Mx_head[ii], my_head[ii], My_head[ii]), "\tin", box, only_centers) - break - if not check_if_textregion_located_in_a_box: - dists_tr_from_box = np.linalg.norm(c_boxes - np.array([[cy_head[ii]], [cx_head[ii]]]), axis=0) - pcontained_in_box = ((boxes[:, 2] <= cy_head[ii]) & (cy_head[ii] < boxes[:, 3]) & - (boxes[:, 0] <= cx_head[ii]) & (cx_head[ii] < boxes[:, 1])) - ind_min = np.argmin(np.ma.masked_array(dists_tr_from_box, ~pcontained_in_box)) - arg_text_con_head[ii] = ind_min - #print("head/fallback", (mx_head[ii], Mx_head[ii], my_head[ii], My_head[ii]), "\tin", boxes[ind_min], only_centers) - args_contours_head = np.arange(len(contours_only_text_parent_h)) - order_by_con_head = np.zeros_like(arg_text_con_head) + order_by_con_head = np.zeros(len(arg_text_con_h)) + order_by_con_main = np.zeros(len(arg_text_con)) ref_point = 0 order_of_texts_tot = [] id_of_texts_tot = [] - for iij, box in enumerate(boxes): - ys = slice(*box[2:4]) - xs = slice(*box[0:2]) - args_contours_box_main = args_contours_main[arg_text_con_main == iij] - args_contours_box_head = args_contours_head[arg_text_con_head == iij] - con_inter_box = contours_only_text_parent[args_contours_box_main] - con_inter_box_h = contours_only_text_parent_h[args_contours_box_head] + for iij in range(len(boxes)): + ys = slice(*boxes[iij][2:4]) + xs = slice(*boxes[iij][0:2]) + args_contours_box = args_contours[np.array(arg_text_con) == iij] + args_contours_box_h = args_contours_h[np.array(arg_text_con_h) == iij] + con_inter_box = [] + con_inter_box_h = [] + + for box in args_contours_box: + con_inter_box.append(contours_only_text_parent[box]) + + for box in args_contours_box_h: + con_inter_box_h.append(contours_only_text_parent_h[box]) indexes_sorted, kind_of_texts_sorted, index_by_kind_sorted = order_of_regions( - textline_mask_tot[ys, xs], con_inter_box, con_inter_box_h, box[2], box[0]) + textline_mask_tot[ys, xs], con_inter_box, con_inter_box_h, boxes[iij][2]) order_of_texts, id_of_texts = order_and_id_of_texts( con_inter_box, con_inter_box_h, indexes_sorted, index_by_kind_sorted, kind_of_texts_sorted, ref_point) - indexes_sorted_main = indexes_sorted[kind_of_texts_sorted == 1] - indexes_by_type_main = index_by_kind_sorted[kind_of_texts_sorted == 1] - indexes_sorted_head = indexes_sorted[kind_of_texts_sorted == 2] - indexes_by_type_head = index_by_kind_sorted[kind_of_texts_sorted == 2] + indexes_sorted_main = np.array(indexes_sorted)[np.array(kind_of_texts_sorted) == 1] + indexes_by_type_main = np.array(index_by_kind_sorted)[np.array(kind_of_texts_sorted) == 1] + indexes_sorted_head = np.array(indexes_sorted)[np.array(kind_of_texts_sorted) == 2] + indexes_by_type_head = np.array(index_by_kind_sorted)[np.array(kind_of_texts_sorted) == 2] - for zahler, _ in enumerate(args_contours_box_main): + for zahler, _ in enumerate(args_contours_box): arg_order_v = indexes_sorted_main[zahler] - order_by_con_main[args_contours_box_main[indexes_by_type_main[zahler]]] = \ - np.flatnonzero(indexes_sorted == arg_order_v) + ref_point + order_by_con_main[args_contours_box[indexes_by_type_main[zahler]]] = \ + np.where(indexes_sorted == arg_order_v)[0][0] + ref_point - for zahler, _ in enumerate(args_contours_box_head): + for zahler, _ in enumerate(args_contours_box_h): arg_order_v = indexes_sorted_head[zahler] - order_by_con_head[args_contours_box_head[indexes_by_type_head[zahler]]] = \ - np.flatnonzero(indexes_sorted == arg_order_v) + ref_point + order_by_con_head[args_contours_box_h[indexes_by_type_head[zahler]]] = \ + np.where(indexes_sorted == arg_order_v)[0][0] + ref_point for jji in range(len(id_of_texts)): order_of_texts_tot.append(order_of_texts[jji] + ref_point) id_of_texts_tot.append(id_of_texts[jji]) ref_point += len(id_of_texts) - order_of_texts_tot = np.concatenate((order_by_con_main, - order_by_con_head)) - order_text_new = np.argsort(order_of_texts_tot) + order_of_texts_tot = [] + for tj1 in range(len(contours_only_text_parent)): + order_of_texts_tot.append(int(order_by_con_main[tj1])) + + for tj1 in range(len(contours_only_text_parent_h)): + order_of_texts_tot.append(int(order_by_con_head[tj1])) + + order_text_new = [] + for iii in range(len(order_of_texts_tot)): + order_text_new.append(np.where(np.array(order_of_texts_tot) == iii)[0][0]) + return order_text_new, id_of_texts_tot try: - results = match_boxes(False) + results = match_boxes(True) except Exception as why: self.logger.exception(why) - results = match_boxes(True) - - self.logger.debug("exit do_order_of_regions") + results = match_boxes(False) + self.logger.debug("exit do_order_of_regions_full_layout") return results def check_iou_of_bounding_box_and_contour_for_tables( @@ -3088,7 +3119,7 @@ class Eynollah: def run_deskew(self, textline_mask_tot_ea): #print(textline_mask_tot_ea.shape, 'textline_mask_tot_ea deskew') slope_deskew = return_deskew_slop(cv2.erode(textline_mask_tot_ea, KERNEL, iterations=2), 2, 30, True, - map=self.executor.map, logger=self.logger, plotter=self.plotter) + logger=self.logger, plotter=self.plotter) if self.plotter: self.plotter.save_deskewed_image(slope_deskew) self.logger.info("slope_deskew: %.2f°", slope_deskew) @@ -4419,130 +4450,89 @@ class Eynollah: ###min_con_area = 0.000005 contours_only_text, hir_on_text = return_contours_of_image(text_only) contours_only_text_parent = return_parent_contours(contours_only_text, hir_on_text) - contours_only_text_parent_d_ordered = [] - contours_only_text_parent_d = [] if len(contours_only_text_parent) > 0: - areas_tot_text = np.prod(text_only.shape) areas_cnt_text = np.array([cv2.contourArea(c) for c in contours_only_text_parent]) - areas_cnt_text = areas_cnt_text / float(areas_tot_text) - #self.logger.info('areas_cnt_text %s', areas_cnt_text) - contours_only_text_parent = np.array(contours_only_text_parent)[areas_cnt_text > MIN_AREA_REGION] - areas_cnt_text_parent = areas_cnt_text[areas_cnt_text > MIN_AREA_REGION] - + areas_cnt_text = areas_cnt_text / float(text_only.shape[0] * text_only.shape[1]) + contours_biggest = contours_only_text_parent[np.argmax(areas_cnt_text)] + contours_only_text_parent = [c for jz, c in enumerate(contours_only_text_parent) + if areas_cnt_text[jz] > MIN_AREA_REGION] + areas_cnt_text_parent = [area for area in areas_cnt_text if area > MIN_AREA_REGION] index_con_parents = np.argsort(areas_cnt_text_parent) - contours_only_text_parent = contours_only_text_parent[index_con_parents] - areas_cnt_text_parent = areas_cnt_text_parent[index_con_parents] - centers = np.stack(find_center_of_contours(contours_only_text_parent)) # [2, N] + contours_only_text_parent = self.return_list_of_contours_with_desired_order( + contours_only_text_parent, index_con_parents) - center0 = centers[:, -1:] # [2, 1] + areas_cnt_text_parent = self.return_list_of_contours_with_desired_order( + areas_cnt_text_parent, index_con_parents) + + cx_bigest_big, cy_biggest_big, _, _, _, _, _ = find_new_features_of_contours([contours_biggest]) + cx_bigest, cy_biggest, _, _, _, _, _ = find_new_features_of_contours(contours_only_text_parent) if np.abs(slope_deskew) >= SLOPE_THRESHOLD: contours_only_text_d, hir_on_text_d = return_contours_of_image(text_only_d) contours_only_text_parent_d = return_parent_contours(contours_only_text_d, hir_on_text_d) - areas_tot_text_d = np.prod(text_only_d.shape) areas_cnt_text_d = np.array([cv2.contourArea(c) for c in contours_only_text_parent_d]) - areas_cnt_text_d = areas_cnt_text_d / float(areas_tot_text_d) + areas_cnt_text_d = areas_cnt_text_d / float(text_only_d.shape[0] * text_only_d.shape[1]) - contours_only_text_parent_d = np.array(contours_only_text_parent_d)[areas_cnt_text_d > MIN_AREA_REGION] - areas_cnt_text_d = areas_cnt_text_d[areas_cnt_text_d > MIN_AREA_REGION] - - if len(contours_only_text_parent_d): + if len(areas_cnt_text_d)>0: + contours_biggest_d = contours_only_text_parent_d[np.argmax(areas_cnt_text_d)] index_con_parents_d = np.argsort(areas_cnt_text_d) - contours_only_text_parent_d = np.array(contours_only_text_parent_d)[index_con_parents_d] - areas_cnt_text_d = areas_cnt_text_d[index_con_parents_d] + contours_only_text_parent_d = self.return_list_of_contours_with_desired_order( + contours_only_text_parent_d, index_con_parents_d) + + areas_cnt_text_d = self.return_list_of_contours_with_desired_order( + areas_cnt_text_d, index_con_parents_d) - centers_d = np.stack(find_center_of_contours(contours_only_text_parent_d)) # [2, N] + cx_bigest_d_big, cy_biggest_d_big, _, _, _, _, _ = find_new_features_of_contours([contours_biggest_d]) + cx_bigest_d, cy_biggest_d, _, _, _, _, _ = find_new_features_of_contours(contours_only_text_parent_d) + try: + if len(cx_bigest_d) >= 5: + cx_bigest_d_last5 = cx_bigest_d[-5:] + cy_biggest_d_last5 = cy_biggest_d[-5:] + dists_d = [math.sqrt((cx_bigest_big[0] - cx_bigest_d_last5[j]) ** 2 + + (cy_biggest_big[0] - cy_biggest_d_last5[j]) ** 2) + for j in range(len(cy_biggest_d_last5))] + ind_largest = len(cx_bigest_d) -5 + np.argmin(dists_d) + else: + cx_bigest_d_last5 = cx_bigest_d[-len(cx_bigest_d):] + cy_biggest_d_last5 = cy_biggest_d[-len(cx_bigest_d):] + dists_d = [math.sqrt((cx_bigest_big[0]-cx_bigest_d_last5[j])**2 + + (cy_biggest_big[0]-cy_biggest_d_last5[j])**2) + for j in range(len(cy_biggest_d_last5))] + ind_largest = len(cx_bigest_d) - len(cx_bigest_d) + np.argmin(dists_d) - center0_d = centers_d[:, -1:].copy() # [2, 1] + cx_bigest_d_big[0] = cx_bigest_d[ind_largest] + cy_biggest_d_big[0] = cy_biggest_d[ind_largest] + except Exception as why: + self.logger.error(str(why)) - # find the largest among the largest 5 deskewed contours - # that is also closest to the largest original contour - last5_centers_d = centers_d[:, -5:] - dists_d = np.linalg.norm(center0 - last5_centers_d, axis=0) - ind_largest = len(contours_only_text_parent_d) - last5_centers_d.shape[1] + np.argmin(dists_d) - center0_d[:, 0] = centers_d[:, ind_largest] - - # order new contours the same way as the undeskewed contours - # (by calculating the offset of the largest contours, respectively, - # of the new and undeskewed image; then for each contour, - # finding the closest new contour, with proximity calculated - # as distance of their centers modulo offset vector) (h, w) = text_only.shape[:2] center = (w // 2.0, h // 2.0) M = cv2.getRotationMatrix2D(center, slope_deskew, 1.0) M_22 = np.array(M)[:2, :2] - center0 = np.dot(M_22, center0) # [2, 1] - offset = center0 - center0_d # [2, 1] + p_big = np.dot(M_22, [cx_bigest_big, cy_biggest_big]) + x_diff = p_big[0] - cx_bigest_d_big + y_diff = p_big[1] - cy_biggest_d_big - centers = np.dot(M_22, centers) - offset # [2,N] - # add dimension for area (so only contours of similar size will be considered close) - centers = np.append(centers, areas_cnt_text_parent[np.newaxis], axis=0) - centers_d = np.append(centers_d, areas_cnt_text_d[np.newaxis], axis=0) + contours_only_text_parent_d_ordered = [] + for i in range(len(contours_only_text_parent)): + p = np.dot(M_22, [cx_bigest[i], cy_biggest[i]]) + p[0] = p[0] - x_diff[0] + p[1] = p[1] - y_diff[0] + dists = [math.sqrt((p[0] - cx_bigest_d[j]) ** 2 + + (p[1] - cy_biggest_d[j]) ** 2) + for j in range(len(cx_bigest_d))] + contours_only_text_parent_d_ordered.append(contours_only_text_parent_d[np.argmin(dists)]) + else: + contours_only_text_parent_d_ordered = [] + contours_only_text_parent_d = [] + contours_only_text_parent = [] - dists = np.zeros((len(contours_only_text_parent), len(contours_only_text_parent_d))) - for i in range(len(contours_only_text_parent)): - dists[i] = np.linalg.norm(centers[:, i:i + 1] - centers_d, axis=0) - corresp = np.zeros(dists.shape, dtype=bool) - # keep searching next-closest until at least one correspondence on each side - while not np.all(corresp.sum(axis=1)) and not np.all(corresp.sum(axis=0)): - idx = np.nanargmin(dists) - i, j = np.unravel_index(idx, dists.shape) - dists[i, j] = np.nan - corresp[i, j] = True - #print("original/deskewed adjacency", corresp.nonzero()) - contours_only_text_parent_d_ordered = np.zeros_like(contours_only_text_parent) - contours_only_text_parent_d_ordered = contours_only_text_parent_d[np.argmax(corresp, axis=1)] - # img1 = np.zeros(text_only_d.shape[:2], dtype=np.uint8) - # for i in range(len(contours_only_text_parent)): - # cv2.fillPoly(img1, pts=[contours_only_text_parent_d_ordered[i]], color=i + 1) - # plt.subplot(2, 2, 1, title="direct corresp contours") - # plt.imshow(img1) - # img2 = np.zeros(text_only_d.shape[:2], dtype=np.uint8) - # join deskewed regions mapping to single original ones - for i in range(len(contours_only_text_parent)): - if np.count_nonzero(corresp[i]) > 1: - indices = np.flatnonzero(corresp[i]) - #print("joining", indices) - polygons_d = [contour2polygon(contour) - for contour in contours_only_text_parent_d[indices]] - contour_d = polygon2contour(join_polygons(polygons_d)) - contours_only_text_parent_d_ordered[i] = contour_d - # cv2.fillPoly(img2, pts=[contour_d], color=i + 1) - # plt.subplot(2, 2, 3, title="joined contours") - # plt.imshow(img2) - # img3 = np.zeros(text_only_d.shape[:2], dtype=np.uint8) - # split deskewed regions mapping to multiple original ones - def deskew(polygon): - polygon = shapely.affinity.rotate(polygon, -slope_deskew, origin=center) - polygon = shapely.affinity.translate(polygon, *offset.squeeze()) - return polygon - for j in range(len(contours_only_text_parent_d)): - if np.count_nonzero(corresp[:, j]) > 1: - indices = np.flatnonzero(corresp[:, j]) - #print("splitting along", indices) - polygons = [deskew(contour2polygon(contour)) - for contour in contours_only_text_parent[indices]] - polygon_d = contour2polygon(contours_only_text_parent_d[j]) - polygons_d = [make_intersection(polygon_d, polygon) - for polygon in polygons] - # ignore where there is no actual overlap - indices = indices[np.flatnonzero(polygons_d)] - contours_d = [polygon2contour(polygon_d) - for polygon_d in polygons_d - if polygon_d] - contours_only_text_parent_d_ordered[indices] = contours_d - # cv2.fillPoly(img3, pts=contours_d, color=j + 1) - # plt.subplot(2, 2, 4, title="split contours") - # plt.imshow(img3) - # img4 = np.zeros(text_only_d.shape[:2], dtype=np.uint8) - # for i in range(len(contours_only_text_parent)): - # cv2.fillPoly(img4, pts=[contours_only_text_parent_d_ordered[i]], color=i + 1) - # plt.subplot(2, 2, 2, title="result contours") - # plt.imshow(img4) - # plt.show() + else: + contours_only_text_parent_d_ordered = [] + contours_only_text_parent_d = [] if not len(contours_only_text_parent): # stop early diff --git a/src/eynollah/utils/__init__.py b/src/eynollah/utils/__init__.py index f3dbae2..d847bb3 100644 --- a/src/eynollah/utils/__init__.py +++ b/src/eynollah/utils/__init__.py @@ -33,7 +33,7 @@ def pairwise(iterable): a = b def return_x_start_end_mothers_childs_and_type_of_reading_order( - peak_points, x_min_hor_some, x_max_hor_some, cy_hor_some, y_max_hor_some): + x_min_hor_some, x_max_hor_some, cy_hor_some, peak_points, cy_hor_diff): """ Analyse which separators overlap multiple column candidates, and how they overlap each other. @@ -82,236 +82,230 @@ def return_x_start_end_mothers_childs_and_type_of_reading_order( x_start=[] x_end=[] + kind=[] len_sep=[] - y_mid=[] - y_max=[] + y_sep=[] + y_diff=[] new_main_sep_y=[] + indexer=0 for i in range(len(x_min_hor_some)): - #print(indexer, "%d:%d" % (x_min_hor_some[i], x_max_hor_some[i]), cy_hor_some[i]) - starting = x_min_hor_some[i] - peak_points - min_start = np.flatnonzero(starting >= 0)[-1] # last left-of - ending = x_max_hor_some[i] - peak_points - max_end = np.flatnonzero(ending < 0)[0] # first right-of - #print(indexer, "%d:%d" % (min_start, max_end)) + starting=x_min_hor_some[i]-peak_points + starting=starting[starting>=0] + min_start=np.argmin(starting) + ending=peak_points-x_max_hor_some[i] + len_ending_neg=len(ending[ending<=0]) + + ending=ending[ending>0] + max_end=np.argmin(ending)+len_ending_neg if (max_end-min_start)>=2: - # column range of separator spans more than one column candidate if (max_end-min_start)==(len(peak_points)-1): - # all columns (i.e. could be true new y splitter) new_main_sep_y.append(indexer) - #print((max_end-min_start),len(peak_points),'(max_end-min_start)') - y_mid.append(cy_hor_some[i]) - y_max.append(y_max_hor_some[i]) + y_sep.append(cy_hor_some[i]) + y_diff.append(cy_hor_diff[i]) x_end.append(max_end) - x_start.append(min_start) + + x_start.append( min_start) + len_sep.append(max_end-min_start) + if max_end==min_start+1: + kind.append(0) + else: + kind.append(1) + indexer+=1 - #print(x_start,'x_start') - #print(x_end,'x_end') x_start_returned = np.array(x_start, dtype=int) x_end_returned = np.array(x_end, dtype=int) - y_mid_returned = np.array(y_mid, dtype=int) - y_max_returned = np.array(y_max, dtype=int) - #print(y_mid_returned,'y_mid_returned') - #print(x_start_returned,'x_start_returned') - #print(x_end_returned,'x_end_returned') + y_sep_returned = np.array(y_sep, dtype=int) + y_diff_returned = np.array(y_diff, dtype=int) - # join/elongate separators if follow-up x and similar y - sep_pairs = contours_in_same_horizon(y_mid_returned) - if len(sep_pairs): - #print('burda') - args_to_be_unified = set() - y_mid_unified = [] - y_max_unified = [] - x_start_unified = [] - x_end_unified = [] - for pair in sep_pairs: - if (not np.array_equal(*x_start_returned[pair]) and - not np.array_equal(*x_end_returned[pair]) and - # immediately adjacent columns? - np.diff(x_end_returned[pair] - - x_start_returned[pair])[0] in [1, -1]): + all_args_uniq = contours_in_same_horizon(y_sep_returned) + args_to_be_unified=[] + y_unified=[] + y_diff_unified=[] + x_s_unified=[] + x_e_unified=[] + if len(all_args_uniq)>0: + if type(all_args_uniq[0]) is list: + for dd in range(len(all_args_uniq)): + if len(all_args_uniq[dd])==2: + x_s_same_hor=np.array(x_start_returned)[all_args_uniq[dd]] + x_e_same_hor=np.array(x_end_returned)[all_args_uniq[dd]] + y_sep_same_hor=np.array(y_sep_returned)[all_args_uniq[dd]] + y_diff_same_hor=np.array(y_diff_returned)[all_args_uniq[dd]] + if (x_s_same_hor[0]==x_e_same_hor[1]-1 or + x_s_same_hor[1]==x_e_same_hor[0]-1 and + x_s_same_hor[0]!=x_s_same_hor[1] and + x_e_same_hor[0]!=x_e_same_hor[1]): + for arg_in in all_args_uniq[dd]: + args_to_be_unified.append(arg_in) + y_selected=np.min(y_sep_same_hor) + y_diff_selected=np.max(y_diff_same_hor) + x_s_selected=np.min(x_s_same_hor) + x_e_selected=np.max(x_e_same_hor) - args_to_be_unified.union(set(pair)) - y_mid_unified.append(np.min(y_mid_returned[pair])) - y_max_unified.append(np.max(y_max_returned[pair])) - x_start_unified.append(np.min(x_start_returned[pair])) - x_end_unified.append(np.max(x_end_returned[pair])) - #print(pair,'pair') - #print(x_start_returned[pair],'x_s_same_hor') - #print(x_end_returned[pair],'x_e_same_hor') - #print(y_mid_unified,'y_mid_unified') - #print(y_max_unified,'y_max_unified') - #print(x_start_unified,'x_s_unified') - #print(x_end_unified,'x_e_selected') - #print('#############################') + x_s_unified.append(x_s_selected) + x_e_unified.append(x_e_selected) + y_unified.append(y_selected) + y_diff_unified.append(y_diff_selected) - if len(y_mid_unified): - args_lines_not_unified = np.setdiff1d(np.arange(len(y_mid_returned)), - list(args_to_be_unified), assume_unique=True) - #print(args_lines_not_unified,'args_lines_not_unified') - x_start_returned = np.append(x_start_returned[args_lines_not_unified], - x_start_unified, axis=0) - x_end_returned = np.append(x_end_returned[args_lines_not_unified], - x_end_unified, axis=0) - y_mid_returned = np.append(y_mid_returned[args_lines_not_unified], - y_mid_unified, axis=0) - y_max_returned = np.append(y_max_returned[args_lines_not_unified], - y_max_unified, axis=0) - #print(y_mid_returned,'y_mid_returned2') - #print(x_start_returned,'x_start_returned2') - #print(x_end_returned,'x_end_returned2') + args_lines_not_unified=list( set(range(len(y_sep_returned)))-set(args_to_be_unified) ) - #print(new_main_sep_y,'new_main_sep_y') - #print(x_start,'x_start') - #print(x_end,'x_end') - x_start = np.array(x_start) - x_end = np.array(x_end) - y_mid = np.array(y_mid) - if len(new_main_sep_y): - # some full-width multi-span separators exist, so - # restrict the y range of separators to search for - # mutual overlaps to only those within the largest - # y strip between adjacent multi-span separators - # that involve at least one such full-width seps. - # (does not affect the separators to be returned) - min_ys=np.min(y_mid) - max_ys=np.max(y_mid) - #print(min_ys,'min_ys') - #print(max_ys,'max_ys') + x_start_returned_not_unified=list( np.array(x_start_returned)[args_lines_not_unified] ) + x_end_returned_not_unified=list( np.array(x_end_returned)[args_lines_not_unified] ) + y_sep_returned_not_unified=list (np.array(y_sep_returned)[args_lines_not_unified] ) + y_diff_returned_not_unified=list (np.array(y_diff_returned)[args_lines_not_unified] ) - y_mains0 = list(y_mid[new_main_sep_y]) - y_mains = [min_ys] + y_mains0 + [max_ys] + for dv in range(len(y_unified)): + y_sep_returned_not_unified.append(y_unified[dv]) + y_diff_returned_not_unified.append(y_diff_unified[dv]) + x_start_returned_not_unified.append(x_s_unified[dv]) + x_end_returned_not_unified.append(x_e_unified[dv]) - y_mains = np.sort(y_mains) - argm = np.argmax(np.diff(y_mains)) - y_mid_new = y_mains[argm] - y_mid_next_new = y_mains[argm + 1] + x_start_returned = np.array(x_start_returned_not_unified, dtype=int) + x_end_returned = np.array(x_end_returned_not_unified, dtype=int) + y_sep_returned = np.array(y_sep_returned_not_unified, dtype=int) + y_diff_returned = np.array(y_diff_returned_not_unified, dtype=int) + + if len(new_main_sep_y)>0: + + min_ys=np.min(y_sep) + max_ys=np.max(y_sep) + + y_mains=[] + y_mains.append(min_ys) + y_mains_sep_ohne_grenzen=[] + + for ii in range(len(new_main_sep_y)): + y_mains.append(y_sep[new_main_sep_y[ii]]) + y_mains_sep_ohne_grenzen.append(y_sep[new_main_sep_y[ii]]) + + y_mains.append(max_ys) + + y_mains_sorted=np.sort(y_mains) + diff=np.diff(y_mains_sorted) + argm=np.argmax(diff) + + y_min_new=y_mains_sorted[argm] + y_max_new=y_mains_sorted[argm+1] - #print(y_mid_new,argm,'y_mid_new') - #print(y_mid_next_new,argm+1,'y_mid_next_new') - #print(y_mid[new_main_sep_y],new_main_sep_y,'yseps') x_start=np.array(x_start) x_end=np.array(x_end) - y_mid=np.array(y_mid) - # iff either boundary is itself not a full-width separator, - # then include it in the range of separators to be kept - if y_mid_new in y_mains0: - where = y_mid > y_mid_new + kind=np.array(kind) + y_sep=np.array(y_sep) + if (y_min_new in y_mains_sep_ohne_grenzen and + y_max_new in y_mains_sep_ohne_grenzen): + x_start=x_start[(y_sep>y_min_new) & (y_sepy_min_new) & (y_sepy_min_new) & (y_sepy_min_new) & (y_sepy_min_new) & (y_sep<=y_max_new)] + x_end=x_end[(y_sep>y_min_new) & (y_sep<=y_max_new)] + kind=kind[(y_sep>y_min_new) & (y_sep<=y_max_new)] + y_sep=y_sep[(y_sep>y_min_new) & (y_sep<=y_max_new)] + elif (y_min_new not in y_mains_sep_ohne_grenzen and + y_max_new in y_mains_sep_ohne_grenzen): + x_start=x_start[(y_sep>=y_min_new) & (y_sep=y_min_new) & (y_sep=y_min_new) & (y_sep=y_min_new) & (y_sep= y_mid_new - if y_mid_next_new in y_mains0: - where &= y_mid < y_mid_next_new - else: - where &= y_mid <= y_mid_next_new - x_start = x_start[where] - x_end = x_end[where] - y_mid = y_mid[where] - #print(x_start,'x_start') - #print(x_end,'x_end') + x_start=x_start[(y_sep>=y_min_new) & (y_sep<=y_max_new)] + x_end=x_end[(y_sep>=y_min_new) & (y_sep<=y_max_new)] + kind=kind[(y_sep>=y_min_new) & (y_sep<=y_max_new)] + y_sep=y_sep[(y_sep>=y_min_new) & (y_sep<=y_max_new)] - # remove redundant separators that span the same columns - # (keeping only 1 representative each) - deleted = set() - for index_i in range(len(x_start) - 1): - nodes_i = set(range(x_start[index_i], x_end[index_i] + 1)) - #print(nodes_i, "nodes_i") - for index_j in range(index_i + 1, len(x_start)): - nodes_j = set(range(x_start[index_j], x_end[index_j] + 1)) - #print(nodes_j, "nodes_j") - if nodes_i == nodes_j: - deleted.add(index_j) - #print(deleted,"deleted") - remained_sep_indexes = set(range(len(x_start))) - deleted - #print(remained_sep_indexes,'remained_sep_indexes') + deleted=[] + for i in range(len(x_start)-1): + nodes_i=set(range(x_start[i],x_end[i]+1)) + for j in range(i+1,len(x_start)): + if nodes_i==set(range(x_start[j],x_end[j]+1)): + deleted.append(j) - # determine which separators span which columns - mother = [] # whether the respective separator has a mother separator - child = [] # whether the respective separator has a child separator + remained_sep_indexes=set(range(len(x_start)))-set(np.unique(deleted) ) + + mother=[] + child=[] for index_i in remained_sep_indexes: have_mother=0 have_child=0 - nodes_i = set(range(x_start[index_i], x_end[index_i] + 1)) + nodes_ind=set(range(x_start[index_i],x_end[index_i]+1)) for index_j in remained_sep_indexes: - nodes_j = set(range(x_start[index_j], x_end[index_j] + 1)) - if nodes_i < nodes_j: + nodes_ind_j=set(range(x_start[index_j],x_end[index_j]+1)) + if nodes_ind nodes_j: + if nodes_ind>nodes_ind_j: have_child=1 mother.append(have_mother) child.append(have_child) - #print(mother, "mother") - #print(child, "child") + + y_lines_without_mother=[] + x_start_without_mother=[] + x_end_without_mother=[] + + y_lines_with_child_without_mother=[] + x_start_with_child_without_mother=[] + x_end_with_child_without_mother=[] mother = np.array(mother) child = np.array(child) - #print(mother,'mother') - #print(child,'child') - remained_sep_indexes = np.array(list(remained_sep_indexes)) - #print(len(remained_sep_indexes)) - #print(len(remained_sep_indexes),len(x_start),len(x_end),len(y_mid),'lens') - reading_order_type = 0 - if len(remained_sep_indexes): - #print(np.array(remained_sep_indexes),'np.array(remained_sep_indexes)') - #print(np.array(mother),'mother') + remained_sep_indexes = np.array(list(remained_sep_indexes)) + x_start = np.array(x_start) + x_end = np.array(x_end) + y_sep = np.array(y_sep) + + if len(remained_sep_indexes)>1: remained_sep_indexes_without_mother = remained_sep_indexes[mother==0] remained_sep_indexes_with_child_without_mother = remained_sep_indexes[(mother==0) & (child==1)] - #print(remained_sep_indexes_without_mother,'remained_sep_indexes_without_mother') - #print(remained_sep_indexes_without_mother,'remained_sep_indexes_without_mother') x_end_with_child_without_mother = x_end[remained_sep_indexes_with_child_without_mother] x_start_with_child_without_mother = x_start[remained_sep_indexes_with_child_without_mother] - y_mid_with_child_without_mother = y_mid[remained_sep_indexes_with_child_without_mother] + y_lines_with_child_without_mother = y_sep[remained_sep_indexes_with_child_without_mother] + reading_orther_type=0 x_end_without_mother = x_end[remained_sep_indexes_without_mother] x_start_without_mother = x_start[remained_sep_indexes_without_mother] - y_mid_without_mother = y_mid[remained_sep_indexes_without_mother] + y_lines_without_mother = y_sep[remained_sep_indexes_without_mother] if len(remained_sep_indexes_without_mother)>=2: for i in range(len(remained_sep_indexes_without_mother)-1): - index_i = remained_sep_indexes_without_mother[i] - nodes_i = set(range(x_start[index_i], x_end[index_i] + 1)) - #print(index_i, nodes_i, "nodes_i without mother") - for j in range(i + 1, len(remained_sep_indexes_without_mother)): - index_j = remained_sep_indexes_without_mother[j] - nodes_j = set(range(x_start[index_j], x_end[index_j] + 1)) - #print(index_j, nodes_j, "nodes_j without mother") - if nodes_i - nodes_j != nodes_i: - #print("type=1") - reading_order_type = 1 + nodes_i=set(range(x_start[remained_sep_indexes_without_mother[i]], + x_end[remained_sep_indexes_without_mother[i]] + # + 1 + )) + for j in range(i+1,len(remained_sep_indexes_without_mother)): + nodes_j=set(range(x_start[remained_sep_indexes_without_mother[j]], + x_end[remained_sep_indexes_without_mother[j]] + # + 1 + )) + set_diff = nodes_i - nodes_j + if set_diff != nodes_i: + reading_orther_type = 1 else: - y_mid_without_mother = np.zeros(0, int) - x_start_without_mother = np.zeros(0, int) - x_end_without_mother = np.zeros(0, int) - y_mid_with_child_without_mother = np.zeros(0, int) - x_start_with_child_without_mother = np.zeros(0, int) - x_end_with_child_without_mother = np.zeros(0, int) - - #print(reading_order_type,'reading_order_type') - #print(y_mid_with_child_without_mother,'y_mid_with_child_without_mother') - #print(x_start_with_child_without_mother,'x_start_with_child_without_mother') - #print(x_end_with_child_without_mother,'x_end_with_hild_without_mother') + reading_orther_type = 0 len_sep_with_child = len(child[child==1]) - #print(len_sep_with_child,'len_sep_with_child') + there_is_sep_with_child = 0 if len_sep_with_child >= 1: there_is_sep_with_child = 1 - return (reading_order_type, + return (reading_orther_type, x_start_returned, x_end_returned, - y_mid_returned, - y_max_returned, - y_mid_without_mother, + y_sep_returned, + y_diff_returned, + y_lines_without_mother, x_start_without_mother, x_end_without_mother, there_is_sep_with_child, - y_mid_with_child_without_mother, + y_lines_with_child_without_mother, x_start_with_child_without_mother, x_end_with_child_without_mother, new_main_sep_y) @@ -1211,19 +1205,12 @@ def small_textlines_to_parent_adherence2(textlines_con, textline_iamge, num_col) textlines_con_changed.append(textlines_big_org_form) return textlines_con_changed -def order_of_regions(textline_mask, contours_main, contours_head, y_ref, x_ref): - ##plt.imshow(textline_mask) - ##plt.show() +def order_of_regions(textline_mask, contours_main, contours_head, y_ref): y = textline_mask.sum(axis=1) # horizontal projection profile y_padded = np.zeros(len(y) + 40) y_padded[20 : len(y) + 20] = y sigma_gaus = 8 - #z = gaussian_filter1d(y_padded, sigma_gaus) - #peaks, _ = find_peaks(z, height=0) - #peaks = peaks - 20 - ##plt.plot(z) - ##plt.show() zneg_rev = np.max(y_padded) - y_padded zneg = np.zeros(len(zneg_rev) + 40) zneg[20 : len(zneg_rev) + 20] = zneg_rev @@ -1242,8 +1229,6 @@ def order_of_regions(textline_mask, contours_main, contours_head, y_ref, x_ref): cx_main, cy_main = find_center_of_contours(contours_main) cx_head, cy_head = find_center_of_contours(contours_head) - # assert not len(cy_main) or np.min(peaks_neg_new) <= np.min(cy_main) and np.max(cy_main) <= np.max(peaks_neg_new) - # assert not len(cy_head) or np.min(peaks_neg_new) <= np.min(cy_head) and np.max(cy_head) <= np.max(peaks_neg_new) matrix_of_orders = np.zeros((len(contours_main) + len(contours_head), 5), dtype=int) matrix_of_orders[:, 0] = np.arange(len(contours_main) + len(contours_head)) @@ -1256,9 +1241,7 @@ def order_of_regions(textline_mask, contours_main, contours_head, y_ref, x_ref): matrix_of_orders[: len(contours_main), 4] = np.arange(len(contours_main)) matrix_of_orders[len(contours_main) :, 4] = np.arange(len(contours_head)) - # print(peaks_neg_new,'peaks_neg_new') - # print(matrix_of_orders,'matrix_of_orders') - # print(peaks_neg_new,np.max(peaks_neg_new)) + final_indexers_sorted = [] final_types = [] final_index_type = [] @@ -1266,31 +1249,12 @@ def order_of_regions(textline_mask, contours_main, contours_head, y_ref, x_ref): indexes_in, types_in, cxs_in, cys_in, typed_indexes_in = \ matrix_of_orders[(matrix_of_orders[:, 3] >= top) & (matrix_of_orders[:, 3] < bot)].T - # if indexes_in.size: - # img = textline_mask.copy() - # plt.imshow(img) - # plt.gca().add_patch(patches.Rectangle((0, top-y_ref), img.shape[1], bot-top, alpha=0.5, color='gray')) - # xrange = np.arange(0, img.shape[1], 50) - # yrange = np.arange(0, img.shape[0], 50) - # plt.gca().set_xticks(xrange, xrange + x_ref) - # plt.gca().set_yticks(yrange, yrange + y_ref) - # for idx, type_, cx, cy in zip(typed_indexes_in, types_in, cxs_in, cys_in): - # cnt = (contours_main if type_ == 1 else contours_head)[idx] - # col = 'red' if type_ == 1 else 'blue' - # plt.scatter(cx - x_ref, cy - y_ref, 20, c=col, marker='o') - # plt.gca().add_patch(patches.Polygon(cnt[:, 0] - [[x_ref, y_ref]], closed=False, fill=False, color=col)) - # plt.title("box contours centered in %d:%d (red=main / blue=heading)" % (top, bot)) - # plt.show() sorted_inside = np.argsort(cxs_in) final_indexers_sorted.extend(indexes_in[sorted_inside]) final_types.extend(types_in[sorted_inside]) final_index_type.extend(typed_indexes_in[sorted_inside]) - ##matrix_of_orders[:len_main,4]=final_indexers_sorted[:] - - # assert len(final_indexers_sorted) == len(contours_main) + len(contours_head) - # assert not len(final_indexers_sorted) or max(final_index_type) == max(len(contours_main) return np.array(final_indexers_sorted), np.array(final_types), np.array(final_index_type) @@ -1386,168 +1350,204 @@ def return_points_with_boundies(peaks_neg_fin, first_point, last_point): peaks_neg_tot.append(last_point) return peaks_neg_tot -def find_number_of_columns_in_document(region_pre_p, num_col_classifier, tables, label_seps, contours_h=None): - separators_closeup = 1 * (region_pre_p == label_seps) - separators_closeup[0:110] = 0 - separators_closeup[-150:] = 0 +def find_number_of_columns_in_document(region_pre_p, num_col_classifier, tables, pixel_lines, contours_h=None): + region_pre_p = np.repeat(region_pre_p[:, :, np.newaxis], 3, axis=2) + separators_closeup=( (region_pre_p[:,:,:]==pixel_lines))*1 + cv2.imwrite("separators_closeup.png", separators_closeup*255) + separators_closeup[0:110,:,:]=0 + separators_closeup[separators_closeup.shape[0]-150:,:,:]=0 kernel = np.ones((5,5),np.uint8) - separators_closeup = separators_closeup.astype(np.uint8) - separators_closeup = cv2.morphologyEx(separators_closeup, cv2.MORPH_CLOSE, kernel, iterations=1) + separators_closeup=separators_closeup.astype(np.uint8) + separators_closeup = cv2.dilate(separators_closeup,kernel,iterations = 1) + separators_closeup = cv2.erode(separators_closeup,kernel,iterations = 1) - separators_closeup_n = separators_closeup.astype(np.uint8) # to be returned + separators_closeup_new=np.zeros((separators_closeup.shape[0] ,separators_closeup.shape[1] )) + separators_closeup_n=np.copy(separators_closeup) + separators_closeup_n=separators_closeup_n.astype(np.uint8) - separators_closeup_n_binary = separators_closeup_n.copy() + separators_closeup_n_binary=np.zeros(( separators_closeup_n.shape[0],separators_closeup_n.shape[1]) ) + separators_closeup_n_binary[:,:]=separators_closeup_n[:,:,0] + separators_closeup_n_binary[:,:][separators_closeup_n_binary[:,:]!=0]=1 - # find horizontal lines by contour properties - contours_sep_e, _ = cv2.findContours(separators_closeup_n_binary, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) - cnts_hor_e = [] - for cnt in contours_sep_e: - max_xe = cnt[:, 0, 0].max() - min_xe = cnt[:, 0, 0].min() - max_ye = cnt[:, 0, 1].max() - min_ye = cnt[:, 0, 1].min() - med_ye = int(np.median(cnt[:, 0, 1])) - dist_xe = max_xe - min_xe - dist_ye = max_ye - min_ye - if dist_ye <= 50 and dist_xe >= 3 * dist_ye: - cnts_hor_e.append(cnt) + gray_early=np.repeat(separators_closeup_n_binary[:, :, np.newaxis], 3, axis=2) + gray_early=gray_early.astype(np.uint8) + imgray_e = cv2.cvtColor(gray_early, cv2.COLOR_BGR2GRAY) + ret_e, thresh_e = cv2.threshold(imgray_e, 0, 255, 0) - # delete horizontal contours (leaving only the edges) - separators_closeup_n_binary = cv2.fillPoly(separators_closeup_n_binary, pts=cnts_hor_e, color=0) - edges = cv2.adaptiveThreshold(separators_closeup_n_binary * 255, 255, - cv2.ADAPTIVE_THRESH_MEAN_C, cv2.THRESH_BINARY, 15, -2) - horizontal = np.copy(edges) - vertical = np.copy(edges) + contours_line_e,hierarchy_e=cv2.findContours(thresh_e,cv2.RETR_TREE,cv2.CHAIN_APPROX_SIMPLE) + _, dist_xe, _, _, _, _, y_min_main, y_max_main, _ = \ + find_features_of_lines(contours_line_e) + dist_ye = y_max_main - y_min_main + args_e=np.arange(len(contours_line_e)) + args_hor_e=args_e[(dist_ye<=50) & + (dist_xe>=3*dist_ye)] + cnts_hor_e=[] + for ce in args_hor_e: + cnts_hor_e.append(contours_line_e[ce]) + figs_e=np.zeros(thresh_e.shape) + figs_e=cv2.fillPoly(figs_e,pts=cnts_hor_e,color=(1,1,1)) - horizontal_size = horizontal.shape[1] // 30 - # find horizontal lines by morphology + separators_closeup_n_binary=cv2.fillPoly(separators_closeup_n_binary, pts=cnts_hor_e, color=(0,0,0)) + gray = cv2.bitwise_not(separators_closeup_n_binary) + gray=gray.astype(np.uint8) + + bw = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_MEAN_C, \ + cv2.THRESH_BINARY, 15, -2) + horizontal = np.copy(bw) + vertical = np.copy(bw) + + cols = horizontal.shape[1] + horizontal_size = cols // 30 + # Create structure element for extracting horizontal lines through morphology operations horizontalStructure = cv2.getStructuringElement(cv2.MORPH_RECT, (horizontal_size, 1)) - horizontal = cv2.morphologyEx(horizontal, cv2.MORPH_OPEN, horizontalStructure) - horizontal = cv2.morphologyEx(horizontal, cv2.MORPH_CLOSE, kernel, iterations=2) - # re-insert deleted horizontal contours - horizontal = cv2.fillPoly(horizontal, pts=cnts_hor_e, color=255) + # Apply morphology operations + horizontal = cv2.erode(horizontal, horizontalStructure) + horizontal = cv2.dilate(horizontal, horizontalStructure) - vertical_size = vertical.shape[0] // 30 - # find vertical lines by morphology - verticalStructure = cv2.getStructuringElement(cv2.MORPH_RECT, (1, vertical_size)) - vertical = cv2.morphologyEx(vertical, cv2.MORPH_OPEN, verticalStructure) - vertical = cv2.dilate(vertical, kernel, iterations=1) + kernel = np.ones((5,5),np.uint8) + horizontal = cv2.dilate(horizontal,kernel,iterations = 2) + horizontal = cv2.erode(horizontal,kernel,iterations = 2) + horizontal = cv2.fillPoly(horizontal, pts=cnts_hor_e, color=(255,255,255)) + + rows = vertical.shape[0] + verticalsize = rows // 30 + # Create structure element for extracting vertical lines through morphology operations + verticalStructure = cv2.getStructuringElement(cv2.MORPH_RECT, (1, verticalsize)) + # Apply morphology operations + vertical = cv2.erode(vertical, verticalStructure) + vertical = cv2.dilate(vertical, verticalStructure) + vertical = cv2.dilate(vertical,kernel,iterations = 1) horizontal, special_separators = \ combine_hor_lines_and_delete_cross_points_and_get_lines_features_back_new( vertical, horizontal, num_col_classifier) - _, thresh = cv2.threshold(vertical, 0, 255, 0) - contours_sep_vers, _ = cv2.findContours(thresh.astype(np.uint8), cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) - slope_seps, dist_x, x_min_seps, x_max_seps, cy_seps, slope_seps_org, y_min_seps, y_max_seps, cx_seps = \ - find_features_of_lines(contours_sep_vers) + separators_closeup_new[:,:][vertical[:,:]!=0]=1 + separators_closeup_new[:,:][horizontal[:,:]!=0]=1 - args=np.arange(len(slope_seps)) - args_ver=args[slope_seps==1] - dist_x_ver=dist_x[slope_seps==1] - y_min_seps_ver=y_min_seps[slope_seps==1] - y_max_seps_ver=y_max_seps[slope_seps==1] - x_min_seps_ver=x_min_seps[slope_seps==1] - x_max_seps_ver=x_max_seps[slope_seps==1] - cx_seps_ver=cx_seps[slope_seps==1] - dist_y_ver=y_max_seps_ver-y_min_seps_ver + vertical=np.repeat(vertical[:, :, np.newaxis], 3, axis=2) + vertical=vertical.astype(np.uint8) + + imgray = cv2.cvtColor(vertical, cv2.COLOR_BGR2GRAY) + ret, thresh = cv2.threshold(imgray, 0, 255, 0) + + contours_line_vers,hierarchy=cv2.findContours(thresh,cv2.RETR_TREE,cv2.CHAIN_APPROX_SIMPLE) + slope_lines, dist_x, x_min_main, x_max_main, cy_main, slope_lines_org, y_min_main, y_max_main, cx_main = \ + find_features_of_lines(contours_line_vers) + + args=np.arange(len(slope_lines)) + args_ver=args[slope_lines==1] + dist_x_ver=dist_x[slope_lines==1] + y_min_main_ver=y_min_main[slope_lines==1] + y_max_main_ver=y_max_main[slope_lines==1] + x_min_main_ver=x_min_main[slope_lines==1] + x_max_main_ver=x_max_main[slope_lines==1] + cx_main_ver=cx_main[slope_lines==1] + dist_y_ver=y_max_main_ver-y_min_main_ver len_y=separators_closeup.shape[0]/3.0 - _, thresh = cv2.threshold(horizontal, 0, 255, 0) - contours_sep_hors, _ = cv2.findContours(thresh.astype(np.uint8), cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) - slope_seps, dist_x, x_min_seps, x_max_seps, cy_seps, slope_seps_org, y_min_seps, y_max_seps, cx_seps = \ - find_features_of_lines(contours_sep_hors) + horizontal=np.repeat(horizontal[:, :, np.newaxis], 3, axis=2) + horizontal=horizontal.astype(np.uint8) + imgray = cv2.cvtColor(horizontal, cv2.COLOR_BGR2GRAY) + ret, thresh = cv2.threshold(imgray, 0, 255, 0) + contours_line_hors,hierarchy=cv2.findContours(thresh,cv2.RETR_TREE,cv2.CHAIN_APPROX_SIMPLE) + slope_lines, dist_x, x_min_main, x_max_main, cy_main, slope_lines_org, y_min_main, y_max_main, cx_main = \ + find_features_of_lines(contours_line_hors) - slope_seps_org_hor=slope_seps_org[slope_seps==0] - args=np.arange(len(slope_seps)) + slope_lines_org_hor=slope_lines_org[slope_lines==0] + args=np.arange(len(slope_lines)) len_x=separators_closeup.shape[1]/5.0 - dist_y=np.abs(y_max_seps-y_min_seps) + dist_y=np.abs(y_max_main-y_min_main) - args_hor=args[slope_seps==0] - dist_x_hor=dist_x[slope_seps==0] - y_min_seps_hor=y_min_seps[slope_seps==0] - y_max_seps_hor=y_max_seps[slope_seps==0] - x_min_seps_hor=x_min_seps[slope_seps==0] - x_max_seps_hor=x_max_seps[slope_seps==0] - dist_y_hor=dist_y[slope_seps==0] - cy_seps_hor=cy_seps[slope_seps==0] + args_hor=args[slope_lines==0] + dist_x_hor=dist_x[slope_lines==0] + y_min_main_hor=y_min_main[slope_lines==0] + y_max_main_hor=y_max_main[slope_lines==0] + x_min_main_hor=x_min_main[slope_lines==0] + x_max_main_hor=x_max_main[slope_lines==0] + dist_y_hor=dist_y[slope_lines==0] + cy_main_hor=cy_main[slope_lines==0] args_hor=args_hor[dist_x_hor>=len_x/2.0] - x_max_seps_hor=x_max_seps_hor[dist_x_hor>=len_x/2.0] - x_min_seps_hor=x_min_seps_hor[dist_x_hor>=len_x/2.0] - cy_seps_hor=cy_seps_hor[dist_x_hor>=len_x/2.0] - y_min_seps_hor=y_min_seps_hor[dist_x_hor>=len_x/2.0] - y_max_seps_hor=y_max_seps_hor[dist_x_hor>=len_x/2.0] + x_max_main_hor=x_max_main_hor[dist_x_hor>=len_x/2.0] + x_min_main_hor=x_min_main_hor[dist_x_hor>=len_x/2.0] + cy_main_hor=cy_main_hor[dist_x_hor>=len_x/2.0] + y_min_main_hor=y_min_main_hor[dist_x_hor>=len_x/2.0] + y_max_main_hor=y_max_main_hor[dist_x_hor>=len_x/2.0] dist_y_hor=dist_y_hor[dist_x_hor>=len_x/2.0] - slope_seps_org_hor=slope_seps_org_hor[dist_x_hor>=len_x/2.0] + slope_lines_org_hor=slope_lines_org_hor[dist_x_hor>=len_x/2.0] dist_x_hor=dist_x_hor[dist_x_hor>=len_x/2.0] - matrix_of_seps_ch = np.zeros((len(cy_seps_hor)+len(cx_seps_ver), 10), dtype=int) - matrix_of_seps_ch[:len(cy_seps_hor),0]=args_hor - matrix_of_seps_ch[len(cy_seps_hor):,0]=args_ver - matrix_of_seps_ch[len(cy_seps_hor):,1]=cx_seps_ver - matrix_of_seps_ch[:len(cy_seps_hor),2]=x_min_seps_hor+50#x_min_seps_hor+150 - matrix_of_seps_ch[len(cy_seps_hor):,2]=x_min_seps_ver - matrix_of_seps_ch[:len(cy_seps_hor),3]=x_max_seps_hor-50#x_max_seps_hor-150 - matrix_of_seps_ch[len(cy_seps_hor):,3]=x_max_seps_ver - matrix_of_seps_ch[:len(cy_seps_hor),4]=dist_x_hor - matrix_of_seps_ch[len(cy_seps_hor):,4]=dist_x_ver - matrix_of_seps_ch[:len(cy_seps_hor),5]=cy_seps_hor - matrix_of_seps_ch[:len(cy_seps_hor),6]=y_min_seps_hor - matrix_of_seps_ch[len(cy_seps_hor):,6]=y_min_seps_ver - matrix_of_seps_ch[:len(cy_seps_hor),7]=y_max_seps_hor - matrix_of_seps_ch[len(cy_seps_hor):,7]=y_max_seps_ver - matrix_of_seps_ch[:len(cy_seps_hor),8]=dist_y_hor - matrix_of_seps_ch[len(cy_seps_hor):,8]=dist_y_ver - matrix_of_seps_ch[len(cy_seps_hor):,9]=1 + matrix_of_lines_ch=np.zeros((len(cy_main_hor)+len(cx_main_ver),10)) + matrix_of_lines_ch[:len(cy_main_hor),0]=args_hor + matrix_of_lines_ch[len(cy_main_hor):,0]=args_ver + matrix_of_lines_ch[len(cy_main_hor):,1]=cx_main_ver + matrix_of_lines_ch[:len(cy_main_hor),2]=x_min_main_hor+50#x_min_main_hor+150 + matrix_of_lines_ch[len(cy_main_hor):,2]=x_min_main_ver + matrix_of_lines_ch[:len(cy_main_hor),3]=x_max_main_hor-50#x_max_main_hor-150 + matrix_of_lines_ch[len(cy_main_hor):,3]=x_max_main_ver + matrix_of_lines_ch[:len(cy_main_hor),4]=dist_x_hor + matrix_of_lines_ch[len(cy_main_hor):,4]=dist_x_ver + matrix_of_lines_ch[:len(cy_main_hor),5]=cy_main_hor + matrix_of_lines_ch[:len(cy_main_hor),6]=y_min_main_hor + matrix_of_lines_ch[len(cy_main_hor):,6]=y_min_main_ver + matrix_of_lines_ch[:len(cy_main_hor),7]=y_max_main_hor + matrix_of_lines_ch[len(cy_main_hor):,7]=y_max_main_ver + matrix_of_lines_ch[:len(cy_main_hor),8]=dist_y_hor + matrix_of_lines_ch[len(cy_main_hor):,8]=dist_y_ver + matrix_of_lines_ch[len(cy_main_hor):,9]=1 if contours_h is not None: - _, dist_x_head, x_min_head, x_max_head, cy_head, _, y_min_head, y_max_head, _ = \ + _, dist_x_head, x_min_main_head, x_max_main_head, cy_main_head, _, y_min_main_head, y_max_main_head, _ = \ find_features_of_lines(contours_h) - matrix_l_n = np.zeros((len(cy_head), matrix_of_seps_ch.shape[1]), dtype=int) - args_head = np.arange(len(cy_head)) - matrix_l_n[:, 0] = args_head - matrix_l_n[:, 2] = x_min_head - matrix_l_n[:, 3] = x_max_head - matrix_l_n[:, 4] = dist_x_head - matrix_l_n[:, 5] = cy_head - matrix_l_n[:, 6] = y_min_head - matrix_l_n[:, 7] = y_max_head - matrix_l_n[:, 8] = y_max_head - y_min_head - matrix_l_n[:, 9] = 2 # mark as heading (so it can be split into 2 horizontal separators as needed) - matrix_of_seps_ch = np.append( - matrix_of_seps_ch, matrix_l_n, axis=0) + matrix_l_n=np.zeros((matrix_of_lines_ch.shape[0]+len(cy_main_head),matrix_of_lines_ch.shape[1])) + matrix_l_n[:matrix_of_lines_ch.shape[0],:]=np.copy(matrix_of_lines_ch[:,:]) + args_head=np.arange(len(cy_main_head)) + len(cy_main_hor) - cy_seps_splitters=cy_seps_hor[(x_min_seps_hor<=.16*region_pre_p.shape[1]) & - (x_max_seps_hor>=.84*region_pre_p.shape[1])] - cy_seps_splitters = np.append(cy_seps_splitters, special_separators) + matrix_l_n[matrix_of_lines_ch.shape[0]:,0]=args_head + matrix_l_n[matrix_of_lines_ch.shape[0]:,2]=x_min_main_head+30 + matrix_l_n[matrix_of_lines_ch.shape[0]:,3]=x_max_main_head-30 + matrix_l_n[matrix_of_lines_ch.shape[0]:,4]=dist_x_head + matrix_l_n[matrix_of_lines_ch.shape[0]:,5]=y_min_main_head-3-8 + matrix_l_n[matrix_of_lines_ch.shape[0]:,6]=y_min_main_head-5-8 + matrix_l_n[matrix_of_lines_ch.shape[0]:,7]=y_max_main_head#y_min_main_head+1-8 + matrix_l_n[matrix_of_lines_ch.shape[0]:,8]=4 + matrix_of_lines_ch=np.copy(matrix_l_n) + cy_main_splitters=cy_main_hor[(x_min_main_hor<=.16*region_pre_p.shape[1]) & + (x_max_main_hor>=.84*region_pre_p.shape[1])] + cy_main_splitters=np.array( list(cy_main_splitters)+list(special_separators)) if contours_h is not None: - y_min_splitters_head = y_min_head[(x_min_head<=.16*region_pre_p.shape[1]) & - (x_max_head>=.84*region_pre_p.shape[1])] - y_max_splitters_head = y_max_head[(x_min_head<=.16*region_pre_p.shape[1]) & - (x_max_head>=.84*region_pre_p.shape[1])] - cy_seps_splitters = np.append(cy_seps_splitters, y_min_splitters_head) - cy_seps_splitters = np.append(cy_seps_splitters, y_max_splitters_head) + try: + cy_main_splitters_head=cy_main_head[(x_min_main_head<=.16*region_pre_p.shape[1]) & + (x_max_main_head>=.84*region_pre_p.shape[1])] + cy_main_splitters=np.array( list(cy_main_splitters)+list(cy_main_splitters_head)) + except: + pass + args_cy_splitter=np.argsort(cy_main_splitters) + cy_main_splitters_sort=cy_main_splitters[args_cy_splitter] - cy_seps_splitters = np.sort(cy_seps_splitters).astype(int) - splitter_y_new = [0] + list(cy_seps_splitters) + [region_pre_p.shape[0]] - big_part = 22 * region_pre_p.shape[0] // 100 # percent height + splitter_y_new=[] + splitter_y_new.append(0) + for i in range(len(cy_main_splitters_sort)): + splitter_y_new.append( cy_main_splitters_sort[i] ) + splitter_y_new.append(region_pre_p.shape[0]) + splitter_y_new_diff=np.diff(splitter_y_new)/float(region_pre_p.shape[0])*100 + + args_big_parts=np.arange(len(splitter_y_new_diff))[ splitter_y_new_diff>22 ] regions_without_separators=return_regions_without_separators(region_pre_p) + length_y_threshold=regions_without_separators.shape[0]/4.0 num_col_fin=0 peaks_neg_fin_fin=[] - num_big_parts = 0 - for top, bot in pairwise(splitter_y_new): - if bot - top < big_part: - continue - num_big_parts += 1 + for itiles in args_big_parts: + regions_without_separators_tile=regions_without_separators[int(splitter_y_new[itiles]): + int(splitter_y_new[itiles+1]),:,0] try: - num_col, peaks_neg_fin = find_num_col(regions_without_separators[top: bot], + num_col, peaks_neg_fin = find_num_col(regions_without_separators_tile, num_col_classifier, tables, multiplier=7.0) - #print("big part %d:%d has %d columns" % (top, bot, num_col), peaks_neg_fin) except: num_col = 0 peaks_neg_fin = [] @@ -1555,630 +1555,512 @@ def find_number_of_columns_in_document(region_pre_p, num_col_classifier, tables, num_col_fin=num_col peaks_neg_fin_fin=peaks_neg_fin - if num_big_parts == 1 and len(peaks_neg_fin_fin) + 1 < num_col_classifier: + if len(args_big_parts)==1 and (len(peaks_neg_fin_fin)+1)=500] peaks_neg_fin=peaks_neg_fin[peaks_neg_fin<=(vertical.shape[1]-500)] peaks_neg_fin_fin=peaks_neg_fin[:] - return num_col_fin, peaks_neg_fin_fin, matrix_of_seps_ch, splitter_y_new, separators_closeup_n + return num_col_fin, peaks_neg_fin_fin,matrix_of_lines_ch,splitter_y_new,separators_closeup_n def return_boxes_of_images_by_order_of_reading_new( splitter_y_new, regions_without_separators, matrix_of_lines_ch, - num_col_classifier, erosion_hurts, tables, - right2left_readingorder, - logger=None): + num_col_classifier, erosion_hurts, tables, right2left_readingorder, logger=None): if right2left_readingorder: regions_without_separators = cv2.flip(regions_without_separators,1) - if logger is None: - logger = getLogger(__package__) - logger.debug('enter return_boxes_of_images_by_order_of_reading_new') - - # def dbg_plt(box=None, title=None, rectangles=None, rectangles_showidx=False): - # minx, maxx, miny, maxy = box or (0, None, 0, None) - # img = regions_without_separators[miny:maxy, minx:maxx] - # plt.imshow(img) - # xrange = np.arange(0, img.shape[1], 100) - # yrange = np.arange(0, img.shape[0], 100) - # ax = plt.gca() - # ax.set_xticks(xrange) - # ax.set_yticks(yrange) - # ax.set_xticklabels(xrange + minx) - # ax.set_yticklabels(yrange + miny) - # def format_coord(x, y): - # return 'x={:g}, y={:g}'.format(x + minx, y + miny) - # ax.format_coord = format_coord - # if title: - # plt.title(title) - # if rectangles: - # for i, (xmin, xmax, ymin, ymax) in enumerate(rectangles): - # ax.add_patch(patches.Rectangle((xmin, ymin), xmax - xmin, ymax - ymin, - # fill=False, linewidth=1, edgecolor='r')) - # if rectangles_showidx: - # ax.text((xmin+xmax)/2, (ymin+ymax)/2, str(i + 1), c='r') - # plt.show() - # dbg_plt(title="return_boxes_of_images_by_order_of_reading_new") - boxes=[] peaks_neg_tot_tables = [] - splitter_y_new = np.array(splitter_y_new, dtype=int) - height_tot, width_tot = regions_without_separators.shape - big_part = 22 * height_tot // 100 # percent height - for top, bot in pairwise(splitter_y_new): - # print("%d:%d" % (top, bot), 'i') - # dbg_plt([0, None, top, bot], "image cut for y split %d:%d" % (top, bot)) - matrix_new = matrix_of_lines_ch[(matrix_of_lines_ch[:,6] > top) & - (matrix_of_lines_ch[:,7] < bot)] - #print(len( matrix_new[:,9][matrix_new[:,9]==1] )) - #print(matrix_new[:,8][matrix_new[:,9]==1],'gaddaaa') - # check to see is there any vertical separator to find holes. - #if (len(matrix_new[:,9][matrix_new[:,9]==1]) > 0 and - # np.max(matrix_new[:,8][matrix_new[:,9]==1]) >= - # 0.1 * (np.abs(bot-top))): + for i in range(len(splitter_y_new)-1): + matrix_new = matrix_of_lines_ch[:,:][(matrix_of_lines_ch[:,6]> splitter_y_new[i] ) & + (matrix_of_lines_ch[:,7]< splitter_y_new[i+1] )] + try: - num_col, peaks_neg_fin = find_num_col( - regions_without_separators[top:bot], - # we do not expect to get all columns in small parts (headings etc.): - num_col_classifier if bot - top >= big_part else 1, - tables, multiplier=6. if erosion_hurts else 7.) + if erosion_hurts: + num_col, peaks_neg_fin = find_num_col( + regions_without_separators[int(splitter_y_new[i]):int(splitter_y_new[i+1]),:], + num_col_classifier, tables, multiplier=6.) + else: + num_col, peaks_neg_fin = find_num_col( + regions_without_separators[int(splitter_y_new[i]):int(splitter_y_new[i+1]),:], + num_col_classifier, tables, multiplier=7.) except: peaks_neg_fin=[] num_col = 0 try: - if ((len(peaks_neg_fin) + 1 < num_col_classifier or - num_col_classifier == 6) and - # we do not expect to get all columns in small parts (headings etc.): - bot - top >= big_part): - # found too few columns here + peaks_neg_fin_org=np.copy(peaks_neg_fin) + if (len(peaks_neg_fin)+1)= len(peaks_neg_fin2): - peaks_neg_fin = peaks_neg_fin1 + peaks_neg_fin2=[] + + if len(peaks_neg_fin1)>=len(peaks_neg_fin2): + peaks_neg_fin=list(np.copy(peaks_neg_fin1)) else: - peaks_neg_fin = peaks_neg_fin2 - # add offset to local result - peaks_neg_fin = list(np.array(peaks_neg_fin) + left) - #print(peaks_neg_fin,'peaks_neg_fin') + peaks_neg_fin=list(np.copy(peaks_neg_fin2)) + peaks_neg_fin=list(np.array(peaks_neg_fin)+peaks_neg_fin_early[i_n]) - peaks_neg_fin_rev.extend(peaks_neg_fin) - if right < peaks_neg_fin_early[-1]: - # all but the last column: interject the preexisting boundary - peaks_neg_fin_rev.append(right) - #print(peaks_neg_fin_rev,'peaks_neg_fin_rev') + if i_n!=(len(peaks_neg_fin_early)-2): + peaks_neg_fin_rev.append(peaks_neg_fin_early[i_n+1]) + peaks_neg_fin_rev=peaks_neg_fin_rev+peaks_neg_fin - if len(peaks_neg_fin_rev) >= len(peaks_neg_fin_org): - peaks_neg_fin = peaks_neg_fin_rev + if len(peaks_neg_fin_rev)>=len(peaks_neg_fin_org): + peaks_neg_fin=list(np.sort(peaks_neg_fin_rev)) + num_col=len(peaks_neg_fin) else: - peaks_neg_fin = peaks_neg_fin_org - num_col = len(peaks_neg_fin) - #print(peaks_neg_fin,'peaks_neg_fin') + peaks_neg_fin=list(np.copy(peaks_neg_fin_org)) + num_col=len(peaks_neg_fin) + except: - logger.exception("cannot find peaks consistent with columns") - #num_col, peaks_neg_fin = find_num_col( - # regions_without_separators[top:bot,:], - # multiplier=7.0) - peaks_neg_tot = np.array([0] + peaks_neg_fin + [width_tot]) - #print(peaks_neg_tot,'peaks_neg_tot') - peaks_neg_tot_tables.append(peaks_neg_tot) - - all_columns = set(range(len(peaks_neg_tot) - 1)) - #print("all_columns", all_columns) - - # elongate horizontal separators+headings as much as possible without overlap - args_nonver = matrix_new[:, 9] != 1 - regions_with_separators = np.copy(regions_without_separators[top:bot]) - for xmin, xmax, ymin, ymax in matrix_new[:, [2, 3, 6, 7]]: - regions_with_separators[ymin - top: ymax - top, xmin: xmax] = 6 - # def dbg_imshow(box, title): - # xmin, xmax, ymin, ymax = box - # plt.imshow(regions_with_separators, extent=[0, width_tot, bot, top]) - # plt.gca().add_patch(patches.Rectangle((xmin, ymin), xmax - xmin, ymax - ymin, - # fill=False, linewidth=1, edgecolor='r')) - # plt.title(title + " at %d:%d, %d:%d" % (ymin, ymax, xmin, xmax)) - # plt.show() - for i in np.flatnonzero(args_nonver): - xmin, xmax, ymin, ymax, typ = matrix_new[i, [2, 3, 6, 7, 9]] - cut = regions_with_separators[ymin - top: ymax - top] - # dbg_imshow([xmin, xmax, ymin, ymax], "separator %d (%s)" % (i, "heading" if typ else "horizontal")) - starting = xmin - peaks_neg_tot - min_start = np.flatnonzero(starting >= 0)[-1] # last left-of - ending = xmax - peaks_neg_tot - max_end = np.flatnonzero(ending < 0)[0] # first right-of - # skip elongation unless this is already a multi-column separator/heading: - if not max_end - min_start > 1: - continue - # is there anything left of min_start? - for j in range(min_start): - # dbg_imshow([peaks_neg_tot[j], xmin, ymin, ymax], "start of %d candidate %d" % (i, j)) - if not np.any(cut[:, peaks_neg_tot[j]: xmin]): - # print("elongated sep", i, "typ", typ, "start", xmin, "to", j, peaks_neg_tot[j]) - matrix_new[i, 2] = peaks_neg_tot[j] + 1 # elongate to start of this column - break - # is there anything right of max_end? - for j in range(len(peaks_neg_tot) - 1, max_end, -1): - # dbg_imshow([xmax, peaks_neg_tot[j], ymin, ymax], "end of %d candidate %d" % (i, j)) - if not np.any(cut[:, xmax: peaks_neg_tot[j]]): - # print("elongated sep", i, "typ", typ, "end", xmax, "to", j, peaks_neg_tot[j]) - matrix_new[i, 3] = peaks_neg_tot[j] - 1 # elongate to end of this column - break - - args_hor = matrix_new[:, 9] == 0 - x_min_hor_some = matrix_new[:, 2][args_hor] - x_max_hor_some = matrix_new[:, 3][args_hor] - y_max_hor_some = matrix_new[:, 7][args_hor] - cy_hor_some = matrix_new[:, 5][args_hor] - - args_head = matrix_new[:, 9] == 2 - x_min_hor_head = matrix_new[:, 2][args_head] - x_max_hor_head = matrix_new[:, 3][args_head] - y_min_hor_head = matrix_new[:, 6][args_head] - y_max_hor_head = matrix_new[:, 7][args_head] - cy_hor_head = matrix_new[:, 5][args_head] - - # split headings at toplines (y_min_head) and baselines (y_max_head) - # instead of merely adding their center (cy_head) as horizontal separator - # (x +/- 30px to avoid crossing col peaks by accident) - x_min_hor_some = np.append(x_min_hor_some, np.tile(x_min_hor_head + 30, 2)) - x_max_hor_some = np.append(x_max_hor_some, np.tile(x_max_hor_head - 30, 2)) - y_max_hor_some = np.append(y_max_hor_some, # baselines - np.concatenate((y_min_hor_head + 2, - y_max_hor_head + 2))) - cy_hor_some = np.append(cy_hor_some, # toplines - np.concatenate((y_min_hor_head - 2, - y_max_hor_head - 2))) + pass + x_min_hor_some=matrix_new[:,2][ (matrix_new[:,9]==0) ] + x_max_hor_some=matrix_new[:,3][ (matrix_new[:,9]==0) ] + cy_hor_some=matrix_new[:,5][ (matrix_new[:,9]==0) ] + cy_hor_diff=matrix_new[:,7][ (matrix_new[:,9]==0) ] + arg_org_hor_some=matrix_new[:,0][ (matrix_new[:,9]==0) ] if right2left_readingorder: - x_max_hor_some = width_tot - x_min_hor_some - x_min_hor_some = width_tot - x_max_hor_some + x_max_hor_some_new = regions_without_separators.shape[1] - x_min_hor_some + x_min_hor_some_new = regions_without_separators.shape[1] - x_max_hor_some + x_min_hor_some =list(np.copy(x_min_hor_some_new)) + x_max_hor_some =list(np.copy(x_max_hor_some_new)) + peaks_neg_tot=return_points_with_boundies(peaks_neg_fin,0, regions_without_separators[:,:].shape[1]) + peaks_neg_tot_tables.append(peaks_neg_tot) - reading_order_type, x_starting, x_ending, y_mid, y_max, \ - y_mid_without_mother, x_start_without_mother, x_end_without_mother, \ - there_is_sep_with_child, \ - y_mid_with_child_without_mother, x_start_with_child_without_mother, x_end_with_child_without_mother, \ + reading_order_type, x_starting, x_ending, y_type_2, y_diff_type_2, \ + y_lines_without_mother, x_start_without_mother, x_end_without_mother, there_is_sep_with_child, \ + y_lines_with_child_without_mother, x_start_with_child_without_mother, x_end_with_child_without_mother, \ new_main_sep_y = return_x_start_end_mothers_childs_and_type_of_reading_order( - peaks_neg_tot, x_min_hor_some, x_max_hor_some, cy_hor_some, y_max_hor_some) + x_min_hor_some, x_max_hor_some, cy_hor_some, peaks_neg_tot, cy_hor_diff) + x_starting = np.array(x_starting) + x_ending = np.array(x_ending) + y_type_2 = np.array(y_type_2) + y_diff_type_2 = np.array(y_diff_type_2) - # show multi-column separators - # dbg_plt([0, None, top, bot], "multi-column separators in current split", - # list(zip(peaks_neg_tot[x_starting], peaks_neg_tot[x_ending], - # y_mid - top, y_max - top)), True) - - if (reading_order_type == 1 or - len(y_mid_without_mother) >= 2 or - there_is_sep_with_child == 1): - # there are top-level multi-colspan horizontal separators which overlap each other - # or multiple top-level multi-colspan horizontal separators - # or multi-colspan horizontal separators shorter than their respective top-level: - # todo: explain how this is dealt with + if ((reading_order_type==1) or + (reading_order_type==0 and + (len(y_lines_without_mother)>=2 or there_is_sep_with_child==1))): try: - y_grenze = top + 300 - up = (y_mid > top) & (y_mid <= y_grenze) + y_grenze=int(splitter_y_new[i])+300 + args_early_ys=np.arange(len(y_type_2)) - args_early_ys=np.arange(len(y_mid)) - #print(args_early_ys,'args_early_ys') - #print(y_mid,'y_mid') - x_starting_up = x_starting[up] - x_ending_up = x_ending[up] - y_mid_up = y_mid[up] - y_max_up = y_max[up] - args_up = args_early_ys[up] - #print(args_up,'args_up') - #print(y_mid_up,'y_mid_up') - #check if there is a big separator in this y_mains0 - if len(y_mid_up) > 0: - # is there a separator with full-width span? - main_separator = (x_starting_up == 0) & (x_ending_up == len(peaks_neg_tot) - 1) - y_mid_main_separator_up = y_mid_up[main_separator] - y_max_main_separator_up = y_max_up[main_separator] - args_main_to_deleted = args_up[main_separator] - #print(y_mid_main_separator_up,y_max_main_separator_up,args_main_to_deleted,'fffffjammmm') - if len(y_max_main_separator_up): + x_starting_up = x_starting[(y_type_2 > int(splitter_y_new[i])) & + (y_type_2 <= y_grenze)] + x_ending_up = x_ending[(y_type_2 > int(splitter_y_new[i])) & + (y_type_2 <= y_grenze)] + y_type_2_up = y_type_2[(y_type_2 > int(splitter_y_new[i])) & + (y_type_2 <= y_grenze)] + y_diff_type_2_up = y_diff_type_2[(y_type_2 > int(splitter_y_new[i])) & + (y_type_2 <= y_grenze)] + args_up = args_early_ys[(y_type_2 > int(splitter_y_new[i])) & + (y_type_2 <= y_grenze)] + if len(y_type_2_up) > 0: + y_main_separator_up = y_type_2_up [(x_starting_up==0) & + (x_ending_up==(len(peaks_neg_tot)-1) )] + y_diff_main_separator_up = y_diff_type_2_up[(x_starting_up==0) & + (x_ending_up==(len(peaks_neg_tot)-1) )] + args_main_to_deleted = args_up[(x_starting_up==0) & + (x_ending_up==(len(peaks_neg_tot)-1) )] + + if len(y_diff_main_separator_up) > 0: args_to_be_kept = np.array(list( set(args_early_ys) - set(args_main_to_deleted) )) - #print(args_to_be_kept,'args_to_be_kept') - boxes.append([0, peaks_neg_tot[-1], - top, y_max_main_separator_up.max()]) - # dbg_plt(boxes[-1], "near top main separator box") - top = y_max_main_separator_up.max() - #print(top,'top') - y_mid = y_mid[args_to_be_kept] + boxes.append([0, peaks_neg_tot[len(peaks_neg_tot)-1], + int(splitter_y_new[i]), int( np.max(y_diff_main_separator_up))]) + splitter_y_new[i]=[ np.max(y_diff_main_separator_up) ][0] + + y_type_2 = y_type_2[args_to_be_kept] x_starting = x_starting[args_to_be_kept] x_ending = x_ending[args_to_be_kept] - y_max = y_max[args_to_be_kept] + y_diff_type_2 = y_diff_type_2[args_to_be_kept] - #print('galdiha') - y_grenze = top + 200 - up = (y_mid > top) & (y_mid <= y_grenze) - args_early_ys2 = np.arange(len(y_mid)) - x_starting_up = x_starting[up] - x_ending_up = x_ending[up] - y_mid_up = y_mid[up] - y_max_up = y_max[up] - args_up2 = args_early_ys2[up] - #print(y_mid_up,x_starting_up,x_ending_up,'didid') - else: - args_early_ys2 = args_early_ys - args_up2 = args_up + y_grenze=int(splitter_y_new[i])+200 + args_early_ys2=np.arange(len(y_type_2)) + y_type_2_up=y_type_2[(y_type_2 > int(splitter_y_new[i])) & + (y_type_2 <= y_grenze)] + x_starting_up=x_starting[(y_type_2 > int(splitter_y_new[i])) & + (y_type_2 <= y_grenze)] + x_ending_up=x_ending[(y_type_2 > int(splitter_y_new[i])) & + (y_type_2 <= y_grenze)] + y_diff_type_2_up=y_diff_type_2[(y_type_2 > int(splitter_y_new[i])) & + (y_type_2 <= y_grenze)] + args_up2=args_early_ys2[(y_type_2 > int(splitter_y_new[i])) & + (y_type_2 <= y_grenze)] - nodes_in = set() - for ij in range(len(x_starting_up)): - nodes_in.update(range(x_starting_up[ij], - x_ending_up[ij])) - #print(nodes_in,'nodes_in') - #print(np.array(range(len(peaks_neg_tot)-1)),'np.array(range(len(peaks_neg_tot)-1))') + nodes_in = [] + for ij in range(len(x_starting_up)): + nodes_in = nodes_in + list(range(int(x_starting_up[ij]), + int(x_ending_up[ij]))) + nodes_in = np.unique(nodes_in) - if nodes_in == set(range(len(peaks_neg_tot)-1)): - pass - elif nodes_in == set(range(1, len(peaks_neg_tot)-1)): - pass - else: - #print('burdaydikh') - args_to_be_kept2 = np.array(list( set(args_early_ys2) - set(args_up2) )) + if set(nodes_in)==set(range(len(peaks_neg_tot)-1)): + pass + elif set(nodes_in)==set(range(1, len(peaks_neg_tot)-1)): + pass + else: + args_to_be_kept2=np.array(list( set(args_early_ys2)-set(args_up2) )) - if len(args_to_be_kept2): - #print(args_to_be_kept2, "args_to_be_kept2") - y_mid = y_mid[args_to_be_kept2] - x_starting = x_starting[args_to_be_kept2] - x_ending = x_ending[args_to_be_kept2] - y_max = y_max[args_to_be_kept2] + if len(args_to_be_kept2)>0: + y_type_2 = y_type_2[args_to_be_kept2] + x_starting = x_starting[args_to_be_kept2] + x_ending = x_ending[args_to_be_kept2] + y_diff_type_2 = y_diff_type_2[args_to_be_kept2] + else: + pass + elif len(y_diff_main_separator_up)==0: + nodes_in = [] + for ij in range(len(x_starting_up)): + nodes_in = nodes_in + list(range(int(x_starting_up[ij]), + int(x_ending_up[ij]))) + nodes_in = np.unique(nodes_in) - #int(top) - # order multi-column separators - y_mid_by_order=[] + if set(nodes_in)==set(range(len(peaks_neg_tot)-1)): + pass + elif set(nodes_in)==set(range(1,len(peaks_neg_tot)-1)): + pass + else: + args_to_be_kept2=np.array(list( set(args_early_ys) - set(args_up) )) + + if len(args_to_be_kept2)>0: + y_type_2 = y_type_2[args_to_be_kept2] + x_starting = x_starting[args_to_be_kept2] + x_ending = x_ending[args_to_be_kept2] + y_diff_type_2 = y_diff_type_2[args_to_be_kept2] + else: + pass + + y_lines_by_order=[] x_start_by_order=[] x_end_by_order=[] - if (reading_order_type == 1 or - len(x_end_with_child_without_mother) == 0): - if reading_order_type == 1: - # there are top-level multi-colspan horizontal separators which overlap each other - #print("adding all columns at top because of multiple overlapping mothers") - y_mid_by_order.append(top) + if (len(x_end_with_child_without_mother)==0 and reading_order_type==0) or reading_order_type==1: + if reading_order_type==1: + y_lines_by_order.append(int(splitter_y_new[i])) x_start_by_order.append(0) x_end_by_order.append(len(peaks_neg_tot)-2) else: - # there are no top-level multi-colspan horizontal separators which themselves - # contain shorter multi-colspan separators - #print(x_start_without_mother,x_end_without_mother,peaks_neg_tot,'dodo') - columns_covered_by_mothers = set() + columns_covered_by_mothers = [] for dj in range(len(x_start_without_mother)): - columns_covered_by_mothers.update( - range(x_start_without_mother[dj], - x_end_without_mother[dj])) - columns_not_covered = list(all_columns - columns_covered_by_mothers) - #print(columns_covered_by_mothers, "columns_covered_by_mothers") - #print(columns_not_covered, "columns_not_covered") - y_mid = np.append(y_mid, np.ones(len(columns_not_covered) + - len(x_start_without_mother), - dtype=int) * top) - ##y_mid_by_order = np.append(y_mid_by_order, [top] * len(columns_not_covered)) - ##x_start_by_order = np.append(x_start_by_order, [0] * len(columns_not_covered)) - x_starting = np.append(x_starting, np.array(columns_not_covered, int)) + columns_covered_by_mothers = columns_covered_by_mothers + \ + list(range(int(x_start_without_mother[dj]), + int(x_end_without_mother[dj]))) + columns_covered_by_mothers = list(set(columns_covered_by_mothers)) + + all_columns=np.arange(len(peaks_neg_tot)-1) + columns_not_covered=list(set(all_columns) - set(columns_covered_by_mothers)) + y_type_2 = np.append(y_type_2, [int(splitter_y_new[i])] * (len(columns_not_covered) + len(x_start_without_mother))) + + x_starting = np.append(x_starting, columns_not_covered) x_starting = np.append(x_starting, x_start_without_mother) - x_ending = np.append(x_ending, np.array(columns_not_covered, int) + 1) + x_ending = np.append(x_ending, np.array(columns_not_covered) + 1) x_ending = np.append(x_ending, x_end_without_mother) - ind_args=np.arange(len(y_mid)) - #print(ind_args,'ind_args') + ind_args=np.arange(len(y_type_2)) for column in range(len(peaks_neg_tot)-1): - #print(column,'column') ind_args_in_col=ind_args[x_starting==column] - #print('babali2') - #print(ind_args_in_col,'ind_args_in_col') - #print(len(y_mid)) - y_mid_column=y_mid[ind_args_in_col] + ind_args_in_col=np.array(ind_args_in_col) + + y_column=y_type_2[ind_args_in_col] x_start_column=x_starting[ind_args_in_col] x_end_column=x_ending[ind_args_in_col] - #print('babali3') - ind_args_col_sorted=np.argsort(y_mid_column) - y_mid_by_order.extend(y_mid_column[ind_args_col_sorted]) - x_start_by_order.extend(x_start_column[ind_args_col_sorted]) - x_end_by_order.extend(x_end_column[ind_args_col_sorted] - 1) + + ind_args_col_sorted=np.argsort(y_column) + y_col_sort=y_column[ind_args_col_sorted] + x_start_column_sort=x_start_column[ind_args_col_sorted] + x_end_column_sort=x_end_column[ind_args_col_sorted] + + for ii in range(len(y_col_sort)): + y_lines_by_order.append(y_col_sort[ii]) + x_start_by_order.append(x_start_column_sort[ii]) + x_end_by_order.append(x_end_column_sort[ii]-1) else: - #print(x_start_without_mother,x_end_without_mother,peaks_neg_tot,'dodo') - columns_covered_by_mothers = set() + columns_covered_by_mothers = [] for dj in range(len(x_start_without_mother)): - columns_covered_by_mothers.update( - range(x_start_without_mother[dj], - x_end_without_mother[dj])) - columns_not_covered = list(all_columns - columns_covered_by_mothers) - #print(columns_covered_by_mothers, "columns_covered_by_mothers") - #print(columns_not_covered, "columns_not_covered") - y_mid = np.append(y_mid, np.ones(len(columns_not_covered) + - len(x_start_without_mother), - dtype=int) * top) - ##y_mid_by_order = np.append(y_mid_by_order, [top] * len(columns_not_covered)) - ##x_start_by_order = np.append(x_start_by_order, [0] * len(columns_not_covered)) - x_starting = np.append(x_starting, np.array(columns_not_covered, int)) + columns_covered_by_mothers = columns_covered_by_mothers + \ + list(range(int(x_start_without_mother[dj]), + int(x_end_without_mother[dj]))) + columns_covered_by_mothers = list(set(columns_covered_by_mothers)) + + all_columns=np.arange(len(peaks_neg_tot)-1) + columns_not_covered=list(set(all_columns) - set(columns_covered_by_mothers)) + y_type_2 = np.append(y_type_2, [int(splitter_y_new[i])] * (len(columns_not_covered) + len(x_start_without_mother))) + x_starting = np.append(x_starting, columns_not_covered) x_starting = np.append(x_starting, x_start_without_mother) - x_ending = np.append(x_ending, np.array(columns_not_covered, int) + 1) + x_ending = np.append(x_ending, np.array(columns_not_covered) + 1) x_ending = np.append(x_ending, x_end_without_mother) - columns_covered_by_mothers_with_child = set() + columns_covered_by_with_child_no_mothers = [] for dj in range(len(x_end_with_child_without_mother)): - columns_covered_by_mothers_with_child.update( - range(x_start_with_child_without_mother[dj], - x_end_with_child_without_mother[dj])) - #print(columns_covered_by_mothers_with_child, "columns_covered_by_mothers_with_child") - columns_not_covered_by_mothers_with_child = list( - all_columns - columns_covered_by_mothers_with_child) - #indexes_to_be_spanned=[] + columns_covered_by_with_child_no_mothers = columns_covered_by_with_child_no_mothers + \ + list(range(int(x_start_with_child_without_mother[dj]), + int(x_end_with_child_without_mother[dj]))) + columns_covered_by_with_child_no_mothers = list(set(columns_covered_by_with_child_no_mothers)) + + all_columns = np.arange(len(peaks_neg_tot)-1) + columns_not_covered_child_no_mother = list(set(all_columns) - set(columns_covered_by_with_child_no_mothers)) for i_s in range(len(x_end_with_child_without_mother)): - columns_not_covered_by_mothers_with_child.append(x_start_with_child_without_mother[i_s]) - columns_not_covered_by_mothers_with_child = np.sort(columns_not_covered_by_mothers_with_child) - #print(columns_not_covered_by_mothers_with_child, "columns_not_covered_by_mothers_with_child") - ind_args = np.arange(len(y_mid)) - for i_s_nc in columns_not_covered_by_mothers_with_child: + columns_not_covered_child_no_mother.append(x_start_with_child_without_mother[i_s]) + columns_not_covered_child_no_mother = np.sort(columns_not_covered_child_no_mother) + ind_args = np.arange(len(y_type_2)) + x_end_with_child_without_mother = np.array(x_end_with_child_without_mother) + x_start_with_child_without_mother = np.array(x_start_with_child_without_mother) + for i_s_nc in columns_not_covered_child_no_mother: if i_s_nc in x_start_with_child_without_mother: - # use only seps with mother's span ("biggest") - #print("i_s_nc", i_s_nc) - x_end_biggest_column = \ - x_end_with_child_without_mother[ - x_start_with_child_without_mother == i_s_nc][0] - args_all_biggest_seps = \ - ind_args[(x_starting == i_s_nc) & - (x_ending == x_end_biggest_column)] - y_mid_column_nc = y_mid[args_all_biggest_seps] - #print("%d:%d" % (i_s_nc, x_end_biggest_column), "columns covered by mother with child") - #x_start_column_nc = x_starting[args_all_biggest_seps] - #x_end_column_nc = x_ending[args_all_biggest_seps] - y_mid_column_nc = np.sort(y_mid_column_nc) - #print(y_mid_column_nc, "y_mid_column_nc (sorted)") - for nc_top, nc_bot in pairwise(np.append(y_mid_column_nc, bot)): - #print("i_c", i_c) - #print("%d:%d" % (nc_top, nc_bot), "y_mid_column_nc") - ind_all_seps_between_nm_wc = \ - ind_args[(y_mid > nc_top) & - (y_mid < nc_bot) & - (x_starting >= i_s_nc) & - (x_ending <= x_end_biggest_column)] - y_mid_all_between_nm_wc = y_mid[ind_all_seps_between_nm_wc] - x_starting_all_between_nm_wc = x_starting[ind_all_seps_between_nm_wc] - x_ending_all_between_nm_wc = x_ending[ind_all_seps_between_nm_wc] + x_end_biggest_column = x_end_with_child_without_mother[x_start_with_child_without_mother==i_s_nc][0] + args_all_biggest_lines = ind_args[(x_starting==i_s_nc) & + (x_ending==x_end_biggest_column)] + y_column_nc = y_type_2[args_all_biggest_lines] + x_start_column_nc = x_starting[args_all_biggest_lines] + x_end_column_nc = x_ending[args_all_biggest_lines] + y_column_nc = np.sort(y_column_nc) + for i_c in range(len(y_column_nc)): + if i_c==(len(y_column_nc)-1): + ind_all_lines_between_nm_wc=ind_args[(y_type_2>y_column_nc[i_c]) & + (y_type_2=i_s_nc) & + (x_ending<=x_end_biggest_column)] + else: + ind_all_lines_between_nm_wc=ind_args[(y_type_2>y_column_nc[i_c]) & + (y_type_2=i_s_nc) & + (x_ending<=x_end_biggest_column)] + y_all_between_nm_wc = y_type_2[ind_all_lines_between_nm_wc] + x_starting_all_between_nm_wc = x_starting[ind_all_lines_between_nm_wc] + x_ending_all_between_nm_wc = x_ending[ind_all_lines_between_nm_wc] - columns_covered_by_mothers = set() - for dj in range(len(ind_all_seps_between_nm_wc)): - columns_covered_by_mothers.update( - range(x_starting_all_between_nm_wc[dj], - x_ending_all_between_nm_wc[dj])) - #print(columns_covered_by_mothers, "columns_covered_by_mothers") - child_columns = set(range(i_s_nc, x_end_biggest_column)) - columns_not_covered = list(child_columns - columns_covered_by_mothers) - #print(child_columns, "child_columns") - #print(columns_not_covered, "columns_not_covered") + x_diff_all_between_nm_wc = x_ending_all_between_nm_wc - x_starting_all_between_nm_wc + if len(x_diff_all_between_nm_wc)>0: + biggest=np.argmax(x_diff_all_between_nm_wc) - if len(ind_all_seps_between_nm_wc): - biggest = np.argmax(x_ending_all_between_nm_wc - - x_starting_all_between_nm_wc) - #print(ind_all_seps_between_nm_wc, "ind_all_seps_between_nm_wc") - #print(biggest, "%d:%d" % (x_starting_all_between_nm_wc[biggest], - x_ending_all_between_nm_wc[biggest]), "biggest") - if columns_covered_by_mothers == set( - range(x_starting_all_between_nm_wc[biggest], - x_ending_all_between_nm_wc[biggest])): - # single biggest accounts for all covered columns alone, - # this separator should be extended to cover all - seps_too_close_to_top_separator = \ - ((y_mid_all_between_nm_wc > nc_top) & - (y_mid_all_between_nm_wc <= nc_top + 500)) - if (np.count_nonzero(seps_too_close_to_top_separator) and - np.count_nonzero(seps_too_close_to_top_separator) < - len(ind_all_seps_between_nm_wc)): - #print(seps_too_close_to_top_separator, "seps_too_close_to_top_separator") - y_mid_all_between_nm_wc = \ - y_mid_all_between_nm_wc[~seps_too_close_to_top_separator] - x_starting_all_between_nm_wc = \ - x_starting_all_between_nm_wc[~seps_too_close_to_top_separator] - x_ending_all_between_nm_wc = \ - x_ending_all_between_nm_wc[~seps_too_close_to_top_separator] + columns_covered_by_mothers = [] + for dj in range(len(x_starting_all_between_nm_wc)): + columns_covered_by_mothers = columns_covered_by_mothers + \ + list(range(int(x_starting_all_between_nm_wc[dj]), + int(x_ending_all_between_nm_wc[dj]))) + columns_covered_by_mothers = list(set(columns_covered_by_mothers)) - y_mid_all_between_nm_wc = np.append( - y_mid_all_between_nm_wc, nc_top) - x_starting_all_between_nm_wc = np.append( - x_starting_all_between_nm_wc, i_s_nc) - x_ending_all_between_nm_wc = np.append( - x_ending_all_between_nm_wc, x_end_biggest_column) - else: - y_mid_all_between_nm_wc = np.append( - y_mid_all_between_nm_wc, nc_top) - x_starting_all_between_nm_wc = np.append( - x_starting_all_between_nm_wc, x_starting_all_between_nm_wc[biggest]) - x_ending_all_between_nm_wc = np.append( - x_ending_all_between_nm_wc, x_ending_all_between_nm_wc[biggest]) + all_columns=np.arange(i_s_nc, x_end_biggest_column) + columns_not_covered = list(set(all_columns) - set(columns_covered_by_mothers)) - if len(columns_not_covered): - y_mid_all_between_nm_wc = np.append( - y_mid_all_between_nm_wc, [nc_top] * len(columns_not_covered)) - x_starting_all_between_nm_wc = np.append( - x_starting_all_between_nm_wc, np.array(columns_not_covered, int)) - x_ending_all_between_nm_wc = np.append( - x_ending_all_between_nm_wc, np.array(columns_not_covered, int) + 1) + should_longest_line_be_extended=0 + if (len(x_diff_all_between_nm_wc) > 0 and + set(list(range(int(x_starting_all_between_nm_wc[biggest]), + int(x_ending_all_between_nm_wc[biggest]))) + + list(columns_not_covered)) != set(all_columns)): + should_longest_line_be_extended=1 + index_lines_so_close_to_top_separator = \ + np.arange(len(y_all_between_nm_wc))[(y_all_between_nm_wc>y_column_nc[i_c]) & + (y_all_between_nm_wc<=(y_column_nc[i_c]+500))] + if len(index_lines_so_close_to_top_separator) > 0: + indexes_remained_after_deleting_closed_lines= \ + np.array(list(set(list(range(len(y_all_between_nm_wc)))) - + set(list(index_lines_so_close_to_top_separator)))) + if len(indexes_remained_after_deleting_closed_lines) > 0: + y_all_between_nm_wc = y_all_between_nm_wc[indexes_remained_after_deleting_closed_lines] + x_starting_all_between_nm_wc = x_starting_all_between_nm_wc[indexes_remained_after_deleting_closed_lines] + x_ending_all_between_nm_wc = x_ending_all_between_nm_wc[indexes_remained_after_deleting_closed_lines] + + y_all_between_nm_wc = np.append(y_all_between_nm_wc, y_column_nc[i_c]) + x_starting_all_between_nm_wc = np.append(x_starting_all_between_nm_wc, i_s_nc) + x_ending_all_between_nm_wc = np.append(x_ending_all_between_nm_wc, x_end_biggest_column) + + if len(x_diff_all_between_nm_wc) > 0: + try: + y_all_between_nm_wc = np.append(y_all_between_nm_wc, y_column_nc[i_c]) + x_starting_all_between_nm_wc = np.append(x_starting_all_between_nm_wc, x_starting_all_between_nm_wc[biggest]) + x_ending_all_between_nm_wc = np.append(x_ending_all_between_nm_wc, x_ending_all_between_nm_wc[biggest]) + except: + pass + + y_all_between_nm_wc = np.append(y_all_between_nm_wc, [y_column_nc[i_c]] * len(columns_not_covered)) + x_starting_all_between_nm_wc = np.append(x_starting_all_between_nm_wc, columns_not_covered) + x_ending_all_between_nm_wc = np.append(x_ending_all_between_nm_wc, np.array(columns_not_covered) + 1) ind_args_between=np.arange(len(x_ending_all_between_nm_wc)) for column in range(int(i_s_nc), int(x_end_biggest_column)): ind_args_in_col=ind_args_between[x_starting_all_between_nm_wc==column] - #print('babali2') - #print(ind_args_in_col,'ind_args_in_col') - #print(len(y_mid)) - y_mid_column=y_mid_all_between_nm_wc[ind_args_in_col] + ind_args_in_col=np.array(ind_args_in_col) + + y_column=y_all_between_nm_wc[ind_args_in_col] x_start_column=x_starting_all_between_nm_wc[ind_args_in_col] x_end_column=x_ending_all_between_nm_wc[ind_args_in_col] - #print('babali3') - ind_args_col_sorted=np.argsort(y_mid_column) - y_mid_by_order.extend(y_mid_column[ind_args_col_sorted]) - x_start_by_order.extend(x_start_column[ind_args_col_sorted]) - x_end_by_order.extend(x_end_column[ind_args_col_sorted] - 1) + + ind_args_col_sorted=np.argsort(y_column) + y_col_sort=y_column[ind_args_col_sorted] + x_start_column_sort=x_start_column[ind_args_col_sorted] + x_end_column_sort=x_end_column[ind_args_col_sorted] + + for ii in range(len(y_col_sort)): + y_lines_by_order.append(y_col_sort[ii]) + x_start_by_order.append(x_start_column_sort[ii]) + x_end_by_order.append(x_end_column_sort[ii]-1) else: - #print(i_s_nc,'column not covered by mothers with child') ind_args_in_col=ind_args[x_starting==i_s_nc] - #print('babali2') - #print(ind_args_in_col,'ind_args_in_col') - #print(len(y_mid)) - y_mid_column=y_mid[ind_args_in_col] + ind_args_in_col=np.array(ind_args_in_col) + + y_column=y_type_2[ind_args_in_col] x_start_column=x_starting[ind_args_in_col] x_end_column=x_ending[ind_args_in_col] - #print('babali3') - ind_args_col_sorted = np.argsort(y_mid_column) - y_mid_by_order.extend(y_mid_column[ind_args_col_sorted]) - x_start_by_order.extend(x_start_column[ind_args_col_sorted]) - x_end_by_order.extend(x_end_column[ind_args_col_sorted] - 1) - # create single-column boxes from multi-column separators - y_mid_by_order = np.array(y_mid_by_order) - x_start_by_order = np.array(x_start_by_order) - x_end_by_order = np.array(x_end_by_order) - for il in range(len(y_mid_by_order)): - #print(il, "il") - y_mid_itself = y_mid_by_order[il] - x_start_itself = x_start_by_order[il] - x_end_itself = x_end_by_order[il] + ind_args_col_sorted=np.argsort(y_column) + y_col_sort=y_column[ind_args_col_sorted] + x_start_column_sort=x_start_column[ind_args_col_sorted] + x_end_column_sort=x_end_column[ind_args_col_sorted] + + for ii in range(len(y_col_sort)): + y_lines_by_order.append(y_col_sort[ii]) + x_start_by_order.append(x_start_column_sort[ii]) + x_end_by_order.append(x_end_column_sort[ii]-1) + + for il in range(len(y_lines_by_order)): + y_copy = list(y_lines_by_order) + x_start_copy = list(x_start_by_order) + x_end_copy = list(x_end_by_order) + + y_itself=y_copy.pop(il) + x_start_itself=x_start_copy.pop(il) + x_end_itself=x_end_copy.pop(il) + for column in range(int(x_start_itself), int(x_end_itself)+1): - #print(column,'cols') - #print('burda') - #print('burda2') - y_mid_next = y_mid_by_order[(y_mid_itself < y_mid_by_order) & - (column >= x_start_by_order) & - (column <= x_end_by_order)] - y_mid_next = y_mid_next.min(initial=bot) - #print(y_mid_next,'y_mid_next') - #print(y_mid_itself,'y_mid_itself') + y_in_cols=[] + for yic in range(len(y_copy)): + if (y_copy[yic]>y_itself and + column>=x_start_copy[yic] and + column<=x_end_copy[yic]): + y_in_cols.append(y_copy[yic]) + + if len(y_in_cols)>0: + y_down=np.min(y_in_cols) + else: + y_down=[int(splitter_y_new[i+1])][0] + boxes.append([peaks_neg_tot[column], - peaks_neg_tot[column+1], - y_mid_itself, - y_mid_next]) - # dbg_plt(boxes[-1], "A column %d box" % (column + 1)) + peaks_neg_tot[column+1], + y_itself, + y_down]) except: - logger.exception("cannot assign boxes") boxes.append([0, peaks_neg_tot[len(peaks_neg_tot)-1], - top, bot]) - # dbg_plt(boxes[-1], "fallback box") + int(splitter_y_new[i]), int(splitter_y_new[i+1])]) else: - # order multi-column separators - y_mid_by_order=[] + y_lines_by_order=[] x_start_by_order=[] x_end_by_order=[] if len(x_starting)>0: - columns_covered_by_seps_covered_more_than_2col = set() + all_columns = np.arange(len(peaks_neg_tot)-1) + columns_covered_by_lines_covered_more_than_2col = [] for dj in range(len(x_starting)): - if set(range(x_starting[dj], x_ending[dj])) != all_columns: - columns_covered_by_seps_covered_more_than_2col.update( - range(x_starting[dj], x_ending[dj])) - columns_not_covered = list(all_columns - columns_covered_by_seps_covered_more_than_2col) + if set(list(range(int(x_starting[dj]),int(x_ending[dj]) ))) == set(all_columns): + pass + else: + columns_covered_by_lines_covered_more_than_2col = columns_covered_by_lines_covered_more_than_2col + \ + list(range(int(x_starting[dj]),int(x_ending[dj]) )) + columns_covered_by_lines_covered_more_than_2col = list(set(columns_covered_by_lines_covered_more_than_2col)) + columns_not_covered = list(set(all_columns) - set(columns_covered_by_lines_covered_more_than_2col)) - y_mid = np.append(y_mid, np.ones(len(columns_not_covered) + 1, - dtype=int) * top) - ##y_mid_by_order = np.append(y_mid_by_order, [top] * len(columns_not_covered)) - ##x_start_by_order = np.append(x_start_by_order, [0] * len(columns_not_covered)) - x_starting = np.append(x_starting, np.array(columns_not_covered, x_starting.dtype)) - x_ending = np.append(x_ending, np.array(columns_not_covered, x_ending.dtype) + 1) + y_type_2 = np.append(y_type_2, [int(splitter_y_new[i])] * (len(columns_not_covered) + 1)) + x_starting = np.append(x_starting, columns_not_covered) + x_ending = np.append(x_ending, np.array(columns_not_covered) + 1) if len(new_main_sep_y) > 0: x_starting = np.append(x_starting, 0) - x_ending = np.append(x_ending, len(peaks_neg_tot) - 1) + x_ending = np.append(x_ending, len(peaks_neg_tot)-1) else: x_starting = np.append(x_starting, x_starting[0]) x_ending = np.append(x_ending, x_ending[0]) else: - columns_not_covered = list(all_columns) - y_mid = np.append(y_mid, np.ones(len(columns_not_covered), - dtype=int) * top) - ##y_mid_by_order = np.append(y_mid_by_order, [top] * len(columns_not_covered)) - ##x_start_by_order = np.append(x_start_by_order, [0] * len(columns_not_covered)) - x_starting = np.append(x_starting, np.array(columns_not_covered, x_starting.dtype)) - x_ending = np.append(x_ending, np.array(columns_not_covered, x_ending.dtype) + 1) - - ind_args = np.arange(len(y_mid)) + all_columns = np.arange(len(peaks_neg_tot)-1) + columns_not_covered = list(set(all_columns)) + y_type_2 = np.append(y_type_2, [int(splitter_y_new[i])] * len(columns_not_covered)) + x_starting = np.append(x_starting, columns_not_covered) + x_ending = np.append(x_ending, np.array(columns_not_covered) + 1) + ind_args=np.array(range(len(y_type_2))) + for column in range(len(peaks_neg_tot)-1): - #print(column,'column') ind_args_in_col=ind_args[x_starting==column] - #print(len(y_mid)) - y_mid_column=y_mid[ind_args_in_col] + ind_args_in_col=np.array(ind_args_in_col) + + y_column=y_type_2[ind_args_in_col] x_start_column=x_starting[ind_args_in_col] x_end_column=x_ending[ind_args_in_col] - ind_args_col_sorted = np.argsort(y_mid_column) - y_mid_by_order.extend(y_mid_column[ind_args_col_sorted]) - x_start_by_order.extend(x_start_column[ind_args_col_sorted]) - x_end_by_order.extend(x_end_column[ind_args_col_sorted] - 1) + ind_args_col_sorted=np.argsort(y_column) + y_col_sort=y_column[ind_args_col_sorted] + x_start_column_sort=x_start_column[ind_args_col_sorted] + x_end_column_sort=x_end_column[ind_args_col_sorted] - # create single-column boxes from multi-column separators - y_mid_by_order = np.array(y_mid_by_order) - x_start_by_order = np.array(x_start_by_order) - x_end_by_order = np.array(x_end_by_order) - for il in range(len(y_mid_by_order)): - #print(il, "il") - y_mid_itself = y_mid_by_order[il] - #print(y_mid_itself,'y_mid_itself') - x_start_itself = x_start_by_order[il] - x_end_itself = x_end_by_order[il] - for column in range(x_start_itself, x_end_itself+1): - #print(column,'cols') - #print('burda2') - y_mid_next = y_mid_by_order[(y_mid_itself < y_mid_by_order) & - (column >= x_start_by_order) & - (column <= x_end_by_order)] - #print(y_mid_next,'y_mid_next') - y_mid_next = y_mid_next.min(initial=bot) - #print(y_mid_next,'y_mid_next') + for ii in range(len(y_col_sort)): + y_lines_by_order.append(y_col_sort[ii]) + x_start_by_order.append(x_start_column_sort[ii]) + x_end_by_order.append(x_end_column_sort[ii]-1) + + for il in range(len(y_lines_by_order)): + y_copy = list(y_lines_by_order) + x_start_copy = list(x_start_by_order) + x_end_copy = list(x_end_by_order) + + y_itself=y_copy.pop(il) + x_start_itself=x_start_copy.pop(il) + x_end_itself=x_end_copy.pop(il) + + for column in range(int(x_start_itself), int(x_end_itself)+1): + y_in_cols=[] + for yic in range(len(y_copy)): + if (y_copy[yic]>y_itself and + column>=x_start_copy[yic] and + column<=x_end_copy[yic]): + y_in_cols.append(y_copy[yic]) + if len(y_in_cols)>0: + y_down=np.min(y_in_cols) + else: + y_down=[int(splitter_y_new[i+1])][0] boxes.append([peaks_neg_tot[column], - peaks_neg_tot[column+1], - y_mid_itself, - y_mid_next]) - # dbg_plt(boxes[-1], "B column %d box" % (column + 1)) + peaks_neg_tot[column+1], + y_itself, + y_down]) if right2left_readingorder: peaks_neg_tot_tables_new = [] if len(peaks_neg_tot_tables)>=1: for peaks_tab_ind in peaks_neg_tot_tables: - peaks_neg_tot_tables_ind = width_tot - np.array(peaks_tab_ind) + peaks_neg_tot_tables_ind = regions_without_separators.shape[1] - np.array(peaks_tab_ind) peaks_neg_tot_tables_ind = list(peaks_neg_tot_tables_ind[::-1]) peaks_neg_tot_tables_new.append(peaks_neg_tot_tables_ind) for i in range(len(boxes)): - x_start_new = width_tot - boxes[i][1] - x_end_new = width_tot - boxes[i][0] + x_start_new = regions_without_separators.shape[1] - boxes[i][1] + x_end_new = regions_without_separators.shape[1] - boxes[i][0] boxes[i][0] = x_start_new boxes[i][1] = x_end_new - peaks_neg_tot_tables = peaks_neg_tot_tables_new - - # show final xy-cut - # dbg_plt(None, "final XY-Cut", boxes, True) - - logger.debug('exit return_boxes_of_images_by_order_of_reading_new') - return boxes, peaks_neg_tot_tables + return boxes, peaks_neg_tot_tables_new + else: + return boxes, peaks_neg_tot_tables def is_image_filename(fname: str) -> bool: return fname.lower().endswith(('.jpg', diff --git a/src/eynollah/utils/separate_lines.py b/src/eynollah/utils/separate_lines.py index 22ef00d..66eaca4 100644 --- a/src/eynollah/utils/separate_lines.py +++ b/src/eynollah/utils/separate_lines.py @@ -1497,24 +1497,26 @@ def separate_lines_new2(img_crop, thetha, num_col, slope_region, logger=None, pl return img_patch_interest_revised -@wrap_ndarray_shared(kw='img') -def do_image_rotation(angle, img=None, sigma_des=1.0, logger=None): - if logger is None: - logger = getLogger(__package__) - img_rot = rotate_image(img, angle) - img_rot[img_rot!=0] = 1 - try: - var = find_num_col_deskew(img_rot, sigma_des, 20.3) - except: - logger.exception("cannot determine variance for angle %.2f°", angle) - var = 0 - return var +def do_image_rotation(queue_of_all_params,angles_per_process, img_resized, sigma_des): + vars_per_each_subprocess = [] + angles_per_each_subprocess = [] + for mv in range(len(angles_per_process)): + img_rot=rotate_image(img_resized,angles_per_process[mv]) + img_rot[img_rot!=0]=1 + try: + var_spectrum=find_num_col_deskew(img_rot,sigma_des,20.3 ) + except: + var_spectrum=0 + vars_per_each_subprocess.append(var_spectrum) + angles_per_each_subprocess.append(angles_per_process[mv]) + + queue_of_all_params.put([vars_per_each_subprocess, angles_per_each_subprocess]) def return_deskew_slop(img_patch_org, sigma_des,n_tot_angles=100, - main_page=False, logger=None, plotter=None, map=None): + main_page=False, logger=None, plotter=None): if main_page and plotter: plotter.save_plot_of_textline_density(img_patch_org) - + img_int=np.zeros((img_patch_org.shape[0],img_patch_org.shape[1])) img_int[:,:]=img_patch_org[:,:]#img_patch_org[:,:,0] @@ -1524,71 +1526,76 @@ def return_deskew_slop(img_patch_org, sigma_des,n_tot_angles=100, onset_x=int((img_resized.shape[1]-img_int.shape[1])/2.) onset_y=int((img_resized.shape[0]-img_int.shape[0])/2.) - #img_resized=np.zeros((int( img_int.shape[0]*(1.8) ) , int( img_int.shape[1]*(2.6) ) )) - #img_resized[ int( img_int.shape[0]*(.4)):int( img_int.shape[0]*(.4))+img_int.shape[0], - # int( img_int.shape[1]*(.8)):int( img_int.shape[1]*(.8))+img_int.shape[1] ]=img_int[:,:] img_resized[ onset_y:onset_y+img_int.shape[0] , onset_x:onset_x+img_int.shape[1] ]=img_int[:,:] if main_page and img_patch_org.shape[1] > img_patch_org.shape[0]: angles = np.array([-45, 0, 45, 90,]) - angle, _ = get_smallest_skew(img_resized, sigma_des, angles, map=map, logger=logger, plotter=plotter) + angle = get_smallest_skew(img_resized, sigma_des, angles, plotter=plotter) angles = np.linspace(angle - 22.5, angle + 22.5, n_tot_angles) - angle, _ = get_smallest_skew(img_resized, sigma_des, angles, map=map, logger=logger, plotter=plotter) + angle = get_smallest_skew(img_resized, sigma_des, angles, plotter=plotter) elif main_page: - #angles = np.linspace(-12, 12, n_tot_angles)#np.array([0 , 45 , 90 , -45]) - angles = np.concatenate((np.linspace(-12, -7, n_tot_angles // 4), - np.linspace(-6, 6, n_tot_angles // 2), - np.linspace(7, 12, n_tot_angles // 4))) - angle, var = get_smallest_skew(img_resized, sigma_des, angles, map=map, logger=logger, plotter=plotter) + angles = np.linspace(-12, 12, n_tot_angles)#np.array([0 , 45 , 90 , -45]) + angle = get_smallest_skew(img_resized, sigma_des, angles, plotter=plotter) early_slope_edge=11 if abs(angle) > early_slope_edge: if angle < 0: - angles2 = np.linspace(-90, -12, n_tot_angles) + angles = np.linspace(-90, -12, n_tot_angles) else: - angles2 = np.linspace(90, 12, n_tot_angles) - angle2, var2 = get_smallest_skew(img_resized, sigma_des, angles2, map=map, logger=logger, plotter=plotter) - if var2 > var: - angle = angle2 + angles = np.linspace(90, 12, n_tot_angles) + angle = get_smallest_skew(img_resized, sigma_des, angles, plotter=plotter) else: angles = np.linspace(-25, 25, int(0.5 * n_tot_angles) + 10) - angle, var = get_smallest_skew(img_resized, sigma_des, angles, map=map, logger=logger, plotter=plotter) + angle = get_smallest_skew(img_resized, sigma_des, angles, plotter=plotter) early_slope_edge=22 if abs(angle) > early_slope_edge: if angle < 0: - angles2 = np.linspace(-90, -25, int(0.5 * n_tot_angles) + 10) + angles = np.linspace(-90, -25, int(0.5 * n_tot_angles) + 10) else: - angles2 = np.linspace(90, 25, int(0.5 * n_tot_angles) + 10) - angle2, var2 = get_smallest_skew(img_resized, sigma_des, angles2, map=map, logger=logger, plotter=plotter) - if var2 > var: - angle = angle2 + angles = np.linspace(90, 25, int(0.5 * n_tot_angles) + 10) + angle = get_smallest_skew(img_resized, sigma_des, angles, plotter=plotter) + return angle -def get_smallest_skew(img, sigma_des, angles, logger=None, plotter=None, map=map): - if logger is None: - logger = getLogger(__package__) - if map is None: - results = [do_image_rotation.__wrapped__(angle, img=img, sigma_des=sigma_des, logger=logger) - for angle in angles] - else: - with share_ndarray(img) as img_shared: - results = list(map(partial(do_image_rotation, img=img_shared, sigma_des=sigma_des, logger=None), - angles)) +def get_smallest_skew(img_resized, sigma_des, angles, plotter=None): + num_cores = cpu_count() + + queue_of_all_params = Queue() + processes = [] + nh = np.linspace(0, len(angles), num_cores + 1) + + for i in range(num_cores): + angles_per_process = angles[int(nh[i]) : int(nh[i + 1])] + processes.append(Process(target=do_image_rotation, args=(queue_of_all_params, angles_per_process, img_resized, sigma_des))) + + for i in range(num_cores): + processes[i].start() + + var_res=[] + all_angles = [] + for i in range(num_cores): + list_all_par = queue_of_all_params.get(True) + vars_for_subprocess = list_all_par[0] + angles_sub_process = list_all_par[1] + for j in range(len(vars_for_subprocess)): + var_res.append(vars_for_subprocess[j]) + all_angles.append(angles_sub_process[j]) + + for i in range(num_cores): + processes[i].join() + if plotter: - plotter.save_plot_of_rotation_angle(angles, results) + plotter.save_plot_of_rotation_angle(all_angles, var_res) + + try: - var_res = np.array(results) - assert var_res.any() - idx = np.argmax(var_res) - angle = angles[idx] - var = var_res[idx] + var_res=np.array(var_res) + ang_int=all_angles[np.argmax(var_res)]#angels_sorted[arg_final]#angels[arg_sort_early[arg_sort[arg_final]]]#angels[arg_fin] except: - logger.exception("cannot determine best angle among %s", str(angles)) - angle = 0 - var = 0 - return angle, var + ang_int=0 + return ang_int @wrap_ndarray_shared(kw='textline_mask_tot_ea') def do_work_of_slopes_new(