diff --git a/src/eynollah/cli.py b/src/eynollah/cli.py index c9bad52..e4a24e4 100644 --- a/src/eynollah/cli.py +++ b/src/eynollah/cli.py @@ -79,18 +79,28 @@ def machine_based_reading_order(input, dir_in, out, model, log_level): type=click.Path(file_okay=True, dir_okay=True), required=True, ) +@click.option( + "--overwrite", + "-O", + help="overwrite (instead of skipping) if output xml exists", + is_flag=True, +) @click.option( "--log_level", "-l", type=click.Choice(['OFF', 'DEBUG', 'INFO', 'WARN', 'ERROR']), help="Override log level globally to this", ) -def binarization(patches, model_dir, input_image, dir_in, output, log_level): +def binarization(patches, model_dir, input_image, dir_in, output, overwrite, log_level): assert bool(input_image) != bool(dir_in), "Either -i (single input) or -di (directory) must be provided, but not both." binarizer = SbbBinarizer(model_dir) if log_level: - binarizer.log.setLevel(getLevelName(log_level)) - binarizer.run(image_path=input_image, use_patches=patches, output=output, dir_in=dir_in) + binarizer.logger.setLevel(getLevelName(log_level)) + binarizer.run(overwrite=overwrite, + use_patches=patches, + image_path=input_image, + output=output, + dir_in=dir_in) @main.command() diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index 13acba6..46a1704 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -88,12 +88,7 @@ from .utils.contour import ( join_polygons, make_intersection, ) -from .utils.rotate import ( - rotate_image, - rotation_not_90_func, - rotation_not_90_func_full_layout, - rotation_image_new -) +from .utils.rotate import rotate_image from .utils.utils_ocr import ( return_start_and_end_of_common_text_of_textline_ocr_without_common_section, return_textline_contour_with_added_box_coordinate, @@ -139,7 +134,6 @@ from .utils import ( return_boxes_of_images_by_order_of_reading_new ) from .utils.pil_cv2 import check_dpi, pil2cv -from .utils.xml import order_and_id_of_texts from .plot import EynollahPlotter from .writer import EynollahXmlWriter @@ -2091,19 +2085,19 @@ class Eynollah: prediction_regions_org = prediction_regions_org[page_coord[0] : page_coord[1], page_coord[2] : page_coord[3]] prediction_regions_org=prediction_regions_org[:,:,0] - mask_lines_only = (prediction_regions_org[:,:] ==3)*1 + mask_seps_only = (prediction_regions_org[:,:] == 3)*1 mask_texts_only = (prediction_regions_org[:,:] ==1)*1 mask_images_only=(prediction_regions_org[:,:] ==2)*1 - polygons_seplines, hir_seplines = return_contours_of_image(mask_lines_only) + polygons_seplines, hir_seplines = return_contours_of_image(mask_seps_only) polygons_seplines = filter_contours_area_of_image( - mask_lines_only, polygons_seplines, hir_seplines, max_area=1, min_area=0.00001, dilate=1) + mask_seps_only, polygons_seplines, hir_seplines, max_area=1, min_area=0.00001, dilate=1) polygons_of_only_texts = return_contours_of_interested_region(mask_texts_only,1,0.00001) - polygons_of_only_lines = return_contours_of_interested_region(mask_lines_only,1,0.00001) + polygons_of_only_seps = return_contours_of_interested_region(mask_seps_only,1,0.00001) text_regions_p_true = np.zeros(prediction_regions_org.shape) - text_regions_p_true = cv2.fillPoly(text_regions_p_true, pts = polygons_of_only_lines, color=(3,3,3)) + text_regions_p_true = cv2.fillPoly(text_regions_p_true, pts = polygons_of_only_seps, color=(3,3,3)) text_regions_p_true[:,:][mask_images_only[:,:] == 1] = 2 text_regions_p_true = cv2.fillPoly(text_regions_p_true, pts=polygons_of_only_texts, color=(1,1,1)) @@ -2282,7 +2276,7 @@ class Eynollah: img_bin = resize_image(img_bin, img_height_h, img_width_h ) prediction_regions_org=prediction_regions_org[:,:,0] - mask_lines_only = (prediction_regions_org[:,:] ==3)*1 + mask_seps_only = (prediction_regions_org[:,:] == 3)*1 mask_texts_only = (prediction_regions_org[:,:] ==1)*1 mask_texts_only = mask_texts_only.astype('uint8') @@ -2293,7 +2287,7 @@ class Eynollah: mask_texts_only = cv2.dilate(mask_texts_only, kernel=np.ones((2,2), np.uint8), iterations=1) mask_images_only=(prediction_regions_org[:,:] ==2)*1 - polygons_seplines, hir_seplines = return_contours_of_image(mask_lines_only) + polygons_seplines, hir_seplines = return_contours_of_image(mask_seps_only) test_khat = np.zeros(prediction_regions_org.shape) test_khat = cv2.fillPoly(test_khat, pts=polygons_seplines, color=(1,1,1)) @@ -2307,7 +2301,7 @@ class Eynollah: #plt.show() polygons_seplines = filter_contours_area_of_image( - mask_lines_only, polygons_seplines, hir_seplines, max_area=1, min_area=0.00001, dilate=1) + mask_seps_only, polygons_seplines, hir_seplines, max_area=1, min_area=0.00001, dilate=1) test_khat = np.zeros(prediction_regions_org.shape) test_khat = cv2.fillPoly(test_khat, pts = polygons_seplines, color=(1,1,1)) @@ -2318,10 +2312,10 @@ class Eynollah: polygons_of_only_texts = return_contours_of_interested_region(mask_texts_only,1,0.00001) ##polygons_of_only_texts = dilate_textregion_contours(polygons_of_only_texts) - polygons_of_only_lines = return_contours_of_interested_region(mask_lines_only,1,0.00001) + polygons_of_only_seps = return_contours_of_interested_region(mask_seps_only,1,0.00001) text_regions_p_true = np.zeros(prediction_regions_org.shape) - text_regions_p_true = cv2.fillPoly(text_regions_p_true, pts=polygons_of_only_lines, color=(3,3,3)) + text_regions_p_true = cv2.fillPoly(text_regions_p_true, pts=polygons_of_only_seps, color=(3,3,3)) text_regions_p_true[:,:][mask_images_only[:,:] == 1] = 2 text_regions_p_true = cv2.fillPoly(text_regions_p_true, pts = polygons_of_only_texts, color=(1,1,1)) @@ -2377,7 +2371,7 @@ class Eynollah: prediction_regions_org2=resize_image(prediction_regions_org2, img_height_h, img_width_h ) mask_zeros2 = (prediction_regions_org2[:,:,0] == 0) - mask_lines2 = (prediction_regions_org2[:,:,0] == 3) + mask_seps2 = (prediction_regions_org2[:,:,0] == 3) text_sume_early = (prediction_regions_org[:,:] == 1).sum() prediction_regions_org_copy = np.copy(prediction_regions_org) prediction_regions_org_copy[(prediction_regions_org_copy[:,:]==1) & (mask_zeros2[:,:]==1)] = 0 @@ -2388,8 +2382,8 @@ class Eynollah: if not(is_image_enhanced and rate_two_models < RATIO_OF_TWO_MODEL_THRESHOLD): prediction_regions_org = np.copy(prediction_regions_org_copy) - prediction_regions_org[(mask_lines2[:,:]==1) & (prediction_regions_org[:,:]==0)]=3 - mask_lines_only=(prediction_regions_org[:,:]==3)*1 + prediction_regions_org[(mask_seps2[:,:]==1) & (prediction_regions_org[:,:]==0)]=3 + mask_seps_only=(prediction_regions_org[:,:]==3)*1 prediction_regions_org = cv2.erode(prediction_regions_org[:,:], KERNEL, iterations=2) prediction_regions_org = cv2.dilate(prediction_regions_org[:,:], KERNEL, iterations=2) @@ -2411,20 +2405,20 @@ class Eynollah: prediction_regions_org = resize_image(prediction_regions_org, img_height_h, img_width_h ) prediction_regions_org=prediction_regions_org[:,:,0] - mask_lines_only=(prediction_regions_org[:,:]==3)*1 + mask_seps_only=(prediction_regions_org[:,:]==3)*1 mask_texts_only=(prediction_regions_org[:,:]==1)*1 mask_images_only=(prediction_regions_org[:,:]==2)*1 - polygons_seplines, hir_seplines = return_contours_of_image(mask_lines_only) + polygons_seplines, hir_seplines = return_contours_of_image(mask_seps_only) polygons_seplines = filter_contours_area_of_image( - mask_lines_only, polygons_seplines, hir_seplines, max_area=1, min_area=0.00001, dilate=1) + mask_seps_only, polygons_seplines, hir_seplines, max_area=1, min_area=0.00001, dilate=1) polygons_of_only_texts = return_contours_of_interested_region(mask_texts_only, 1, 0.00001) - polygons_of_only_lines = return_contours_of_interested_region(mask_lines_only, 1, 0.00001) + polygons_of_only_seps = return_contours_of_interested_region(mask_seps_only, 1, 0.00001) text_regions_p_true = np.zeros(prediction_regions_org.shape) - text_regions_p_true = cv2.fillPoly(text_regions_p_true,pts = polygons_of_only_lines, color=(3, 3, 3)) + text_regions_p_true = cv2.fillPoly(text_regions_p_true,pts = polygons_of_only_seps, color=(3, 3, 3)) text_regions_p_true[:,:][mask_images_only[:,:] == 1] = 2 text_regions_p_true=cv2.fillPoly(text_regions_p_true,pts=polygons_of_only_texts, color=(1,1,1)) @@ -2449,7 +2443,7 @@ class Eynollah: prediction_regions_org = resize_image(prediction_regions_org, img_height_h, img_width_h ) prediction_regions_org=prediction_regions_org[:,:,0] - #mask_lines_only=(prediction_regions_org[:,:]==3)*1 + #mask_seps_only=(prediction_regions_org[:,:]==3)*1 #img = resize_image(img_org, int(img_org.shape[0]*1), int(img_org.shape[1]*1)) #prediction_regions_org = self.do_prediction(True, img, self.models["region"]) @@ -2457,19 +2451,19 @@ class Eynollah: #prediction_regions_org = prediction_regions_org[:,:,0] #prediction_regions_org[(prediction_regions_org[:,:] == 1) & (mask_zeros_y[:,:] == 1)]=0 - mask_lines_only = (prediction_regions_org == 3)*1 + mask_seps_only = (prediction_regions_org == 3)*1 mask_texts_only = (prediction_regions_org == 1)*1 mask_images_only= (prediction_regions_org == 2)*1 - polygons_seplines, hir_seplines = return_contours_of_image(mask_lines_only) + polygons_seplines, hir_seplines = return_contours_of_image(mask_seps_only) polygons_seplines = filter_contours_area_of_image( - mask_lines_only, polygons_seplines, hir_seplines, max_area=1, min_area=0.00001, dilate=1) + mask_seps_only, polygons_seplines, hir_seplines, max_area=1, min_area=0.00001, dilate=1) polygons_of_only_texts = return_contours_of_interested_region(mask_texts_only,1,0.00001) - polygons_of_only_lines = return_contours_of_interested_region(mask_lines_only,1,0.00001) + polygons_of_only_seps = return_contours_of_interested_region(mask_seps_only,1,0.00001) text_regions_p_true = np.zeros(prediction_regions_org.shape) - text_regions_p_true = cv2.fillPoly(text_regions_p_true, pts = polygons_of_only_lines, color=(3,3,3)) + text_regions_p_true = cv2.fillPoly(text_regions_p_true, pts = polygons_of_only_seps, color=(3,3,3)) text_regions_p_true[:,:][mask_images_only[:,:] == 1] = 2 text_regions_p_true = cv2.fillPoly(text_regions_p_true, pts = polygons_of_only_texts, color=(1,1,1)) @@ -2491,11 +2485,15 @@ class Eynollah: contours_only_text_parent) cx_head, cy_head, mx_head, Mx_head, my_head, My_head, mxy_head = find_new_features_of_contours( contours_only_text_parent_h) + cx_main = np.array(cx_main, dtype=int) + cy_main = np.array(cy_main, dtype=int) + cx_head = np.array(cx_head, dtype=int) + cy_head = np.array(cy_head, dtype=int) def match_boxes(only_centers: bool): arg_text_con_main = np.zeros(len(contours_only_text_parent), dtype=int) for ii in range(len(contours_only_text_parent)): - check_if_textregion_located_in_a_box = False + box_found = False for jj, box in enumerate(boxes): if ((cx_main[ii] >= box[0] and cx_main[ii] < box[1] and @@ -2506,20 +2504,23 @@ class Eynollah: my_main[ii] >= box[2] and My_main[ii] < box[3])): arg_text_con_main[ii] = jj - check_if_textregion_located_in_a_box = True + box_found = True + # print("main/matched ", ii, "\t", (mx_main[ii], Mx_main[ii], my_main[ii], My_main[ii]), "\tin", jj, box, only_centers) break - if not check_if_textregion_located_in_a_box: + if not box_found: dists_tr_from_box = np.linalg.norm(c_boxes - np.array([[cy_main[ii]], [cx_main[ii]]]), axis=0) pcontained_in_box = ((boxes[:, 2] <= cy_main[ii]) & (cy_main[ii] < boxes[:, 3]) & (boxes[:, 0] <= cx_main[ii]) & (cx_main[ii] < boxes[:, 1])) + assert pcontained_in_box.any(), (ii, cx_main[ii], cy_main[ii]) ind_min = np.argmin(np.ma.masked_array(dists_tr_from_box, ~pcontained_in_box)) arg_text_con_main[ii] = ind_min + # print("main/fallback ", ii, "\t", (mx_main[ii], Mx_main[ii], my_main[ii], My_main[ii]), "\tin", ind_min, boxes[ind_min], only_centers) args_contours_main = np.arange(len(contours_only_text_parent)) order_by_con_main = np.zeros_like(arg_text_con_main) arg_text_con_head = np.zeros(len(contours_only_text_parent_h), dtype=int) for ii in range(len(contours_only_text_parent_h)): - check_if_textregion_located_in_a_box = False + box_found = False for jj, box in enumerate(boxes): if ((cx_head[ii] >= box[0] and cx_head[ii] < box[1] and @@ -2530,20 +2531,21 @@ class Eynollah: my_head[ii] >= box[2] and My_head[ii] < box[3])): arg_text_con_head[ii] = jj - check_if_textregion_located_in_a_box = True + box_found = True + # print("head/matched ", ii, "\t", (mx_head[ii], Mx_head[ii], my_head[ii], My_head[ii]), "\tin", jj, box, only_centers) break - if not check_if_textregion_located_in_a_box: + if not box_found: dists_tr_from_box = np.linalg.norm(c_boxes - np.array([[cy_head[ii]], [cx_head[ii]]]), axis=0) pcontained_in_box = ((boxes[:, 2] <= cy_head[ii]) & (cy_head[ii] < boxes[:, 3]) & (boxes[:, 0] <= cx_head[ii]) & (cx_head[ii] < boxes[:, 1])) + assert pcontained_in_box.any(), (ii, cx_head[ii], cy_head[ii]) ind_min = np.argmin(np.ma.masked_array(dists_tr_from_box, ~pcontained_in_box)) arg_text_con_head[ii] = ind_min + # print("head/fallback ", ii, "\t", (mx_head[ii], Mx_head[ii], my_head[ii], My_head[ii]), "\tin", ind_min, boxes[ind_min], only_centers) args_contours_head = np.arange(len(contours_only_text_parent_h)) order_by_con_head = np.zeros_like(arg_text_con_head) - ref_point = 0 - order_of_texts_tot = [] - id_of_texts_tot = [] + idx = 0 for iij, box in enumerate(boxes): ys = slice(*box[2:4]) xs = slice(*box[0:2]) @@ -2552,42 +2554,30 @@ class Eynollah: con_inter_box = contours_only_text_parent[args_contours_box_main] con_inter_box_h = contours_only_text_parent_h[args_contours_box_head] - indexes_sorted, kind_of_texts_sorted, index_by_kind_sorted = order_of_regions( - textline_mask_tot[ys, xs], con_inter_box, con_inter_box_h, box[2]) + _, kind_of_texts_sorted, index_by_kind_sorted = order_of_regions( + textline_mask_tot[ys, xs], con_inter_box, con_inter_box_h, box[2], box[0]) - order_of_texts, id_of_texts = order_and_id_of_texts( - con_inter_box, con_inter_box_h, - indexes_sorted, index_by_kind_sorted, kind_of_texts_sorted, ref_point) + for tidx, kind in zip(index_by_kind_sorted, kind_of_texts_sorted): + if kind == 1: + # print(iij, "main", args_contours_box_main[tidx], "becomes", idx) + order_by_con_main[args_contours_box_main[tidx]] = idx + else: + # print(iij, "head", args_contours_box_head[tidx], "becomes", idx) + order_by_con_head[args_contours_box_head[tidx]] = idx + idx += 1 - indexes_sorted_main = indexes_sorted[kind_of_texts_sorted == 1] - indexes_by_type_main = index_by_kind_sorted[kind_of_texts_sorted == 1] - indexes_sorted_head = indexes_sorted[kind_of_texts_sorted == 2] - indexes_by_type_head = index_by_kind_sorted[kind_of_texts_sorted == 2] - - for zahler, _ in enumerate(args_contours_box_main): - arg_order_v = indexes_sorted_main[zahler] - order_by_con_main[args_contours_box_main[indexes_by_type_main[zahler]]] = \ - np.flatnonzero(indexes_sorted == arg_order_v) + ref_point - - for zahler, _ in enumerate(args_contours_box_head): - arg_order_v = indexes_sorted_head[zahler] - order_by_con_head[args_contours_box_head[indexes_by_type_head[zahler]]] = \ - np.flatnonzero(indexes_sorted == arg_order_v) + ref_point - - for jji in range(len(id_of_texts)): - order_of_texts_tot.append(order_of_texts[jji] + ref_point) - id_of_texts_tot.append(id_of_texts[jji]) - ref_point += len(id_of_texts) - - order_of_texts_tot = np.concatenate((order_by_con_main, - order_by_con_head)) - order_text_new = np.argsort(order_of_texts_tot) - return order_text_new, id_of_texts_tot + # xml writer will create region ids in order of + # - contours_only_text_parent (main text), followed by + # - contours_only_text_parent (headings), + # and then create regionrefs into these ordered by order_text_new + order_text_new = np.argsort(np.concatenate((order_by_con_main, + order_by_con_head))) + return order_text_new try: results = match_boxes(False) except Exception as why: - self.logger.error(why) + self.logger.exception(why) results = match_boxes(True) self.logger.debug("exit do_order_of_regions") @@ -2665,45 +2655,35 @@ class Eynollah: return layout_org, contours_new - def delete_separator_around(self, spliter_y,peaks_neg,image_by_region, pixel_line, pixel_table): + def delete_separator_around(self, splitter_y, peaks_neg, image_by_region, label_seps, label_table): # format of subboxes: box=[x1, x2 , y1, y2] pix_del = 100 - if len(image_by_region.shape)==3: - for i in range(len(spliter_y)-1): - for j in range(1,len(peaks_neg[i])-1): - ys = slice(int(spliter_y[i]), - int(spliter_y[i+1])) - xs = slice(peaks_neg[i][j] - pix_del, - peaks_neg[i][j] + pix_del) - image_by_region[ys,xs,0][image_by_region[ys,xs,0]==pixel_line] = 0 - image_by_region[ys,xs,0][image_by_region[ys,xs,1]==pixel_line] = 0 - image_by_region[ys,xs,0][image_by_region[ys,xs,2]==pixel_line] = 0 - - image_by_region[ys,xs,0][image_by_region[ys,xs,0]==pixel_table] = 0 - image_by_region[ys,xs,0][image_by_region[ys,xs,1]==pixel_table] = 0 - image_by_region[ys,xs,0][image_by_region[ys,xs,2]==pixel_table] = 0 - else: - for i in range(len(spliter_y)-1): - for j in range(1,len(peaks_neg[i])-1): - ys = slice(int(spliter_y[i]), - int(spliter_y[i+1])) - xs = slice(peaks_neg[i][j] - pix_del, - peaks_neg[i][j] + pix_del) - image_by_region[ys,xs][image_by_region[ys,xs]==pixel_line] = 0 - image_by_region[ys,xs][image_by_region[ys,xs]==pixel_table] = 0 + for i in range(len(splitter_y)-1): + for j in range(1,len(peaks_neg[i])-1): + where = np.index_exp[splitter_y[i]: + splitter_y[i+1], + peaks_neg[i][j] - pix_del: + peaks_neg[i][j] + pix_del, + :] + if image_by_region.ndim < 3: + where = where[:2] + else: + print("image_by_region ndim is 3!") # rs + image_by_region[where][image_by_region[where] == label_seps] = 0 + image_by_region[where][image_by_region[where] == label_table] = 0 return image_by_region def add_tables_heuristic_to_layout( self, image_regions_eraly_p, boxes, - slope_mean_hor, spliter_y, peaks_neg_tot, image_revised, - num_col_classifier, min_area, pixel_line): + slope_mean_hor, splitter_y, peaks_neg_tot, image_revised, + num_col_classifier, min_area, label_seps): - pixel_table =10 - image_revised_1 = self.delete_separator_around(spliter_y, peaks_neg_tot, image_revised, pixel_line, pixel_table) + label_table =10 + image_revised_1 = self.delete_separator_around(splitter_y, peaks_neg_tot, image_revised, label_seps, label_table) try: - image_revised_1[:,:30][image_revised_1[:,:30]==pixel_line] = 0 - image_revised_1[:,-30:][image_revised_1[:,-30:]==pixel_line] = 0 + image_revised_1[:,:30][image_revised_1[:,:30]==label_seps] = 0 + image_revised_1[:,-30:][image_revised_1[:,-30:]==label_seps] = 0 except: pass boxes = np.array(boxes, dtype=int) # to be on the safe side @@ -2714,7 +2694,7 @@ class Eynollah: _, thresh = cv2.threshold(image_col, 0, 255, 0) contours,hirarchy=cv2.findContours(thresh.copy(), cv2.RETR_TREE,cv2.CHAIN_APPROX_SIMPLE) - if indiv==pixel_table: + if indiv==label_table: main_contours = filter_contours_area_of_image_tables(thresh, contours, hirarchy, max_area=1, min_area=0.001) else: @@ -2730,11 +2710,11 @@ class Eynollah: box_xs = slice(*boxes[i][0:2]) image_box = img_comm[box_ys, box_xs] try: - image_box_tabels_1 = (image_box == pixel_table) * 1 + image_box_tabels_1 = (image_box == label_table) * 1 contours_tab,_=return_contours_of_image(image_box_tabels_1) contours_tab=filter_contours_area_of_image_tables(image_box_tabels_1,contours_tab,_,1,0.003) - image_box_tabels_1 = (image_box == pixel_line).astype(np.uint8) * 1 - image_box_tabels_and_m_text = ( (image_box == pixel_table) | + image_box_tabels_1 = (image_box == label_seps).astype(np.uint8) * 1 + image_box_tabels_and_m_text = ( (image_box == label_table) | (image_box == 1) ).astype(np.uint8) * 1 image_box_tabels_1 = cv2.dilate(image_box_tabels_1, KERNEL, iterations=5) @@ -2796,7 +2776,7 @@ class Eynollah: y_up_tabs=[] for ii in range(len(y_up_tabs)): - image_box[y_up_tabs[ii]:y_down_tabs[ii]] = pixel_table + image_box[y_up_tabs[ii]:y_down_tabs[ii]] = label_table image_revised_last[box_ys, box_xs] = image_box else: @@ -2807,14 +2787,14 @@ class Eynollah: image_revised_last[box_ys, box_xs] = image_box if num_col_classifier==1: - img_tables_col_1 = (image_revised_last == pixel_table).astype(np.uint8) + img_tables_col_1 = (image_revised_last == label_table).astype(np.uint8) contours_table_col1, _ = return_contours_of_image(img_tables_col_1) _,_ ,_ , _, y_min_tab_col1 ,y_max_tab_col1, _= find_new_features_of_contours(contours_table_col1) if len(y_min_tab_col1)>0: for ijv in range(len(y_min_tab_col1)): - image_revised_last[int(y_min_tab_col1[ijv]):int(y_max_tab_col1[ijv])] = pixel_table + image_revised_last[int(y_min_tab_col1[ijv]):int(y_max_tab_col1[ijv])] = label_table return image_revised_last def get_tables_from_model(self, img, num_col_classifier): @@ -2952,8 +2932,8 @@ class Eynollah: mask_images = (text_regions_p_1[:, :] == 2) * 1 mask_images = mask_images.astype(np.uint8) mask_images = cv2.erode(mask_images[:, :], KERNEL, iterations=10) - mask_lines = (text_regions_p_1[:, :] == 3) * 1 - mask_lines = mask_lines.astype(np.uint8) + mask_seps = (text_regions_p_1[:, :] == 3) * 1 + mask_seps = mask_seps.astype(np.uint8) img_only_regions_with_sep = ((text_regions_p_1[:, :] != 3) & (text_regions_p_1[:, :] != 0)) * 1 img_only_regions_with_sep = img_only_regions_with_sep.astype(np.uint8) @@ -2976,10 +2956,10 @@ class Eynollah: max(self.num_col_lower or num_col_classifier, num_col_classifier)) except Exception as why: - self.logger.error(why) + self.logger.exception(why) num_col = None #print("inside graphics 3 ", time.time() - t_in_gr) - return (num_col, num_col_classifier, img_only_regions, page_coord, image_page, mask_images, mask_lines, + return (num_col, num_col_classifier, img_only_regions, page_coord, image_page, mask_images, mask_seps, text_regions_p_1, cont_page, table_prediction, textline_mask_tot_ea, img_bin_light) def run_graphics_and_columns_without_layout(self, textline_mask_tot_ea, img_bin_light): @@ -3029,8 +3009,8 @@ class Eynollah: mask_images = (text_regions_p_1[:, :] == 2) * 1 mask_images = mask_images.astype(np.uint8) mask_images = cv2.erode(mask_images[:, :], KERNEL, iterations=10) - mask_lines = (text_regions_p_1[:, :] == 3) * 1 - mask_lines = mask_lines.astype(np.uint8) + mask_seps = (text_regions_p_1[:, :] == 3) * 1 + mask_seps = mask_seps.astype(np.uint8) img_only_regions_with_sep = ((text_regions_p_1[:, :] != 3) & (text_regions_p_1[:, :] != 0)) * 1 img_only_regions_with_sep = img_only_regions_with_sep.astype(np.uint8) @@ -3044,9 +3024,9 @@ class Eynollah: if not num_column_is_classified: num_col_classifier = num_col + 1 except Exception as why: - self.logger.error(why) + self.logger.exception(why) num_col = None - return (num_col, num_col_classifier, img_only_regions, page_coord, image_page, mask_images, mask_lines, + return (num_col, num_col_classifier, img_only_regions, page_coord, image_page, mask_images, mask_seps, text_regions_p_1, cont_page, table_prediction) def run_enhancement(self, light_version): @@ -3101,13 +3081,13 @@ class Eynollah: return slope_deskew def run_marginals( - self, textline_mask_tot_ea, mask_images, mask_lines, + self, textline_mask_tot_ea, mask_images, mask_seps, num_col_classifier, slope_deskew, text_regions_p_1, table_prediction): textline_mask_tot = textline_mask_tot_ea[:, :] textline_mask_tot[mask_images[:, :] == 1] = 0 - text_regions_p_1[mask_lines[:, :] == 1] = 3 + text_regions_p_1[mask_seps[:, :] == 1] = 3 text_regions_p = text_regions_p_1[:, :] text_regions_p = np.array(text_regions_p) if num_col_classifier in (1, 2): @@ -3131,12 +3111,10 @@ class Eynollah: self.logger.debug('enter run_boxes_no_full_layout') t_0_box = time.time() if np.abs(slope_deskew) >= SLOPE_THRESHOLD: - _, textline_mask_tot_d, text_regions_p_1_n, table_prediction_n = rotation_not_90_func( - image_page, textline_mask_tot, text_regions_p, table_prediction, slope_deskew) - text_regions_p_1_n = resize_image(text_regions_p_1_n, text_regions_p.shape[0], text_regions_p.shape[1]) - textline_mask_tot_d = resize_image(textline_mask_tot_d, text_regions_p.shape[0], text_regions_p.shape[1]) - table_prediction_n = resize_image(table_prediction_n, text_regions_p.shape[0], text_regions_p.shape[1]) - regions_without_separators_d = (text_regions_p_1_n[:, :] == 1) * 1 + textline_mask_tot_d = rotate_image(textline_mask_tot, slope_deskew) + text_regions_p_d = rotate_image(text_regions_p, slope_deskew) + table_prediction_n = rotate_image(table_prediction, slope_deskew) + regions_without_separators_d = (text_regions_p_d[:, :] == 1) * 1 if self.tables: regions_without_separators_d[table_prediction_n[:,:] == 1] = 1 regions_without_separators = (text_regions_p[:, :] == 1) * 1 @@ -3146,17 +3124,17 @@ class Eynollah: if self.tables: regions_without_separators[table_prediction ==1 ] = 1 if np.abs(slope_deskew) < SLOPE_THRESHOLD: - text_regions_p_1_n = None + text_regions_p_d = None textline_mask_tot_d = None regions_without_separators_d = None - pixel_lines = 3 + label_seps = 3 if np.abs(slope_deskew) < SLOPE_THRESHOLD: - _, _, matrix_of_lines_ch, splitter_y_new, _ = find_number_of_columns_in_document( - text_regions_p, num_col_classifier, self.tables, pixel_lines) + _, _, matrix_of_seps_ch, splitter_y_new, _ = find_number_of_columns_in_document( + text_regions_p, num_col_classifier, self.tables, label_seps) if np.abs(slope_deskew) >= SLOPE_THRESHOLD: - _, _, matrix_of_lines_ch_d, splitter_y_new_d, _ = find_number_of_columns_in_document( - text_regions_p_1_n, num_col_classifier, self.tables, pixel_lines) + _, _, matrix_of_seps_ch_d, splitter_y_new_d, _ = find_number_of_columns_in_document( + text_regions_p_d, num_col_classifier, self.tables, label_seps) #print(time.time()-t_0_box,'time box in 2') self.logger.info("num_col_classifier: %s", num_col_classifier) @@ -3171,7 +3149,7 @@ class Eynollah: t1 = time.time() if np.abs(slope_deskew) < SLOPE_THRESHOLD: boxes, peaks_neg_tot_tables = return_boxes_of_images_by_order_of_reading_new( - splitter_y_new, regions_without_separators, matrix_of_lines_ch, + splitter_y_new, regions_without_separators, text_regions_p, matrix_of_seps_ch, num_col_classifier, erosion_hurts, self.tables, self.right2left) boxes_d = None self.logger.debug("len(boxes): %s", len(boxes)) @@ -3183,17 +3161,17 @@ class Eynollah: else: text_regions_p_tables = np.copy(text_regions_p) text_regions_p_tables[(table_prediction == 1)] = 10 - pixel_line = 3 + label_seps = 3 img_revised_tab2 = self.add_tables_heuristic_to_layout( text_regions_p_tables, boxes, 0, splitter_y_new, peaks_neg_tot_tables, text_regions_p_tables, - num_col_classifier , 0.000005, pixel_line) + num_col_classifier , 0.000005, label_seps) #print(time.time()-t_0_box,'time box in 3.2') img_revised_tab2, contoures_tables = self.check_iou_of_bounding_box_and_contour_for_tables( img_revised_tab2, table_prediction, 10, num_col_classifier) #print(time.time()-t_0_box,'time box in 3.3') else: boxes_d, peaks_neg_tot_tables_d = return_boxes_of_images_by_order_of_reading_new( - splitter_y_new_d, regions_without_separators_d, matrix_of_lines_ch_d, + splitter_y_new_d, regions_without_separators_d, text_regions_p_d, matrix_of_seps_ch_d, num_col_classifier, erosion_hurts, self.tables, self.right2left) boxes = None self.logger.debug("len(boxes): %s", len(boxes_d)) @@ -3202,15 +3180,15 @@ class Eynollah: if self.light_version: pass else: - text_regions_p_tables = np.copy(text_regions_p_1_n) + text_regions_p_tables = np.copy(text_regions_p_d) text_regions_p_tables = np.round(text_regions_p_tables) text_regions_p_tables[(text_regions_p_tables != 3) & (table_prediction_n == 1)] = 10 - pixel_line = 3 + label_seps = 3 img_revised_tab2 = self.add_tables_heuristic_to_layout( text_regions_p_tables, boxes_d, 0, splitter_y_new_d, peaks_neg_tot_tables_d, text_regions_p_tables, - num_col_classifier, 0.000005, pixel_line) + num_col_classifier, 0.000005, label_seps) img_revised_tab2_d,_ = self.check_iou_of_bounding_box_and_contour_for_tables( img_revised_tab2, table_prediction_n, 10, num_col_classifier) @@ -3245,22 +3223,22 @@ class Eynollah: else: polygons_of_images = return_contours_of_interested_region(img_revised_tab, 2) - pixel_img = 4 + label_marginalia = 4 min_area_mar = 0.00001 if self.light_version: - marginal_mask = (text_regions_p[:,:]==pixel_img)*1 + marginal_mask = (text_regions_p[:,:]==label_marginalia)*1 marginal_mask = marginal_mask.astype('uint8') marginal_mask = cv2.dilate(marginal_mask, KERNEL, iterations=2) polygons_of_marginals = return_contours_of_interested_region(marginal_mask, 1, min_area_mar) else: - polygons_of_marginals = return_contours_of_interested_region(text_regions_p, pixel_img, min_area_mar) + polygons_of_marginals = return_contours_of_interested_region(text_regions_p, label_marginalia, min_area_mar) - pixel_img = 10 - contours_tables = return_contours_of_interested_region(text_regions_p, pixel_img, min_area_mar) + label_tables = 10 + contours_tables = return_contours_of_interested_region(text_regions_p, label_tables, min_area_mar) #print(time.time()-t_0_box,'time box in 5') self.logger.debug('exit run_boxes_no_full_layout') - return (polygons_of_images, img_revised_tab, text_regions_p_1_n, textline_mask_tot_d, + return (polygons_of_images, img_revised_tab, text_regions_p_d, textline_mask_tot_d, regions_without_separators_d, boxes, boxes_d, polygons_of_marginals, contours_tables) @@ -3276,24 +3254,13 @@ class Eynollah: text_regions_p[:,:][table_prediction[:,:]==1] = 10 img_revised_tab = text_regions_p[:,:] if np.abs(slope_deskew) >= SLOPE_THRESHOLD: - _, textline_mask_tot_d, text_regions_p_1_n, table_prediction_n = \ - rotation_not_90_func(image_page, textline_mask_tot, text_regions_p, - table_prediction, slope_deskew) - - text_regions_p_1_n = resize_image(text_regions_p_1_n, - text_regions_p.shape[0], - text_regions_p.shape[1]) - textline_mask_tot_d = resize_image(textline_mask_tot_d, - text_regions_p.shape[0], - text_regions_p.shape[1]) - table_prediction_n = resize_image(table_prediction_n, - text_regions_p.shape[0], - text_regions_p.shape[1]) - - regions_without_separators_d = (text_regions_p_1_n[:,:] == 1)*1 + textline_mask_tot_d = rotate_image(textline_mask_tot, slope_deskew) + text_regions_p_d = rotate_image(text_regions_p, slope_deskew) + table_prediction_n = rotate_image(table_prediction, slope_deskew) + regions_without_separators_d = (text_regions_p_d[:,:] == 1)*1 regions_without_separators_d[table_prediction_n[:,:] == 1] = 1 else: - text_regions_p_1_n = None + text_regions_p_d = None textline_mask_tot_d = None regions_without_separators_d = None # regions_without_separators = ( text_regions_p[:,:]==1 | text_regions_p[:,:]==2 )*1 @@ -3303,24 +3270,13 @@ class Eynollah: else: if np.abs(slope_deskew) >= SLOPE_THRESHOLD: - _, textline_mask_tot_d, text_regions_p_1_n, table_prediction_n = \ - rotation_not_90_func(image_page, textline_mask_tot, text_regions_p, - table_prediction, slope_deskew) - - text_regions_p_1_n = resize_image(text_regions_p_1_n, - text_regions_p.shape[0], - text_regions_p.shape[1]) - textline_mask_tot_d = resize_image(textline_mask_tot_d, - text_regions_p.shape[0], - text_regions_p.shape[1]) - table_prediction_n = resize_image(table_prediction_n, - text_regions_p.shape[0], - text_regions_p.shape[1]) - - regions_without_separators_d = (text_regions_p_1_n[:,:] == 1)*1 + textline_mask_tot_d = rotate_image(textline_mask_tot, slope_deskew) + text_regions_p_d = rotate_image(text_regions_p, slope_deskew) + table_prediction_n = rotate_image(table_prediction, slope_deskew) + regions_without_separators_d = (text_regions_p_d[:,:] == 1)*1 regions_without_separators_d[table_prediction_n[:,:] == 1] = 1 else: - text_regions_p_1_n = None + text_regions_p_d = None textline_mask_tot_d = None regions_without_separators_d = None @@ -3329,14 +3285,14 @@ class Eynollah: regions_without_separators = (text_regions_p[:,:] == 1)*1 regions_without_separators[table_prediction == 1] = 1 - pixel_lines=3 + label_seps=3 if np.abs(slope_deskew) < SLOPE_THRESHOLD: - num_col, _, matrix_of_lines_ch, splitter_y_new, _ = find_number_of_columns_in_document( - text_regions_p, num_col_classifier, self.tables, pixel_lines) + num_col, _, matrix_of_seps_ch, splitter_y_new, _ = find_number_of_columns_in_document( + text_regions_p, num_col_classifier, self.tables, label_seps) if np.abs(slope_deskew) >= SLOPE_THRESHOLD: - num_col_d, _, matrix_of_lines_ch_d, splitter_y_new_d, _ = find_number_of_columns_in_document( - text_regions_p_1_n, num_col_classifier, self.tables, pixel_lines) + num_col_d, _, matrix_of_seps_ch_d, splitter_y_new_d, _ = find_number_of_columns_in_document( + text_regions_p_d, num_col_classifier, self.tables, label_seps) if num_col_classifier>=3: if np.abs(slope_deskew) < SLOPE_THRESHOLD: @@ -3351,30 +3307,30 @@ class Eynollah: if np.abs(slope_deskew) < SLOPE_THRESHOLD: boxes, peaks_neg_tot_tables = return_boxes_of_images_by_order_of_reading_new( - splitter_y_new, regions_without_separators, matrix_of_lines_ch, + splitter_y_new, regions_without_separators, text_regions_p, matrix_of_seps_ch, num_col_classifier, erosion_hurts, self.tables, self.right2left) text_regions_p_tables = np.copy(text_regions_p) text_regions_p_tables[:,:][(table_prediction[:,:]==1)] = 10 - pixel_line = 3 + label_seps = 3 img_revised_tab2 = self.add_tables_heuristic_to_layout( text_regions_p_tables, boxes, 0, splitter_y_new, peaks_neg_tot_tables, text_regions_p_tables, - num_col_classifier , 0.000005, pixel_line) + num_col_classifier , 0.000005, label_seps) img_revised_tab2,contoures_tables = self.check_iou_of_bounding_box_and_contour_for_tables( img_revised_tab2, table_prediction, 10, num_col_classifier) else: boxes_d, peaks_neg_tot_tables_d = return_boxes_of_images_by_order_of_reading_new( - splitter_y_new_d, regions_without_separators_d, matrix_of_lines_ch_d, + splitter_y_new_d, regions_without_separators_d, text_regions_p_d, matrix_of_seps_ch_d, num_col_classifier, erosion_hurts, self.tables, self.right2left) - text_regions_p_tables = np.copy(text_regions_p_1_n) + text_regions_p_tables = np.copy(text_regions_p_d) text_regions_p_tables = np.round(text_regions_p_tables) text_regions_p_tables[(text_regions_p_tables != 3) & (table_prediction_n == 1)] = 10 - pixel_line = 3 + label_seps = 3 img_revised_tab2 = self.add_tables_heuristic_to_layout( text_regions_p_tables, boxes_d, 0, splitter_y_new_d, peaks_neg_tot_tables_d, text_regions_p_tables, - num_col_classifier, 0.000005, pixel_line) + num_col_classifier, 0.000005, label_seps) img_revised_tab2_d,_ = self.check_iou_of_bounding_box_and_contour_for_tables( img_revised_tab2, table_prediction_n, 10, num_col_classifier) @@ -3399,20 +3355,20 @@ class Eynollah: text_regions_p[img_revised_tab == 10] = 10 #img_revised_tab[img_revised_tab2 == 10] = 10 - pixel_img = 4 + label_marginalia = 4 min_area_mar = 0.00001 if self.light_version: - marginal_mask = (text_regions_p[:,:]==pixel_img)*1 + marginal_mask = (text_regions_p[:,:]==label_marginalia)*1 marginal_mask = marginal_mask.astype('uint8') marginal_mask = cv2.dilate(marginal_mask, KERNEL, iterations=2) polygons_of_marginals = return_contours_of_interested_region(marginal_mask, 1, min_area_mar) else: - polygons_of_marginals = return_contours_of_interested_region(text_regions_p, pixel_img, min_area_mar) + polygons_of_marginals = return_contours_of_interested_region(text_regions_p, label_marginalia, min_area_mar) - pixel_img = 10 - contours_tables = return_contours_of_interested_region(text_regions_p, pixel_img, min_area_mar) + label_tables = 10 + contours_tables = return_contours_of_interested_region(text_regions_p, label_tables, min_area_mar) # set first model with second model text_regions_p[:, :][text_regions_p[:, :] == 2] = 5 @@ -3465,16 +3421,13 @@ class Eynollah: #plt.show() ####if not self.tables: if np.abs(slope_deskew) >= SLOPE_THRESHOLD: - _, textline_mask_tot_d, text_regions_p_1_n, regions_fully_n = rotation_not_90_func_full_layout( - image_page, textline_mask_tot, text_regions_p, regions_fully, slope_deskew) - - text_regions_p_1_n = resize_image(text_regions_p_1_n, text_regions_p.shape[0], text_regions_p.shape[1]) - textline_mask_tot_d = resize_image(textline_mask_tot_d, text_regions_p.shape[0], text_regions_p.shape[1]) - regions_fully_n = resize_image(regions_fully_n, text_regions_p.shape[0], text_regions_p.shape[1]) + textline_mask_tot_d = rotate_image(textline_mask_tot, slope_deskew) + text_regions_p_d = rotate_image(text_regions_p, slope_deskew) + regions_fully_n = rotate_image(regions_fully, slope_deskew) if not self.tables: - regions_without_separators_d = (text_regions_p_1_n[:, :] == 1) * 1 + regions_without_separators_d = (text_regions_p_d[:, :] == 1) * 1 else: - text_regions_p_1_n = None + text_regions_p_d = None textline_mask_tot_d = None regions_without_separators_d = None if not self.tables: @@ -3484,7 +3437,7 @@ class Eynollah: self.logger.debug('exit run_boxes_full_layout') #print("full inside 3", time.time()- t_full0) - return (polygons_of_images, img_revised_tab, text_regions_p_1_n, textline_mask_tot_d, + return (polygons_of_images, img_revised_tab, text_regions_p_d, textline_mask_tot_d, regions_without_separators_d, regions_fully, regions_without_separators, polygons_of_marginals, contours_tables) @@ -3632,7 +3585,7 @@ class Eynollah: co_text_all = contours_only_text_parent if not len(co_text_all): - return [], [] + return [] labels_con = np.zeros((int(y_len /6.), int(x_len/6.), len(co_text_all)), dtype=bool) co_text_all = [(i/6).astype(int) for i in co_text_all] @@ -3715,11 +3668,9 @@ class Eynollah: else: org_contours_indexes.extend([indexes_of_located_cont[region_with_curr_order]]) - region_ids = ['region_%04d' % i for i in range(len(co_text_all_org))] - return org_contours_indexes, region_ids + return org_contours_indexes else: - region_ids = ['region_%04d' % i for i in range(len(co_text_all_org))] - return ordered, region_ids + return ordered def return_start_and_end_of_common_text_of_textline_ocr(self,textline_image, ind_tot): width = np.shape(textline_image)[1] @@ -4213,7 +4164,7 @@ class Eynollah: image_page, page_coord, cont_page = \ self.get_regions_light_v_extract_only_images(img_res, is_image_enhanced, num_col_classifier) pcgts = self.writer.build_pagexml_no_full_layout( - [], page_coord, [], [], [], [], + [], page_coord, [], [], [], polygons_of_images, [], [], [], [], [], [], [], [], [], cont_page, [], []) if self.plotter: @@ -4254,7 +4205,6 @@ class Eynollah: order_text_new = [0] slopes =[0] - id_of_texts_tot =['region_0001'] conf_contours_textregions =[0] if self.ocr and not self.tr: @@ -4266,7 +4216,7 @@ class Eynollah: ocr_all_textlines = None pcgts = self.writer.build_pagexml_no_full_layout( - cont_page, page_coord, order_text_new, id_of_texts_tot, + cont_page, page_coord, order_text_new, all_found_textline_polygons, page_coord, [], [], [], [], [], [], [], slopes, [], [], @@ -4301,7 +4251,7 @@ class Eynollah: slope_deskew = self.run_deskew(textline_mask_tot_ea) #print("text region early -2,5 in %.1fs", time.time() - t0) #self.logger.info("Textregion detection took %.1fs ", time.time() - t1t) - num_col, num_col_classifier, img_only_regions, page_coord, image_page, mask_images, mask_lines, \ + num_col, num_col_classifier, img_only_regions, page_coord, image_page, mask_images, mask_seps, \ text_regions_p_1, cont_page, table_prediction, textline_mask_tot_ea, img_bin_light = \ self.run_graphics_and_columns_light(text_regions_p_1, textline_mask_tot_ea, num_col_classifier, num_column_is_classified, @@ -4318,7 +4268,7 @@ class Eynollah: confidence_matrix = np.zeros((text_regions_p_1.shape[:2])) t1 = time.time() - num_col, num_col_classifier, img_only_regions, page_coord, image_page, mask_images, mask_lines, \ + num_col, num_col_classifier, img_only_regions, page_coord, image_page, mask_images, mask_seps, \ text_regions_p_1, cont_page, table_prediction = \ self.run_graphics_and_columns(text_regions_p_1, num_col_classifier, num_column_is_classified, erosion_hurts) @@ -4332,7 +4282,7 @@ class Eynollah: self.logger.info("No columns detected - generating empty PAGE-XML") pcgts = self.writer.build_pagexml_no_full_layout( - [], page_coord, [], [], [], [], [], [], [], [], [], [], [], [], [], [], + [], page_coord, [], [], [], [], [], [], [], [], [], [], [], [], [], cont_page, [], []) return pcgts @@ -4356,12 +4306,12 @@ class Eynollah: image_page = resize_image(image_page,img_h_new, img_w_new ) textline_mask_tot_ea = resize_image(textline_mask_tot_ea,img_h_new, img_w_new ) mask_images = resize_image(mask_images,img_h_new, img_w_new ) - mask_lines = resize_image(mask_lines,img_h_new, img_w_new ) + mask_seps = resize_image(mask_seps, img_h_new, img_w_new) text_regions_p_1 = resize_image(text_regions_p_1,img_h_new, img_w_new ) table_prediction = resize_image(table_prediction,img_h_new, img_w_new ) textline_mask_tot, text_regions_p = \ - self.run_marginals(textline_mask_tot_ea, mask_images, mask_lines, + self.run_marginals(textline_mask_tot_ea, mask_images, mask_seps, num_col_classifier, slope_deskew, text_regions_p_1, table_prediction) if self.plotter: self.plotter.save_plot_of_layout_main_all(text_regions_p, image_page) @@ -4398,14 +4348,14 @@ class Eynollah: ## birdan sora chock chakir t1 = time.time() if not self.full_layout: - polygons_of_images, img_revised_tab, text_regions_p_1_n, \ + polygons_of_images, img_revised_tab, text_regions_p_d, \ textline_mask_tot_d, regions_without_separators_d, \ boxes, boxes_d, polygons_of_marginals, contours_tables = \ self.run_boxes_no_full_layout(image_page, textline_mask_tot, text_regions_p, slope_deskew, num_col_classifier, table_prediction, erosion_hurts) ###polygons_of_marginals = dilate_textregion_contours(polygons_of_marginals) else: - polygons_of_images, img_revised_tab, text_regions_p_1_n, \ + polygons_of_images, img_revised_tab, text_regions_p_d, \ textline_mask_tot_d, regions_without_separators_d, \ regions_fully, regions_without_separators, polygons_of_marginals, contours_tables = \ self.run_boxes_full_layout(image_page, textline_mask_tot, text_regions_p, slope_deskew, @@ -4419,7 +4369,7 @@ class Eynollah: text_only = (img_revised_tab[:, :] == 1) * 1 if np.abs(slope_deskew) >= SLOPE_THRESHOLD: - text_only_d = (text_regions_p_1_n[:, :] == 1) * 1 + text_only_d = ((text_regions_p_d[:, :] == 1)) * 1 #print("text region early 2 in %.1fs", time.time() - t0) ###min_con_area = 0.000005 @@ -4493,42 +4443,42 @@ class Eynollah: dists[i] = np.linalg.norm(centers[:, i:i + 1] - centers_d, axis=0) corresp = np.zeros(dists.shape, dtype=bool) # keep searching next-closest until at least one correspondence on each side - while not np.all(corresp.sum(axis=1)) and not np.all(corresp.sum(axis=0)): + while not np.all(corresp.sum(axis=1)) or not np.all(corresp.sum(axis=0)): idx = np.nanargmin(dists) i, j = np.unravel_index(idx, dists.shape) dists[i, j] = np.nan corresp[i, j] = True - #print("original/deskewed adjacency", corresp.nonzero()) + # print("original/deskewed adjacency", corresp.nonzero()) contours_only_text_parent_d_ordered = np.zeros_like(contours_only_text_parent) contours_only_text_parent_d_ordered = contours_only_text_parent_d[np.argmax(corresp, axis=1)] # img1 = np.zeros(text_only_d.shape[:2], dtype=np.uint8) # for i in range(len(contours_only_text_parent)): # cv2.fillPoly(img1, pts=[contours_only_text_parent_d_ordered[i]], color=i + 1) - # plt.subplot(2, 2, 1, title="direct corresp contours") + # plt.subplot(1, 4, 1, title="direct corresp contours") # plt.imshow(img1) # img2 = np.zeros(text_only_d.shape[:2], dtype=np.uint8) # join deskewed regions mapping to single original ones for i in range(len(contours_only_text_parent)): if np.count_nonzero(corresp[i]) > 1: indices = np.flatnonzero(corresp[i]) - #print("joining", indices) + # print("joining", indices) polygons_d = [contour2polygon(contour) for contour in contours_only_text_parent_d[indices]] contour_d = polygon2contour(join_polygons(polygons_d)) contours_only_text_parent_d_ordered[i] = contour_d # cv2.fillPoly(img2, pts=[contour_d], color=i + 1) - # plt.subplot(2, 2, 3, title="joined contours") + # plt.subplot(1, 4, 2, title="joined contours") # plt.imshow(img2) # img3 = np.zeros(text_only_d.shape[:2], dtype=np.uint8) # split deskewed regions mapping to multiple original ones def deskew(polygon): polygon = shapely.affinity.rotate(polygon, -slope_deskew, origin=center) - polygon = shapely.affinity.translate(polygon, *offset.squeeze()) + #polygon = shapely.affinity.translate(polygon, *offset.squeeze()) return polygon for j in range(len(contours_only_text_parent_d)): if np.count_nonzero(corresp[:, j]) > 1: indices = np.flatnonzero(corresp[:, j]) - #print("splitting along", indices) + # print("splitting along", indices) polygons = [deskew(contour2polygon(contour)) for contour in contours_only_text_parent[indices]] polygon_d = contour2polygon(contours_only_text_parent_d[j]) @@ -4541,21 +4491,45 @@ class Eynollah: if polygon_d] contours_only_text_parent_d_ordered[indices] = contours_d # cv2.fillPoly(img3, pts=contours_d, color=j + 1) - # plt.subplot(2, 2, 4, title="split contours") + # plt.subplot(1, 4, 3, title="split contours") # plt.imshow(img3) # img4 = np.zeros(text_only_d.shape[:2], dtype=np.uint8) # for i in range(len(contours_only_text_parent)): # cv2.fillPoly(img4, pts=[contours_only_text_parent_d_ordered[i]], color=i + 1) - # plt.subplot(2, 2, 2, title="result contours") + # plt.subplot(1, 4, 4, title="result contours") # plt.imshow(img4) # plt.show() + # from matplotlib import patches as ptchs + # plt.subplot(1, 2, 1, title="undeskewed") + # plt.imshow(text_only) + # centers = np.stack(find_center_of_contours(contours_only_text_parent)) # [2, N] + # for i in range(len(contours_only_text_parent)): + # cnt = contours_only_text_parent[i] + # ctr = centers[:, i] + # plt.gca().add_patch(ptchs.Polygon(cnt[:, 0], closed=False, fill=False, color='blue')) + # plt.gca().scatter(ctr[0], ctr[1], 20, c='blue', marker='x') + # plt.gca().text(ctr[0], ctr[1], str(i), c='blue') + # plt.subplot(1, 2, 2, title="deskewed") + # plt.imshow(text_only_d) + # centers_d = np.stack(find_center_of_contours(contours_only_text_parent_d_ordered)) # [2, N] + # for i in range(len(contours_only_text_parent)): + # cnt = contours_only_text_parent[i] + # cnt = polygon2contour(deskew(contour2polygon(cnt))) + # plt.gca().add_patch(ptchs.Polygon(cnt[:, 0], closed=False, fill=False, color='blue')) + # for i in range(len(contours_only_text_parent_d_ordered)): + # cnt = contours_only_text_parent_d_ordered[i] + # ctr = centers_d[:, i] + # plt.gca().add_patch(ptchs.Polygon(cnt[:, 0], closed=False, fill=False, color='red')) + # plt.gca().scatter(ctr[0], ctr[1], 20, c='red', marker='x') + # plt.gca().text(ctr[0], ctr[1], str(i), c='red') + # plt.show() if not len(contours_only_text_parent): # stop early empty_marginals = [[]] * len(polygons_of_marginals) if self.full_layout: pcgts = self.writer.build_pagexml_full_layout( - [], [], page_coord, [], [], [], [], [], [], + [], [], page_coord, [], [], [], [], [], polygons_of_images, contours_tables, [], polygons_of_marginals, polygons_of_marginals, empty_marginals, empty_marginals, @@ -4564,7 +4538,7 @@ class Eynollah: cont_page, polygons_seplines) else: pcgts = self.writer.build_pagexml_no_full_layout( - [], page_coord, [], [], [], [], + [], page_coord, [], [], [], polygons_of_images, polygons_of_marginals, polygons_of_marginals, empty_marginals, empty_marginals, @@ -4695,18 +4669,18 @@ class Eynollah: label_seps = 6 if not self.headers_off: if np.abs(slope_deskew) < SLOPE_THRESHOLD: - num_col, _, matrix_of_lines_ch, splitter_y_new, _ = find_number_of_columns_in_document( + num_col, _, matrix_of_seps_ch, splitter_y_new, _ = find_number_of_columns_in_document( text_regions_p, num_col_classifier, self.tables, label_seps, contours_only_text_parent_h) else: - _, _, matrix_of_lines_ch_d, splitter_y_new_d, _ = find_number_of_columns_in_document( - text_regions_p_1_n, num_col_classifier, self.tables, label_seps, contours_only_text_parent_h_d_ordered) + _, _, matrix_of_seps_ch_d, splitter_y_new_d, _ = find_number_of_columns_in_document( + text_regions_p_d, num_col_classifier, self.tables, label_seps, contours_only_text_parent_h_d_ordered) elif self.headers_off: if np.abs(slope_deskew) < SLOPE_THRESHOLD: - num_col, _, matrix_of_lines_ch, splitter_y_new, _ = find_number_of_columns_in_document( + num_col, _, matrix_of_seps_ch, splitter_y_new, _ = find_number_of_columns_in_document( text_regions_p, num_col_classifier, self.tables, label_seps) else: - _, _, matrix_of_lines_ch_d, splitter_y_new_d, _ = find_number_of_columns_in_document( - text_regions_p_1_n, num_col_classifier, self.tables, label_seps) + _, _, matrix_of_seps_ch_d, splitter_y_new_d, _ = find_number_of_columns_in_document( + text_regions_p_d, num_col_classifier, self.tables, label_seps) if num_col_classifier >= 3: if np.abs(slope_deskew) < SLOPE_THRESHOLD: @@ -4717,13 +4691,13 @@ class Eynollah: regions_without_separators_d = cv2.erode(regions_without_separators_d[:, :], KERNEL, iterations=6) if np.abs(slope_deskew) < SLOPE_THRESHOLD: - boxes, peaks_neg_tot_tables = return_boxes_of_images_by_order_of_reading_new( - splitter_y_new, regions_without_separators, matrix_of_lines_ch, + boxes, _ = return_boxes_of_images_by_order_of_reading_new( + splitter_y_new, regions_without_separators, text_regions_p, matrix_of_seps_ch, num_col_classifier, erosion_hurts, self.tables, self.right2left, logger=self.logger) else: - boxes_d, peaks_neg_tot_tables_d = return_boxes_of_images_by_order_of_reading_new( - splitter_y_new_d, regions_without_separators_d, matrix_of_lines_ch_d, + boxes_d, _ = return_boxes_of_images_by_order_of_reading_new( + splitter_y_new_d, regions_without_separators_d, text_regions_p_d, matrix_of_seps_ch_d, num_col_classifier, erosion_hurts, self.tables, self.right2left, logger=self.logger) else: @@ -4744,14 +4718,14 @@ class Eynollah: self.logger.info("Headers ignored in reading order") if self.reading_order_machine_based: - order_text_new, id_of_texts_tot = self.do_order_of_regions_with_model( + order_text_new = self.do_order_of_regions_with_model( contours_only_text_parent, contours_only_text_parent_h, text_regions_p) else: if np.abs(slope_deskew) < SLOPE_THRESHOLD: - order_text_new, id_of_texts_tot = self.do_order_of_regions( + order_text_new = self.do_order_of_regions( contours_only_text_parent, contours_only_text_parent_h, boxes, textline_mask_tot) else: - order_text_new, id_of_texts_tot = self.do_order_of_regions( + order_text_new = self.do_order_of_regions( contours_only_text_parent_d_ordered, contours_only_text_parent_h_d_ordered, boxes_d, textline_mask_tot_d) self.logger.info(f"Detection of reading order took {time.time() - t_order:.1f}s") @@ -4848,7 +4822,7 @@ class Eynollah: if self.full_layout: pcgts = self.writer.build_pagexml_full_layout( - contours_only_text_parent, contours_only_text_parent_h, page_coord, order_text_new, id_of_texts_tot, + contours_only_text_parent, contours_only_text_parent_h, page_coord, order_text_new, all_found_textline_polygons, all_found_textline_polygons_h, all_box_coord, all_box_coord_h, polygons_of_images, contours_tables, polygons_of_drop_capitals, polygons_of_marginals_left, polygons_of_marginals_right, @@ -4861,7 +4835,7 @@ class Eynollah: conf_contours_textregions, conf_contours_textregions_h) else: pcgts = self.writer.build_pagexml_no_full_layout( - contours_only_text_parent, page_coord, order_text_new, id_of_texts_tot, + contours_only_text_parent, page_coord, order_text_new, all_found_textline_polygons, all_box_coord, polygons_of_images, polygons_of_marginals_left, polygons_of_marginals_right, all_found_textline_polygons_marginals_left, all_found_textline_polygons_marginals_right, diff --git a/src/eynollah/ocrd_cli_binarization.py b/src/eynollah/ocrd_cli_binarization.py index 848bbac..6289517 100644 --- a/src/eynollah/ocrd_cli_binarization.py +++ b/src/eynollah/ocrd_cli_binarization.py @@ -70,7 +70,7 @@ class SbbBinarizeProcessor(Processor): if oplevel == 'page': self.logger.info("Binarizing on 'page' level in page '%s'", page_id) - page_image_bin = cv2pil(self.binarizer.run(image=pil2cv(page_image), use_patches=True)) + page_image_bin = cv2pil(self.binarizer.run_single(image=pil2cv(page_image), use_patches=True)) # update PAGE (reference the image file): page_image_ref = AlternativeImageType(comments=page_xywh['features'] + ',binarized,clipped') page.add_AlternativeImage(page_image_ref) @@ -83,7 +83,7 @@ class SbbBinarizeProcessor(Processor): for region in regions: region_image, region_xywh = self.workspace.image_from_segment( region, page_image, page_xywh, feature_filter='binarized') - region_image_bin = cv2pil(self.binarizer.run(image=pil2cv(region_image), use_patches=True)) + region_image_bin = cv2pil(self.binarizer.run_single(image=pil2cv(region_image), use_patches=True)) # update PAGE (reference the image file): region_image_ref = AlternativeImageType(comments=region_xywh['features'] + ',binarized') region.add_AlternativeImage(region_image_ref) @@ -95,7 +95,7 @@ class SbbBinarizeProcessor(Processor): self.logger.warning("Page '%s' contains no text lines", page_id) for line in lines: line_image, line_xywh = self.workspace.image_from_segment(line, page_image, page_xywh, feature_filter='binarized') - line_image_bin = cv2pil(self.binarizer.run(image=pil2cv(line_image), use_patches=True)) + line_image_bin = cv2pil(self.binarizer.run_single(image=pil2cv(line_image), use_patches=True)) # update PAGE (reference the image file): line_image_ref = AlternativeImageType(comments=line_xywh['features'] + ',binarized') line.add_AlternativeImage(region_image_ref) diff --git a/src/eynollah/sbb_binarize.py b/src/eynollah/sbb_binarize.py index 3716987..b81f45e 100644 --- a/src/eynollah/sbb_binarize.py +++ b/src/eynollah/sbb_binarize.py @@ -25,7 +25,7 @@ class SbbBinarizer: def __init__(self, model_dir, logger=None): self.model_dir = model_dir - self.log = logger if logger else logging.getLogger('SbbBinarizer') + self.logger = logger if logger else logging.getLogger('SbbBinarizer') self.start_new_session() @@ -315,64 +315,46 @@ class SbbBinarizer: prediction_true = prediction_true.astype(np.uint8) return prediction_true[:,:,0] - def run(self, image=None, image_path=None, output=None, use_patches=False, dir_in=None): - # print(dir_in,'dir_in') - if not dir_in: - if (image is not None and image_path is not None) or \ - (image is None and image_path is None): - raise ValueError("Must pass either a opencv2 image or an image_path") - if image_path is not None: - image = cv2.imread(image_path) - img_last = 0 - for n, (model, model_file) in enumerate(zip(self.models, self.model_files)): - self.log.info('Predicting with model %s [%s/%s]' % (model_file, n + 1, len(self.model_files))) - - res = self.predict(model, image, use_patches) - - img_fin = np.zeros((res.shape[0], res.shape[1], 3)) - res[:, :][res[:, :] == 0] = 2 - res = res - 1 - res = res * 255 - img_fin[:, :, 0] = res - img_fin[:, :, 1] = res - img_fin[:, :, 2] = res - - img_fin = img_fin.astype(np.uint8) - img_fin = (res[:, :] == 0) * 255 - img_last = img_last + img_fin - - kernel = np.ones((5, 5), np.uint8) - img_last[:, :][img_last[:, :] > 0] = 255 - img_last = (img_last[:, :] == 0) * 255 - if output: - cv2.imwrite(output, img_last) - return img_last + def run(self, image_path=None, output=None, dir_in=None, use_patches=False, overwrite=False): + if dir_in: + ls_imgs = [(os.path.join(dir_in, image_filename), + os.path.join(output, os.path.splitext(image_filename)[0] + '.png')) + for image_filename in filter(is_image_filename, + os.listdir(dir_in))] else: - ls_imgs = list(filter(is_image_filename, os.listdir(dir_in))) - for image_name in ls_imgs: - image_stem = image_name.split('.')[0] - print(image_name,'image_name') - image = cv2.imread(os.path.join(dir_in,image_name) ) - img_last = 0 - for n, (model, model_file) in enumerate(zip(self.models, self.model_files)): - self.log.info('Predicting with model %s [%s/%s]' % (model_file, n + 1, len(self.model_files))) + ls_imgs = [(image_path, output)] - res = self.predict(model, image, use_patches) + for input_path, output_path in ls_imgs: + print(input_path, 'image_name') + if os.path.exists(output_path): + if overwrite: + self.logger.warning("will overwrite existing output file '%s'", output_path) + else: + self.logger.warning("will skip input for existing output file '%s'", output_path) + image = cv2.imread(input_path) + result = self.run_single(image, use_patches) + cv2.imwrite(output_path, result) - img_fin = np.zeros((res.shape[0], res.shape[1], 3)) - res[:, :][res[:, :] == 0] = 2 - res = res - 1 - res = res * 255 - img_fin[:, :, 0] = res - img_fin[:, :, 1] = res - img_fin[:, :, 2] = res + def run_single(self, image: np.ndarray, use_patches=False): + img_last = 0 + for n, (model, model_file) in enumerate(zip(self.models, self.model_files)): + self.logger.info('Predicting with model %s [%s/%s]' % (model_file, n + 1, len(self.model_files))) - img_fin = img_fin.astype(np.uint8) - img_fin = (res[:, :] == 0) * 255 - img_last = img_last + img_fin + res = self.predict(model, image, use_patches) - kernel = np.ones((5, 5), np.uint8) - img_last[:, :][img_last[:, :] > 0] = 255 - img_last = (img_last[:, :] == 0) * 255 - - cv2.imwrite(os.path.join(output, image_stem + '.png'), img_last) + img_fin = np.zeros((res.shape[0], res.shape[1], 3)) + res[:, :][res[:, :] == 0] = 2 + res = res - 1 + res = res * 255 + img_fin[:, :, 0] = res + img_fin[:, :, 1] = res + img_fin[:, :, 2] = res + + img_fin = img_fin.astype(np.uint8) + img_fin = (res[:, :] == 0) * 255 + img_last = img_last + img_fin + + kernel = np.ones((5, 5), np.uint8) + img_last[:, :][img_last[:, :] > 0] = 255 + img_last = (img_last[:, :] == 0) * 255 + return img_last diff --git a/src/eynollah/utils/__init__.py b/src/eynollah/utils/__init__.py index 5ccb2af..307d8f3 100644 --- a/src/eynollah/utils/__init__.py +++ b/src/eynollah/utils/__init__.py @@ -5,6 +5,7 @@ import math try: import matplotlib.pyplot as plt + import matplotlib.patches as patches except ImportError: plt = None import numpy as np @@ -20,6 +21,7 @@ from .contour import (contours_in_same_horizon, return_contours_of_image, return_parent_contours) + def pairwise(iterable): # pairwise('ABCDEFG') → AB BC CD DE EF FG @@ -30,286 +32,132 @@ def pairwise(iterable): yield a, b a = b -def return_x_start_end_mothers_childs_and_type_of_reading_order( - x_min_hor_some, x_max_hor_some, cy_hor_some, peak_points, cy_hor_diff): +def return_multicol_separators_x_start_end( + regions_without_separators, peak_points, top, bot, + x_min_hor_some, x_max_hor_some, cy_hor_some, y_min_hor_some, y_max_hor_some): + """ + Analyse which separators overlap multiple column candidates, + and how they overlap each other. - x_start=[] - x_end=[] - kind=[]#if covers 2 and more than 2 columns set it to 1 otherwise 0 - len_sep=[] - y_sep=[] - y_diff=[] - new_main_sep_y=[] + Ignore separators not spanning multiple columns. - indexer=0 + For the separators to be returned, try to remove or unify them when there + is no region between them (vertically) and their neighbours. + + Arguments: + * the text mask (with all separators suppressed) + * the x column coordinates + * the y start coordinate to consider in total + * the y end coordinate to consider in total + * the x start coordinate of the horizontal separators + * the x end coordinate of the horizontal separators + * the y start coordinate of the horizontal separators + * the y center coordinate of the horizontal separators + * the y end coordinate of the horizontal separators + + Returns: + a tuple of: + * the x start column index of the resulting multi-span separators + * the x end column index of the resulting multi-span separators + * the y start coordinate of the resulting multi-span separators + * the y center coordinate of the resulting multi-span separators + * the y end coordinate of the resulting multi-span separators + """ + + x_start = [0] + x_end = [len(peak_points) - 1] + y_min = [top] + y_mid = [top] + y_max = [top + 2] + indexer = 1 for i in range(len(x_min_hor_some)): - starting=x_min_hor_some[i]-peak_points - starting=starting[starting>=0] - min_start=np.argmin(starting) - ending=peak_points-x_max_hor_some[i] - len_ending_neg=len(ending[ending<=0]) - - ending=ending[ending>0] - max_end=np.argmin(ending)+len_ending_neg + #print(indexer, "%d:%d" % (x_min_hor_some[i], x_max_hor_some[i]), cy_hor_some[i]) + starting = x_min_hor_some[i] - peak_points + min_start = np.flatnonzero(starting >= 0)[-1] # last left-of + ending = x_max_hor_some[i] - peak_points + max_end = np.flatnonzero(ending <= 0)[0] # first right-of + #print(indexer, "%d:%d" % (min_start, max_end)) if (max_end-min_start)>=2: - if (max_end-min_start)==(len(peak_points)-1): - new_main_sep_y.append(indexer) - + # column range of separator spans more than one column candidate #print((max_end-min_start),len(peak_points),'(max_end-min_start)') - y_sep.append(cy_hor_some[i]) - y_diff.append(cy_hor_diff[i]) + y_min.append(y_min_hor_some[i]) + y_mid.append(cy_hor_some[i]) + y_max.append(y_max_hor_some[i]) x_end.append(max_end) - - x_start.append( min_start) - - len_sep.append(max_end-min_start) - if max_end==min_start+1: - kind.append(0) - else: - kind.append(1) - + x_start.append(min_start) indexer+=1 - - x_start_returned = np.array(x_start, dtype=int) - x_end_returned = np.array(x_end, dtype=int) - y_sep_returned = np.array(y_sep, dtype=int) - y_diff_returned = np.array(y_diff, dtype=int) - - all_args_uniq = contours_in_same_horizon(y_sep_returned) - args_to_be_unified=[] - y_unified=[] - y_diff_unified=[] - x_s_unified=[] - x_e_unified=[] - if len(all_args_uniq)>0: - #print('burda') - if type(all_args_uniq[0]) is list: - for dd in range(len(all_args_uniq)): - if len(all_args_uniq[dd])==2: - x_s_same_hor=np.array(x_start_returned)[all_args_uniq[dd]] - x_e_same_hor=np.array(x_end_returned)[all_args_uniq[dd]] - y_sep_same_hor=np.array(y_sep_returned)[all_args_uniq[dd]] - y_diff_same_hor=np.array(y_diff_returned)[all_args_uniq[dd]] - #print('burda2') - if (x_s_same_hor[0]==x_e_same_hor[1]-1 or - x_s_same_hor[1]==x_e_same_hor[0]-1 and - x_s_same_hor[0]!=x_s_same_hor[1] and - x_e_same_hor[0]!=x_e_same_hor[1]): - #print('burda3') - for arg_in in all_args_uniq[dd]: - #print(arg_in,'arg_in') - args_to_be_unified.append(arg_in) - y_selected=np.min(y_sep_same_hor) - y_diff_selected=np.max(y_diff_same_hor) - x_s_selected=np.min(x_s_same_hor) - x_e_selected=np.max(x_e_same_hor) - - x_s_unified.append(x_s_selected) - x_e_unified.append(x_e_selected) - y_unified.append(y_selected) - y_diff_unified.append(y_diff_selected) - #print(x_s_same_hor,'x_s_same_hor') - #print(x_e_same_hor[:]-1,'x_e_same_hor') - #print('#############################') - #print(x_s_unified,'y_selected') - #print(x_e_unified,'x_s_selected') - #print(y_unified,'x_e_same_hor') - - args_lines_not_unified=list( set(range(len(y_sep_returned)))-set(args_to_be_unified) ) - #print(args_lines_not_unified,'args_lines_not_unified') - - x_start_returned_not_unified=list( np.array(x_start_returned)[args_lines_not_unified] ) - x_end_returned_not_unified=list( np.array(x_end_returned)[args_lines_not_unified] ) - y_sep_returned_not_unified=list (np.array(y_sep_returned)[args_lines_not_unified] ) - y_diff_returned_not_unified=list (np.array(y_diff_returned)[args_lines_not_unified] ) - - for dv in range(len(y_unified)): - y_sep_returned_not_unified.append(y_unified[dv]) - y_diff_returned_not_unified.append(y_diff_unified[dv]) - x_start_returned_not_unified.append(x_s_unified[dv]) - x_end_returned_not_unified.append(x_e_unified[dv]) - - #print(y_sep_returned,'y_sep_returned') - #print(x_start_returned,'x_start_returned') - #print(x_end_returned,'x_end_returned') - - x_start_returned = np.array(x_start_returned_not_unified, dtype=int) - x_end_returned = np.array(x_end_returned_not_unified, dtype=int) - y_sep_returned = np.array(y_sep_returned_not_unified, dtype=int) - y_diff_returned = np.array(y_diff_returned_not_unified, dtype=int) - - #print(y_sep_returned,'y_sep_returned2') - #print(x_start_returned,'x_start_returned2') - #print(x_end_returned,'x_end_returned2') - #print(new_main_sep_y,'new_main_sep_y') - #print(x_start,'x_start') #print(x_end,'x_end') - if len(new_main_sep_y)>0: - min_ys=np.min(y_sep) - max_ys=np.max(y_sep) - - y_mains=[] - y_mains.append(min_ys) - y_mains_sep_ohne_grenzen=[] - - for ii in range(len(new_main_sep_y)): - y_mains.append(y_sep[new_main_sep_y[ii]]) - y_mains_sep_ohne_grenzen.append(y_sep[new_main_sep_y[ii]]) - - y_mains.append(max_ys) - - y_mains_sorted=np.sort(y_mains) - diff=np.diff(y_mains_sorted) - argm=np.argmax(diff) - - y_min_new=y_mains_sorted[argm] - y_max_new=y_mains_sorted[argm+1] - - #print(y_min_new,'y_min_new') - #print(y_max_new,'y_max_new') - #print(y_sep[new_main_sep_y[0]],y_sep,'yseps') - x_start=np.array(x_start) - x_end=np.array(x_end) - kind=np.array(kind) - y_sep=np.array(y_sep) - if (y_min_new in y_mains_sep_ohne_grenzen and - y_max_new in y_mains_sep_ohne_grenzen): - x_start=x_start[(y_sep>y_min_new) & (y_sepy_min_new) & (y_sepy_min_new) & (y_sepy_min_new) & (y_sepy_min_new) & (y_sep<=y_max_new)] - #print('burda1') - x_end=x_end[(y_sep>y_min_new) & (y_sep<=y_max_new)] - #print('burda2') - kind=kind[(y_sep>y_min_new) & (y_sep<=y_max_new)] - y_sep=y_sep[(y_sep>y_min_new) & (y_sep<=y_max_new)] - elif (y_min_new not in y_mains_sep_ohne_grenzen and - y_max_new in y_mains_sep_ohne_grenzen): - x_start=x_start[(y_sep>=y_min_new) & (y_sep=y_min_new) & (y_sep=y_min_new) & (y_sep=y_min_new) & (y_sep=y_min_new) & (y_sep<=y_max_new)] - x_end=x_end[(y_sep>=y_min_new) & (y_sep<=y_max_new)] - kind=kind[(y_sep>=y_min_new) & (y_sep<=y_max_new)] - y_sep=y_sep[(y_sep>=y_min_new) & (y_sep<=y_max_new)] + x_start = np.array(x_start, dtype=int) + x_end = np.array(x_end, dtype=int) + y_min = np.array(y_min, dtype=int) + y_mid = np.array(y_mid, dtype=int) + y_max = np.array(y_max, dtype=int) + #print(y_mid,'y_mid') #print(x_start,'x_start') #print(x_end,'x_end') - #print(len_sep) - deleted=[] - for i in range(len(x_start)-1): - nodes_i=set(range(x_start[i],x_end[i]+1)) - for j in range(i+1,len(x_start)): - if nodes_i==set(range(x_start[j],x_end[j]+1)): - deleted.append(j) - #print(np.unique(deleted)) + # remove redundant separators (with nothing in between) + args_emptysep = set() + args_ysorted = np.argsort(y_mid) + for i in range(len(y_mid)): + # find nearest neighbours above with nothing in between + prev = (~np.eye(len(y_mid), dtype=bool)[i] & + (y_mid[i] >= y_mid) & + # complete subsumption: + # (x_start[i] >= x_start) & + # (x_end[i] <= x_end) + # partial overlap + (x_start[i] < x_end) & + (x_end[i] > x_start) + ) + prev[list(args_emptysep)] = False # but no pair we already saw + if not prev.any(): + continue + prev = np.flatnonzero(prev[args_ysorted]) + j = args_ysorted[prev[-1]] + if not np.any(regions_without_separators[y_max[j]: y_min[i], + peak_points[min(x_start[i], x_start[j])]: + peak_points[max(x_end[i], x_end[j])]]): + args_emptysep.add(i) + if x_start[j] > x_start[i]: + # print(j, "now starts at", x_start[i]) + x_start[j] = x_start[i] + if x_end[j] < x_end[i]: + x_end[j] = x_end[i] + # print(j, "now ends at", x_end[i]) + # print(j, i, "%d:%d" % (y_mid[j], y_mid[i]), "%d:%d" % (x_start[i], x_end[i]), "empty prev sep") + continue + # find nearest neighbours below with nothing in between + nExt = (~np.eye(len(y_mid), dtype=bool)[i] & + (y_mid[i] <= y_mid) & + (x_start[i] >= x_start) & + (x_end[i] <= x_end)) + nExt[list(args_emptysep)] = False # but no pair we already saw + if not nExt.any(): + continue + nExt = np.flatnonzero(nExt[args_ysorted]) + j = args_ysorted[nExt[0]] + if not np.any(regions_without_separators[y_max[i]: y_min[j], + peak_points[x_start[i]]: + peak_points[x_end[i]]]): + args_emptysep.add(i) + # print(j, i, "%d:%d" % (y_mid[j], y_mid[i]), "%d:%d" % (x_start[i], x_end[i]), "empty next sep") + args_to_be_kept = [arg for arg in args_ysorted + if arg not in args_emptysep] + x_start = x_start[args_to_be_kept] + x_end = x_end[args_to_be_kept] + y_min = y_min[args_to_be_kept] + y_mid = y_mid[args_to_be_kept] + y_max = y_max[args_to_be_kept] - remained_sep_indexes=set(range(len(x_start)))-set(np.unique(deleted) ) - #print(remained_sep_indexes,'remained_sep_indexes') - mother=[]#if it has mother - child=[] - for index_i in remained_sep_indexes: - have_mother=0 - have_child=0 - nodes_ind=set(range(x_start[index_i],x_end[index_i]+1)) - for index_j in remained_sep_indexes: - nodes_ind_j=set(range(x_start[index_j],x_end[index_j]+1)) - if nodes_indnodes_ind_j: - have_child=1 - mother.append(have_mother) - child.append(have_child) - - #print(mother,'mother') - #print(len(remained_sep_indexes)) - #print(len(remained_sep_indexes),len(x_start),len(x_end),len(y_sep),'lens') - y_lines_without_mother=[] - x_start_without_mother=[] - x_end_without_mother=[] - - y_lines_with_child_without_mother=[] - x_start_with_child_without_mother=[] - x_end_with_child_without_mother=[] - - mother = np.array(mother) - child = np.array(child) - #print(mother,'mother') - #print(child,'child') - remained_sep_indexes = np.array(list(remained_sep_indexes)) - x_start = np.array(x_start) - x_end = np.array(x_end) - y_sep = np.array(y_sep) - - if len(remained_sep_indexes)>1: - #print(np.array(remained_sep_indexes),'np.array(remained_sep_indexes)') - #print(np.array(mother),'mother') - remained_sep_indexes_without_mother = remained_sep_indexes[mother==0] - remained_sep_indexes_with_child_without_mother = remained_sep_indexes[(mother==0) & (child==1)] - #print(remained_sep_indexes_without_mother,'remained_sep_indexes_without_mother') - #print(remained_sep_indexes_without_mother,'remained_sep_indexes_without_mother') - - x_end_with_child_without_mother = x_end[remained_sep_indexes_with_child_without_mother] - x_start_with_child_without_mother = x_start[remained_sep_indexes_with_child_without_mother] - y_lines_with_child_without_mother = y_sep[remained_sep_indexes_with_child_without_mother] - - reading_orther_type=0 - x_end_without_mother = x_end[remained_sep_indexes_without_mother] - x_start_without_mother = x_start[remained_sep_indexes_without_mother] - y_lines_without_mother = y_sep[remained_sep_indexes_without_mother] - - if len(remained_sep_indexes_without_mother)>=2: - for i in range(len(remained_sep_indexes_without_mother)-1): - nodes_i=set(range(x_start[remained_sep_indexes_without_mother[i]], - x_end[remained_sep_indexes_without_mother[i]] - # + 1 - )) - for j in range(i+1,len(remained_sep_indexes_without_mother)): - nodes_j=set(range(x_start[remained_sep_indexes_without_mother[j]], - x_end[remained_sep_indexes_without_mother[j]] - # + 1 - )) - set_diff = nodes_i - nodes_j - if set_diff != nodes_i: - reading_orther_type = 1 - else: - reading_orther_type = 0 - #print(reading_orther_type,'javab') - #print(y_lines_with_child_without_mother,'y_lines_with_child_without_mother') - #print(x_start_with_child_without_mother,'x_start_with_child_without_mother') - #print(x_end_with_child_without_mother,'x_end_with_hild_without_mother') - - len_sep_with_child = len(child[child==1]) - - #print(len_sep_with_child,'len_sep_with_child') - there_is_sep_with_child = 0 - if len_sep_with_child >= 1: - there_is_sep_with_child = 1 - #print(all_args_uniq,'all_args_uniq') - #print(args_to_be_unified,'args_to_be_unified') - - return (reading_orther_type, - x_start_returned, - x_end_returned, - y_sep_returned, - y_diff_returned, - y_lines_without_mother, - x_start_without_mother, - x_end_without_mother, - there_is_sep_with_child, - y_lines_with_child_without_mother, - x_start_with_child_without_mother, - x_end_with_child_without_mother, - new_main_sep_y) + return (x_start, + x_end, + y_min, + y_mid, + y_max) def box2rect(box: Tuple[int, int, int, int]) -> Tuple[int, int, int, int]: return (box[1], box[1] + box[3], @@ -393,19 +241,23 @@ def find_num_col_deskew(regions_without_separators, sigma_, multiplier=3.8): z = gaussian_filter1d(regions_without_separators_0, sigma_) return np.std(z) -def find_num_col(regions_without_separators, num_col_classifier, tables, multiplier=3.8): +def find_num_col(regions_without_separators, num_col_classifier, tables, multiplier=3.8, unbalanced=False): if not regions_without_separators.any(): return 0, [] - #plt.imshow(regions_without_separators) - #plt.show() regions_without_separators_0 = regions_without_separators.sum(axis=0) - ##plt.plot(regions_without_separators_0) - ##plt.show() - sigma_ = 35 # 70#35 - meda_n_updown = regions_without_separators_0[len(regions_without_separators_0) :: -1] + # fig, (ax1, ax2) = plt.subplots(2, sharex=True) + # ax1.imshow(regions_without_separators, aspect="auto") + # ax2.plot(regions_without_separators_0) + # plt.show() + sigma_ = 25 # 70#35 + meda_n_updown = regions_without_separators_0[::-1] first_nonzero = next((i for i, x in enumerate(regions_without_separators_0) if x), 0) last_nonzero = next((i for i, x in enumerate(meda_n_updown) if x), 0) last_nonzero = len(regions_without_separators_0) - last_nonzero + last_nonzero = last_nonzero - 50 #- 100 + first_nonzero = first_nonzero + 50 #+ 200 + last_offmargin = len(regions_without_separators_0) - 170 #370 + first_offmargin = 170 #370 y = regions_without_separators_0 # [first_nonzero:last_nonzero] y_help = np.zeros(len(y) + 20) y_help[10 : len(y) + 10] = y @@ -416,112 +268,141 @@ def find_num_col(regions_without_separators, num_col_classifier, tables, multipl z = gaussian_filter1d(y, sigma_) zneg = gaussian_filter1d(zneg, sigma_) - peaks_neg, _ = find_peaks(zneg, height=0) - #plt.plot(zneg) - #plt.plot(peaks_neg, zneg[peaks_neg], 'rx') - #plt.show() peaks, _ = find_peaks(z, height=0) + peaks_neg, _ = find_peaks(zneg, height=0) + # _, (ax1, ax2) = plt.subplots(2, sharex=True) + # ax1.set_title("z") + # ax1.plot(z) + # ax1.scatter(peaks, z[peaks]) + # ax1.axvline(0.06 * len(y), label="first") + # ax1.axvline(0.94 * len(y), label="last") + # ax1.text(0.06 * len(y), 0, "first", rotation=90) + # ax1.text(0.94 * len(y), 0, "last", rotation=90) + # ax1.axhline(10, label="minimum") + # ax1.text(0, 10, "minimum") + # ax2.set_title("zneg") + # ax2.plot(zneg) + # ax2.scatter(peaks_neg, zneg[peaks_neg]) + # ax2.axvline(first_nonzero, label="first nonzero") + # ax2.axvline(last_nonzero, label="last nonzero") + # ax2.text(first_nonzero, 0, "first nonzero", rotation=90) + # ax2.text(last_nonzero, 0, "last nonzero", rotation=90) + # ax2.axvline(first_offmargin, label="first offmargin") + # ax2.axvline(last_offmargin, label="last offmargin") + # ax2.text(first_offmargin, 0, "first offmargin", rotation=90) + # ax2.text(last_offmargin, 0, "last offmargin", rotation=90) + # plt.show() peaks_neg = peaks_neg - 10 - 10 - last_nonzero = last_nonzero - 100 - first_nonzero = first_nonzero + 200 - - peaks_neg = peaks_neg[(peaks_neg > first_nonzero) & - (peaks_neg < last_nonzero)] - peaks = peaks[(peaks > 0.06 * regions_without_separators.shape[1]) & - (peaks < 0.94 * regions_without_separators.shape[1])] - peaks_neg = peaks_neg[(peaks_neg > 370) & - (peaks_neg < (regions_without_separators.shape[1] - 370))] + # print("raw peaks", peaks) + peaks = peaks[(peaks > 0.06 * len(y)) & + (peaks < 0.94 * len(y))] + # print("non-marginal peaks", peaks) interest_pos = z[peaks] + # print("interest_pos", interest_pos) interest_pos = interest_pos[interest_pos > 10] if not interest_pos.any(): return 0, [] + # plt.plot(z) # plt.show() + #print("raw peaks_neg", peaks_neg) + peaks_neg = peaks_neg[(peaks_neg > first_nonzero) & + (peaks_neg < last_nonzero)] + #print("non-zero peaks_neg", peaks_neg) + peaks_neg = peaks_neg[(peaks_neg > first_offmargin) & + (peaks_neg < last_offmargin)] + #print("non-marginal peaks_neg", peaks_neg) interest_neg = z[peaks_neg] + #print("interest_neg", interest_neg) if not interest_neg.any(): return 0, [] min_peaks_pos = np.min(interest_pos) max_peaks_pos = np.max(interest_pos) - if max_peaks_pos / min_peaks_pos >= 35: + #print(min_peaks_pos, max_peaks_pos, max_peaks_pos / min_peaks_pos, 'minmax') + if max_peaks_pos / (min_peaks_pos or 1e-9) >= 35: min_peaks_pos = np.mean(interest_pos) min_peaks_neg = 0 # np.min(interest_neg) - # print(np.min(interest_pos),np.max(interest_pos),np.max(interest_pos)/np.min(interest_pos),'minmax') + # cutoff criterion: fixed fraction of lowest column height dis_talaei = (min_peaks_pos - min_peaks_neg) / multiplier grenze = min_peaks_pos - dis_talaei - # np.mean(y[peaks_neg[0]:peaks_neg[len(peaks_neg)-1]])-np.std(y[peaks_neg[0]:peaks_neg[len(peaks_neg)-1]])/2.0 + #np.mean(y[peaks_neg[0]:peaks_neg[-1]])-np.std(y[peaks_neg[0]:peaks_neg[-1]])/2.0 + + # extra criterion: fixed multiple of lowest gap height + grenze = min(grenze, multiplier * (5 + np.min(interest_neg))) # print(interest_neg,'interest_neg') # print(grenze,'grenze') # print(min_peaks_pos,'min_peaks_pos') # print(dis_talaei,'dis_talaei') # print(peaks_neg,'peaks_neg') + # fig, (ax1, ax2) = plt.subplots(2, sharex=True) + # ax1.imshow(regions_without_separators, aspect="auto") + # ax2.plot(z, color='red', label='z') + # ax2.plot(zneg[20:], color='blue', label='zneg') + # ax2.scatter(peaks_neg, z[peaks_neg], color='red') + # ax2.scatter(peaks_neg, zneg[20:][peaks_neg], color='blue') + # ax2.axhline(min_peaks_pos, color='red', label="min_peaks_pos") + # ax2.axhline(grenze, color='blue', label="grenze") + # ax2.text(0, grenze, "grenze") + # plt.show() interest_neg_fin = interest_neg[(interest_neg < grenze)] peaks_neg_fin = peaks_neg[(interest_neg < grenze)] - # interest_neg_fin=interest_neg[(interest_neg= 3: - index_sort_interest_neg_fin= np.argsort(interest_neg_fin) - peaks_neg_sorted = np.array(peaks_neg)[index_sort_interest_neg_fin] - interest_neg_fin_sorted = np.array(interest_neg_fin)[index_sort_interest_neg_fin] + # found too few columns here: ignore 'grenze' and take the deepest N peaks + sort_by_height = np.argsort(interest_neg)[:num_col_classifier] + peaks_neg_fin = peaks_neg[sort_by_height] + interest_neg_fin = interest_neg[sort_by_height] + # print(peaks_neg_fin, "peaks_neg[sorted_by_height]") + sort_by_pos = np.argsort(peaks_neg_fin) + peaks_neg_fin = peaks_neg_fin[sort_by_pos] + interest_neg_fin = interest_neg_fin[sort_by_pos] - if len(index_sort_interest_neg_fin)>=num_col_classifier: - peaks_neg_fin = list( peaks_neg_sorted[:num_col_classifier] ) - interest_neg_fin = list( interest_neg_fin_sorted[:num_col_classifier] ) - else: - peaks_neg_fin = peaks_neg[:] - interest_neg_fin = interest_neg[:] - - num_col = (len(interest_neg_fin)) + 1 + num_col = len(interest_neg_fin) + 1 # print(peaks_neg_fin,'peaks_neg_fin') # print(num_col,'diz') - p_l = 0 - p_u = len(y) - 1 - p_m = int(len(y) / 2.0) - p_g_l = int(len(y) / 4.0) - p_g_u = len(y) - int(len(y) / 4.0) - - if num_col == 3: - if ((peaks_neg_fin[0] > p_g_u and - peaks_neg_fin[1] > p_g_u) or - (peaks_neg_fin[0] < p_g_l and - peaks_neg_fin[1] < p_g_l) or - (peaks_neg_fin[0] + 200 < p_m and - peaks_neg_fin[1] < p_m) or - (peaks_neg_fin[0] - 200 > p_m and - peaks_neg_fin[1] > p_m)): - num_col = 1 - peaks_neg_fin = [] - - if num_col == 2: - if (peaks_neg_fin[0] > p_g_u or - peaks_neg_fin[0] < p_g_l): - num_col = 1 - peaks_neg_fin = [] + # cancel if resulting split is highly unbalanced across available width + if unbalanced: + pass + elif ((num_col == 3 and + ((peaks_neg_fin[0] > 0.75 * len(y) and + peaks_neg_fin[1] > 0.75 * len(y)) or + (peaks_neg_fin[0] < 0.25 * len(y) and + peaks_neg_fin[1] < 0.25 * len(y)) or + (peaks_neg_fin[0] < 0.5 * len(y) - 200 and + peaks_neg_fin[1] < 0.5 * len(y)) or + (peaks_neg_fin[0] > 0.5 * len(y) + 200 and + peaks_neg_fin[1] > 0.5 * len(y)))) or + (num_col == 2 and + (peaks_neg_fin[0] > 0.75 * len(y) or + peaks_neg_fin[0] < 0.25 * len(y)))): + num_col = 1 + peaks_neg_fin = [] ##print(len(peaks_neg_fin)) + # filter out peaks that are too close (<400px) to each other: + # among each group, pick the position with smallest amount of text diff_peaks = np.abs(np.diff(peaks_neg_fin)) - cut_off = 400 + cut_off = 300 #400 peaks_neg_true = [] forest = [] - # print(len(peaks_neg_fin),'len_') - for i in range(len(peaks_neg_fin)): if i == 0: forest.append(peaks_neg_fin[i]) if i < len(peaks_neg_fin) - 1: if diff_peaks[i] <= cut_off: forest.append(peaks_neg_fin[i + 1]) - if diff_peaks[i] > cut_off: + else: # print(forest[np.argmin(z[forest]) ] ) if not isNaN(forest[np.argmin(z[forest])]): peaks_neg_true.append(forest[np.argmin(z[forest])]) @@ -533,68 +414,61 @@ def find_num_col(regions_without_separators, num_col_classifier, tables, multipl peaks_neg_true.append(forest[np.argmin(z[forest])]) num_col = len(peaks_neg_true) + 1 - p_l = 0 - p_u = len(y) - 1 - p_m = int(len(y) / 2.0) - p_quarter = int(len(y) / 5.0) - p_g_l = int(len(y) / 4.0) - p_g_u = len(y) - int(len(y) / 4.0) - - p_u_quarter = len(y) - p_quarter - + #print(peaks_neg_true, "peaks_neg_true") ##print(num_col,'early') - if num_col == 3: - if ((peaks_neg_true[0] > p_g_u and - peaks_neg_true[1] > p_g_u) or - (peaks_neg_true[0] < p_g_l and - peaks_neg_true[1] < p_g_l) or - (peaks_neg_true[0] < p_m and - peaks_neg_true[1] + 200 < p_m) or - (peaks_neg_true[0] - 200 > p_m and - peaks_neg_true[1] > p_m)): - num_col = 1 - peaks_neg_true = [] - elif (peaks_neg_true[0] < p_g_u and - peaks_neg_true[0] > p_g_l and - peaks_neg_true[1] > p_u_quarter): - peaks_neg_true = [peaks_neg_true[0]] - elif (peaks_neg_true[1] < p_g_u and - peaks_neg_true[1] > p_g_l and - peaks_neg_true[0] < p_quarter): - peaks_neg_true = [peaks_neg_true[1]] + # cancel if resulting split is highly unbalanced across available width + if unbalanced: + pass + elif ((num_col == 3 and + ((peaks_neg_true[0] > 0.75 * len(y) and + peaks_neg_true[1] > 0.75 * len(y)) or + (peaks_neg_true[0] < 0.25 * len(y) and + peaks_neg_true[1] < 0.25 * len(y)) or + (peaks_neg_true[0] < 0.5 * len(y) - 200 and + peaks_neg_true[1] < 0.5 * len(y)) or + (peaks_neg_true[0] > 0.5 * len(y) + 200 and + peaks_neg_true[1] > 0.5 * len(y)))) or + (num_col == 2 and + (peaks_neg_true[0] > 0.75 * len(y) or + peaks_neg_true[0] < 0.25 * len(y)))): + num_col = 1 + peaks_neg_true = [] + elif (num_col == 3 and + (peaks_neg_true[0] < 0.75 * len(y) and + peaks_neg_true[0] > 0.25 * len(y) and + peaks_neg_true[1] > 0.80 * len(y))): + num_col = 2 + peaks_neg_true = [peaks_neg_true[0]] + elif (num_col == 3 and + (peaks_neg_true[1] < 0.75 * len(y) and + peaks_neg_true[1] > 0.25 * len(y) and + peaks_neg_true[0] < 0.20 * len(y))): + num_col = 2 + peaks_neg_true = [peaks_neg_true[1]] - if num_col == 2: - if (peaks_neg_true[0] > p_g_u or - peaks_neg_true[0] < p_g_l): - num_col = 1 - peaks_neg_true = [] + # get rid of too narrow columns (not used) + # if np.count_nonzero(diff_peaks < 360): + # arg_help = np.arange(len(diff_peaks)) + # arg_help_ann = arg_help[diff_peaks < 360] + # peaks_neg_fin_new = [] + # for ii in range(len(peaks_neg_fin)): + # if ii in arg_help_ann: + # if interest_neg_fin[ii] < interest_neg_fin[ii + 1]: + # peaks_neg_fin_new.append(peaks_neg_fin[ii]) + # else: + # peaks_neg_fin_new.append(peaks_neg_fin[ii + 1]) - diff_peaks_abnormal = diff_peaks[diff_peaks < 360] - - if len(diff_peaks_abnormal) > 0: - arg_help = np.arange(len(diff_peaks)) - arg_help_ann = arg_help[diff_peaks < 360] - - peaks_neg_fin_new = [] - - for ii in range(len(peaks_neg_fin)): - if ii in arg_help_ann: - arg_min = np.argmin([interest_neg_fin[ii], interest_neg_fin[ii + 1]]) - if arg_min == 0: - peaks_neg_fin_new.append(peaks_neg_fin[ii]) - else: - peaks_neg_fin_new.append(peaks_neg_fin[ii + 1]) - - elif (ii - 1) not in arg_help_ann: - peaks_neg_fin_new.append(peaks_neg_fin[ii]) - else: - peaks_neg_fin_new = peaks_neg_fin + # elif (ii - 1) not in arg_help_ann: + # peaks_neg_fin_new.append(peaks_neg_fin[ii]) + # else: + # peaks_neg_fin_new = peaks_neg_fin # plt.plot(gaussian_filter1d(y, sigma_)) # plt.plot(peaks_neg_true,z[peaks_neg_true],'*') # plt.plot([0,len(y)], [grenze,grenze]) # plt.show() ##print(len(peaks_neg_true)) + #print(peaks_neg_true, "peaks_neg_true") return len(peaks_neg_true), peaks_neg_true def find_num_col_only_image(regions_without_separators, multiplier=3.8): @@ -806,6 +680,12 @@ def find_num_col_by_vertical_lines(regions_without_separators, multiplier=3.8): peaks, _ = find_peaks(z, height=0) # print(peaks,'peaksnew') + # fig, (ax1, ax2) = plt.subplots(2, sharex=True, suptitle='find_num_col_by_vertical_lines') + # ax1.imshow(regions_without_separators, aspect="auto") + # ax2.plot(z) + # ax2.scatter(peaks, z[peaks]) + # ax2.set_title('find_peaks(regions_without_separators.sum(axis=0), height=0)') + # plt.show() return peaks def return_regions_without_separators(regions_pre): @@ -1192,7 +1072,26 @@ def small_textlines_to_parent_adherence2(textlines_con, textline_iamge, num_col) textlines_con_changed.append(textlines_big_org_form) return textlines_con_changed -def order_of_regions(textline_mask, contours_main, contours_head, y_ref): +def order_of_regions(textline_mask, contours_main, contours_head, y_ref, x_ref): + """ + Order text region contours within a single column bbox in a top-down-left-right way. + + First, determine the vertical gaps. Then iterate over each vertical segment, + identifying the contours centered in that segment. Order them by their + horizontal center, and add them to the overall order. + + Arguments: + * textline_mask: the mask of the textline segmentation, cropped for that box + * contours_main: the paragraph text region contours expected to be here + * contours_head: the heading text region contours expected to be here + * y_ref: the vertical offset of that box within the page + * x_ref: the horizontal offset of that box within the page + + Returns: a tuple of + * the array of contour indexes overall within this box (i.e. into main+head) + * the array of types (1 for paragraph, 2 for heading) + * the array of contour indexes for the respective type (i.e. into contours_main or contours_head) + """ ##plt.imshow(textline_mask) ##plt.show() y = textline_mask.sum(axis=1) # horizontal projection profile @@ -1203,6 +1102,8 @@ def order_of_regions(textline_mask, contours_main, contours_head, y_ref): #z = gaussian_filter1d(y_padded, sigma_gaus) #peaks, _ = find_peaks(z, height=0) #peaks = peaks - 20 + ##plt.plot(z) + ##plt.show() zneg_rev = np.max(y_padded) - y_padded zneg = np.zeros(len(zneg_rev) + 40) zneg[20 : len(zneg_rev) + 20] = zneg_rev @@ -1211,15 +1112,16 @@ def order_of_regions(textline_mask, contours_main, contours_head, y_ref): peaks_neg, _ = find_peaks(zneg, height=0) peaks_neg = peaks_neg - 20 - 20 - ##plt.plot(z) - ##plt.show() - cx_main, cy_main = find_center_of_contours(contours_main) - cx_head, cy_head = find_center_of_contours(contours_head) - - peaks_neg_new = np.append(np.insert(peaks_neg, 0, 0), textline_mask.shape[0]) + peaks_neg_new = np.array([0] + + # peaks can be beyond box due to padding and smoothing + [peak for peak in peaks_neg + if 0 < peak and peak < textline_mask.shape[0]] + + [textline_mask.shape[0]]) # offset from bbox of mask peaks_neg_new += y_ref + cx_main, cy_main = find_center_of_contours(contours_main) + cx_head, cy_head = find_center_of_contours(contours_head) # assert not len(cy_main) or np.min(peaks_neg_new) <= np.min(cy_main) and np.max(cy_main) <= np.max(peaks_neg_new) # assert not len(cy_head) or np.min(peaks_neg_new) <= np.min(cy_head) and np.max(cy_head) <= np.max(peaks_neg_new) @@ -1244,6 +1146,23 @@ def order_of_regions(textline_mask, contours_main, contours_head, y_ref): indexes_in, types_in, cxs_in, cys_in, typed_indexes_in = \ matrix_of_orders[(matrix_of_orders[:, 3] >= top) & (matrix_of_orders[:, 3] < bot)].T + # if indexes_in.size: + # img = textline_mask.copy() + # plt.imshow(img) + # plt.gca().add_patch(patches.Rectangle((0, top-y_ref), img.shape[1], bot-top, alpha=0.5, color='gray')) + # xrange = np.arange(0, img.shape[1], 50) + # yrange = np.arange(0, img.shape[0], 50) + # plt.gca().set_xticks(xrange, xrange + x_ref) + # plt.gca().set_yticks(yrange, yrange + y_ref) + # for idx, type_, cx, cy in zip(typed_indexes_in, types_in, cxs_in, cys_in): + # cnt = (contours_main if type_ == 1 else contours_head)[idx] + # col = 'red' if type_ == 1 else 'blue' + # plt.scatter(cx - x_ref, cy - y_ref, 20, c=col, marker='o') + # plt.text(cx - x_ref, cy - y_ref, str(idx), c=col) + # plt.gca().add_patch(patches.Polygon(cnt[:, 0] - [[x_ref, y_ref]], closed=False, fill=False, color=col)) + # plt.title("box contours centered in %d:%d (red=main / blue=heading)" % (top, bot)) + # plt.show() + sorted_inside = np.argsort(cxs_in) final_indexers_sorted.extend(indexes_in[sorted_inside]) final_types.extend(types_in[sorted_inside]) @@ -1251,8 +1170,8 @@ def order_of_regions(textline_mask, contours_main, contours_head, y_ref): ##matrix_of_orders[:len_main,4]=final_indexers_sorted[:] - # assert len(final_indexers_sorted) == len(contours_main) + len(contours_head) - # assert not len(final_indexers_sorted) or max(final_index_type) == max(len(contours_main) + assert len(set(final_indexers_sorted)) == len(contours_main) + len(contours_head) + assert set(final_index_type) == set(range(len(contours_main))).union(range(len(contours_head))) return np.array(final_indexers_sorted), np.array(final_types), np.array(final_index_type) @@ -1286,47 +1205,42 @@ def combine_hor_lines_and_delete_cross_points_and_get_lines_features_back_new( float(num_col_classifier)) if len_lines_bigger_than_x_width_smaller_than_acolumn_width_per_column < 10: args_hor=np.arange(len(slope_lines_hor)) - all_args_uniq=contours_in_same_horizon(cy_main_hor) - #print(all_args_uniq,'all_args_uniq') - if len(all_args_uniq)>0: - if type(all_args_uniq[0]) is list: - special_separators=[] - contours_new=[] - for dd in range(len(all_args_uniq)): - merged_all=None - some_args=args_hor[all_args_uniq[dd]] - some_cy=cy_main_hor[all_args_uniq[dd]] - some_x_min=x_min_main_hor[all_args_uniq[dd]] - some_x_max=x_max_main_hor[all_args_uniq[dd]] + sep_pairs=contours_in_same_horizon(cy_main_hor) + if len(sep_pairs): + special_separators=[] + contours_new=[] + for pair in sep_pairs: + merged_all=None + some_args=args_hor[pair] + some_cy=cy_main_hor[pair] + some_x_min=x_min_main_hor[pair] + some_x_max=x_max_main_hor[pair] - #img_in=np.zeros(separators_closeup_n[:,:,2].shape) - #print(img_p_in_ver.shape[1],some_x_max-some_x_min,'xdiff') - diff_x_some=some_x_max-some_x_min - for jv in range(len(some_args)): - img_p_in=cv2.fillPoly(img_in_hor, pts=[contours_lines_hor[some_args[jv]]], color=(1,1,1)) - if any(i_diff>(img_p_in_ver.shape[1]/float(3.3)) for i_diff in diff_x_some): - img_p_in[int(np.mean(some_cy))-5: - int(np.mean(some_cy))+5, - int(np.min(some_x_min)): - int(np.max(some_x_max)) ]=1 - sum_dis=dist_x_hor[some_args].sum() - diff_max_min_uniques=np.max(x_max_main_hor[some_args])-np.min(x_min_main_hor[some_args]) + #img_in=np.zeros(separators_closeup_n[:,:,2].shape) + #print(img_p_in_ver.shape[1],some_x_max-some_x_min,'xdiff') + diff_x_some=some_x_max-some_x_min + for jv in range(len(some_args)): + img_p_in=cv2.fillPoly(img_in_hor, pts=[contours_lines_hor[some_args[jv]]], color=(1,1,1)) + if any(i_diff>(img_p_in_ver.shape[1]/float(3.3)) for i_diff in diff_x_some): + img_p_in[int(np.mean(some_cy))-5: + int(np.mean(some_cy))+5, + int(np.min(some_x_min)): + int(np.max(some_x_max)) ]=1 + sum_dis=dist_x_hor[some_args].sum() + diff_max_min_uniques=np.max(x_max_main_hor[some_args])-np.min(x_min_main_hor[some_args]) - if (diff_max_min_uniques > sum_dis and - sum_dis / float(diff_max_min_uniques) > 0.85 and - diff_max_min_uniques / float(img_p_in_ver.shape[1]) > 0.85 and - np.std(dist_x_hor[some_args]) < 0.55 * np.mean(dist_x_hor[some_args])): - # print(dist_x_hor[some_args], - # dist_x_hor[some_args].sum(), - # np.min(x_min_main_hor[some_args]), - # np.max(x_max_main_hor[some_args]),'jalibdi') - # print(np.mean( dist_x_hor[some_args] ), - # np.std( dist_x_hor[some_args] ), - # np.var( dist_x_hor[some_args] ),'jalibdiha') - special_separators.append(np.mean(cy_main_hor[some_args])) - else: - img_p_in=img_in_hor - special_separators=[] + if (diff_max_min_uniques > sum_dis and + sum_dis / float(diff_max_min_uniques) > 0.85 and + diff_max_min_uniques / float(img_p_in_ver.shape[1]) > 0.85 and + np.std(dist_x_hor[some_args]) < 0.55 * np.mean(dist_x_hor[some_args])): + # print(dist_x_hor[some_args], + # dist_x_hor[some_args].sum(), + # np.min(x_min_main_hor[some_args]), + # np.max(x_max_main_hor[some_args]),'jalibdi') + # print(np.mean( dist_x_hor[some_args] ), + # np.std( dist_x_hor[some_args] ), + # np.var( dist_x_hor[some_args] ),'jalibdiha') + special_separators.append(np.mean(cy_main_hor[some_args])) else: img_p_in=img_in_hor special_separators=[] @@ -1353,189 +1267,176 @@ def return_points_with_boundies(peaks_neg_fin, first_point, last_point): peaks_neg_tot.append(last_point) return peaks_neg_tot -def find_number_of_columns_in_document(region_pre_p, num_col_classifier, tables, label_lines, contours_h=None): - t_ins_c0 = time.time() - separators_closeup=( (region_pre_p[:,:]==label_lines))*1 - separators_closeup[0:110,:]=0 - separators_closeup[separators_closeup.shape[0]-150:,:]=0 +def find_number_of_columns_in_document(region_pre_p, num_col_classifier, tables, label_seps, contours_h=None): + separators_closeup = 1 * (region_pre_p == label_seps) + separators_closeup[0:110] = 0 + separators_closeup[-150:] = 0 kernel = np.ones((5,5),np.uint8) - separators_closeup=separators_closeup.astype(np.uint8) - separators_closeup = cv2.dilate(separators_closeup,kernel,iterations = 1) - separators_closeup = cv2.erode(separators_closeup,kernel,iterations = 1) + separators_closeup = separators_closeup.astype(np.uint8) + separators_closeup = cv2.morphologyEx(separators_closeup, cv2.MORPH_CLOSE, kernel, iterations=1) - separators_closeup_new=np.zeros((separators_closeup.shape[0] ,separators_closeup.shape[1] )) - separators_closeup_n=np.copy(separators_closeup) - separators_closeup_n=separators_closeup_n.astype(np.uint8) + separators_closeup_n = separators_closeup.astype(np.uint8) # to be returned - separators_closeup_n_binary=np.zeros(( separators_closeup_n.shape[0],separators_closeup_n.shape[1]) ) - separators_closeup_n_binary[:,:]=separators_closeup_n[:,:] - separators_closeup_n_binary[:,:][separators_closeup_n_binary[:,:]!=0]=1 + separators_closeup_n_binary = separators_closeup_n.copy() - _, thresh_e = cv2.threshold(separators_closeup_n_binary, 0, 255, 0) - contours_line_e, _ = cv2.findContours(thresh_e.astype(np.uint8), cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) - _, dist_xe, _, _, _, _, y_min_main, y_max_main, _ = \ - find_features_of_lines(contours_line_e) - dist_ye = y_max_main - y_min_main - args_e=np.arange(len(contours_line_e)) - args_hor_e=args_e[(dist_ye<=50) & - (dist_xe>=3*dist_ye)] - cnts_hor_e=[] - for ce in args_hor_e: - cnts_hor_e.append(contours_line_e[ce]) + # find horizontal lines by contour properties + contours_sep_e, _ = cv2.findContours(separators_closeup_n_binary, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) + cnts_hor_e = [] + for cnt in contours_sep_e: + max_xe = cnt[:, 0, 0].max() + min_xe = cnt[:, 0, 0].min() + max_ye = cnt[:, 0, 1].max() + min_ye = cnt[:, 0, 1].min() + med_ye = int(np.median(cnt[:, 0, 1])) + dist_xe = max_xe - min_xe + dist_ye = max_ye - min_ye + if dist_ye <= 50 and dist_xe >= 3 * dist_ye: + cnts_hor_e.append(cnt) - separators_closeup_n_binary=cv2.fillPoly(separators_closeup_n_binary, pts=cnts_hor_e, color=0) - gray = cv2.bitwise_not(separators_closeup_n_binary) - gray=gray.astype(np.uint8) + # delete horizontal contours (leaving only the edges) + separators_closeup_n_binary = cv2.fillPoly(separators_closeup_n_binary, pts=cnts_hor_e, color=0) + edges = cv2.adaptiveThreshold(separators_closeup_n_binary * 255, 255, + cv2.ADAPTIVE_THRESH_MEAN_C, cv2.THRESH_BINARY, 15, -2) + horizontal = np.copy(edges) + vertical = np.copy(edges) - bw = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_MEAN_C, \ - cv2.THRESH_BINARY, 15, -2) - horizontal = np.copy(bw) - vertical = np.copy(bw) - - cols = horizontal.shape[1] - horizontal_size = cols // 30 - # Create structure element for extracting horizontal lines through morphology operations + horizontal_size = horizontal.shape[1] // 30 + # find horizontal lines by morphology horizontalStructure = cv2.getStructuringElement(cv2.MORPH_RECT, (horizontal_size, 1)) - # Apply morphology operations - horizontal = cv2.erode(horizontal, horizontalStructure) - horizontal = cv2.dilate(horizontal, horizontalStructure) - - kernel = np.ones((5,5),np.uint8) - horizontal = cv2.dilate(horizontal,kernel,iterations = 2) - horizontal = cv2.erode(horizontal,kernel,iterations = 2) + horizontal = cv2.morphologyEx(horizontal, cv2.MORPH_OPEN, horizontalStructure) + horizontal = cv2.morphologyEx(horizontal, cv2.MORPH_CLOSE, kernel, iterations=2) + # re-insert deleted horizontal contours horizontal = cv2.fillPoly(horizontal, pts=cnts_hor_e, color=255) - rows = vertical.shape[0] - verticalsize = rows // 30 - # Create structure element for extracting vertical lines through morphology operations - verticalStructure = cv2.getStructuringElement(cv2.MORPH_RECT, (1, verticalsize)) - # Apply morphology operations - vertical = cv2.erode(vertical, verticalStructure) - vertical = cv2.dilate(vertical, verticalStructure) - vertical = cv2.dilate(vertical,kernel,iterations = 1) + vertical_size = vertical.shape[0] // 30 + # find vertical lines by morphology + verticalStructure = cv2.getStructuringElement(cv2.MORPH_RECT, (1, vertical_size)) + vertical = cv2.morphologyEx(vertical, cv2.MORPH_OPEN, verticalStructure) + vertical = cv2.dilate(vertical, kernel, iterations=1) horizontal, special_separators = \ combine_hor_lines_and_delete_cross_points_and_get_lines_features_back_new( vertical, horizontal, num_col_classifier) - separators_closeup_new[:,:][vertical[:,:]!=0]=1 - separators_closeup_new[:,:][horizontal[:,:]!=0]=1 - _, thresh = cv2.threshold(vertical, 0, 255, 0) - contours_line_vers, _ = cv2.findContours(thresh.astype(np.uint8), cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) - slope_lines, dist_x, x_min_main, x_max_main, cy_main, slope_lines_org, y_min_main, y_max_main, cx_main = \ - find_features_of_lines(contours_line_vers) + contours_sep_vers, _ = cv2.findContours(thresh.astype(np.uint8), cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) + slope_seps, dist_x, x_min_seps, x_max_seps, cy_seps, slope_seps_org, y_min_seps, y_max_seps, cx_seps = \ + find_features_of_lines(contours_sep_vers) - args=np.arange(len(slope_lines)) - args_ver=args[slope_lines==1] - dist_x_ver=dist_x[slope_lines==1] - y_min_main_ver=y_min_main[slope_lines==1] - y_max_main_ver=y_max_main[slope_lines==1] - x_min_main_ver=x_min_main[slope_lines==1] - x_max_main_ver=x_max_main[slope_lines==1] - cx_main_ver=cx_main[slope_lines==1] - dist_y_ver=y_max_main_ver-y_min_main_ver + args=np.arange(len(slope_seps)) + args_ver=args[slope_seps==1] + dist_x_ver=dist_x[slope_seps==1] + y_min_seps_ver=y_min_seps[slope_seps==1] + y_max_seps_ver=y_max_seps[slope_seps==1] + x_min_seps_ver=x_min_seps[slope_seps==1] + x_max_seps_ver=x_max_seps[slope_seps==1] + cx_seps_ver=cx_seps[slope_seps==1] + dist_y_ver=y_max_seps_ver-y_min_seps_ver len_y=separators_closeup.shape[0]/3.0 _, thresh = cv2.threshold(horizontal, 0, 255, 0) - contours_line_hors, _ = cv2.findContours(thresh.astype(np.uint8), cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) - slope_lines, dist_x, x_min_main, x_max_main, cy_main, slope_lines_org, y_min_main, y_max_main, cx_main = \ - find_features_of_lines(contours_line_hors) + contours_sep_hors, _ = cv2.findContours(thresh.astype(np.uint8), cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) + slope_seps, dist_x, x_min_seps, x_max_seps, cy_seps, slope_seps_org, y_min_seps, y_max_seps, cx_seps = \ + find_features_of_lines(contours_sep_hors) - slope_lines_org_hor=slope_lines_org[slope_lines==0] - args=np.arange(len(slope_lines)) + slope_seps_org_hor=slope_seps_org[slope_seps==0] + args=np.arange(len(slope_seps)) len_x=separators_closeup.shape[1]/5.0 - dist_y=np.abs(y_max_main-y_min_main) + dist_y=np.abs(y_max_seps-y_min_seps) - args_hor=args[slope_lines==0] - dist_x_hor=dist_x[slope_lines==0] - y_min_main_hor=y_min_main[slope_lines==0] - y_max_main_hor=y_max_main[slope_lines==0] - x_min_main_hor=x_min_main[slope_lines==0] - x_max_main_hor=x_max_main[slope_lines==0] - dist_y_hor=dist_y[slope_lines==0] - cy_main_hor=cy_main[slope_lines==0] + args_hor=args[slope_seps==0] + dist_x_hor=dist_x[slope_seps==0] + y_min_seps_hor=y_min_seps[slope_seps==0] + y_max_seps_hor=y_max_seps[slope_seps==0] + x_min_seps_hor=x_min_seps[slope_seps==0] + x_max_seps_hor=x_max_seps[slope_seps==0] + dist_y_hor=dist_y[slope_seps==0] + cy_seps_hor=cy_seps[slope_seps==0] args_hor=args_hor[dist_x_hor>=len_x/2.0] - x_max_main_hor=x_max_main_hor[dist_x_hor>=len_x/2.0] - x_min_main_hor=x_min_main_hor[dist_x_hor>=len_x/2.0] - cy_main_hor=cy_main_hor[dist_x_hor>=len_x/2.0] - y_min_main_hor=y_min_main_hor[dist_x_hor>=len_x/2.0] - y_max_main_hor=y_max_main_hor[dist_x_hor>=len_x/2.0] + x_max_seps_hor=x_max_seps_hor[dist_x_hor>=len_x/2.0] + x_min_seps_hor=x_min_seps_hor[dist_x_hor>=len_x/2.0] + cy_seps_hor=cy_seps_hor[dist_x_hor>=len_x/2.0] + y_min_seps_hor=y_min_seps_hor[dist_x_hor>=len_x/2.0] + y_max_seps_hor=y_max_seps_hor[dist_x_hor>=len_x/2.0] dist_y_hor=dist_y_hor[dist_x_hor>=len_x/2.0] - slope_lines_org_hor=slope_lines_org_hor[dist_x_hor>=len_x/2.0] + slope_seps_org_hor=slope_seps_org_hor[dist_x_hor>=len_x/2.0] dist_x_hor=dist_x_hor[dist_x_hor>=len_x/2.0] - matrix_of_lines_ch=np.zeros((len(cy_main_hor)+len(cx_main_ver),10)) - matrix_of_lines_ch[:len(cy_main_hor),0]=args_hor - matrix_of_lines_ch[len(cy_main_hor):,0]=args_ver - matrix_of_lines_ch[len(cy_main_hor):,1]=cx_main_ver - matrix_of_lines_ch[:len(cy_main_hor),2]=x_min_main_hor+50#x_min_main_hor+150 - matrix_of_lines_ch[len(cy_main_hor):,2]=x_min_main_ver - matrix_of_lines_ch[:len(cy_main_hor),3]=x_max_main_hor-50#x_max_main_hor-150 - matrix_of_lines_ch[len(cy_main_hor):,3]=x_max_main_ver - matrix_of_lines_ch[:len(cy_main_hor),4]=dist_x_hor - matrix_of_lines_ch[len(cy_main_hor):,4]=dist_x_ver - matrix_of_lines_ch[:len(cy_main_hor),5]=cy_main_hor - matrix_of_lines_ch[:len(cy_main_hor),6]=y_min_main_hor - matrix_of_lines_ch[len(cy_main_hor):,6]=y_min_main_ver - matrix_of_lines_ch[:len(cy_main_hor),7]=y_max_main_hor - matrix_of_lines_ch[len(cy_main_hor):,7]=y_max_main_ver - matrix_of_lines_ch[:len(cy_main_hor),8]=dist_y_hor - matrix_of_lines_ch[len(cy_main_hor):,8]=dist_y_ver - matrix_of_lines_ch[len(cy_main_hor):,9]=1 + matrix_of_seps_ch = np.zeros((len(cy_seps_hor)+len(cx_seps_ver), 10), dtype=int) + matrix_of_seps_ch[:len(cy_seps_hor),0]=args_hor + matrix_of_seps_ch[len(cy_seps_hor):,0]=args_ver + matrix_of_seps_ch[len(cy_seps_hor):,1]=cx_seps_ver + matrix_of_seps_ch[:len(cy_seps_hor),2]=x_min_seps_hor+50#x_min_seps_hor+150 + matrix_of_seps_ch[len(cy_seps_hor):,2]=x_min_seps_ver + matrix_of_seps_ch[:len(cy_seps_hor),3]=x_max_seps_hor-50#x_max_seps_hor-150 + matrix_of_seps_ch[len(cy_seps_hor):,3]=x_max_seps_ver + matrix_of_seps_ch[:len(cy_seps_hor),4]=dist_x_hor + matrix_of_seps_ch[len(cy_seps_hor):,4]=dist_x_ver + matrix_of_seps_ch[:len(cy_seps_hor),5]=cy_seps_hor + matrix_of_seps_ch[:len(cy_seps_hor),6]=y_min_seps_hor + matrix_of_seps_ch[len(cy_seps_hor):,6]=y_min_seps_ver + matrix_of_seps_ch[:len(cy_seps_hor),7]=y_max_seps_hor + matrix_of_seps_ch[len(cy_seps_hor):,7]=y_max_seps_ver + matrix_of_seps_ch[:len(cy_seps_hor),8]=dist_y_hor + matrix_of_seps_ch[len(cy_seps_hor):,8]=dist_y_ver + matrix_of_seps_ch[len(cy_seps_hor):,9]=1 if contours_h is not None: - _, dist_x_head, x_min_main_head, x_max_main_head, cy_main_head, _, y_min_main_head, y_max_main_head, _ = \ + _, dist_x_head, x_min_head, x_max_head, cy_head, _, y_min_head, y_max_head, _ = \ find_features_of_lines(contours_h) - matrix_l_n=np.zeros((matrix_of_lines_ch.shape[0]+len(cy_main_head),matrix_of_lines_ch.shape[1])) - matrix_l_n[:matrix_of_lines_ch.shape[0],:]=np.copy(matrix_of_lines_ch[:,:]) - args_head=np.arange(len(cy_main_head)) + len(cy_main_hor) + matrix_l_n = np.zeros((len(cy_head), matrix_of_seps_ch.shape[1]), dtype=int) + args_head = np.arange(len(cy_head)) + matrix_l_n[:, 0] = args_head + matrix_l_n[:, 2] = x_min_head + matrix_l_n[:, 3] = x_max_head + matrix_l_n[:, 4] = dist_x_head + matrix_l_n[:, 5] = cy_head + matrix_l_n[:, 6] = y_min_head + matrix_l_n[:, 7] = y_max_head + matrix_l_n[:, 8] = y_max_head - y_min_head + matrix_l_n[:, 9] = 2 # mark as heading (so it can be split into 2 horizontal separators as needed) + matrix_of_seps_ch = np.append( + matrix_of_seps_ch, matrix_l_n, axis=0) - matrix_l_n[matrix_of_lines_ch.shape[0]:,0]=args_head - matrix_l_n[matrix_of_lines_ch.shape[0]:,2]=x_min_main_head+30 - matrix_l_n[matrix_of_lines_ch.shape[0]:,3]=x_max_main_head-30 - matrix_l_n[matrix_of_lines_ch.shape[0]:,4]=dist_x_head - matrix_l_n[matrix_of_lines_ch.shape[0]:,5]=y_min_main_head-3-8 - matrix_l_n[matrix_of_lines_ch.shape[0]:,6]=y_min_main_head-5-8 - matrix_l_n[matrix_of_lines_ch.shape[0]:,7]=y_max_main_head#y_min_main_head+1-8 - matrix_l_n[matrix_of_lines_ch.shape[0]:,8]=4 - matrix_of_lines_ch=np.copy(matrix_l_n) + # ensure no seps are out of bounds + matrix_of_seps_ch[:, 1] = np.maximum(np.minimum(matrix_of_seps_ch[:, 1], region_pre_p.shape[1]), 0) + matrix_of_seps_ch[:, 2] = np.maximum(matrix_of_seps_ch[:, 2], 0) + matrix_of_seps_ch[:, 3] = np.minimum(matrix_of_seps_ch[:, 3], region_pre_p.shape[1]) + matrix_of_seps_ch[:, 5] = np.maximum(np.minimum(matrix_of_seps_ch[:, 5], region_pre_p.shape[0]), 0) + matrix_of_seps_ch[:, 6] = np.maximum(matrix_of_seps_ch[:, 6], 0) + matrix_of_seps_ch[:, 7] = np.minimum(matrix_of_seps_ch[:, 7], region_pre_p.shape[0]) + + cy_seps_splitters=cy_seps_hor[(x_min_seps_hor<=.16*region_pre_p.shape[1]) & + (x_max_seps_hor>=.84*region_pre_p.shape[1])] + cy_seps_splitters = np.append(cy_seps_splitters, special_separators) - cy_main_splitters=cy_main_hor[(x_min_main_hor<=.16*region_pre_p.shape[1]) & - (x_max_main_hor>=.84*region_pre_p.shape[1])] - cy_main_splitters=np.array( list(cy_main_splitters)+list(special_separators)) if contours_h is not None: - try: - cy_main_splitters_head=cy_main_head[(x_min_main_head<=.16*region_pre_p.shape[1]) & - (x_max_main_head>=.84*region_pre_p.shape[1])] - cy_main_splitters=np.array( list(cy_main_splitters)+list(cy_main_splitters_head)) - except: - pass - args_cy_splitter=np.argsort(cy_main_splitters) - cy_main_splitters_sort=cy_main_splitters[args_cy_splitter] + y_min_splitters_head = y_min_head[(x_min_head<=.16*region_pre_p.shape[1]) & + (x_max_head>=.84*region_pre_p.shape[1])] + y_max_splitters_head = y_max_head[(x_min_head<=.16*region_pre_p.shape[1]) & + (x_max_head>=.84*region_pre_p.shape[1])] + cy_seps_splitters = np.append(cy_seps_splitters, y_min_splitters_head) + cy_seps_splitters = np.append(cy_seps_splitters, y_max_splitters_head) - splitter_y_new=[] - splitter_y_new.append(0) - for i in range(len(cy_main_splitters_sort)): - splitter_y_new.append( cy_main_splitters_sort[i] ) - splitter_y_new.append(region_pre_p.shape[0]) - splitter_y_new_diff=np.diff(splitter_y_new)/float(region_pre_p.shape[0])*100 - - args_big_parts=np.arange(len(splitter_y_new_diff))[ splitter_y_new_diff>22 ] + cy_seps_splitters = np.sort(cy_seps_splitters).astype(int) + splitter_y_new = [0] + list(cy_seps_splitters) + [region_pre_p.shape[0]] + big_part = 22 * region_pre_p.shape[0] // 100 # percent height regions_without_separators=return_regions_without_separators(region_pre_p) - length_y_threshold=regions_without_separators.shape[0]/4.0 num_col_fin=0 peaks_neg_fin_fin=[] - for itiles in args_big_parts: - regions_without_separators_tile=regions_without_separators[int(splitter_y_new[itiles]): - int(splitter_y_new[itiles+1]),:] + num_big_parts = 0 + for top, bot in pairwise(splitter_y_new): + if bot - top < big_part: + continue + num_big_parts += 1 try: - num_col, peaks_neg_fin = find_num_col(regions_without_separators_tile, + num_col, peaks_neg_fin = find_num_col(regions_without_separators[top: bot], num_col_classifier, tables, multiplier=7.0) + # print("big part %d:%d has %d columns" % (top, bot, num_col + 1), peaks_neg_fin) except: num_col = 0 peaks_neg_fin = [] @@ -1543,20 +1444,45 @@ def find_number_of_columns_in_document(region_pre_p, num_col_classifier, tables, num_col_fin=num_col peaks_neg_fin_fin=peaks_neg_fin - if len(args_big_parts)==1 and (len(peaks_neg_fin_fin)+1)=500] peaks_neg_fin=peaks_neg_fin[peaks_neg_fin<=(vertical.shape[1]-500)] peaks_neg_fin_fin=peaks_neg_fin[:] - return num_col_fin, peaks_neg_fin_fin,matrix_of_lines_ch,splitter_y_new,separators_closeup_n + return num_col_fin, peaks_neg_fin_fin, matrix_of_seps_ch, splitter_y_new, separators_closeup_n def return_boxes_of_images_by_order_of_reading_new( - splitter_y_new, regions_without_separators, - matrix_of_lines_ch, + splitter_y_new, + regions_without_separators, + regions_with_separators, + matrix_of_seps_ch, num_col_classifier, erosion_hurts, tables, right2left_readingorder, logger=None): + """ + Iterate through the vertical parts of a page, each with its own set of columns, + and from the matrix of horizontal separators for that part, find an ordered + list of bounding boxes through all columns and regions. + + Arguments: + * splitter_y_new: the y coordinates separating the parts + * regions_without_separators: (text) region mask with separators suppressed; + (needed to find per-part columns and to combine separators if possible) + * regions_with_separators: (full) region map with separators suppressed; + (needed to elongate separators if possible) + * matrix_of_seps: type and coordinates of horizontal and vertical separators, + as well as headings + * num_col_classifier: predicted number of columns for the entire page + * erosion_hurts: bool + * tables: bool + * right2left_readingorder: whether to invert the default left-to-right order + + Returns: a tuple of + * the ordered list of bounding boxes + * a list of arrays: the x coordinates delimiting the columns for every page part + (according to splitter) + """ if right2left_readingorder: regions_without_separators = cv2.flip(regions_without_separators,1) @@ -1564,555 +1490,340 @@ def return_boxes_of_images_by_order_of_reading_new( logger = getLogger(__package__) logger.debug('enter return_boxes_of_images_by_order_of_reading_new') + # def dbg_imshow(box, title): + # xmin, xmax, ymin, ymax = box + # plt.imshow(regions_with_separators) #, extent=[0, width_tot, bot, top]) + # plt.gca().add_patch(patches.Rectangle((xmin, ymin), xmax - xmin, ymax - ymin, + # fill=False, linewidth=1, edgecolor='r')) + # plt.title(title + " at %d:%d, %d:%d" % (ymin, ymax, xmin, xmax)) + # plt.show() + # def dbg_plt(box=None, title=None, rectangles=None, rectangles_showidx=False): + # minx, maxx, miny, maxy = box or (0, None, 0, None) + # img = regions_without_separators[miny:maxy, minx:maxx] + # plt.imshow(img) + # step = max(img.shape) // 10 + # xrange = np.arange(0, img.shape[1], step) + # yrange = np.arange(0, img.shape[0], step) + # ax = plt.gca() + # ax.set_xticks(xrange) + # ax.set_yticks(yrange) + # ax.set_xticklabels(xrange + minx) + # ax.set_yticklabels(yrange + miny) + # def format_coord(x, y): + # return 'x={:g}, y={:g}'.format(x + minx, y + miny) + # ax.format_coord = format_coord + # if title: + # plt.title(title) + # if rectangles: + # for i, (xmin, xmax, ymin, ymax) in enumerate(rectangles): + # ax.add_patch(patches.Rectangle((xmin, ymin), xmax - xmin, ymax - ymin, + # fill=False, linewidth=1, edgecolor='r')) + # if rectangles_showidx: + # ax.text((xmin+xmax)/2, (ymin+ymax)/2, str(i), c='r') + # plt.show() + # dbg_plt(title="return_boxes_of_images_by_order_of_reading_new") + boxes=[] peaks_neg_tot_tables = [] splitter_y_new = np.array(splitter_y_new, dtype=int) - for i in range(len(splitter_y_new)-1): - #print(splitter_y_new[i],splitter_y_new[i+1]) - matrix_new = matrix_of_lines_ch[:,:][(matrix_of_lines_ch[:,6]> splitter_y_new[i] ) & - (matrix_of_lines_ch[:,7]< splitter_y_new[i+1] )] + height_tot, width_tot = regions_without_separators.shape + big_part = 22 * height_tot // 100 # percent height + _, ccomps, cstats, _ = cv2.connectedComponentsWithStats(regions_without_separators.astype(np.uint8)) + for top, bot in pairwise(splitter_y_new): + # print("%d:%d" % (top, bot), 'i') + # dbg_plt([0, None, top, bot], "image cut for y split %d:%d" % (top, bot)) + matrix_new = matrix_of_seps_ch[(matrix_of_seps_ch[:,6] >= top) & + (matrix_of_seps_ch[:,7] < bot)] #print(len( matrix_new[:,9][matrix_new[:,9]==1] )) #print(matrix_new[:,8][matrix_new[:,9]==1],'gaddaaa') # check to see is there any vertical separator to find holes. #if (len(matrix_new[:,9][matrix_new[:,9]==1]) > 0 and # np.max(matrix_new[:,8][matrix_new[:,9]==1]) >= - # 0.1 * (np.abs(splitter_y_new[i+1]-splitter_y_new[i]))): - if True: - try: - num_col, peaks_neg_fin = find_num_col( - regions_without_separators[splitter_y_new[i]:splitter_y_new[i+1], :], - num_col_classifier, tables, multiplier=6. if erosion_hurts else 7.) - except: - peaks_neg_fin=[] - num_col = 0 - try: - if (len(peaks_neg_fin)+1)= big_part else 1, + tables, multiplier=6. if erosion_hurts else 7., + unbalanced=True) + except: + peaks_neg_fin=[] + num_col = 0 + try: + if ((len(peaks_neg_fin) + 1 < num_col_classifier or + num_col_classifier == 6) and + # we do not expect to get all columns in small parts (headings etc.): + bot - top >= big_part): + # found too few columns here + #print('burda') + peaks_neg_fin_org = np.copy(peaks_neg_fin) + #print("peaks_neg_fin_org", peaks_neg_fin_org) + if len(peaks_neg_fin)==0: + num_col, peaks_neg_fin = find_num_col( + regions_without_separators[top:bot], + num_col_classifier, tables, multiplier=3., unbalanced=True) + #print(peaks_neg_fin,'peaks_neg_fin') + peaks_neg_fin_early = [0] + peaks_neg_fin + [width_tot-1] + + #print(peaks_neg_fin_early,'burda2') + peaks_neg_fin_rev=[] + for left, right in pairwise(peaks_neg_fin_early): + # print("%d:%d" % (left, right), 'i_n') + # dbg_plt([left, right, top, bot], + # "image cut for y split %d:%d / x gap %d:%d" % ( + # top, bot, left, right)) + # plt.plot(regions_without_separators[top:bot, left:right].sum(axis=0)) + # plt.title("vertical projection (sum over y)") + # plt.show() + try: + _, peaks_neg_fin1 = find_num_col( + regions_without_separators[top:bot, left:right], + num_col_classifier, tables, multiplier=7.) + except: + peaks_neg_fin1 = [] + try: + _, peaks_neg_fin2 = find_num_col( + regions_without_separators[top:bot, left:right], + num_col_classifier, tables, multiplier=5.) + except: + peaks_neg_fin2 = [] + if len(peaks_neg_fin1) >= len(peaks_neg_fin2): + peaks_neg_fin = peaks_neg_fin1 + else: + peaks_neg_fin = peaks_neg_fin2 + # add offset to local result + peaks_neg_fin = list(np.array(peaks_neg_fin) + left) #print(peaks_neg_fin,'peaks_neg_fin') - for p_n in peaks_neg_fin: - peaks_neg_fin_early.append(p_n) - peaks_neg_fin_early.append(regions_without_separators.shape[1]-1) - #print(peaks_neg_fin_early,'burda2') - peaks_neg_fin_rev=[] - for i_n in range(len(peaks_neg_fin_early)-1): - #print(i_n,'i_n') - #plt.plot(regions_without_separators[splitter_y_new[i]: - # splitter_y_new[i+1], - # peaks_neg_fin_early[i_n]: - # peaks_neg_fin_early[i_n+1]].sum(axis=0) ) - #plt.show() - try: - num_col, peaks_neg_fin1 = find_num_col( - regions_without_separators[splitter_y_new[i]:splitter_y_new[i+1], - peaks_neg_fin_early[i_n]:peaks_neg_fin_early[i_n+1]], - num_col_classifier,tables, multiplier=7.) - except: - peaks_neg_fin1=[] - try: - num_col, peaks_neg_fin2 = find_num_col( - regions_without_separators[splitter_y_new[i]:splitter_y_new[i+1], - peaks_neg_fin_early[i_n]:peaks_neg_fin_early[i_n+1]], - num_col_classifier,tables, multiplier=5.) - except: - peaks_neg_fin2=[] + peaks_neg_fin_rev.extend(peaks_neg_fin) + if right < peaks_neg_fin_early[-1]: + # all but the last column: interject the preexisting boundary + peaks_neg_fin_rev.append(right) + #print(peaks_neg_fin_rev,'peaks_neg_fin_rev') - if len(peaks_neg_fin1)>=len(peaks_neg_fin2): - peaks_neg_fin=list(np.copy(peaks_neg_fin1)) - else: - peaks_neg_fin=list(np.copy(peaks_neg_fin2)) - peaks_neg_fin=list(np.array(peaks_neg_fin)+peaks_neg_fin_early[i_n]) - - if i_n!=(len(peaks_neg_fin_early)-2): - peaks_neg_fin_rev.append(peaks_neg_fin_early[i_n+1]) - #print(peaks_neg_fin,'peaks_neg_fin') - peaks_neg_fin_rev=peaks_neg_fin_rev+peaks_neg_fin - - if len(peaks_neg_fin_rev)>=len(peaks_neg_fin_org): - peaks_neg_fin=list(np.sort(peaks_neg_fin_rev)) - num_col=len(peaks_neg_fin) - else: - peaks_neg_fin=list(np.copy(peaks_neg_fin_org)) - num_col=len(peaks_neg_fin) - - #print(peaks_neg_fin,'peaks_neg_fin') - except: - logger.exception("cannot find peaks consistent with columns") - #num_col, peaks_neg_fin = find_num_col( - # regions_without_separators[splitter_y_new[i]:splitter_y_new[i+1],:], - # multiplier=7.0) - x_min_hor_some=matrix_new[:,2][ (matrix_new[:,9]==0) ] - x_max_hor_some=matrix_new[:,3][ (matrix_new[:,9]==0) ] - cy_hor_some=matrix_new[:,5][ (matrix_new[:,9]==0) ] - cy_hor_diff=matrix_new[:,7][ (matrix_new[:,9]==0) ] - arg_org_hor_some=matrix_new[:,0][ (matrix_new[:,9]==0) ] - - if right2left_readingorder: - x_max_hor_some_new = regions_without_separators.shape[1] - x_min_hor_some - x_min_hor_some_new = regions_without_separators.shape[1] - x_max_hor_some - x_min_hor_some =list(np.copy(x_min_hor_some_new)) - x_max_hor_some =list(np.copy(x_max_hor_some_new)) - - peaks_neg_tot=return_points_with_boundies(peaks_neg_fin,0, regions_without_separators[:,:].shape[1]) - peaks_neg_tot_tables.append(peaks_neg_tot) - - reading_order_type, x_starting, x_ending, y_type_2, y_diff_type_2, \ - y_lines_without_mother, x_start_without_mother, x_end_without_mother, there_is_sep_with_child, \ - y_lines_with_child_without_mother, x_start_with_child_without_mother, x_end_with_child_without_mother, \ - new_main_sep_y = return_x_start_end_mothers_childs_and_type_of_reading_order( - x_min_hor_some, x_max_hor_some, cy_hor_some, peaks_neg_tot, cy_hor_diff) - - all_columns = set(range(len(peaks_neg_tot) - 1)) - if ((reading_order_type==1) or - (reading_order_type==0 and - (len(y_lines_without_mother)>=2 or there_is_sep_with_child==1))): - try: - y_grenze = splitter_y_new[i] + 300 - #check if there is a big separator in this y_mains_sep_ohne_grenzen - - args_early_ys=np.arange(len(y_type_2)) - #print(args_early_ys,'args_early_ys') - #print(splitter_y_new[i], splitter_y_new[i+1]) - - x_starting_up = x_starting[(y_type_2 > splitter_y_new[i]) & - (y_type_2 <= y_grenze)] - x_ending_up = x_ending[(y_type_2 > splitter_y_new[i]) & - (y_type_2 <= y_grenze)] - y_type_2_up = y_type_2[(y_type_2 > splitter_y_new[i]) & - (y_type_2 <= y_grenze)] - y_diff_type_2_up = y_diff_type_2[(y_type_2 > splitter_y_new[i]) & - (y_type_2 <= y_grenze)] - args_up = args_early_ys[(y_type_2 > splitter_y_new[i]) & - (y_type_2 <= y_grenze)] - if len(y_type_2_up) > 0: - y_main_separator_up = y_type_2_up [(x_starting_up==0) & - (x_ending_up==(len(peaks_neg_tot)-1) )] - y_diff_main_separator_up = y_diff_type_2_up[(x_starting_up==0) & - (x_ending_up==(len(peaks_neg_tot)-1) )] - args_main_to_deleted = args_up[(x_starting_up==0) & - (x_ending_up==(len(peaks_neg_tot)-1) )] - #print(y_main_separator_up,y_diff_main_separator_up,args_main_to_deleted,'fffffjammmm') - if len(y_diff_main_separator_up) > 0: - args_to_be_kept = np.array(list( set(args_early_ys) - set(args_main_to_deleted) )) - #print(args_to_be_kept,'args_to_be_kept') - boxes.append([0, peaks_neg_tot[len(peaks_neg_tot)-1], - splitter_y_new[i], y_diff_main_separator_up.max()]) - splitter_y_new[i] = y_diff_main_separator_up.max() - - #print(splitter_y_new[i],'splitter_y_new[i]') - y_type_2 = y_type_2[args_to_be_kept] - x_starting = x_starting[args_to_be_kept] - x_ending = x_ending[args_to_be_kept] - y_diff_type_2 = y_diff_type_2[args_to_be_kept] - - #print('galdiha') - y_grenze = splitter_y_new[i] + 200 - args_early_ys2=np.arange(len(y_type_2)) - y_type_2_up=y_type_2[(y_type_2 > splitter_y_new[i]) & - (y_type_2 <= y_grenze)] - x_starting_up=x_starting[(y_type_2 > splitter_y_new[i]) & - (y_type_2 <= y_grenze)] - x_ending_up=x_ending[(y_type_2 > splitter_y_new[i]) & - (y_type_2 <= y_grenze)] - y_diff_type_2_up=y_diff_type_2[(y_type_2 > splitter_y_new[i]) & - (y_type_2 <= y_grenze)] - args_up2=args_early_ys2[(y_type_2 > splitter_y_new[i]) & - (y_type_2 <= y_grenze)] - #print(y_type_2_up,x_starting_up,x_ending_up,'didid') - nodes_in = set() - for ij in range(len(x_starting_up)): - nodes_in.update(range(x_starting_up[ij], - x_ending_up[ij])) - #print(nodes_in,'nodes_in') - - if nodes_in == set(range(len(peaks_neg_tot)-1)): - pass - elif nodes_in == set(range(1, len(peaks_neg_tot)-1)): - pass - else: - #print('burdaydikh') - args_to_be_kept2=np.array(list( set(args_early_ys2)-set(args_up2) )) - - if len(args_to_be_kept2)>0: - y_type_2 = y_type_2[args_to_be_kept2] - x_starting = x_starting[args_to_be_kept2] - x_ending = x_ending[args_to_be_kept2] - y_diff_type_2 = y_diff_type_2[args_to_be_kept2] - else: - pass - #print('burdaydikh2') - elif len(y_diff_main_separator_up)==0: - nodes_in = set() - for ij in range(len(x_starting_up)): - nodes_in.update(range(x_starting_up[ij], - x_ending_up[ij])) - #print(nodes_in,'nodes_in2') - #print(np.array(range(len(peaks_neg_tot)-1)),'np.array(range(len(peaks_neg_tot)-1))') - - if nodes_in == set(range(len(peaks_neg_tot)-1)): - pass - elif nodes_in == set(range(1,len(peaks_neg_tot)-1)): - pass - else: - #print('burdaydikh') - #print(args_early_ys,'args_early_ys') - #print(args_up,'args_up') - args_to_be_kept2=np.array(list( set(args_early_ys) - set(args_up) )) - - #print(args_to_be_kept2,'args_to_be_kept2') - #print(len(y_type_2),len(x_starting),len(x_ending),len(y_diff_type_2)) - if len(args_to_be_kept2)>0: - y_type_2 = y_type_2[args_to_be_kept2] - x_starting = x_starting[args_to_be_kept2] - x_ending = x_ending[args_to_be_kept2] - y_diff_type_2 = y_diff_type_2[args_to_be_kept2] - else: - pass - #print('burdaydikh2') - - #int(splitter_y_new[i]) - y_lines_by_order=[] - x_start_by_order=[] - x_end_by_order=[] - if (len(x_end_with_child_without_mother)==0 and reading_order_type==0) or reading_order_type==1: - if reading_order_type==1: - y_lines_by_order.append(splitter_y_new[i]) - x_start_by_order.append(0) - x_end_by_order.append(len(peaks_neg_tot)-2) - else: - #print(x_start_without_mother,x_end_without_mother,peaks_neg_tot,'dodo') - columns_covered_by_mothers = set() - for dj in range(len(x_start_without_mother)): - columns_covered_by_mothers.update( - range(x_start_without_mother[dj], - x_end_without_mother[dj])) - columns_not_covered = list(all_columns - columns_covered_by_mothers) - y_type_2 = np.append(y_type_2, np.ones(len(columns_not_covered) + - len(x_start_without_mother), - dtype=int) * splitter_y_new[i]) - ##y_lines_by_order = np.append(y_lines_by_order, [splitter_y_new[i]] * len(columns_not_covered)) - ##x_start_by_order = np.append(x_start_by_order, [0] * len(columns_not_covered)) - x_starting = np.append(x_starting, np.array(columns_not_covered, int)) - x_starting = np.append(x_starting, x_start_without_mother) - x_ending = np.append(x_ending, np.array(columns_not_covered, int) + 1) - x_ending = np.append(x_ending, x_end_without_mother) - - ind_args=np.arange(len(y_type_2)) - #ind_args=np.array(ind_args) - #print(ind_args,'ind_args') - for column in range(len(peaks_neg_tot)-1): - #print(column,'column') - ind_args_in_col=ind_args[x_starting==column] - #print('babali2') - #print(ind_args_in_col,'ind_args_in_col') - ind_args_in_col=np.array(ind_args_in_col) - #print(len(y_type_2)) - y_column=y_type_2[ind_args_in_col] - x_start_column=x_starting[ind_args_in_col] - x_end_column=x_ending[ind_args_in_col] - #print('babali3') - ind_args_col_sorted=np.argsort(y_column) - y_col_sort=y_column[ind_args_col_sorted] - x_start_column_sort=x_start_column[ind_args_col_sorted] - x_end_column_sort=x_end_column[ind_args_col_sorted] - #print('babali4') - for ii in range(len(y_col_sort)): - #print('babali5') - y_lines_by_order.append(y_col_sort[ii]) - x_start_by_order.append(x_start_column_sort[ii]) - x_end_by_order.append(x_end_column_sort[ii]-1) - else: - #print(x_start_without_mother,x_end_without_mother,peaks_neg_tot,'dodo') - columns_covered_by_mothers = set() - for dj in range(len(x_start_without_mother)): - columns_covered_by_mothers.update( - range(x_start_without_mother[dj], - x_end_without_mother[dj])) - columns_not_covered = list(all_columns - columns_covered_by_mothers) - y_type_2 = np.append(y_type_2, np.ones(len(columns_not_covered) + len(x_start_without_mother), - dtype=int) * splitter_y_new[i]) - ##y_lines_by_order = np.append(y_lines_by_order, [splitter_y_new[i]] * len(columns_not_covered)) - ##x_start_by_order = np.append(x_start_by_order, [0] * len(columns_not_covered)) - x_starting = np.append(x_starting, np.array(columns_not_covered, int)) - x_starting = np.append(x_starting, x_start_without_mother) - x_ending = np.append(x_ending, np.array(columns_not_covered, int) + 1) - x_ending = np.append(x_ending, x_end_without_mother) - - columns_covered_by_with_child_no_mothers = set() - for dj in range(len(x_end_with_child_without_mother)): - columns_covered_by_with_child_no_mothers.update( - range(x_start_with_child_without_mother[dj], - x_end_with_child_without_mother[dj])) - columns_not_covered_child_no_mother = list( - all_columns - columns_covered_by_with_child_no_mothers) - #indexes_to_be_spanned=[] - for i_s in range(len(x_end_with_child_without_mother)): - columns_not_covered_child_no_mother.append(x_start_with_child_without_mother[i_s]) - columns_not_covered_child_no_mother = np.sort(columns_not_covered_child_no_mother) - ind_args = np.arange(len(y_type_2)) - x_end_with_child_without_mother = np.array(x_end_with_child_without_mother, int) - x_start_with_child_without_mother = np.array(x_start_with_child_without_mother, int) - for i_s_nc in columns_not_covered_child_no_mother: - if i_s_nc in x_start_with_child_without_mother: - x_end_biggest_column = \ - x_end_with_child_without_mother[x_start_with_child_without_mother==i_s_nc][0] - args_all_biggest_lines = ind_args[(x_starting==i_s_nc) & - (x_ending==x_end_biggest_column)] - y_column_nc = y_type_2[args_all_biggest_lines] - x_start_column_nc = x_starting[args_all_biggest_lines] - x_end_column_nc = x_ending[args_all_biggest_lines] - y_column_nc = np.sort(y_column_nc) - for i_c in range(len(y_column_nc)): - if i_c==(len(y_column_nc)-1): - ind_all_lines_between_nm_wc=ind_args[(y_type_2>y_column_nc[i_c]) & - (y_type_2=i_s_nc) & - (x_ending<=x_end_biggest_column)] - else: - ind_all_lines_between_nm_wc=ind_args[(y_type_2>y_column_nc[i_c]) & - (y_type_2=i_s_nc) & - (x_ending<=x_end_biggest_column)] - y_all_between_nm_wc = y_type_2[ind_all_lines_between_nm_wc] - x_starting_all_between_nm_wc = x_starting[ind_all_lines_between_nm_wc] - x_ending_all_between_nm_wc = x_ending[ind_all_lines_between_nm_wc] - - x_diff_all_between_nm_wc = x_ending_all_between_nm_wc - x_starting_all_between_nm_wc - if len(x_diff_all_between_nm_wc)>0: - biggest=np.argmax(x_diff_all_between_nm_wc) - - columns_covered_by_mothers = set() - for dj in range(len(x_starting_all_between_nm_wc)): - columns_covered_by_mothers.update( - range(x_starting_all_between_nm_wc[dj], - x_ending_all_between_nm_wc[dj])) - child_columns = set(range(i_s_nc, x_end_biggest_column)) - columns_not_covered = list(child_columns - columns_covered_by_mothers) - - should_longest_line_be_extended=0 - if (len(x_diff_all_between_nm_wc) > 0 and - set(list(range(x_starting_all_between_nm_wc[biggest], - x_ending_all_between_nm_wc[biggest])) + - list(columns_not_covered)) != child_columns): - should_longest_line_be_extended=1 - index_lines_so_close_to_top_separator = \ - np.arange(len(y_all_between_nm_wc))[(y_all_between_nm_wc>y_column_nc[i_c]) & - (y_all_between_nm_wc<=(y_column_nc[i_c]+500))] - if len(index_lines_so_close_to_top_separator) > 0: - indexes_remained_after_deleting_closed_lines= \ - np.array(list(set(list(range(len(y_all_between_nm_wc)))) - - set(list(index_lines_so_close_to_top_separator)))) - if len(indexes_remained_after_deleting_closed_lines) > 0: - y_all_between_nm_wc = \ - y_all_between_nm_wc[indexes_remained_after_deleting_closed_lines] - x_starting_all_between_nm_wc = \ - x_starting_all_between_nm_wc[indexes_remained_after_deleting_closed_lines] - x_ending_all_between_nm_wc = \ - x_ending_all_between_nm_wc[indexes_remained_after_deleting_closed_lines] - - y_all_between_nm_wc = np.append(y_all_between_nm_wc, y_column_nc[i_c]) - x_starting_all_between_nm_wc = np.append(x_starting_all_between_nm_wc, i_s_nc) - x_ending_all_between_nm_wc = np.append(x_ending_all_between_nm_wc, x_end_biggest_column) - - if len(x_diff_all_between_nm_wc) > 0: - try: - y_all_between_nm_wc = np.append(y_all_between_nm_wc, y_column_nc[i_c]) - x_starting_all_between_nm_wc = np.append(x_starting_all_between_nm_wc, x_starting_all_between_nm_wc[biggest]) - x_ending_all_between_nm_wc = np.append(x_ending_all_between_nm_wc, x_ending_all_between_nm_wc[biggest]) - except: - logger.exception("cannot append") - - y_all_between_nm_wc = np.append(y_all_between_nm_wc, [y_column_nc[i_c]] * len(columns_not_covered)) - x_starting_all_between_nm_wc = np.append(x_starting_all_between_nm_wc, np.array(columns_not_covered, int)) - x_ending_all_between_nm_wc = np.append(x_ending_all_between_nm_wc, np.array(columns_not_covered, int) + 1) - - ind_args_between=np.arange(len(x_ending_all_between_nm_wc)) - for column in range(int(i_s_nc), int(x_end_biggest_column)): - ind_args_in_col=ind_args_between[x_starting_all_between_nm_wc==column] - #print('babali2') - #print(ind_args_in_col,'ind_args_in_col') - ind_args_in_col=np.array(ind_args_in_col) - #print(len(y_type_2)) - y_column=y_all_between_nm_wc[ind_args_in_col] - x_start_column=x_starting_all_between_nm_wc[ind_args_in_col] - x_end_column=x_ending_all_between_nm_wc[ind_args_in_col] - #print('babali3') - ind_args_col_sorted=np.argsort(y_column) - y_col_sort=y_column[ind_args_col_sorted] - x_start_column_sort=x_start_column[ind_args_col_sorted] - x_end_column_sort=x_end_column[ind_args_col_sorted] - #print('babali4') - for ii in range(len(y_col_sort)): - #print('babali5') - y_lines_by_order.append(y_col_sort[ii]) - x_start_by_order.append(x_start_column_sort[ii]) - x_end_by_order.append(x_end_column_sort[ii]-1) - else: - #print(column,'column') - ind_args_in_col=ind_args[x_starting==i_s_nc] - #print('babali2') - #print(ind_args_in_col,'ind_args_in_col') - ind_args_in_col=np.array(ind_args_in_col) - #print(len(y_type_2)) - y_column=y_type_2[ind_args_in_col] - x_start_column=x_starting[ind_args_in_col] - x_end_column=x_ending[ind_args_in_col] - #print('babali3') - ind_args_col_sorted=np.argsort(y_column) - y_col_sort=y_column[ind_args_col_sorted] - x_start_column_sort=x_start_column[ind_args_col_sorted] - x_end_column_sort=x_end_column[ind_args_col_sorted] - #print('babali4') - for ii in range(len(y_col_sort)): - y_lines_by_order.append(y_col_sort[ii]) - x_start_by_order.append(x_start_column_sort[ii]) - x_end_by_order.append(x_end_column_sort[ii]-1) - - for il in range(len(y_lines_by_order)): - y_copy = list(y_lines_by_order) - x_start_copy = list(x_start_by_order) - x_end_copy = list(x_end_by_order) - - #print(y_copy,'y_copy') - y_itself=y_copy.pop(il) - x_start_itself=x_start_copy.pop(il) - x_end_itself=x_end_copy.pop(il) - - #print(y_copy,'y_copy2') - for column in range(int(x_start_itself), int(x_end_itself)+1): - #print(column,'cols') - y_in_cols=[] - for yic in range(len(y_copy)): - #print('burda') - if (y_copy[yic]>y_itself and - column>=x_start_copy[yic] and - column<=x_end_copy[yic]): - y_in_cols.append(y_copy[yic]) - #print('burda2') - #print(y_in_cols,'y_in_cols') - if len(y_in_cols)>0: - y_down=np.min(y_in_cols) - else: - y_down=splitter_y_new[i+1] - #print(y_itself,'y_itself') - boxes.append([peaks_neg_tot[column], - peaks_neg_tot[column+1], - y_itself, - y_down]) - except: - logger.exception("cannot assign boxes") - boxes.append([0, peaks_neg_tot[len(peaks_neg_tot)-1], - splitter_y_new[i], splitter_y_new[i+1]]) - else: - y_lines_by_order=[] - x_start_by_order=[] - x_end_by_order=[] - if len(x_starting)>0: - columns_covered_by_lines_covered_more_than_2col = set() - for dj in range(len(x_starting)): - if set(range(x_starting[dj], x_ending[dj])) != all_columns: - columns_covered_by_lines_covered_more_than_2col.update( - range(x_starting[dj], x_ending[dj])) - columns_not_covered = list(all_columns - columns_covered_by_lines_covered_more_than_2col) - - y_type_2 = np.append(y_type_2, np.ones(len(columns_not_covered) + 1, - dtype=int) * splitter_y_new[i]) - ##y_lines_by_order = np.append(y_lines_by_order, [splitter_y_new[i]] * len(columns_not_covered)) - ##x_start_by_order = np.append(x_start_by_order, [0] * len(columns_not_covered)) - x_starting = np.append(x_starting, np.array(columns_not_covered, x_starting.dtype)) - x_ending = np.append(x_ending, np.array(columns_not_covered, x_ending.dtype) + 1) - if len(new_main_sep_y) > 0: - x_starting = np.append(x_starting, 0) - x_ending = np.append(x_ending, len(peaks_neg_tot) - 1) - else: - x_starting = np.append(x_starting, x_starting[0]) - x_ending = np.append(x_ending, x_ending[0]) + if len(peaks_neg_fin_rev) >= len(peaks_neg_fin_org): + peaks_neg_fin = peaks_neg_fin_rev else: - columns_not_covered = list(all_columns) - y_type_2 = np.append(y_type_2, np.ones(len(columns_not_covered), - dtype=int) * splitter_y_new[i]) - ##y_lines_by_order = np.append(y_lines_by_order, [splitter_y_new[i]] * len(columns_not_covered)) - ##x_start_by_order = np.append(x_start_by_order, [0] * len(columns_not_covered)) - x_starting = np.append(x_starting, np.array(columns_not_covered, x_starting.dtype)) - x_ending = np.append(x_ending, np.array(columns_not_covered, x_ending.dtype) + 1) + peaks_neg_fin = peaks_neg_fin_org + num_col = len(peaks_neg_fin) + #print(peaks_neg_fin,'peaks_neg_fin') + except: + logger.exception("cannot find peaks consistent with columns") + #num_col, peaks_neg_fin = find_num_col( + # regions_without_separators[top:bot,:], + # multiplier=7.0) + peaks_neg_tot = np.array([0] + peaks_neg_fin + [width_tot]) + #print(peaks_neg_tot,'peaks_neg_tot') + peaks_neg_tot_tables.append(peaks_neg_tot) - ind_args = np.arange(len(y_type_2)) - - for column in range(len(peaks_neg_tot)-1): - #print(column,'column') - ind_args_in_col=ind_args[x_starting==column] - ind_args_in_col=np.array(ind_args_in_col) - #print(len(y_type_2)) - y_column=y_type_2[ind_args_in_col] - x_start_column=x_starting[ind_args_in_col] - x_end_column=x_ending[ind_args_in_col] + all_columns = set(range(len(peaks_neg_tot) - 1)) + #print("all_columns", all_columns) - ind_args_col_sorted=np.argsort(y_column) - y_col_sort=y_column[ind_args_col_sorted] - x_start_column_sort=x_start_column[ind_args_col_sorted] - x_end_column_sort=x_end_column[ind_args_col_sorted] - #print('babali4') - for ii in range(len(y_col_sort)): - #print('babali5') - y_lines_by_order.append(y_col_sort[ii]) - x_start_by_order.append(x_start_column_sort[ii]) - x_end_by_order.append(x_end_column_sort[ii]-1) + # elongate horizontal separators+headings as much as possible without overlap + args_nonver = matrix_new[:, 9] != 1 + for i in np.flatnonzero(args_nonver): + xmin, xmax, ymin, ymax, typ = matrix_new[i, [2, 3, 6, 7, 9]] + cut = regions_with_separators[ymin: ymax] + # dbg_imshow([xmin, xmax, ymin, ymax], "separator %d (%s)" % (i, "heading" if typ else "horizontal")) + starting = xmin - peaks_neg_tot + min_start = np.flatnonzero(starting >= 0)[-1] # last left-of + ending = xmax - peaks_neg_tot + max_end = np.flatnonzero(ending <= 0)[0] # first right-of + # skip elongation unless this is already a multi-column separator/heading: + if not max_end - min_start > 1: + continue + # is there anything left of min_start? + for j in range(min_start): + # dbg_imshow([peaks_neg_tot[j], xmin, ymin, ymax], "start of %d candidate %d" % (i, j)) + if not np.any(cut[:, peaks_neg_tot[j]: xmin]): + # print("elongated sep", i, "typ", typ, "start", xmin, "to", j, peaks_neg_tot[j]) + matrix_new[i, 2] = peaks_neg_tot[j] + 1 # elongate to start of this column + break + # is there anything right of max_end? + for j in range(len(peaks_neg_tot) - 1, max_end, -1): + # dbg_imshow([xmax, peaks_neg_tot[j], ymin, ymax], "end of %d candidate %d" % (i, j)) + if not np.any(cut[:, xmax: peaks_neg_tot[j]]): + # print("elongated sep", i, "typ", typ, "end", xmax, "to", j, peaks_neg_tot[j]) + matrix_new[i, 3] = peaks_neg_tot[j] - 1 # elongate to end of this column + break - for il in range(len(y_lines_by_order)): - y_copy = list(y_lines_by_order) - x_start_copy = list(x_start_by_order) - x_end_copy = list(x_end_by_order) + args_hor = matrix_new[:, 9] == 0 + x_min_hor_some = matrix_new[:, 2][args_hor] + x_max_hor_some = matrix_new[:, 3][args_hor] + y_min_hor_some = matrix_new[:, 6][args_hor] + y_max_hor_some = matrix_new[:, 7][args_hor] + cy_hor_some = matrix_new[:, 5][args_hor] - #print(y_copy,'y_copy') - y_itself=y_copy.pop(il) - x_start_itself=x_start_copy.pop(il) - x_end_itself=x_end_copy.pop(il) + args_head = matrix_new[:, 9] == 2 + x_min_hor_head = matrix_new[:, 2][args_head] + x_max_hor_head = matrix_new[:, 3][args_head] + y_min_hor_head = matrix_new[:, 6][args_head] + y_max_hor_head = matrix_new[:, 7][args_head] + cy_hor_head = matrix_new[:, 5][args_head] - for column in range(x_start_itself, x_end_itself+1): - #print(column,'cols') - y_in_cols=[] - for yic in range(len(y_copy)): - #print('burda') - if (y_copy[yic]>y_itself and - column>=x_start_copy[yic] and - column<=x_end_copy[yic]): - y_in_cols.append(y_copy[yic]) - #print('burda2') - #print(y_in_cols,'y_in_cols') - if len(y_in_cols)>0: - y_down=np.min(y_in_cols) - else: - y_down=splitter_y_new[i+1] - #print(y_itself,'y_itself') + # split headings at toplines (y_min_head) and baselines (y_max_head) + # instead of merely adding their center (cy_head) as horizontal separator + # (x +/- 30px to avoid crossing col peaks by accident) + x_min_hor_some = np.append(x_min_hor_some, np.tile(x_min_hor_head + 30, 2)) + x_max_hor_some = np.append(x_max_hor_some, np.tile(x_max_hor_head - 30, 2)) + y_min_hor_some = np.append(y_min_hor_some, # toplines + np.concatenate((y_min_hor_head - 2, + y_max_hor_head - 0))) + y_max_hor_some = np.append(y_max_hor_some, # baselines + np.concatenate((y_min_hor_head + 0, + y_max_hor_head + 2))) + cy_hor_some = np.append(cy_hor_some, # centerlines + np.concatenate((y_min_hor_head - 1, + y_max_hor_head + 1))) + + # analyse connected components of regions to gain additional separators + # and prepare a map for cross-column boxes + ccounts = np.bincount(ccomps[top: bot].flatten()) + col_ccounts = np.stack([np.bincount(ccomps[top: bot, left: right].flatten(), + minlength=ccounts.size) + for left, right in pairwise(peaks_neg_tot)]) + labelcolmap = dict() + for label, label_count in enumerate(ccounts): + if not label: + continue + label_left, label_top, label_width, label_height, label_area = cstats[label] + # if label_count < 0.9 * label_area: + # # mostly not in this part of the page + # continue + if label_count < 0.01 * (top - bot) * width_tot: + continue + #assert np.sum(col_ccounts[:, label]) == label_count + label_right = label_left + label_width + label_bot = label_top + label_height + label_start = np.flatnonzero(peaks_neg_tot > label_left)[0] - 1 + label_end = np.flatnonzero(peaks_neg_tot >= label_right)[0] + # store as dict for multi-column boxes: + for start in range(label_start, label_end): + labelcolmap.setdefault(start, list()).append( + (label_end, label_top, label_bot, sum(col_ccounts[start: label_end, label]))) + # make additional separators: + if label_end - label_start < 2: + continue + if np.count_nonzero(col_ccounts[:, label] > 0.1 * label_count) < 2: + continue + x_min_hor_some = np.append(x_min_hor_some, [label_left] * 2) + x_max_hor_some = np.append(x_max_hor_some, [label_right] * 2) + y_min_hor_some = np.append(y_min_hor_some, [label_top - 2, label_bot]) + y_max_hor_some = np.append(y_max_hor_some, [label_top, label_bot + 2]) + cy_hor_some = np.append(cy_hor_some, [label_top - 1, label_bot + 1]) + + if right2left_readingorder: + x_max_hor_some = width_tot - x_min_hor_some + x_min_hor_some = width_tot - x_max_hor_some + + x_starting, x_ending, y_min, y_mid, y_max = return_multicol_separators_x_start_end( + regions_without_separators, peaks_neg_tot, top, bot, + x_min_hor_some, x_max_hor_some, cy_hor_some, y_min_hor_some, y_max_hor_some) + # dbg_plt([0, None, top, bot], "non-empty multi-column separators in current split", + # list(zip(peaks_neg_tot[x_starting], peaks_neg_tot[x_ending], + # y_min - top, y_max - top)), True) + + # core algorithm: + # 1. iterate through multi-column separators, pre-ordered by their y coord + # 2. for each separator, iterate from its starting to its ending column + # 3. in each starting column, determine the next downwards separator, + # 4. if there is none, then fill up the column to the bottom; + # otherwise, fill up to that next separator + # 5. moreover, determine the next rightward column that would not cut through + # any regions, advancing to that column, and storing a new in-order bbox + # for that down/right span + # 6. if there was a next separator, and it ends no further than the current one, + # then recurse on that separator from step 1, then continue (with the next + # column for the current separator) at step 2, or (with the next separator + # in order) at step 1 + args = list(range(len(y_mid))) + while len(args): + cur = args[0] + args = args[1:] + # print("iter", cur, y_mid[cur], "%d:%d" % (x_starting[cur], x_ending[cur])) + def get_span(start, y_top, y_bot): + # for last, l_top, l_bot, l_count in labelcolmap.get(start, []): + # if y_top < l_bot and y_bot > l_top and last > start + 1: + # width = (peaks_neg_tot[last] - peaks_neg_tot[start]) + # print("span", start, last, l_top, l_bot, l_count, + # "box area", (y_bot - y_top) * width, + # "label area", (min(y_bot, l_bot) - max(y_top, l_top)) * width, + # "box height", (y_bot - y_top), + # "label height", sum(regions_without_separators[ + # y_top: y_bot, peaks_neg_tot[start + 1]])) + return min((last for last, l_top, l_bot, l_count in labelcolmap.get(start, []) + # yield the right-most column that does not cut through + # any regions in this horizontal span + if y_top < l_bot and y_bot > l_top + # Ignore if it ends here, anyway + and last > start + 1 + # Ensure this is not just a tiny region near larger regions + and l_count > 0.1 * max(l_count2 for _, l_top2, l_bot2, l_count2 in labelcolmap[start] + if y_top < l_bot2 and y_bot > l_top2) + # or just a small cut of the respective region + # (i.e. box should cover at least 10% of the label). + and ((min(y_bot, l_bot) - max(y_top, l_top)) * + (peaks_neg_tot[last] - peaks_neg_tot[start])) > 0.1 * l_count + # But do allow cutting tiny passages with less 10% of height + # (i.e. label is already almost separated by columns) + and sum(regions_without_separators[ + y_top: y_bot, peaks_neg_tot[start + 1]]) > 0.1 * (y_bot - y_top)), + # Otherwise advance only 1 column. + default=start + 1) + def add_sep(cur): + column = x_starting[cur] + while column < x_ending[cur]: + nxt = np.flatnonzero((y_mid[cur] < y_mid) & + (column >= x_starting) & + (column < x_ending)) + if len(nxt): + nxt = nxt[0] + # print("column", column) + last = get_span(column, y_max[cur], y_min[nxt]) + last = min(last, x_ending[nxt], x_ending[cur]) + # print("nxt", nxt, y_mid[nxt], "%d:%d" % (column, last)) boxes.append([peaks_neg_tot[column], - peaks_neg_tot[column+1], - y_itself, - y_down]) - #else: - #boxes.append([ 0, regions_without_separators[:,:].shape[1] ,splitter_y_new[i],splitter_y_new[i+1]]) + peaks_neg_tot[last], + y_mid[cur], + y_mid[nxt]]) + # dbg_plt(boxes[-1], "recursive column %d:%d box [%d]" % (column, last, len(boxes))) + column = last + if last == x_ending[nxt] and x_ending[nxt] <= x_ending[cur] and nxt in args: + # child – recur + # print("recur", nxt, y_mid[nxt], "%d:%d" % (x_starting[nxt], x_ending[nxt])) + args.remove(nxt) + add_sep(nxt) + else: + # print("column", column) + last = get_span(column, y_max[cur], bot) + # print("bot", bot, "%d:%d" % (column, last)) + boxes.append([peaks_neg_tot[column], + peaks_neg_tot[last], + y_mid[cur], + bot]) + # dbg_plt(boxes[-1], "non-recursive column %d box [%d]" % (column, len(boxes))) + column = last + add_sep(cur) if right2left_readingorder: peaks_neg_tot_tables_new = [] if len(peaks_neg_tot_tables)>=1: for peaks_tab_ind in peaks_neg_tot_tables: - peaks_neg_tot_tables_ind = regions_without_separators.shape[1] - np.array(peaks_tab_ind) + peaks_neg_tot_tables_ind = width_tot - np.array(peaks_tab_ind) peaks_neg_tot_tables_ind = list(peaks_neg_tot_tables_ind[::-1]) peaks_neg_tot_tables_new.append(peaks_neg_tot_tables_ind) for i in range(len(boxes)): - x_start_new = regions_without_separators.shape[1] - boxes[i][1] - x_end_new = regions_without_separators.shape[1] - boxes[i][0] + x_start_new = width_tot - boxes[i][1] + x_end_new = width_tot - boxes[i][0] boxes[i][0] = x_start_new boxes[i][1] = x_end_new peaks_neg_tot_tables = peaks_neg_tot_tables_new + # show final xy-cut + # dbg_plt(None, "final XY-Cut", boxes, True) + logger.debug('exit return_boxes_of_images_by_order_of_reading_new') return boxes, peaks_neg_tot_tables diff --git a/src/eynollah/utils/contour.py b/src/eynollah/utils/contour.py index f304db2..052688c 100644 --- a/src/eynollah/utils/contour.py +++ b/src/eynollah/utils/contour.py @@ -14,21 +14,16 @@ from shapely.ops import unary_union, nearest_points from .rotate import rotate_image, rotation_image_new def contours_in_same_horizon(cy_main_hor): - X1 = np.zeros((len(cy_main_hor), len(cy_main_hor))) - X2 = np.zeros((len(cy_main_hor), len(cy_main_hor))) - - X1[0::1, :] = cy_main_hor[:] - X2 = X1.T - - X_dif = np.abs(X2 - X1) - args_help = np.array(range(len(cy_main_hor))) - all_args = [] - for i in range(len(cy_main_hor)): - list_h = list(args_help[X_dif[i, :] <= 20]) - list_h.append(i) - if len(list_h) > 1: - all_args.append(list(set(list_h))) - return np.unique(np.array(all_args, dtype=object)) + """ + Takes an array of y coords, identifies all pairs among them + which are close to each other, and returns all such pairs + by index into the array. + """ + sort = np.argsort(cy_main_hor) + same = np.diff(cy_main_hor[sort] <= 20) + # groups = np.split(sort, np.arange(len(cy_main_hor) - 1)[~same] + 1) + same = np.flatnonzero(same) + return np.stack((sort[:-1][same], sort[1:][same])).T def find_contours_mean_y_diff(contours_main): M_main = [cv2.moments(contours_main[j]) for j in range(len(contours_main))] diff --git a/src/eynollah/writer.py b/src/eynollah/writer.py index 9c3456a..f8aff62 100644 --- a/src/eynollah/writer.py +++ b/src/eynollah/writer.py @@ -89,7 +89,7 @@ class EynollahXmlWriter: def build_pagexml_no_full_layout( self, found_polygons_text_region, - page_coord, order_of_texts, id_of_texts, + page_coord, order_of_texts, all_found_textline_polygons, all_box_coord, found_polygons_text_region_img, @@ -102,7 +102,7 @@ class EynollahXmlWriter: **kwargs): return self.build_pagexml_full_layout( found_polygons_text_region, [], - page_coord, order_of_texts, id_of_texts, + page_coord, order_of_texts, all_found_textline_polygons, [], all_box_coord, [], found_polygons_text_region_img, found_polygons_tables, [], @@ -116,7 +116,7 @@ class EynollahXmlWriter: def build_pagexml_full_layout( self, found_polygons_text_region, found_polygons_text_region_h, - page_coord, order_of_texts, id_of_texts, + page_coord, order_of_texts, all_found_textline_polygons, all_found_textline_polygons_h, all_box_coord, all_box_coord_h, found_polygons_text_region_img, found_polygons_tables, found_polygons_drop_capitals,