From a2f43b8d69949d5e467fb6305a7621553d363f9f Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 21 Apr 2026 01:06:41 +0200 Subject: [PATCH] simplify, add confidence for headings as well --- src/eynollah/eynollah.py | 33 ++++++++++++++------------------- src/eynollah/utils/__init__.py | 17 ++++++++++------- 2 files changed, 24 insertions(+), 26 deletions(-) diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index df98c19..e87038c 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -1712,19 +1712,20 @@ class Eynollah: #print(time.time()-t_0_box,'time box in 3') t1 = time.time() if np.abs(slope_deskew) < SLOPE_THRESHOLD: - boxes, peaks_neg_tot_tables = return_boxes_of_images_by_order_of_reading_new( + boxes, _ = return_boxes_of_images_by_order_of_reading_new( splitter_y_new, regions_without_separators, text_regions_p == label_seps_fl, matrix_of_seps_ch, - num_col_classifier, erosion_hurts, self.tables, self.right2left) + num_col_classifier, erosion_hurts, self.tables, self.right2left, + logger=self.logger) boxes_d = None self.logger.debug("len(boxes): %s", len(boxes)) #print(time.time()-t_0_box,'time box in 3.1') - else: - boxes_d, peaks_neg_tot_tables_d = return_boxes_of_images_by_order_of_reading_new( + boxes_d, _ = return_boxes_of_images_by_order_of_reading_new( splitter_y_new_d, regions_without_separators_d, text_regions_p_d == label_seps_fl, matrix_of_seps_ch_d, - num_col_classifier, erosion_hurts, self.tables, self.right2left) + num_col_classifier, erosion_hurts, self.tables, self.right2left, + logger=self.logger) boxes = None self.logger.debug("len(boxes): %s", len(boxes_d)) @@ -2843,20 +2844,14 @@ class Eynollah: if not self.reading_order_machine_based: label_seps = 6 - if not self.headers_off: - if np.abs(slope_deskew) < SLOPE_THRESHOLD: - num_col, _, matrix_of_seps_ch, splitter_y_new, _ = find_number_of_columns_in_document( - text_regions_p, num_col_classifier, self.tables, label_seps, contours_only_text_parent_h) - else: - _, _, matrix_of_seps_ch_d, splitter_y_new_d, _ = find_number_of_columns_in_document( - text_regions_p_d, num_col_classifier, self.tables, label_seps, contours_only_text_parent_h_d_ordered) - elif self.headers_off: - if np.abs(slope_deskew) < SLOPE_THRESHOLD: - num_col, _, matrix_of_seps_ch, splitter_y_new, _ = find_number_of_columns_in_document( - text_regions_p, num_col_classifier, self.tables, label_seps) - else: - _, _, matrix_of_seps_ch_d, splitter_y_new_d, _ = find_number_of_columns_in_document( - text_regions_p_d, num_col_classifier, self.tables, label_seps) + if np.abs(slope_deskew) < SLOPE_THRESHOLD: + _, _, matrix_of_seps_ch, splitter_y_new, _ = find_number_of_columns_in_document( + text_regions_p, num_col_classifier, self.tables, label_seps, + contours_h=None if self.headers_off else contours_only_text_parent_h) + else: + _, _, matrix_of_seps_ch_d, splitter_y_new_d, _ = find_number_of_columns_in_document( + text_regions_p_d, num_col_classifier, self.tables, label_seps, + contours_h=None if self.headers_off else contours_only_text_parent_h_d_ordered) if not erosion_hurts: if np.abs(slope_deskew) < SLOPE_THRESHOLD: diff --git a/src/eynollah/utils/__init__.py b/src/eynollah/utils/__init__.py index 76097ce..93d82e3 100644 --- a/src/eynollah/utils/__init__.py +++ b/src/eynollah/utils/__init__.py @@ -899,16 +899,19 @@ def split_textregion_main_vs_head( h_o = regions_model_1.shape[0] w_o = regions_model_1.shape[1] zoom = 3 - regions_model_1 = cv2.resize(regions_model_1, (regions_model_1.shape[1] // zoom, - regions_model_1.shape[0] // zoom), + regions_model_1 = cv2.resize(regions_model_1, + (regions_model_1.shape[1] // zoom, + regions_model_1.shape[0] // zoom), interpolation=cv2.INTER_NEAREST) - regions_model_full = cv2.resize(regions_model_full, (regions_model_full.shape[1] // zoom, - regions_model_full.shape[0] // zoom), + regions_model_full = cv2.resize(regions_model_full, + (regions_model_full.shape[1] // zoom, + regions_model_full.shape[0] // zoom), interpolation=cv2.INTER_NEAREST) - contours_only_text_parent_z = [(cnt / zoom).astype(int) for cnt in contours_only_text_parent] + contours_only_text_parent_z = [contour // zoom + for contour in contours_only_text_parent] ### - cx_main, cy_main, x_min_main, x_max_main, y_min_main, y_max_main, y_corr_x_min_from_argmin = \ + _, _, x_min_main, x_max_main, y_min_main, y_max_main, _ = \ find_new_features_of_contours(contours_only_text_parent_z) length_con=x_max_main-x_min_main @@ -947,7 +950,7 @@ def split_textregion_main_vs_head( regions_model_1[(regions_model_1 == label_text) & (parent > 0)] = label_head_final contours_only_text_parent_head.append(contours_only_text_parent[ii]) - conf_contours_head.append(None) # why not conf_contours[ii], too? + conf_contours_head.append(conf_contours[ii]) if len(contours_only_text_parent_d_ordered): contours_only_text_parent_head_d.append(contours_only_text_parent_d_ordered[ii]) all_box_coord_head.append(all_box_coord[ii])