From adcea47bc05ccbdfa76c6059d5f66e4610e5ae41 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 28 Nov 2025 18:23:59 +0100 Subject: [PATCH] return_boxes_of_images_by_order_of_reading_new: always erode when passing the text region mask, do not apply erosion only if there are more than 2 columns, but iff `not erosion_hurts` (consistent with `find_num_col`'s expectations and making it as easy to find the column gaps on 1 and 2-column pages as on multi-column pages) --- src/eynollah/eynollah.py | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index 46a1704..47198cb 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -2356,7 +2356,6 @@ class Eynollah: img_only_regions_with_sep = (prediction_regions_org_y == 1).astype(np.uint8) try: img_only_regions = cv2.erode(img_only_regions_with_sep[:,:], KERNEL, iterations=20) - _, _ = find_num_col(img_only_regions, num_col_classifier, self.tables, multiplier=6.0) img = resize_image(img_org, int(img_org.shape[0]), int(img_org.shape[1]*(1.2 if is_image_enhanced else 1))) prediction_regions_org = self.do_prediction(True, img, self.models["region"]) @@ -3138,7 +3137,7 @@ class Eynollah: #print(time.time()-t_0_box,'time box in 2') self.logger.info("num_col_classifier: %s", num_col_classifier) - if num_col_classifier >= 3: + if not erosion_hurts: if np.abs(slope_deskew) < SLOPE_THRESHOLD: regions_without_separators = regions_without_separators.astype(np.uint8) regions_without_separators = cv2.erode(regions_without_separators[:, :], KERNEL, iterations=6) @@ -3289,21 +3288,16 @@ class Eynollah: if np.abs(slope_deskew) < SLOPE_THRESHOLD: num_col, _, matrix_of_seps_ch, splitter_y_new, _ = find_number_of_columns_in_document( text_regions_p, num_col_classifier, self.tables, label_seps) - - if np.abs(slope_deskew) >= SLOPE_THRESHOLD: - num_col_d, _, matrix_of_seps_ch_d, splitter_y_new_d, _ = find_number_of_columns_in_document( - text_regions_p_d, num_col_classifier, self.tables, label_seps) - - if num_col_classifier>=3: - if np.abs(slope_deskew) < SLOPE_THRESHOLD: + if not erosion_hurts: regions_without_separators = regions_without_separators.astype(np.uint8) regions_without_separators = cv2.erode(regions_without_separators[:,:], KERNEL, iterations=6) - if np.abs(slope_deskew) >= SLOPE_THRESHOLD: + else: + num_col_d, _, matrix_of_seps_ch_d, splitter_y_new_d, _ = find_number_of_columns_in_document( + text_regions_p_d, num_col_classifier, self.tables, label_seps) + if not erosion_hurts: regions_without_separators_d = regions_without_separators_d.astype(np.uint8) regions_without_separators_d = cv2.erode(regions_without_separators_d[:,:], KERNEL, iterations=6) - else: - pass if np.abs(slope_deskew) < SLOPE_THRESHOLD: boxes, peaks_neg_tot_tables = return_boxes_of_images_by_order_of_reading_new( @@ -4149,6 +4143,7 @@ class Eynollah: self.run_enhancement(self.light_version) self.logger.info(f"Image: {self.image.shape[1]}x{self.image.shape[0]}, " + f"scale {self.scale_x:.1f}x{self.scale_y:.1f}, " f"{self.dpi} DPI, {num_col_classifier} columns") if is_image_enhanced: self.logger.info("Enhancement applied") @@ -4682,7 +4677,7 @@ class Eynollah: _, _, matrix_of_seps_ch_d, splitter_y_new_d, _ = find_number_of_columns_in_document( text_regions_p_d, num_col_classifier, self.tables, label_seps) - if num_col_classifier >= 3: + if not erosion_hurts: if np.abs(slope_deskew) < SLOPE_THRESHOLD: regions_without_separators = regions_without_separators.astype(np.uint8) regions_without_separators = cv2.erode(regions_without_separators[:, :], KERNEL, iterations=6)