simplify, add confidence for headings as well

2026-08-03 09:22:32 +02:00 · 2026-04-21 01:06:41 +02:00 · 2026-04-21 01:06:41 +02:00 · a2f43b8d69
commit a2f43b8d69
parent 264b00f8ab
2 changed files with 24 additions and 26 deletions
--- a/src/eynollah/eynollah.py
+++ b/src/eynollah/eynollah.py
@ -1712,19 +1712,20 @@ class Eynollah:
        #print(time.time()-t_0_box,'time box in 3')
        t1 = time.time()
        if np.abs(slope_deskew) < SLOPE_THRESHOLD:
-            boxes, peaks_neg_tot_tables = return_boxes_of_images_by_order_of_reading_new(
+            boxes, _ = return_boxes_of_images_by_order_of_reading_new(
                splitter_y_new, regions_without_separators,
                text_regions_p == label_seps_fl, matrix_of_seps_ch,
-                num_col_classifier, erosion_hurts, self.tables, self.right2left)
+                num_col_classifier, erosion_hurts, self.tables, self.right2left,
                logger=self.logger)
            boxes_d = None
            self.logger.debug("len(boxes): %s", len(boxes))
            #print(time.time()-t_0_box,'time box in 3.1')
        else:
-            boxes_d, peaks_neg_tot_tables_d = return_boxes_of_images_by_order_of_reading_new(
+            boxes_d, _ = return_boxes_of_images_by_order_of_reading_new(
                splitter_y_new_d, regions_without_separators_d,
                text_regions_p_d == label_seps_fl, matrix_of_seps_ch_d,
-                num_col_classifier, erosion_hurts, self.tables, self.right2left)
+                num_col_classifier, erosion_hurts, self.tables, self.right2left,
                logger=self.logger)
            boxes = None
            self.logger.debug("len(boxes): %s", len(boxes_d))
@ -2843,20 +2844,14 @@ class Eynollah:
            if not self.reading_order_machine_based:
                label_seps = 6
                if not self.headers_off:
                if np.abs(slope_deskew) < SLOPE_THRESHOLD:
-                        num_col, _, matrix_of_seps_ch, splitter_y_new, _ = find_number_of_columns_in_document(
+                    _, _, matrix_of_seps_ch, splitter_y_new, _ = find_number_of_columns_in_document(
-                            text_regions_p, num_col_classifier, self.tables,  label_seps, contours_only_text_parent_h)
+                        text_regions_p, num_col_classifier, self.tables, label_seps,
                        contours_h=None if self.headers_off else contours_only_text_parent_h)
                else:
                    _, _, matrix_of_seps_ch_d, splitter_y_new_d, _ = find_number_of_columns_in_document(
-                            text_regions_p_d, num_col_classifier, self.tables, label_seps, contours_only_text_parent_h_d_ordered)
+                        text_regions_p_d, num_col_classifier, self.tables, label_seps,
-                elif self.headers_off:
+                        contours_h=None if self.headers_off else contours_only_text_parent_h_d_ordered)
                    if np.abs(slope_deskew) < SLOPE_THRESHOLD:
                        num_col, _, matrix_of_seps_ch, splitter_y_new, _ = find_number_of_columns_in_document(
                            text_regions_p, num_col_classifier, self.tables,  label_seps)
                    else:
                        _, _, matrix_of_seps_ch_d, splitter_y_new_d, _ = find_number_of_columns_in_document(
                            text_regions_p_d, num_col_classifier, self.tables, label_seps)
                if not erosion_hurts:
                    if np.abs(slope_deskew) < SLOPE_THRESHOLD:
--- a/src/eynollah/utils/init.py
+++ b/src/eynollah/utils/init.py
@ -899,16 +899,19 @@ def split_textregion_main_vs_head(
    h_o = regions_model_1.shape[0]
    w_o = regions_model_1.shape[1]
    zoom = 3
-    regions_model_1 = cv2.resize(regions_model_1, (regions_model_1.shape[1] // zoom,
+    regions_model_1 = cv2.resize(regions_model_1,
                                 (regions_model_1.shape[1] // zoom,
                                  regions_model_1.shape[0] // zoom),
                                 interpolation=cv2.INTER_NEAREST)
-    regions_model_full = cv2.resize(regions_model_full, (regions_model_full.shape[1] // zoom,
+    regions_model_full = cv2.resize(regions_model_full,
                                    (regions_model_full.shape[1] // zoom,
                                     regions_model_full.shape[0] // zoom),
                                    interpolation=cv2.INTER_NEAREST)
-    contours_only_text_parent_z = [(cnt / zoom).astype(int) for cnt in contours_only_text_parent]
+    contours_only_text_parent_z = [contour // zoom
                                   for contour in contours_only_text_parent]
    ###
-    cx_main, cy_main, x_min_main, x_max_main, y_min_main, y_max_main, y_corr_x_min_from_argmin = \
+    _, _, x_min_main, x_max_main, y_min_main, y_max_main, _ = \
        find_new_features_of_contours(contours_only_text_parent_z)
    length_con=x_max_main-x_min_main
@ -947,7 +950,7 @@ def split_textregion_main_vs_head(
            regions_model_1[(regions_model_1 == label_text) & (parent > 0)] = label_head_final
            contours_only_text_parent_head.append(contours_only_text_parent[ii])
-            conf_contours_head.append(None) # why not conf_contours[ii], too?
+            conf_contours_head.append(conf_contours[ii])
            if len(contours_only_text_parent_d_ordered):
                contours_only_text_parent_head_d.append(contours_only_text_parent_d_ordered[ii])
            all_box_coord_head.append(all_box_coord[ii])