From 4475183f08d2c25eb90deb04bda552930abd4ba0 Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Fri, 14 Nov 2025 03:39:36 +0100
Subject: [PATCH] improve rules governing column split

- reduce `sigma` for smoothing of input to `find_peaks`
  (so we get deeper gaps between columns)
- allow column boundaries closer to the margins
  (50 instead of 100 or 200 px, 170 instead of 370 px)
- allow column boundaries closer to each other
  (300 instead of 400 px)
- add a secondary `grenze` criterion for depth of gap
  (relative to lowest minimum, if that is smaller than
   the old criterion relative to lowest maximum)
- for calls to `find_num_col` within parts of a page,
  do allow unbalanced column boundaries
---
 src/eynollah/utils/__init__.py | 113 +++++++++++++++++++--------------
 1 file changed, 66 insertions(+), 47 deletions(-)

diff --git a/src/eynollah/utils/__init__.py b/src/eynollah/utils/__init__.py
index e00004f..570eefe 100644
--- a/src/eynollah/utils/__init__.py
+++ b/src/eynollah/utils/__init__.py
@@ -241,7 +241,7 @@ def find_num_col_deskew(regions_without_separators, sigma_, multiplier=3.8):
     z = gaussian_filter1d(regions_without_separators_0, sigma_)
     return np.std(z)
 
-def find_num_col(regions_without_separators, num_col_classifier, tables, multiplier=3.8):
+def find_num_col(regions_without_separators, num_col_classifier, tables, multiplier=3.8, unbalanced=False):
     if not regions_without_separators.any():
         return 0, []
     regions_without_separators_0 = regions_without_separators.sum(axis=0)
@@ -249,13 +249,15 @@ def find_num_col(regions_without_separators, num_col_classifier, tables, multipl
     # ax1.imshow(regions_without_separators, aspect="auto")
     # ax2.plot(regions_without_separators_0)
     # plt.show()
-    sigma_ = 35  # 70#35
+    sigma_ = 25  # 70#35
     meda_n_updown = regions_without_separators_0[::-1]
     first_nonzero = next((i for i, x in enumerate(regions_without_separators_0) if x), 0)
     last_nonzero = next((i for i, x in enumerate(meda_n_updown) if x), 0)
     last_nonzero = len(regions_without_separators_0) - last_nonzero
-    last_nonzero = last_nonzero - 100
-    first_nonzero = first_nonzero + 200
+    last_nonzero = last_nonzero - 50 #- 100
+    first_nonzero = first_nonzero + 50 #+ 200
+    last_offmargin = len(regions_without_separators_0) - 170 #370
+    first_offmargin = 170 #370
     y = regions_without_separators_0  # [first_nonzero:last_nonzero]
     y_help = np.zeros(len(y) + 20)
     y_help[10 : len(y) + 10] = y
@@ -285,26 +287,34 @@ def find_num_col(regions_without_separators, num_col_classifier, tables, multipl
     # ax2.axvline(last_nonzero, label="last nonzero")
     # ax2.text(first_nonzero, 0, "first nonzero", rotation=90)
     # ax2.text(last_nonzero, 0, "last nonzero", rotation=90)
-    # ax2.axvline(370, label="first")
-    # ax2.axvline(len(y) - 370, label="last")
-    # ax2.text(370, 0, "first", rotation=90)
-    # ax2.text(len(y) - 370, 0, "last", rotation=90)
+    # ax2.axvline(first_offmargin, label="first offmargin")
+    # ax2.axvline(last_offmargin, label="last offmargin")
+    # ax2.text(first_offmargin, 0, "first offmargin", rotation=90)
+    # ax2.text(last_offmargin, 0, "last offmargin", rotation=90)
     # plt.show()
     peaks_neg = peaks_neg - 10 - 10
 
+    # print("raw peaks", peaks)
     peaks = peaks[(peaks > 0.06 * len(y)) &
                   (peaks < 0.94 * len(y))]
+    # print("non-marginal peaks", peaks)
     interest_pos = z[peaks]
+    # print("interest_pos", interest_pos)
     interest_pos = interest_pos[interest_pos > 10]
     if not interest_pos.any():
         return 0, []
+
     # plt.plot(z)
     # plt.show()
+    #print("raw peaks_neg", peaks_neg)
     peaks_neg = peaks_neg[(peaks_neg > first_nonzero) &
                           (peaks_neg < last_nonzero)]
-    peaks_neg = peaks_neg[(peaks_neg > 370) &
-                          (peaks_neg < len(y) - 370)]
+    #print("non-zero peaks_neg", peaks_neg)
+    peaks_neg = peaks_neg[(peaks_neg > first_offmargin) &
+                          (peaks_neg < last_offmargin)]
+    #print("non-marginal peaks_neg", peaks_neg)
     interest_neg = z[peaks_neg]
+    #print("interest_neg", interest_neg)
     if not interest_neg.any():
         return 0, []
 
@@ -317,10 +327,14 @@ def find_num_col(regions_without_separators, num_col_classifier, tables, multipl
 
     min_peaks_neg = 0  # np.min(interest_neg)
 
+    # cutoff criterion: fixed fraction of lowest column height
     dis_talaei = (min_peaks_pos - min_peaks_neg) / multiplier
     grenze = min_peaks_pos - dis_talaei
     #np.mean(y[peaks_neg[0]:peaks_neg[-1]])-np.std(y[peaks_neg[0]:peaks_neg[-1]])/2.0
 
+    # extra criterion: fixed multiple of lowest gap height
+    grenze = min(grenze, multiplier * (5 + np.min(interest_neg)))
+
     # print(interest_neg,'interest_neg')
     # print(grenze,'grenze')
     # print(min_peaks_pos,'min_peaks_pos')
@@ -356,18 +370,20 @@ def find_num_col(regions_without_separators, num_col_classifier, tables, multipl
     # print(peaks_neg_fin,'peaks_neg_fin')
     # print(num_col,'diz')
     # cancel if resulting split is highly unbalanced across available width
-    if ((num_col == 3 and
-        ((peaks_neg_fin[0] > 0.75 * len(y) and
-          peaks_neg_fin[1] > 0.75 * len(y)) or
-         (peaks_neg_fin[0] < 0.25 * len(y) and
-          peaks_neg_fin[1] < 0.25 * len(y)) or
-         (peaks_neg_fin[0] < 0.5 * len(y) - 200 and
-          peaks_neg_fin[1] < 0.5 * len(y)) or
-         (peaks_neg_fin[0] > 0.5 * len(y) + 200 and
-          peaks_neg_fin[1] > 0.5 * len(y)))) or
-        (num_col == 2 and
-         (peaks_neg_fin[0] > 0.75 * len(y) or
-          peaks_neg_fin[0] < 0.25 * len(y)))):
+    if unbalanced:
+        pass
+    elif ((num_col == 3 and
+           ((peaks_neg_fin[0] > 0.75 * len(y) and
+             peaks_neg_fin[1] > 0.75 * len(y)) or
+            (peaks_neg_fin[0] < 0.25 * len(y) and
+             peaks_neg_fin[1] < 0.25 * len(y)) or
+            (peaks_neg_fin[0] < 0.5 * len(y) - 200 and
+             peaks_neg_fin[1] < 0.5 * len(y)) or
+            (peaks_neg_fin[0] > 0.5 * len(y) + 200 and
+             peaks_neg_fin[1] > 0.5 * len(y)))) or
+          (num_col == 2 and
+           (peaks_neg_fin[0] > 0.75 * len(y) or
+            peaks_neg_fin[0] < 0.25 * len(y)))):
         num_col = 1
         peaks_neg_fin = []
 
@@ -376,7 +392,7 @@ def find_num_col(regions_without_separators, num_col_classifier, tables, multipl
     # filter out peaks that are too close (<400px) to each other:
     # among each group, pick the position with smallest amount of text
     diff_peaks = np.abs(np.diff(peaks_neg_fin))
-    cut_off = 400
+    cut_off = 300 #400
     peaks_neg_true = []
     forest = []
     # print(len(peaks_neg_fin),'len_')
@@ -401,30 +417,32 @@ def find_num_col(regions_without_separators, num_col_classifier, tables, multipl
     #print(peaks_neg_true, "peaks_neg_true")
     ##print(num_col,'early')
     # cancel if resulting split is highly unbalanced across available width
-    if ((num_col == 3 and
-        ((peaks_neg_true[0] > 0.75 * len(y) and
-          peaks_neg_true[1] > 0.75 * len(y)) or
-         (peaks_neg_true[0] < 0.25 * len(y) and
-          peaks_neg_true[1] < 0.25 * len(y)) or
-         (peaks_neg_true[0] < 0.5 * len(y) - 200 and
-          peaks_neg_true[1] < 0.5 * len(y)) or
-         (peaks_neg_true[0] > 0.5 * len(y) + 200 and
-          peaks_neg_true[1] > 0.5 * len(y)))) or
-        (num_col == 2 and
-         (peaks_neg_true[0] > 0.75 * len(y) or
-          peaks_neg_true[0] < 0.25 * len(y)))):
+    if unbalanced:
+        pass
+    elif ((num_col == 3 and
+           ((peaks_neg_true[0] > 0.75 * len(y) and
+             peaks_neg_true[1] > 0.75 * len(y)) or
+            (peaks_neg_true[0] < 0.25 * len(y) and
+             peaks_neg_true[1] < 0.25 * len(y)) or
+            (peaks_neg_true[0] < 0.5 * len(y) - 200 and
+             peaks_neg_true[1] < 0.5 * len(y)) or
+            (peaks_neg_true[0] > 0.5 * len(y) + 200 and
+             peaks_neg_true[1] > 0.5 * len(y)))) or
+          (num_col == 2 and
+           (peaks_neg_true[0] > 0.75 * len(y) or
+            peaks_neg_true[0] < 0.25 * len(y)))):
         num_col = 1
         peaks_neg_true = []
-    if (num_col == 3 and
-        (peaks_neg_true[0] < 0.75 * len(y) and
-         peaks_neg_true[0] > 0.25 * len(y) and
-         peaks_neg_true[1] > 0.80 * len(y))):
+    elif (num_col == 3 and
+          (peaks_neg_true[0] < 0.75 * len(y) and
+           peaks_neg_true[0] > 0.25 * len(y) and
+           peaks_neg_true[1] > 0.80 * len(y))):
         num_col = 2
         peaks_neg_true = [peaks_neg_true[0]]
-    if (num_col == 3 and
-        (peaks_neg_true[1] < 0.75 * len(y) and
-         peaks_neg_true[1] > 0.25 * len(y) and
-         peaks_neg_true[0] < 0.20 * len(y))):
+    elif (num_col == 3 and
+          (peaks_neg_true[1] < 0.75 * len(y) and
+           peaks_neg_true[1] > 0.25 * len(y) and
+           peaks_neg_true[0] < 0.20 * len(y))):
         num_col = 2
         peaks_neg_true = [peaks_neg_true[1]]
 
@@ -1151,8 +1169,8 @@ def order_of_regions(textline_mask, contours_main, contours_head, y_ref, x_ref):
 
     ##matrix_of_orders[:len_main,4]=final_indexers_sorted[:]
 
-    # assert len(final_indexers_sorted) == len(contours_main) + len(contours_head)
-    # assert not len(final_indexers_sorted) or max(final_index_type) == max(len(contours_main)
+    assert len(set(final_indexers_sorted)) == len(contours_main) + len(contours_head)
+    assert set(final_index_type) == set(range(len(contours_main))).union(range(len(contours_head)))
 
     return np.array(final_indexers_sorted), np.array(final_types), np.array(final_index_type)
 
@@ -1518,7 +1536,8 @@ def return_boxes_of_images_by_order_of_reading_new(
                 regions_without_separators[top:bot],
                 # we do not expect to get all columns in small parts (headings etc.):
                 num_col_classifier if bot - top >= big_part else 1,
-                tables, multiplier=6. if erosion_hurts else 7.)
+                tables, multiplier=6. if erosion_hurts else 7.,
+                unbalanced=True)
         except:
             peaks_neg_fin=[]
             num_col = 0
@@ -1534,7 +1553,7 @@ def return_boxes_of_images_by_order_of_reading_new(
                 if len(peaks_neg_fin)==0:
                     num_col, peaks_neg_fin = find_num_col(
                         regions_without_separators[top:bot],
-                        num_col_classifier, tables, multiplier=3.)
+                        num_col_classifier, tables, multiplier=3., unbalanced=True)
                 #print(peaks_neg_fin,'peaks_neg_fin')
                 peaks_neg_fin_early = [0] + peaks_neg_fin + [width_tot-1]