From 4dd40c542b3384322febf821c0c761bc9cb4dc46 Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Fri, 28 Nov 2025 18:07:15 +0100
Subject: [PATCH] find_num_col: add optional criterion - sum of vertical
 separators

when searching for gaps between text regions, consider the vertical
separator mask (if given): add the vertical sum of vertical separators
to the peak scores (making column detection more robust if still slighly
skewed or partially obscured by multi-column regions, but fg seps are
present)
---
 src/eynollah/utils/__init__.py | 29 +++++++++++++++++++----------
 1 file changed, 19 insertions(+), 10 deletions(-)

diff --git a/src/eynollah/utils/__init__.py b/src/eynollah/utils/__init__.py
index 2ebf48a..0f2dac3 100644
--- a/src/eynollah/utils/__init__.py
+++ b/src/eynollah/utils/__init__.py
@@ -241,10 +241,13 @@ def find_num_col_deskew(regions_without_separators, sigma_, multiplier=3.8):
     z = gaussian_filter1d(regions_without_separators_0, sigma_)
     return np.std(z)
 
-def find_num_col(regions_without_separators, num_col_classifier, tables, multiplier=3.8, unbalanced=False):
+def find_num_col(regions_without_separators, num_col_classifier, tables, multiplier=3.8, unbalanced=False, vertical_separators=None):
     if not regions_without_separators.any():
         return 0, []
+    if vertical_separators is None:
+        vertical_separators = np.zeros_like(regions_without_separators)
     regions_without_separators_0 = regions_without_separators.sum(axis=0)
+    vertical_separators_0 = vertical_separators.sum(axis=0)
     # fig, (ax1, ax2) = plt.subplots(2, sharex=True)
     # ax1.imshow(regions_without_separators, aspect="auto")
     # ax2.plot(regions_without_separators_0)
@@ -258,13 +261,12 @@ def find_num_col(regions_without_separators, num_col_classifier, tables, multipl
     first_nonzero = first_nonzero + 50 #+ 200
     last_offmargin = len(regions_without_separators_0) - 170 #370
     first_offmargin = 170 #370
+    x = vertical_separators_0
     y = regions_without_separators_0  # [first_nonzero:last_nonzero]
-    y_help = np.zeros(len(y) + 20)
-    y_help[10 : len(y) + 10] = y
-    x = np.arange(len(y))
-    zneg_rev = -y_help + np.max(y_help)
-    zneg = np.zeros(len(zneg_rev) + 20)
-    zneg[10 : len(zneg_rev) + 10] = zneg_rev
+    y_help = np.pad(y, (10, 10), constant_values=(0, 0))
+    zneg_rev = y.max() - y_help
+    zneg = np.pad(zneg_rev, (10, 10), constant_values=(0, 0))
+    x = gaussian_filter1d(x, sigma_)
     z = gaussian_filter1d(y, sigma_)
     zneg = gaussian_filter1d(zneg, sigma_)
 
@@ -333,6 +335,7 @@ def find_num_col(regions_without_separators, num_col_classifier, tables, multipl
     #np.mean(y[peaks_neg[0]:peaks_neg[-1]])-np.std(y[peaks_neg[0]:peaks_neg[-1]])/2.0
 
     # extra criterion: fixed multiple of lowest gap height
+    # print("grenze", grenze, multiplier * (5 + np.min(interest_neg)))
     grenze = min(grenze, multiplier * (5 + np.min(interest_neg)))
 
     # print(interest_neg,'interest_neg')
@@ -341,16 +344,22 @@ def find_num_col(regions_without_separators, num_col_classifier, tables, multipl
     # print(dis_talaei,'dis_talaei')
     # print(peaks_neg,'peaks_neg')
     # fig, (ax1, ax2) = plt.subplots(2, sharex=True)
-    # ax1.imshow(regions_without_separators, aspect="auto")
+    # ax1.imshow(regions_without_separators + 5 * vertical_separators, aspect="auto")
     # ax2.plot(z, color='red', label='z')
     # ax2.plot(zneg[20:], color='blue', label='zneg')
+    # ax2.plot(x, color='green', label='vsep')
     # ax2.scatter(peaks_neg, z[peaks_neg], color='red')
     # ax2.scatter(peaks_neg, zneg[20:][peaks_neg], color='blue')
-    # ax2.axhline(min_peaks_pos, color='red', label="min_peaks_pos")
-    # ax2.axhline(grenze, color='blue', label="grenze")
+    # ax2.axhline(min_peaks_pos, color='red')
+    # ax2.axhline(grenze, color='blue')
+    # ax2.annotate("min_peaks_pos", xy=(0, min_peaks_pos), color='red')
+    # ax2.annotate("grenze", xy=(0, grenze), color='blue')
     # ax2.text(0, grenze, "grenze")
+    # ax2.legend()
     # plt.show()
 
+    # print("vsep", x[peaks_neg])
+    interest_neg = interest_neg - x[peaks_neg]
     interest_neg_fin = interest_neg[(interest_neg < grenze)]
     peaks_neg_fin = peaks_neg[(interest_neg < grenze)]