find_num_col: simplify, add better plotting (but commented out)

This commit is contained in:
Robert Sachunsky 2025-10-20 13:20:12 +02:00
parent 184927fb54
commit 48761c3e12

View file

@ -396,16 +396,18 @@ def find_num_col_deskew(regions_without_separators, sigma_, multiplier=3.8):
def find_num_col(regions_without_separators, num_col_classifier, tables, multiplier=3.8): def find_num_col(regions_without_separators, num_col_classifier, tables, multiplier=3.8):
if not regions_without_separators.any(): if not regions_without_separators.any():
return 0, [] return 0, []
#plt.imshow(regions_without_separators)
#plt.show()
regions_without_separators_0 = regions_without_separators.sum(axis=0) regions_without_separators_0 = regions_without_separators.sum(axis=0)
##plt.plot(regions_without_separators_0) # fig, (ax1, ax2) = plt.subplots(2, sharex=True)
##plt.show() # ax1.imshow(regions_without_separators, aspect="auto")
# ax2.plot(regions_without_separators_0)
# plt.show()
sigma_ = 35 # 70#35 sigma_ = 35 # 70#35
meda_n_updown = regions_without_separators_0[len(regions_without_separators_0) :: -1] meda_n_updown = regions_without_separators_0[::-1]
first_nonzero = next((i for i, x in enumerate(regions_without_separators_0) if x), 0) first_nonzero = next((i for i, x in enumerate(regions_without_separators_0) if x), 0)
last_nonzero = next((i for i, x in enumerate(meda_n_updown) if x), 0) last_nonzero = next((i for i, x in enumerate(meda_n_updown) if x), 0)
last_nonzero = len(regions_without_separators_0) - last_nonzero last_nonzero = len(regions_without_separators_0) - last_nonzero
last_nonzero = last_nonzero - 100
first_nonzero = first_nonzero + 200
y = regions_without_separators_0 # [first_nonzero:last_nonzero] y = regions_without_separators_0 # [first_nonzero:last_nonzero]
y_help = np.zeros(len(y) + 20) y_help = np.zeros(len(y) + 20)
y_help[10 : len(y) + 10] = y y_help[10 : len(y) + 10] = y
@ -416,28 +418,44 @@ def find_num_col(regions_without_separators, num_col_classifier, tables, multipl
z = gaussian_filter1d(y, sigma_) z = gaussian_filter1d(y, sigma_)
zneg = gaussian_filter1d(zneg, sigma_) zneg = gaussian_filter1d(zneg, sigma_)
peaks_neg, _ = find_peaks(zneg, height=0)
#plt.plot(zneg)
#plt.plot(peaks_neg, zneg[peaks_neg], 'rx')
#plt.show()
peaks, _ = find_peaks(z, height=0) peaks, _ = find_peaks(z, height=0)
peaks_neg, _ = find_peaks(zneg, height=0)
# _, (ax1, ax2) = plt.subplots(2, sharex=True)
# ax1.set_title("z")
# ax1.plot(z)
# ax1.scatter(peaks, z[peaks])
# ax1.axvline(0.06 * len(y), label="first")
# ax1.axvline(0.94 * len(y), label="last")
# ax1.text(0.06 * len(y), 0, "first", rotation=90)
# ax1.text(0.94 * len(y), 0, "last", rotation=90)
# ax1.axhline(10, label="minimum")
# ax1.text(0, 10, "minimum")
# ax2.set_title("zneg")
# ax2.plot(zneg)
# ax2.scatter(peaks_neg, zneg[peaks_neg])
# ax2.axvline(first_nonzero, label="first nonzero")
# ax2.axvline(last_nonzero, label="last nonzero")
# ax2.text(first_nonzero, 0, "first nonzero", rotation=90)
# ax2.text(last_nonzero, 0, "last nonzero", rotation=90)
# ax2.axvline(370, label="first")
# ax2.axvline(len(y) - 370, label="last")
# ax2.text(370, 0, "first", rotation=90)
# ax2.text(len(y) - 370, 0, "last", rotation=90)
# plt.show()
peaks_neg = peaks_neg - 10 - 10 peaks_neg = peaks_neg - 10 - 10
last_nonzero = last_nonzero - 100 peaks = peaks[(peaks > 0.06 * len(y)) &
first_nonzero = first_nonzero + 200 (peaks < 0.94 * len(y))]
peaks_neg = peaks_neg[(peaks_neg > first_nonzero) &
(peaks_neg < last_nonzero)]
peaks = peaks[(peaks > 0.06 * regions_without_separators.shape[1]) &
(peaks < 0.94 * regions_without_separators.shape[1])]
peaks_neg = peaks_neg[(peaks_neg > 370) &
(peaks_neg < (regions_without_separators.shape[1] - 370))]
interest_pos = z[peaks] interest_pos = z[peaks]
interest_pos = interest_pos[interest_pos > 10] interest_pos = interest_pos[interest_pos > 10]
if not interest_pos.any(): if not interest_pos.any():
return 0, [] return 0, []
# plt.plot(z) # plt.plot(z)
# plt.show() # plt.show()
peaks_neg = peaks_neg[(peaks_neg > first_nonzero) &
(peaks_neg < last_nonzero)]
peaks_neg = peaks_neg[(peaks_neg > 370) &
(peaks_neg < len(y) - 370)]
interest_neg = z[peaks_neg] interest_neg = z[peaks_neg]
if not interest_neg.any(): if not interest_neg.any():
return 0, [] return 0, []
@ -445,21 +463,28 @@ def find_num_col(regions_without_separators, num_col_classifier, tables, multipl
min_peaks_pos = np.min(interest_pos) min_peaks_pos = np.min(interest_pos)
max_peaks_pos = np.max(interest_pos) max_peaks_pos = np.max(interest_pos)
if max_peaks_pos / min_peaks_pos >= 35: #print(min_peaks_pos, max_peaks_pos, max_peaks_pos / min_peaks_pos, 'minmax')
if max_peaks_pos / (min_peaks_pos or 1e-9) >= 35:
min_peaks_pos = np.mean(interest_pos) min_peaks_pos = np.mean(interest_pos)
min_peaks_neg = 0 # np.min(interest_neg) min_peaks_neg = 0 # np.min(interest_neg)
# print(np.min(interest_pos),np.max(interest_pos),np.max(interest_pos)/np.min(interest_pos),'minmax')
dis_talaei = (min_peaks_pos - min_peaks_neg) / multiplier dis_talaei = (min_peaks_pos - min_peaks_neg) / multiplier
grenze = min_peaks_pos - dis_talaei grenze = min_peaks_pos - dis_talaei
# np.mean(y[peaks_neg[0]:peaks_neg[len(peaks_neg)-1]])-np.std(y[peaks_neg[0]:peaks_neg[len(peaks_neg)-1]])/2.0 #np.mean(y[peaks_neg[0]:peaks_neg[-1]])-np.std(y[peaks_neg[0]:peaks_neg[-1]])/2.0
# print(interest_neg,'interest_neg') # print(interest_neg,'interest_neg')
# print(grenze,'grenze') # print(grenze,'grenze')
# print(min_peaks_pos,'min_peaks_pos') # print(min_peaks_pos,'min_peaks_pos')
# print(dis_talaei,'dis_talaei') # print(dis_talaei,'dis_talaei')
# print(peaks_neg,'peaks_neg') # print(peaks_neg,'peaks_neg')
# fig, (ax1, ax2) = plt.subplots(2, sharex=True)
# ax1.imshow(regions_without_separators, aspect="auto")
# ax2.plot(z)
# ax2.scatter(peaks_neg, z[peaks_neg])
# ax2.axhline(grenze, label="grenze")
# ax2.text(0, grenze, "grenze")
# plt.show()
interest_neg_fin = interest_neg[(interest_neg < grenze)] interest_neg_fin = interest_neg[(interest_neg < grenze)]
peaks_neg_fin = peaks_neg[(interest_neg < grenze)] peaks_neg_fin = peaks_neg[(interest_neg < grenze)]
@ -479,46 +504,38 @@ def find_num_col(regions_without_separators, num_col_classifier, tables, multipl
# print(peaks_neg_fin,'peaks_neg_fin') # print(peaks_neg_fin,'peaks_neg_fin')
# print(num_col,'diz') # print(num_col,'diz')
p_l = 0 # cancel if resulting split is highly unbalanced across available width
p_u = len(y) - 1 if ((num_col == 3 and
p_m = int(len(y) / 2.0) ((peaks_neg_fin[0] > 0.75 * len(y) and
p_g_l = int(len(y) / 4.0) peaks_neg_fin[1] > 0.75 * len(y)) or
p_g_u = len(y) - int(len(y) / 4.0) (peaks_neg_fin[0] < 0.25 * len(y) and
peaks_neg_fin[1] < 0.25 * len(y)) or
if num_col == 3: (peaks_neg_fin[0] < 0.5 * len(y) - 200 and
if ((peaks_neg_fin[0] > p_g_u and peaks_neg_fin[1] < 0.5 * len(y)) or
peaks_neg_fin[1] > p_g_u) or (peaks_neg_fin[0] > 0.5 * len(y) + 200 and
(peaks_neg_fin[0] < p_g_l and peaks_neg_fin[1] > 0.5 * len(y)))) or
peaks_neg_fin[1] < p_g_l) or (num_col == 2 and
(peaks_neg_fin[0] + 200 < p_m and (peaks_neg_fin[0] > 0.75 * len(y) or
peaks_neg_fin[1] < p_m) or peaks_neg_fin[0] < 0.25 * len(y)))):
(peaks_neg_fin[0] - 200 > p_m and num_col = 1
peaks_neg_fin[1] > p_m)): peaks_neg_fin = []
num_col = 1
peaks_neg_fin = []
if num_col == 2:
if (peaks_neg_fin[0] > p_g_u or
peaks_neg_fin[0] < p_g_l):
num_col = 1
peaks_neg_fin = []
##print(len(peaks_neg_fin)) ##print(len(peaks_neg_fin))
# filter out peaks that are too close (<400px) to each other:
# among each group, pick the position with smallest amount of text
diff_peaks = np.abs(np.diff(peaks_neg_fin)) diff_peaks = np.abs(np.diff(peaks_neg_fin))
cut_off = 400 cut_off = 400
peaks_neg_true = [] peaks_neg_true = []
forest = [] forest = []
# print(len(peaks_neg_fin),'len_') # print(len(peaks_neg_fin),'len_')
for i in range(len(peaks_neg_fin)): for i in range(len(peaks_neg_fin)):
if i == 0: if i == 0:
forest.append(peaks_neg_fin[i]) forest.append(peaks_neg_fin[i])
if i < len(peaks_neg_fin) - 1: if i < len(peaks_neg_fin) - 1:
if diff_peaks[i] <= cut_off: if diff_peaks[i] <= cut_off:
forest.append(peaks_neg_fin[i + 1]) forest.append(peaks_neg_fin[i + 1])
if diff_peaks[i] > cut_off: else:
# print(forest[np.argmin(z[forest]) ] ) # print(forest[np.argmin(z[forest]) ] )
if not isNaN(forest[np.argmin(z[forest])]): if not isNaN(forest[np.argmin(z[forest])]):
peaks_neg_true.append(forest[np.argmin(z[forest])]) peaks_neg_true.append(forest[np.argmin(z[forest])])
@ -530,68 +547,59 @@ def find_num_col(regions_without_separators, num_col_classifier, tables, multipl
peaks_neg_true.append(forest[np.argmin(z[forest])]) peaks_neg_true.append(forest[np.argmin(z[forest])])
num_col = len(peaks_neg_true) + 1 num_col = len(peaks_neg_true) + 1
p_l = 0 #print(peaks_neg_true, "peaks_neg_true")
p_u = len(y) - 1
p_m = int(len(y) / 2.0)
p_quarter = int(len(y) / 5.0)
p_g_l = int(len(y) / 4.0)
p_g_u = len(y) - int(len(y) / 4.0)
p_u_quarter = len(y) - p_quarter
##print(num_col,'early') ##print(num_col,'early')
if num_col == 3: # cancel if resulting split is highly unbalanced across available width
if ((peaks_neg_true[0] > p_g_u and if ((num_col == 3 and
peaks_neg_true[1] > p_g_u) or ((peaks_neg_true[0] > 0.75 * len(y) and
(peaks_neg_true[0] < p_g_l and peaks_neg_true[1] > 0.75 * len(y)) or
peaks_neg_true[1] < p_g_l) or (peaks_neg_true[0] < 0.25 * len(y) and
(peaks_neg_true[0] < p_m and peaks_neg_true[1] < 0.25 * len(y)) or
peaks_neg_true[1] + 200 < p_m) or (peaks_neg_true[0] < 0.5 * len(y) - 200 and
(peaks_neg_true[0] - 200 > p_m and peaks_neg_true[1] < 0.5 * len(y)) or
peaks_neg_true[1] > p_m)): (peaks_neg_true[0] > 0.5 * len(y) + 200 and
num_col = 1 peaks_neg_true[1] > 0.5 * len(y)))) or
peaks_neg_true = [] (num_col == 2 and
elif (peaks_neg_true[0] < p_g_u and (peaks_neg_true[0] > 0.75 * len(y) or
peaks_neg_true[0] > p_g_l and peaks_neg_true[0] < 0.25 * len(y)))):
peaks_neg_true[1] > p_u_quarter): num_col = 1
peaks_neg_true = [peaks_neg_true[0]] peaks_neg_true = []
elif (peaks_neg_true[1] < p_g_u and if (num_col == 3 and
peaks_neg_true[1] > p_g_l and (peaks_neg_true[0] < 0.75 * len(y) and
peaks_neg_true[0] < p_quarter): peaks_neg_true[0] > 0.25 * len(y) and
peaks_neg_true = [peaks_neg_true[1]] peaks_neg_true[1] > 0.80 * len(y))):
num_col = 2
peaks_neg_true = [peaks_neg_true[0]]
if (num_col == 3 and
(peaks_neg_true[1] < 0.75 * len(y) and
peaks_neg_true[1] > 0.25 * len(y) and
peaks_neg_true[0] < 0.20 * len(y))):
num_col = 2
peaks_neg_true = [peaks_neg_true[1]]
if num_col == 2: # get rid of too narrow columns (not used)
if (peaks_neg_true[0] > p_g_u or # if np.count_nonzero(diff_peaks < 360):
peaks_neg_true[0] < p_g_l): # arg_help = np.arange(len(diff_peaks))
num_col = 1 # arg_help_ann = arg_help[diff_peaks < 360]
peaks_neg_true = [] # peaks_neg_fin_new = []
# for ii in range(len(peaks_neg_fin)):
# if ii in arg_help_ann:
# if interest_neg_fin[ii] < interest_neg_fin[ii + 1]:
# peaks_neg_fin_new.append(peaks_neg_fin[ii])
# else:
# peaks_neg_fin_new.append(peaks_neg_fin[ii + 1])
diff_peaks_abnormal = diff_peaks[diff_peaks < 360] # elif (ii - 1) not in arg_help_ann:
# peaks_neg_fin_new.append(peaks_neg_fin[ii])
if len(diff_peaks_abnormal) > 0: # else:
arg_help = np.arange(len(diff_peaks)) # peaks_neg_fin_new = peaks_neg_fin
arg_help_ann = arg_help[diff_peaks < 360]
peaks_neg_fin_new = []
for ii in range(len(peaks_neg_fin)):
if ii in arg_help_ann:
arg_min = np.argmin([interest_neg_fin[ii], interest_neg_fin[ii + 1]])
if arg_min == 0:
peaks_neg_fin_new.append(peaks_neg_fin[ii])
else:
peaks_neg_fin_new.append(peaks_neg_fin[ii + 1])
elif (ii - 1) not in arg_help_ann:
peaks_neg_fin_new.append(peaks_neg_fin[ii])
else:
peaks_neg_fin_new = peaks_neg_fin
# plt.plot(gaussian_filter1d(y, sigma_)) # plt.plot(gaussian_filter1d(y, sigma_))
# plt.plot(peaks_neg_true,z[peaks_neg_true],'*') # plt.plot(peaks_neg_true,z[peaks_neg_true],'*')
# plt.plot([0,len(y)], [grenze,grenze]) # plt.plot([0,len(y)], [grenze,grenze])
# plt.show() # plt.show()
##print(len(peaks_neg_true)) ##print(len(peaks_neg_true))
#print(peaks_neg_true, "peaks_neg_true")
return len(peaks_neg_true), peaks_neg_true return len(peaks_neg_true), peaks_neg_true
def find_num_col_only_image(regions_without_separators, multiplier=3.8): def find_num_col_only_image(regions_without_separators, multiplier=3.8):