From 086c1880ac600e8d4b043fc8206298e9e964081d Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 15 Oct 2025 12:24:21 +0200 Subject: [PATCH 01/32] binarization: add option `--overwrite`, skip existing outputs (also, simplify `run` and separate `run_single`) --- src/eynollah/cli.py | 16 ++++-- src/eynollah/sbb_binarize.py | 96 +++++++++++++++--------------------- 2 files changed, 52 insertions(+), 60 deletions(-) diff --git a/src/eynollah/cli.py b/src/eynollah/cli.py index c9bad52..e4a24e4 100644 --- a/src/eynollah/cli.py +++ b/src/eynollah/cli.py @@ -79,18 +79,28 @@ def machine_based_reading_order(input, dir_in, out, model, log_level): type=click.Path(file_okay=True, dir_okay=True), required=True, ) +@click.option( + "--overwrite", + "-O", + help="overwrite (instead of skipping) if output xml exists", + is_flag=True, +) @click.option( "--log_level", "-l", type=click.Choice(['OFF', 'DEBUG', 'INFO', 'WARN', 'ERROR']), help="Override log level globally to this", ) -def binarization(patches, model_dir, input_image, dir_in, output, log_level): +def binarization(patches, model_dir, input_image, dir_in, output, overwrite, log_level): assert bool(input_image) != bool(dir_in), "Either -i (single input) or -di (directory) must be provided, but not both." binarizer = SbbBinarizer(model_dir) if log_level: - binarizer.log.setLevel(getLevelName(log_level)) - binarizer.run(image_path=input_image, use_patches=patches, output=output, dir_in=dir_in) + binarizer.logger.setLevel(getLevelName(log_level)) + binarizer.run(overwrite=overwrite, + use_patches=patches, + image_path=input_image, + output=output, + dir_in=dir_in) @main.command() diff --git a/src/eynollah/sbb_binarize.py b/src/eynollah/sbb_binarize.py index 3716987..0eab2ae 100644 --- a/src/eynollah/sbb_binarize.py +++ b/src/eynollah/sbb_binarize.py @@ -25,7 +25,7 @@ class SbbBinarizer: def __init__(self, model_dir, logger=None): self.model_dir = model_dir - self.log = logger if logger else logging.getLogger('SbbBinarizer') + self.logger = logger if logger else logging.getLogger('SbbBinarizer') self.start_new_session() @@ -315,64 +315,46 @@ class SbbBinarizer: prediction_true = prediction_true.astype(np.uint8) return prediction_true[:,:,0] - def run(self, image=None, image_path=None, output=None, use_patches=False, dir_in=None): - # print(dir_in,'dir_in') - if not dir_in: - if (image is not None and image_path is not None) or \ - (image is None and image_path is None): - raise ValueError("Must pass either a opencv2 image or an image_path") - if image_path is not None: - image = cv2.imread(image_path) - img_last = 0 - for n, (model, model_file) in enumerate(zip(self.models, self.model_files)): - self.log.info('Predicting with model %s [%s/%s]' % (model_file, n + 1, len(self.model_files))) - - res = self.predict(model, image, use_patches) - - img_fin = np.zeros((res.shape[0], res.shape[1], 3)) - res[:, :][res[:, :] == 0] = 2 - res = res - 1 - res = res * 255 - img_fin[:, :, 0] = res - img_fin[:, :, 1] = res - img_fin[:, :, 2] = res - - img_fin = img_fin.astype(np.uint8) - img_fin = (res[:, :] == 0) * 255 - img_last = img_last + img_fin - - kernel = np.ones((5, 5), np.uint8) - img_last[:, :][img_last[:, :] > 0] = 255 - img_last = (img_last[:, :] == 0) * 255 - if output: - cv2.imwrite(output, img_last) - return img_last + def run(self, image_path=None, output=None, dir_in=None, use_patches=False, overwrite=False): + if dir_in: + ls_imgs = [(os.path.join(dir_in, image_filename), + os.path.join(output, os.path.splitext(image_filename)[0] + '.png')) + for image_filename in filter(is_image_filename, + os.listdir(dir_in))] else: - ls_imgs = list(filter(is_image_filename, os.listdir(dir_in))) - for image_name in ls_imgs: - image_stem = image_name.split('.')[0] - print(image_name,'image_name') - image = cv2.imread(os.path.join(dir_in,image_name) ) - img_last = 0 - for n, (model, model_file) in enumerate(zip(self.models, self.model_files)): - self.log.info('Predicting with model %s [%s/%s]' % (model_file, n + 1, len(self.model_files))) + ls_imgs = [(image_path, output)] - res = self.predict(model, image, use_patches) + for input_path, output_path in ls_imgs: + print(input_path, 'image_name') + if os.path.exists(output_path): + if overwrite: + self.logger.warning("will overwrite existing output file '%s'", output_ptah) + else: + self.logger.warning("will skip input for existing output file '%s'", output_path) + image = cv2.imread(input_path) + result = self.run_single(image, use_patches) + cv2.imwrite(output_path, result) - img_fin = np.zeros((res.shape[0], res.shape[1], 3)) - res[:, :][res[:, :] == 0] = 2 - res = res - 1 - res = res * 255 - img_fin[:, :, 0] = res - img_fin[:, :, 1] = res - img_fin[:, :, 2] = res + def run_single(self, image: np.ndarray, use_patches=False): + img_last = 0 + for n, (model, model_file) in enumerate(zip(self.models, self.model_files)): + self.logger.info('Predicting with model %s [%s/%s]' % (model_file, n + 1, len(self.model_files))) - img_fin = img_fin.astype(np.uint8) - img_fin = (res[:, :] == 0) * 255 - img_last = img_last + img_fin + res = self.predict(model, image, use_patches) - kernel = np.ones((5, 5), np.uint8) - img_last[:, :][img_last[:, :] > 0] = 255 - img_last = (img_last[:, :] == 0) * 255 - - cv2.imwrite(os.path.join(output, image_stem + '.png'), img_last) + img_fin = np.zeros((res.shape[0], res.shape[1], 3)) + res[:, :][res[:, :] == 0] = 2 + res = res - 1 + res = res * 255 + img_fin[:, :, 0] = res + img_fin[:, :, 1] = res + img_fin[:, :, 2] = res + + img_fin = img_fin.astype(np.uint8) + img_fin = (res[:, :] == 0) * 255 + img_last = img_last + img_fin + + kernel = np.ones((5, 5), np.uint8) + img_last[:, :][img_last[:, :] > 0] = 255 + img_last = (img_last[:, :] == 0) * 255 + return img_last From 184927fb5488f440948320ca97d716144da5012c Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 20 Oct 2025 13:16:57 +0200 Subject: [PATCH 02/32] `find_num_cols`: re-sort peaks when cutting n-best `num_col_classifier` --- src/eynollah/utils/__init__.py | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/src/eynollah/utils/__init__.py b/src/eynollah/utils/__init__.py index 5ccb2af..7c47407 100644 --- a/src/eynollah/utils/__init__.py +++ b/src/eynollah/utils/__init__.py @@ -463,22 +463,19 @@ def find_num_col(regions_without_separators, num_col_classifier, tables, multipl interest_neg_fin = interest_neg[(interest_neg < grenze)] peaks_neg_fin = peaks_neg[(interest_neg < grenze)] - # interest_neg_fin=interest_neg[(interest_neg= 3: - index_sort_interest_neg_fin= np.argsort(interest_neg_fin) - peaks_neg_sorted = np.array(peaks_neg)[index_sort_interest_neg_fin] - interest_neg_fin_sorted = np.array(interest_neg_fin)[index_sort_interest_neg_fin] + # found too few columns here: ignore 'grenze' and take the deepest N peaks + sort_by_height = np.argsort(interest_neg)[:num_col_classifier] + peaks_neg_fin = peaks_neg[sort_by_height] + interest_neg_fin = interest_neg[sort_by_height] + # print(peaks_neg_fin, "peaks_neg[sorted_by_height]") + sort_by_pos = np.argsort(peaks_neg_fin) + peaks_neg_fin = peaks_neg_fin[sort_by_pos] + interest_neg_fin = interest_neg_fin[sort_by_pos] - if len(index_sort_interest_neg_fin)>=num_col_classifier: - peaks_neg_fin = list( peaks_neg_sorted[:num_col_classifier] ) - interest_neg_fin = list( interest_neg_fin_sorted[:num_col_classifier] ) - else: - peaks_neg_fin = peaks_neg[:] - interest_neg_fin = interest_neg[:] - - num_col = (len(interest_neg_fin)) + 1 + num_col = len(interest_neg_fin) + 1 # print(peaks_neg_fin,'peaks_neg_fin') # print(num_col,'diz') From 48761c3e127bfde488cc3ff6dd7edc97eb85bfd0 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 20 Oct 2025 13:20:12 +0200 Subject: [PATCH 03/32] `find_num_col`: simplify, add better plotting (but commented out) --- src/eynollah/utils/__init__.py | 208 +++++++++++++++++---------------- 1 file changed, 108 insertions(+), 100 deletions(-) diff --git a/src/eynollah/utils/__init__.py b/src/eynollah/utils/__init__.py index 7c47407..ce72df4 100644 --- a/src/eynollah/utils/__init__.py +++ b/src/eynollah/utils/__init__.py @@ -396,16 +396,18 @@ def find_num_col_deskew(regions_without_separators, sigma_, multiplier=3.8): def find_num_col(regions_without_separators, num_col_classifier, tables, multiplier=3.8): if not regions_without_separators.any(): return 0, [] - #plt.imshow(regions_without_separators) - #plt.show() regions_without_separators_0 = regions_without_separators.sum(axis=0) - ##plt.plot(regions_without_separators_0) - ##plt.show() + # fig, (ax1, ax2) = plt.subplots(2, sharex=True) + # ax1.imshow(regions_without_separators, aspect="auto") + # ax2.plot(regions_without_separators_0) + # plt.show() sigma_ = 35 # 70#35 - meda_n_updown = regions_without_separators_0[len(regions_without_separators_0) :: -1] + meda_n_updown = regions_without_separators_0[::-1] first_nonzero = next((i for i, x in enumerate(regions_without_separators_0) if x), 0) last_nonzero = next((i for i, x in enumerate(meda_n_updown) if x), 0) last_nonzero = len(regions_without_separators_0) - last_nonzero + last_nonzero = last_nonzero - 100 + first_nonzero = first_nonzero + 200 y = regions_without_separators_0 # [first_nonzero:last_nonzero] y_help = np.zeros(len(y) + 20) y_help[10 : len(y) + 10] = y @@ -416,28 +418,44 @@ def find_num_col(regions_without_separators, num_col_classifier, tables, multipl z = gaussian_filter1d(y, sigma_) zneg = gaussian_filter1d(zneg, sigma_) - peaks_neg, _ = find_peaks(zneg, height=0) - #plt.plot(zneg) - #plt.plot(peaks_neg, zneg[peaks_neg], 'rx') - #plt.show() peaks, _ = find_peaks(z, height=0) + peaks_neg, _ = find_peaks(zneg, height=0) + # _, (ax1, ax2) = plt.subplots(2, sharex=True) + # ax1.set_title("z") + # ax1.plot(z) + # ax1.scatter(peaks, z[peaks]) + # ax1.axvline(0.06 * len(y), label="first") + # ax1.axvline(0.94 * len(y), label="last") + # ax1.text(0.06 * len(y), 0, "first", rotation=90) + # ax1.text(0.94 * len(y), 0, "last", rotation=90) + # ax1.axhline(10, label="minimum") + # ax1.text(0, 10, "minimum") + # ax2.set_title("zneg") + # ax2.plot(zneg) + # ax2.scatter(peaks_neg, zneg[peaks_neg]) + # ax2.axvline(first_nonzero, label="first nonzero") + # ax2.axvline(last_nonzero, label="last nonzero") + # ax2.text(first_nonzero, 0, "first nonzero", rotation=90) + # ax2.text(last_nonzero, 0, "last nonzero", rotation=90) + # ax2.axvline(370, label="first") + # ax2.axvline(len(y) - 370, label="last") + # ax2.text(370, 0, "first", rotation=90) + # ax2.text(len(y) - 370, 0, "last", rotation=90) + # plt.show() peaks_neg = peaks_neg - 10 - 10 - last_nonzero = last_nonzero - 100 - first_nonzero = first_nonzero + 200 - - peaks_neg = peaks_neg[(peaks_neg > first_nonzero) & - (peaks_neg < last_nonzero)] - peaks = peaks[(peaks > 0.06 * regions_without_separators.shape[1]) & - (peaks < 0.94 * regions_without_separators.shape[1])] - peaks_neg = peaks_neg[(peaks_neg > 370) & - (peaks_neg < (regions_without_separators.shape[1] - 370))] + peaks = peaks[(peaks > 0.06 * len(y)) & + (peaks < 0.94 * len(y))] interest_pos = z[peaks] interest_pos = interest_pos[interest_pos > 10] if not interest_pos.any(): return 0, [] # plt.plot(z) # plt.show() + peaks_neg = peaks_neg[(peaks_neg > first_nonzero) & + (peaks_neg < last_nonzero)] + peaks_neg = peaks_neg[(peaks_neg > 370) & + (peaks_neg < len(y) - 370)] interest_neg = z[peaks_neg] if not interest_neg.any(): return 0, [] @@ -445,21 +463,28 @@ def find_num_col(regions_without_separators, num_col_classifier, tables, multipl min_peaks_pos = np.min(interest_pos) max_peaks_pos = np.max(interest_pos) - if max_peaks_pos / min_peaks_pos >= 35: + #print(min_peaks_pos, max_peaks_pos, max_peaks_pos / min_peaks_pos, 'minmax') + if max_peaks_pos / (min_peaks_pos or 1e-9) >= 35: min_peaks_pos = np.mean(interest_pos) min_peaks_neg = 0 # np.min(interest_neg) - # print(np.min(interest_pos),np.max(interest_pos),np.max(interest_pos)/np.min(interest_pos),'minmax') dis_talaei = (min_peaks_pos - min_peaks_neg) / multiplier grenze = min_peaks_pos - dis_talaei - # np.mean(y[peaks_neg[0]:peaks_neg[len(peaks_neg)-1]])-np.std(y[peaks_neg[0]:peaks_neg[len(peaks_neg)-1]])/2.0 + #np.mean(y[peaks_neg[0]:peaks_neg[-1]])-np.std(y[peaks_neg[0]:peaks_neg[-1]])/2.0 # print(interest_neg,'interest_neg') # print(grenze,'grenze') # print(min_peaks_pos,'min_peaks_pos') # print(dis_talaei,'dis_talaei') # print(peaks_neg,'peaks_neg') + # fig, (ax1, ax2) = plt.subplots(2, sharex=True) + # ax1.imshow(regions_without_separators, aspect="auto") + # ax2.plot(z) + # ax2.scatter(peaks_neg, z[peaks_neg]) + # ax2.axhline(grenze, label="grenze") + # ax2.text(0, grenze, "grenze") + # plt.show() interest_neg_fin = interest_neg[(interest_neg < grenze)] peaks_neg_fin = peaks_neg[(interest_neg < grenze)] @@ -479,46 +504,38 @@ def find_num_col(regions_without_separators, num_col_classifier, tables, multipl # print(peaks_neg_fin,'peaks_neg_fin') # print(num_col,'diz') - p_l = 0 - p_u = len(y) - 1 - p_m = int(len(y) / 2.0) - p_g_l = int(len(y) / 4.0) - p_g_u = len(y) - int(len(y) / 4.0) - - if num_col == 3: - if ((peaks_neg_fin[0] > p_g_u and - peaks_neg_fin[1] > p_g_u) or - (peaks_neg_fin[0] < p_g_l and - peaks_neg_fin[1] < p_g_l) or - (peaks_neg_fin[0] + 200 < p_m and - peaks_neg_fin[1] < p_m) or - (peaks_neg_fin[0] - 200 > p_m and - peaks_neg_fin[1] > p_m)): - num_col = 1 - peaks_neg_fin = [] - - if num_col == 2: - if (peaks_neg_fin[0] > p_g_u or - peaks_neg_fin[0] < p_g_l): - num_col = 1 - peaks_neg_fin = [] + # cancel if resulting split is highly unbalanced across available width + if ((num_col == 3 and + ((peaks_neg_fin[0] > 0.75 * len(y) and + peaks_neg_fin[1] > 0.75 * len(y)) or + (peaks_neg_fin[0] < 0.25 * len(y) and + peaks_neg_fin[1] < 0.25 * len(y)) or + (peaks_neg_fin[0] < 0.5 * len(y) - 200 and + peaks_neg_fin[1] < 0.5 * len(y)) or + (peaks_neg_fin[0] > 0.5 * len(y) + 200 and + peaks_neg_fin[1] > 0.5 * len(y)))) or + (num_col == 2 and + (peaks_neg_fin[0] > 0.75 * len(y) or + peaks_neg_fin[0] < 0.25 * len(y)))): + num_col = 1 + peaks_neg_fin = [] ##print(len(peaks_neg_fin)) + # filter out peaks that are too close (<400px) to each other: + # among each group, pick the position with smallest amount of text diff_peaks = np.abs(np.diff(peaks_neg_fin)) cut_off = 400 peaks_neg_true = [] forest = [] - # print(len(peaks_neg_fin),'len_') - for i in range(len(peaks_neg_fin)): if i == 0: forest.append(peaks_neg_fin[i]) if i < len(peaks_neg_fin) - 1: if diff_peaks[i] <= cut_off: forest.append(peaks_neg_fin[i + 1]) - if diff_peaks[i] > cut_off: + else: # print(forest[np.argmin(z[forest]) ] ) if not isNaN(forest[np.argmin(z[forest])]): peaks_neg_true.append(forest[np.argmin(z[forest])]) @@ -530,68 +547,59 @@ def find_num_col(regions_without_separators, num_col_classifier, tables, multipl peaks_neg_true.append(forest[np.argmin(z[forest])]) num_col = len(peaks_neg_true) + 1 - p_l = 0 - p_u = len(y) - 1 - p_m = int(len(y) / 2.0) - p_quarter = int(len(y) / 5.0) - p_g_l = int(len(y) / 4.0) - p_g_u = len(y) - int(len(y) / 4.0) - - p_u_quarter = len(y) - p_quarter - + #print(peaks_neg_true, "peaks_neg_true") ##print(num_col,'early') - if num_col == 3: - if ((peaks_neg_true[0] > p_g_u and - peaks_neg_true[1] > p_g_u) or - (peaks_neg_true[0] < p_g_l and - peaks_neg_true[1] < p_g_l) or - (peaks_neg_true[0] < p_m and - peaks_neg_true[1] + 200 < p_m) or - (peaks_neg_true[0] - 200 > p_m and - peaks_neg_true[1] > p_m)): - num_col = 1 - peaks_neg_true = [] - elif (peaks_neg_true[0] < p_g_u and - peaks_neg_true[0] > p_g_l and - peaks_neg_true[1] > p_u_quarter): - peaks_neg_true = [peaks_neg_true[0]] - elif (peaks_neg_true[1] < p_g_u and - peaks_neg_true[1] > p_g_l and - peaks_neg_true[0] < p_quarter): - peaks_neg_true = [peaks_neg_true[1]] + # cancel if resulting split is highly unbalanced across available width + if ((num_col == 3 and + ((peaks_neg_true[0] > 0.75 * len(y) and + peaks_neg_true[1] > 0.75 * len(y)) or + (peaks_neg_true[0] < 0.25 * len(y) and + peaks_neg_true[1] < 0.25 * len(y)) or + (peaks_neg_true[0] < 0.5 * len(y) - 200 and + peaks_neg_true[1] < 0.5 * len(y)) or + (peaks_neg_true[0] > 0.5 * len(y) + 200 and + peaks_neg_true[1] > 0.5 * len(y)))) or + (num_col == 2 and + (peaks_neg_true[0] > 0.75 * len(y) or + peaks_neg_true[0] < 0.25 * len(y)))): + num_col = 1 + peaks_neg_true = [] + if (num_col == 3 and + (peaks_neg_true[0] < 0.75 * len(y) and + peaks_neg_true[0] > 0.25 * len(y) and + peaks_neg_true[1] > 0.80 * len(y))): + num_col = 2 + peaks_neg_true = [peaks_neg_true[0]] + if (num_col == 3 and + (peaks_neg_true[1] < 0.75 * len(y) and + peaks_neg_true[1] > 0.25 * len(y) and + peaks_neg_true[0] < 0.20 * len(y))): + num_col = 2 + peaks_neg_true = [peaks_neg_true[1]] - if num_col == 2: - if (peaks_neg_true[0] > p_g_u or - peaks_neg_true[0] < p_g_l): - num_col = 1 - peaks_neg_true = [] + # get rid of too narrow columns (not used) + # if np.count_nonzero(diff_peaks < 360): + # arg_help = np.arange(len(diff_peaks)) + # arg_help_ann = arg_help[diff_peaks < 360] + # peaks_neg_fin_new = [] + # for ii in range(len(peaks_neg_fin)): + # if ii in arg_help_ann: + # if interest_neg_fin[ii] < interest_neg_fin[ii + 1]: + # peaks_neg_fin_new.append(peaks_neg_fin[ii]) + # else: + # peaks_neg_fin_new.append(peaks_neg_fin[ii + 1]) - diff_peaks_abnormal = diff_peaks[diff_peaks < 360] - - if len(diff_peaks_abnormal) > 0: - arg_help = np.arange(len(diff_peaks)) - arg_help_ann = arg_help[diff_peaks < 360] - - peaks_neg_fin_new = [] - - for ii in range(len(peaks_neg_fin)): - if ii in arg_help_ann: - arg_min = np.argmin([interest_neg_fin[ii], interest_neg_fin[ii + 1]]) - if arg_min == 0: - peaks_neg_fin_new.append(peaks_neg_fin[ii]) - else: - peaks_neg_fin_new.append(peaks_neg_fin[ii + 1]) - - elif (ii - 1) not in arg_help_ann: - peaks_neg_fin_new.append(peaks_neg_fin[ii]) - else: - peaks_neg_fin_new = peaks_neg_fin + # elif (ii - 1) not in arg_help_ann: + # peaks_neg_fin_new.append(peaks_neg_fin[ii]) + # else: + # peaks_neg_fin_new = peaks_neg_fin # plt.plot(gaussian_filter1d(y, sigma_)) # plt.plot(peaks_neg_true,z[peaks_neg_true],'*') # plt.plot([0,len(y)], [grenze,grenze]) # plt.show() ##print(len(peaks_neg_true)) + #print(peaks_neg_true, "peaks_neg_true") return len(peaks_neg_true), peaks_neg_true def find_num_col_only_image(regions_without_separators, multiplier=3.8): From c43a825d1d26c36beee3bbc2e038f8c0cda4221b Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 20 Oct 2025 13:26:01 +0200 Subject: [PATCH 04/32] `order_of_regions`: filter out-of-image peaks --- src/eynollah/utils/__init__.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/src/eynollah/utils/__init__.py b/src/eynollah/utils/__init__.py index ce72df4..677ed53 100644 --- a/src/eynollah/utils/__init__.py +++ b/src/eynollah/utils/__init__.py @@ -1216,15 +1216,16 @@ def order_of_regions(textline_mask, contours_main, contours_head, y_ref): peaks_neg, _ = find_peaks(zneg, height=0) peaks_neg = peaks_neg - 20 - 20 - ##plt.plot(z) - ##plt.show() - cx_main, cy_main = find_center_of_contours(contours_main) - cx_head, cy_head = find_center_of_contours(contours_head) - - peaks_neg_new = np.append(np.insert(peaks_neg, 0, 0), textline_mask.shape[0]) + peaks_neg_new = np.array([0] + + # peaks can be beyond box due to padding and smoothing + [peak for peak in peaks_neg + if 0 < peak and peak < textline_mask.shape[0]] + + [textline_mask.shape[0]]) # offset from bbox of mask peaks_neg_new += y_ref + cx_main, cy_main = find_center_of_contours(contours_main) + cx_head, cy_head = find_center_of_contours(contours_head) # assert not len(cy_main) or np.min(peaks_neg_new) <= np.min(cy_main) and np.max(cy_main) <= np.max(peaks_neg_new) # assert not len(cy_head) or np.min(peaks_neg_new) <= np.min(cy_head) and np.max(cy_head) <= np.max(peaks_neg_new) From d3d599b0108bf17802bda2f9808620e3cd8471db Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 20 Oct 2025 13:27:23 +0200 Subject: [PATCH 05/32] `order_of_regions`: add better plotting (but commented out) --- src/eynollah/eynollah.py | 2 +- src/eynollah/utils/__init__.py | 20 +++++++++++++++++++- 2 files changed, 20 insertions(+), 2 deletions(-) diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index 13acba6..9412861 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -2553,7 +2553,7 @@ class Eynollah: con_inter_box_h = contours_only_text_parent_h[args_contours_box_head] indexes_sorted, kind_of_texts_sorted, index_by_kind_sorted = order_of_regions( - textline_mask_tot[ys, xs], con_inter_box, con_inter_box_h, box[2]) + textline_mask_tot[ys, xs], con_inter_box, con_inter_box_h, box[2], box[0]) order_of_texts, id_of_texts = order_and_id_of_texts( con_inter_box, con_inter_box_h, diff --git a/src/eynollah/utils/__init__.py b/src/eynollah/utils/__init__.py index 677ed53..f2e3581 100644 --- a/src/eynollah/utils/__init__.py +++ b/src/eynollah/utils/__init__.py @@ -1197,7 +1197,7 @@ def small_textlines_to_parent_adherence2(textlines_con, textline_iamge, num_col) textlines_con_changed.append(textlines_big_org_form) return textlines_con_changed -def order_of_regions(textline_mask, contours_main, contours_head, y_ref): +def order_of_regions(textline_mask, contours_main, contours_head, y_ref, x_ref): ##plt.imshow(textline_mask) ##plt.show() y = textline_mask.sum(axis=1) # horizontal projection profile @@ -1208,6 +1208,8 @@ def order_of_regions(textline_mask, contours_main, contours_head, y_ref): #z = gaussian_filter1d(y_padded, sigma_gaus) #peaks, _ = find_peaks(z, height=0) #peaks = peaks - 20 + ##plt.plot(z) + ##plt.show() zneg_rev = np.max(y_padded) - y_padded zneg = np.zeros(len(zneg_rev) + 40) zneg[20 : len(zneg_rev) + 20] = zneg_rev @@ -1250,6 +1252,22 @@ def order_of_regions(textline_mask, contours_main, contours_head, y_ref): indexes_in, types_in, cxs_in, cys_in, typed_indexes_in = \ matrix_of_orders[(matrix_of_orders[:, 3] >= top) & (matrix_of_orders[:, 3] < bot)].T + # if indexes_in.size: + # img = textline_mask.copy() + # plt.imshow(img) + # plt.gca().add_patch(patches.Rectangle((0, top-y_ref), img.shape[1], bot-top, alpha=0.5, color='gray')) + # xrange = np.arange(0, img.shape[1], 50) + # yrange = np.arange(0, img.shape[0], 50) + # plt.gca().set_xticks(xrange, xrange + x_ref) + # plt.gca().set_yticks(yrange, yrange + y_ref) + # for idx, type_, cx, cy in zip(typed_indexes_in, types_in, cxs_in, cys_in): + # cnt = (contours_main if type_ == 1 else contours_head)[idx] + # col = 'red' if type_ == 1 else 'blue' + # plt.scatter(cx - x_ref, cy - y_ref, 20, c=col, marker='o') + # plt.gca().add_patch(patches.Polygon(cnt[:, 0] - [[x_ref, y_ref]], closed=False, fill=False, color=col)) + # plt.title("box contours centered in %d:%d (red=main / blue=heading)" % (top, bot)) + # plt.show() + sorted_inside = np.argsort(cxs_in) final_indexers_sorted.extend(indexes_in[sorted_inside]) final_types.extend(types_in[sorted_inside]) From 542d38ab432e3089ebc8fefd3caee2915fe6b031 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 20 Oct 2025 13:34:56 +0200 Subject: [PATCH 06/32] =?UTF-8?q?`find=5Fnumber=5Fof=5Fcolumns=5Fin=5Fdocu?= =?UTF-8?q?ment`:=20simplify,=20rename=20`line`=E2=86=92`seps`?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/eynollah/utils/__init__.py | 244 +++++++++++++++------------------ 1 file changed, 109 insertions(+), 135 deletions(-) diff --git a/src/eynollah/utils/__init__.py b/src/eynollah/utils/__init__.py index f2e3581..168899f 100644 --- a/src/eynollah/utils/__init__.py +++ b/src/eynollah/utils/__init__.py @@ -1377,175 +1377,149 @@ def return_points_with_boundies(peaks_neg_fin, first_point, last_point): peaks_neg_tot.append(last_point) return peaks_neg_tot -def find_number_of_columns_in_document(region_pre_p, num_col_classifier, tables, label_lines, contours_h=None): - t_ins_c0 = time.time() - separators_closeup=( (region_pre_p[:,:]==label_lines))*1 - separators_closeup[0:110,:]=0 - separators_closeup[separators_closeup.shape[0]-150:,:]=0 +def find_number_of_columns_in_document(region_pre_p, num_col_classifier, tables, label_seps, contours_h=None): + separators_closeup = 1 * (region_pre_p == label_seps) + separators_closeup[0:110] = 0 + separators_closeup[-150:] = 0 kernel = np.ones((5,5),np.uint8) - separators_closeup=separators_closeup.astype(np.uint8) - separators_closeup = cv2.dilate(separators_closeup,kernel,iterations = 1) - separators_closeup = cv2.erode(separators_closeup,kernel,iterations = 1) + separators_closeup = separators_closeup.astype(np.uint8) + separators_closeup = cv2.morphologyEx(separators_closeup, cv2.MORPH_CLOSE, kernel, iterations=1) - separators_closeup_new=np.zeros((separators_closeup.shape[0] ,separators_closeup.shape[1] )) - separators_closeup_n=np.copy(separators_closeup) - separators_closeup_n=separators_closeup_n.astype(np.uint8) + separators_closeup_n = separators_closeup.astype(np.uint8) # to be returned - separators_closeup_n_binary=np.zeros(( separators_closeup_n.shape[0],separators_closeup_n.shape[1]) ) - separators_closeup_n_binary[:,:]=separators_closeup_n[:,:] - separators_closeup_n_binary[:,:][separators_closeup_n_binary[:,:]!=0]=1 + separators_closeup_n_binary = separators_closeup_n.copy() - _, thresh_e = cv2.threshold(separators_closeup_n_binary, 0, 255, 0) - contours_line_e, _ = cv2.findContours(thresh_e.astype(np.uint8), cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) - _, dist_xe, _, _, _, _, y_min_main, y_max_main, _ = \ - find_features_of_lines(contours_line_e) - dist_ye = y_max_main - y_min_main - args_e=np.arange(len(contours_line_e)) - args_hor_e=args_e[(dist_ye<=50) & - (dist_xe>=3*dist_ye)] - cnts_hor_e=[] - for ce in args_hor_e: - cnts_hor_e.append(contours_line_e[ce]) + # find horizontal lines by contour properties + contours_sep_e, _ = cv2.findContours(separators_closeup_n_binary, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) + cnts_hor_e = [] + for cnt in contours_sep_e: + max_xe = cnt[:, 0, 0].max() + min_xe = cnt[:, 0, 0].min() + max_ye = cnt[:, 0, 1].max() + min_ye = cnt[:, 0, 1].min() + dist_xe = max_xe - min_xe + dist_ye = max_ye - min_ye + if dist_ye <= 50 and dist_xe >= 3 * dist_ye: + cnts_hor_e.append(cnt) - separators_closeup_n_binary=cv2.fillPoly(separators_closeup_n_binary, pts=cnts_hor_e, color=0) - gray = cv2.bitwise_not(separators_closeup_n_binary) - gray=gray.astype(np.uint8) + # delete horizontal contours (leaving only the edges) + separators_closeup_n_binary = cv2.fillPoly(separators_closeup_n_binary, pts=cnts_hor_e, color=0) + edges = cv2.adaptiveThreshold(separators_closeup_n_binary * 255, 255, + cv2.ADAPTIVE_THRESH_MEAN_C, cv2.THRESH_BINARY, 15, -2) + horizontal = np.copy(edges) + vertical = np.copy(edges) - bw = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_MEAN_C, \ - cv2.THRESH_BINARY, 15, -2) - horizontal = np.copy(bw) - vertical = np.copy(bw) - - cols = horizontal.shape[1] - horizontal_size = cols // 30 - # Create structure element for extracting horizontal lines through morphology operations + horizontal_size = horizontal.shape[1] // 30 + # find horizontal lines by morphology horizontalStructure = cv2.getStructuringElement(cv2.MORPH_RECT, (horizontal_size, 1)) - # Apply morphology operations - horizontal = cv2.erode(horizontal, horizontalStructure) - horizontal = cv2.dilate(horizontal, horizontalStructure) - - kernel = np.ones((5,5),np.uint8) - horizontal = cv2.dilate(horizontal,kernel,iterations = 2) - horizontal = cv2.erode(horizontal,kernel,iterations = 2) + horizontal = cv2.morphologyEx(horizontal, cv2.MORPH_OPEN, horizontalStructure) + horizontal = cv2.morphologyEx(horizontal, cv2.MORPH_CLOSE, kernel, iterations=2) + # re-insert deleted horizontal contours horizontal = cv2.fillPoly(horizontal, pts=cnts_hor_e, color=255) - rows = vertical.shape[0] - verticalsize = rows // 30 - # Create structure element for extracting vertical lines through morphology operations - verticalStructure = cv2.getStructuringElement(cv2.MORPH_RECT, (1, verticalsize)) - # Apply morphology operations - vertical = cv2.erode(vertical, verticalStructure) - vertical = cv2.dilate(vertical, verticalStructure) - vertical = cv2.dilate(vertical,kernel,iterations = 1) + vertical_size = vertical.shape[0] // 30 + # find vertical lines by morphology + verticalStructure = cv2.getStructuringElement(cv2.MORPH_RECT, (1, vertical_size)) + vertical = cv2.morphologyEx(vertical, cv2.MORPH_OPEN, verticalStructure) + vertical = cv2.dilate(vertical, kernel, iterations=1) horizontal, special_separators = \ combine_hor_lines_and_delete_cross_points_and_get_lines_features_back_new( vertical, horizontal, num_col_classifier) - separators_closeup_new[:,:][vertical[:,:]!=0]=1 - separators_closeup_new[:,:][horizontal[:,:]!=0]=1 - _, thresh = cv2.threshold(vertical, 0, 255, 0) - contours_line_vers, _ = cv2.findContours(thresh.astype(np.uint8), cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) - slope_lines, dist_x, x_min_main, x_max_main, cy_main, slope_lines_org, y_min_main, y_max_main, cx_main = \ - find_features_of_lines(contours_line_vers) + contours_sep_vers, _ = cv2.findContours(thresh.astype(np.uint8), cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) + slope_seps, dist_x, x_min_seps, x_max_seps, cy_seps, slope_seps_org, y_min_seps, y_max_seps, cx_seps = \ + find_features_of_lines(contours_sep_vers) - args=np.arange(len(slope_lines)) - args_ver=args[slope_lines==1] - dist_x_ver=dist_x[slope_lines==1] - y_min_main_ver=y_min_main[slope_lines==1] - y_max_main_ver=y_max_main[slope_lines==1] - x_min_main_ver=x_min_main[slope_lines==1] - x_max_main_ver=x_max_main[slope_lines==1] - cx_main_ver=cx_main[slope_lines==1] - dist_y_ver=y_max_main_ver-y_min_main_ver + args=np.arange(len(slope_seps)) + args_ver=args[slope_seps==1] + dist_x_ver=dist_x[slope_seps==1] + y_min_seps_ver=y_min_seps[slope_seps==1] + y_max_seps_ver=y_max_seps[slope_seps==1] + x_min_seps_ver=x_min_seps[slope_seps==1] + x_max_seps_ver=x_max_seps[slope_seps==1] + cx_seps_ver=cx_seps[slope_seps==1] + dist_y_ver=y_max_seps_ver-y_min_seps_ver len_y=separators_closeup.shape[0]/3.0 _, thresh = cv2.threshold(horizontal, 0, 255, 0) - contours_line_hors, _ = cv2.findContours(thresh.astype(np.uint8), cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) - slope_lines, dist_x, x_min_main, x_max_main, cy_main, slope_lines_org, y_min_main, y_max_main, cx_main = \ - find_features_of_lines(contours_line_hors) + contours_sep_hors, _ = cv2.findContours(thresh.astype(np.uint8), cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) + slope_seps, dist_x, x_min_seps, x_max_seps, cy_seps, slope_seps_org, y_min_seps, y_max_seps, cx_seps = \ + find_features_of_lines(contours_sep_hors) - slope_lines_org_hor=slope_lines_org[slope_lines==0] - args=np.arange(len(slope_lines)) + slope_seps_org_hor=slope_seps_org[slope_seps==0] + args=np.arange(len(slope_seps)) len_x=separators_closeup.shape[1]/5.0 - dist_y=np.abs(y_max_main-y_min_main) + dist_y=np.abs(y_max_seps-y_min_seps) - args_hor=args[slope_lines==0] - dist_x_hor=dist_x[slope_lines==0] - y_min_main_hor=y_min_main[slope_lines==0] - y_max_main_hor=y_max_main[slope_lines==0] - x_min_main_hor=x_min_main[slope_lines==0] - x_max_main_hor=x_max_main[slope_lines==0] - dist_y_hor=dist_y[slope_lines==0] - cy_main_hor=cy_main[slope_lines==0] + args_hor=args[slope_seps==0] + dist_x_hor=dist_x[slope_seps==0] + y_min_seps_hor=y_min_seps[slope_seps==0] + y_max_seps_hor=y_max_seps[slope_seps==0] + x_min_seps_hor=x_min_seps[slope_seps==0] + x_max_seps_hor=x_max_seps[slope_seps==0] + dist_y_hor=dist_y[slope_seps==0] + cy_seps_hor=cy_seps[slope_seps==0] args_hor=args_hor[dist_x_hor>=len_x/2.0] - x_max_main_hor=x_max_main_hor[dist_x_hor>=len_x/2.0] - x_min_main_hor=x_min_main_hor[dist_x_hor>=len_x/2.0] - cy_main_hor=cy_main_hor[dist_x_hor>=len_x/2.0] - y_min_main_hor=y_min_main_hor[dist_x_hor>=len_x/2.0] - y_max_main_hor=y_max_main_hor[dist_x_hor>=len_x/2.0] + x_max_seps_hor=x_max_seps_hor[dist_x_hor>=len_x/2.0] + x_min_seps_hor=x_min_seps_hor[dist_x_hor>=len_x/2.0] + cy_seps_hor=cy_seps_hor[dist_x_hor>=len_x/2.0] + y_min_seps_hor=y_min_seps_hor[dist_x_hor>=len_x/2.0] + y_max_seps_hor=y_max_seps_hor[dist_x_hor>=len_x/2.0] dist_y_hor=dist_y_hor[dist_x_hor>=len_x/2.0] - slope_lines_org_hor=slope_lines_org_hor[dist_x_hor>=len_x/2.0] + slope_seps_org_hor=slope_seps_org_hor[dist_x_hor>=len_x/2.0] dist_x_hor=dist_x_hor[dist_x_hor>=len_x/2.0] - matrix_of_lines_ch=np.zeros((len(cy_main_hor)+len(cx_main_ver),10)) - matrix_of_lines_ch[:len(cy_main_hor),0]=args_hor - matrix_of_lines_ch[len(cy_main_hor):,0]=args_ver - matrix_of_lines_ch[len(cy_main_hor):,1]=cx_main_ver - matrix_of_lines_ch[:len(cy_main_hor),2]=x_min_main_hor+50#x_min_main_hor+150 - matrix_of_lines_ch[len(cy_main_hor):,2]=x_min_main_ver - matrix_of_lines_ch[:len(cy_main_hor),3]=x_max_main_hor-50#x_max_main_hor-150 - matrix_of_lines_ch[len(cy_main_hor):,3]=x_max_main_ver - matrix_of_lines_ch[:len(cy_main_hor),4]=dist_x_hor - matrix_of_lines_ch[len(cy_main_hor):,4]=dist_x_ver - matrix_of_lines_ch[:len(cy_main_hor),5]=cy_main_hor - matrix_of_lines_ch[:len(cy_main_hor),6]=y_min_main_hor - matrix_of_lines_ch[len(cy_main_hor):,6]=y_min_main_ver - matrix_of_lines_ch[:len(cy_main_hor),7]=y_max_main_hor - matrix_of_lines_ch[len(cy_main_hor):,7]=y_max_main_ver - matrix_of_lines_ch[:len(cy_main_hor),8]=dist_y_hor - matrix_of_lines_ch[len(cy_main_hor):,8]=dist_y_ver - matrix_of_lines_ch[len(cy_main_hor):,9]=1 + matrix_of_seps_ch=np.zeros((len(cy_seps_hor)+len(cx_seps_ver),10)) + matrix_of_seps_ch[:len(cy_seps_hor),0]=args_hor + matrix_of_seps_ch[len(cy_seps_hor):,0]=args_ver + matrix_of_seps_ch[len(cy_seps_hor):,1]=cx_seps_ver + matrix_of_seps_ch[:len(cy_seps_hor),2]=x_min_seps_hor+50#x_min_seps_hor+150 + matrix_of_seps_ch[len(cy_seps_hor):,2]=x_min_seps_ver + matrix_of_seps_ch[:len(cy_seps_hor),3]=x_max_seps_hor-50#x_max_seps_hor-150 + matrix_of_seps_ch[len(cy_seps_hor):,3]=x_max_seps_ver + matrix_of_seps_ch[:len(cy_seps_hor),4]=dist_x_hor + matrix_of_seps_ch[len(cy_seps_hor):,4]=dist_x_ver + matrix_of_seps_ch[:len(cy_seps_hor),5]=cy_seps_hor + matrix_of_seps_ch[:len(cy_seps_hor),6]=y_min_seps_hor + matrix_of_seps_ch[len(cy_seps_hor):,6]=y_min_seps_ver + matrix_of_seps_ch[:len(cy_seps_hor),7]=y_max_seps_hor + matrix_of_seps_ch[len(cy_seps_hor):,7]=y_max_seps_ver + matrix_of_seps_ch[:len(cy_seps_hor),8]=dist_y_hor + matrix_of_seps_ch[len(cy_seps_hor):,8]=dist_y_ver + matrix_of_seps_ch[len(cy_seps_hor):,9]=1 if contours_h is not None: - _, dist_x_head, x_min_main_head, x_max_main_head, cy_main_head, _, y_min_main_head, y_max_main_head, _ = \ + _, dist_x_head, x_min_head, x_max_head, cy_head, _, y_min_head, y_max_head, _ = \ find_features_of_lines(contours_h) - matrix_l_n=np.zeros((matrix_of_lines_ch.shape[0]+len(cy_main_head),matrix_of_lines_ch.shape[1])) - matrix_l_n[:matrix_of_lines_ch.shape[0],:]=np.copy(matrix_of_lines_ch[:,:]) - args_head=np.arange(len(cy_main_head)) + len(cy_main_hor) + matrix_l_n = np.zeros((len(cy_head), matrix_of_seps_ch.shape[1])) + args_head = np.arange(len(cy_head)) + matrix_l_n[:, 0] = args_head + matrix_l_n[:, 2] = x_min_head+30 + matrix_l_n[:, 3] = x_max_head-30 + matrix_l_n[:, 4] = dist_x_head + matrix_l_n[:, 5] = y_min_head-3-8 + matrix_l_n[:, 6] = y_min_head-5-8 + matrix_l_n[:, 7] = y_max_head#y_min_head+1-8 + matrix_l_n[:, 8] = 4 + matrix_of_seps_ch = np.append( + matrix_of_seps_ch, matrix_l_n, axis=0) - matrix_l_n[matrix_of_lines_ch.shape[0]:,0]=args_head - matrix_l_n[matrix_of_lines_ch.shape[0]:,2]=x_min_main_head+30 - matrix_l_n[matrix_of_lines_ch.shape[0]:,3]=x_max_main_head-30 - matrix_l_n[matrix_of_lines_ch.shape[0]:,4]=dist_x_head - matrix_l_n[matrix_of_lines_ch.shape[0]:,5]=y_min_main_head-3-8 - matrix_l_n[matrix_of_lines_ch.shape[0]:,6]=y_min_main_head-5-8 - matrix_l_n[matrix_of_lines_ch.shape[0]:,7]=y_max_main_head#y_min_main_head+1-8 - matrix_l_n[matrix_of_lines_ch.shape[0]:,8]=4 - matrix_of_lines_ch=np.copy(matrix_l_n) + cy_seps_splitters=cy_seps_hor[(x_min_seps_hor<=.16*region_pre_p.shape[1]) & + (x_max_seps_hor>=.84*region_pre_p.shape[1])] + cy_seps_splitters = np.append(cy_seps_splitters, special_separators) - cy_main_splitters=cy_main_hor[(x_min_main_hor<=.16*region_pre_p.shape[1]) & - (x_max_main_hor>=.84*region_pre_p.shape[1])] - cy_main_splitters=np.array( list(cy_main_splitters)+list(special_separators)) if contours_h is not None: - try: - cy_main_splitters_head=cy_main_head[(x_min_main_head<=.16*region_pre_p.shape[1]) & - (x_max_main_head>=.84*region_pre_p.shape[1])] - cy_main_splitters=np.array( list(cy_main_splitters)+list(cy_main_splitters_head)) - except: - pass - args_cy_splitter=np.argsort(cy_main_splitters) - cy_main_splitters_sort=cy_main_splitters[args_cy_splitter] + cy_seps_splitters_head=cy_head[(x_min_head<=.16*region_pre_p.shape[1]) & + (x_max_head>=.84*region_pre_p.shape[1])] + cy_seps_splitters = np.append(cy_seps_splitters, cy_seps_splitters_head) - splitter_y_new=[] - splitter_y_new.append(0) - for i in range(len(cy_main_splitters_sort)): - splitter_y_new.append( cy_main_splitters_sort[i] ) - splitter_y_new.append(region_pre_p.shape[0]) - splitter_y_new_diff=np.diff(splitter_y_new)/float(region_pre_p.shape[0])*100 + cy_seps_splitters = np.sort(cy_seps_splitters) + splitter_y_new = [0] + list(cy_seps_splitters) + [region_pre_p.shape[0]] + splitter_y_new_diff = np.diff(splitter_y_new) / float(region_pre_p.shape[0]) * 100 args_big_parts=np.arange(len(splitter_y_new_diff))[ splitter_y_new_diff>22 ] @@ -1573,7 +1547,7 @@ def find_number_of_columns_in_document(region_pre_p, num_col_classifier, tables, peaks_neg_fin=peaks_neg_fin[peaks_neg_fin<=(vertical.shape[1]-500)] peaks_neg_fin_fin=peaks_neg_fin[:] - return num_col_fin, peaks_neg_fin_fin,matrix_of_lines_ch,splitter_y_new,separators_closeup_n + return num_col_fin, peaks_neg_fin_fin, matrix_of_seps_ch, splitter_y_new, separators_closeup_n def return_boxes_of_images_by_order_of_reading_new( splitter_y_new, regions_without_separators, From 5a0e4c3b0f2e089acff0b4fbf058f1d2e6f90f66 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 20 Oct 2025 13:36:10 +0200 Subject: [PATCH 07/32] `find_number_of_columns_in_document`: improve splitter rule extend horizontal separators to full img width if they do not overlap any other regions (only as regards to returned `splitter_y` result, but without changing returned separators mask) --- src/eynollah/utils/__init__.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/src/eynollah/utils/__init__.py b/src/eynollah/utils/__init__.py index 168899f..b930bfd 100644 --- a/src/eynollah/utils/__init__.py +++ b/src/eynollah/utils/__init__.py @@ -1378,6 +1378,8 @@ def return_points_with_boundies(peaks_neg_fin, first_point, last_point): return peaks_neg_tot def find_number_of_columns_in_document(region_pre_p, num_col_classifier, tables, label_seps, contours_h=None): + ncomps, ccomps = cv2.connectedComponents(region_pre_p.astype(np.uint8)) + separators_closeup = 1 * (region_pre_p == label_seps) separators_closeup[0:110] = 0 separators_closeup[-150:] = 0 @@ -1398,10 +1400,19 @@ def find_number_of_columns_in_document(region_pre_p, num_col_classifier, tables, min_xe = cnt[:, 0, 0].min() max_ye = cnt[:, 0, 1].max() min_ye = cnt[:, 0, 1].min() + med_ye = int(np.median(cnt[:, 0, 1])) dist_xe = max_xe - min_xe dist_ye = max_ye - min_ye if dist_ye <= 50 and dist_xe >= 3 * dist_ye: cnts_hor_e.append(cnt) + labels = np.setdiff1d(np.unique(ccomps[med_ye]), [0]) + if len(labels) == 1: + # mid line does not intersect with any other region + # so add it as extra splitter line + cnts_hor_e.append(np.array([[[0, med_ye]], + [[ccomps.shape[1], med_ye]], + [[ccomps.shape[1], med_ye + 1]], + [[0, med_ye + 1]]])) # delete horizontal contours (leaving only the edges) separators_closeup_n_binary = cv2.fillPoly(separators_closeup_n_binary, pts=cnts_hor_e, color=0) From cd35241e816acc7e2083dc31d99f376a8877904b Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 20 Oct 2025 13:41:36 +0200 Subject: [PATCH 08/32] `find_number_of_columns_in_document`: split headings at top+baseline regarding `splitter_y` result, for headings, instead of cutting right through them via center line, add their toplines and baselines as if they were horizontal separators --- src/eynollah/utils/__init__.py | 28 +++++++++++++++++++++++----- 1 file changed, 23 insertions(+), 5 deletions(-) diff --git a/src/eynollah/utils/__init__.py b/src/eynollah/utils/__init__.py index b930bfd..0c3e4ae 100644 --- a/src/eynollah/utils/__init__.py +++ b/src/eynollah/utils/__init__.py @@ -1506,15 +1506,33 @@ def find_number_of_columns_in_document(region_pre_p, num_col_classifier, tables, if contours_h is not None: _, dist_x_head, x_min_head, x_max_head, cy_head, _, y_min_head, y_max_head, _ = \ find_features_of_lines(contours_h) + # matrix_l_n = np.zeros((len(cy_head), matrix_of_seps_ch.shape[1])) + # args_head = np.arange(len(cy_head)) + # matrix_l_n[:, 0] = args_head + # matrix_l_n[:, 2] = x_min_head+30 + # matrix_l_n[:, 3] = x_max_head-30 + # matrix_l_n[:, 4] = dist_x_head + # matrix_l_n[:, 5] = y_min_head-3-8 + # matrix_l_n[:, 6] = y_min_head-5-8 + # matrix_l_n[:, 7] = y_max_head#y_min_head+1-8 + # matrix_l_n[:, 8] = 4 + # split at toplines (y_min_head) and baselines (y_max_head) instead of center (cy_head): + cy_head = np.stack((y_min_head, y_max_head)).T.flatten() + y_min_head, y_max_head = (np.stack((y_min_head - 2, y_max_head - 2)).T.flatten(), + np.stack((y_min_head + 2, y_max_head + 2)).T.flatten()) + x_min_head = np.repeat(x_min_head, 2) + x_max_head = np.repeat(x_max_head, 2) + dist_x_head = np.repeat(dist_x_head, 2) matrix_l_n = np.zeros((len(cy_head), matrix_of_seps_ch.shape[1])) args_head = np.arange(len(cy_head)) matrix_l_n[:, 0] = args_head - matrix_l_n[:, 2] = x_min_head+30 - matrix_l_n[:, 3] = x_max_head-30 + # +/- 30px to avoid crossing col peaks by accident + matrix_l_n[:, 2] = x_min_head + 30 + matrix_l_n[:, 3] = x_max_head - 30 matrix_l_n[:, 4] = dist_x_head - matrix_l_n[:, 5] = y_min_head-3-8 - matrix_l_n[:, 6] = y_min_head-5-8 - matrix_l_n[:, 7] = y_max_head#y_min_head+1-8 + matrix_l_n[:, 5] = cy_head + matrix_l_n[:, 6] = y_min_head + matrix_l_n[:, 7] = y_max_head matrix_l_n[:, 8] = 4 matrix_of_seps_ch = np.append( matrix_of_seps_ch, matrix_l_n, axis=0) From 7c3e41858877211c82f5b6c91a02fccfe146cacb Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 20 Oct 2025 16:13:51 +0200 Subject: [PATCH 09/32] `return_boxes_of_images_by_order_of_reading_new`: simplify - enumeration instead of indexing - array instead of list operations - add better plotting (but commented out) --- src/eynollah/utils/__init__.py | 349 ++++++++++++++++----------------- 1 file changed, 165 insertions(+), 184 deletions(-) diff --git a/src/eynollah/utils/__init__.py b/src/eynollah/utils/__init__.py index 0c3e4ae..698b0bd 100644 --- a/src/eynollah/utils/__init__.py +++ b/src/eynollah/utils/__init__.py @@ -5,6 +5,7 @@ import math try: import matplotlib.pyplot as plt + import matplotlib.patches as patches except ImportError: plt = None import numpy as np @@ -20,6 +21,7 @@ from .contour import (contours_in_same_horizon, return_contours_of_image, return_parent_contours) + def pairwise(iterable): # pairwise('ABCDEFG') → AB BC CD DE EF FG @@ -205,15 +207,15 @@ def return_x_start_end_mothers_childs_and_type_of_reading_order( #print(x_end,'x_end') #print(len_sep) - deleted=[] + deleted = set() for i in range(len(x_start)-1): nodes_i=set(range(x_start[i],x_end[i]+1)) for j in range(i+1,len(x_start)): if nodes_i==set(range(x_start[j],x_end[j]+1)): - deleted.append(j) + deleted.add(j) #print(np.unique(deleted)) - remained_sep_indexes=set(range(len(x_start)))-set(np.unique(deleted) ) + remained_sep_indexes = set(range(len(x_start))) - deleted #print(remained_sep_indexes,'remained_sep_indexes') mother=[]#if it has mother child=[] @@ -262,7 +264,7 @@ def return_x_start_end_mothers_childs_and_type_of_reading_order( x_start_with_child_without_mother = x_start[remained_sep_indexes_with_child_without_mother] y_lines_with_child_without_mother = y_sep[remained_sep_indexes_with_child_without_mother] - reading_orther_type=0 + reading_order_type=0 x_end_without_mother = x_end[remained_sep_indexes_without_mother] x_start_without_mother = x_start[remained_sep_indexes_without_mother] y_lines_without_mother = y_sep[remained_sep_indexes_without_mother] @@ -278,12 +280,11 @@ def return_x_start_end_mothers_childs_and_type_of_reading_order( x_end[remained_sep_indexes_without_mother[j]] # + 1 )) - set_diff = nodes_i - nodes_j - if set_diff != nodes_i: - reading_orther_type = 1 + if nodes_i - nodes_j != nodes_i: + reading_order_type = 1 else: - reading_orther_type = 0 - #print(reading_orther_type,'javab') + reading_order_type = 0 + #print(reading_order_type,'javab') #print(y_lines_with_child_without_mother,'y_lines_with_child_without_mother') #print(x_start_with_child_without_mother,'x_start_with_child_without_mother') #print(x_end_with_child_without_mother,'x_end_with_hild_without_mother') @@ -297,7 +298,7 @@ def return_x_start_end_mothers_childs_and_type_of_reading_order( #print(all_args_uniq,'all_args_uniq') #print(args_to_be_unified,'args_to_be_unified') - return (reading_orther_type, + return (reading_order_type, x_start_returned, x_end_returned, y_sep_returned, @@ -1590,77 +1591,90 @@ def return_boxes_of_images_by_order_of_reading_new( if logger is None: logger = getLogger(__package__) logger.debug('enter return_boxes_of_images_by_order_of_reading_new') + # def dbg_plt(box=None, title=None): + # if box is None: + # box = [None, None, None, None] + # img = regions_without_separators[box[2]:box[3], box[0]:box[1]] + # plt.imshow(img) + # xrange = np.arange(0, img.shape[1], 100) + # yrange = np.arange(0, img.shape[0], 100) + # plt.gca().set_xticks(xrange, xrange + (box[0] or 0)) + # plt.gca().set_yticks(yrange, yrange + (box[2] or 0)) + # if title: + # plt.title(title) + # plt.show() + # dbg_plt() boxes=[] peaks_neg_tot_tables = [] splitter_y_new = np.array(splitter_y_new, dtype=int) - for i in range(len(splitter_y_new)-1): - #print(splitter_y_new[i],splitter_y_new[i+1]) - matrix_new = matrix_of_lines_ch[:,:][(matrix_of_lines_ch[:,6]> splitter_y_new[i] ) & - (matrix_of_lines_ch[:,7]< splitter_y_new[i+1] )] + width_tot = regions_without_separators.shape[1] + for top, bot in pairwise(splitter_y_new): + # print("%d:%d" % (top, bot), 'i') + # dbg_plt([None, None, top, bot], + # "image cut for y split %d:%d" % ( + # top, bot)) + matrix_new = matrix_of_lines_ch[(matrix_of_lines_ch[:,6] > top) & + (matrix_of_lines_ch[:,7] < bot)] #print(len( matrix_new[:,9][matrix_new[:,9]==1] )) #print(matrix_new[:,8][matrix_new[:,9]==1],'gaddaaa') # check to see is there any vertical separator to find holes. #if (len(matrix_new[:,9][matrix_new[:,9]==1]) > 0 and # np.max(matrix_new[:,8][matrix_new[:,9]==1]) >= - # 0.1 * (np.abs(splitter_y_new[i+1]-splitter_y_new[i]))): + # 0.1 * (np.abs(bot-top))): if True: try: num_col, peaks_neg_fin = find_num_col( - regions_without_separators[splitter_y_new[i]:splitter_y_new[i+1], :], + regions_without_separators[top:bot], num_col_classifier, tables, multiplier=6. if erosion_hurts else 7.) except: peaks_neg_fin=[] num_col = 0 try: if (len(peaks_neg_fin)+1)=len(peaks_neg_fin2): - peaks_neg_fin=list(np.copy(peaks_neg_fin1)) + peaks_neg_fin2 = [] + if len(peaks_neg_fin1) >= len(peaks_neg_fin2): + peaks_neg_fin = peaks_neg_fin1 else: - peaks_neg_fin=list(np.copy(peaks_neg_fin2)) - peaks_neg_fin=list(np.array(peaks_neg_fin)+peaks_neg_fin_early[i_n]) - - if i_n!=(len(peaks_neg_fin_early)-2): - peaks_neg_fin_rev.append(peaks_neg_fin_early[i_n+1]) + peaks_neg_fin = peaks_neg_fin2 + peaks_neg_fin = list(np.array(peaks_neg_fin) + left) #print(peaks_neg_fin,'peaks_neg_fin') - peaks_neg_fin_rev=peaks_neg_fin_rev+peaks_neg_fin + + if right < peaks_neg_fin_early[-1]: + peaks_neg_fin_rev.append(right) + peaks_neg_fin_rev.extend(peaks_neg_fin) if len(peaks_neg_fin_rev)>=len(peaks_neg_fin_org): peaks_neg_fin=list(np.sort(peaks_neg_fin_rev)) @@ -1673,21 +1687,20 @@ def return_boxes_of_images_by_order_of_reading_new( except: logger.exception("cannot find peaks consistent with columns") #num_col, peaks_neg_fin = find_num_col( - # regions_without_separators[splitter_y_new[i]:splitter_y_new[i+1],:], + # regions_without_separators[top:bot,:], # multiplier=7.0) x_min_hor_some=matrix_new[:,2][ (matrix_new[:,9]==0) ] x_max_hor_some=matrix_new[:,3][ (matrix_new[:,9]==0) ] cy_hor_some=matrix_new[:,5][ (matrix_new[:,9]==0) ] cy_hor_diff=matrix_new[:,7][ (matrix_new[:,9]==0) ] - arg_org_hor_some=matrix_new[:,0][ (matrix_new[:,9]==0) ] if right2left_readingorder: - x_max_hor_some_new = regions_without_separators.shape[1] - x_min_hor_some - x_min_hor_some_new = regions_without_separators.shape[1] - x_max_hor_some + x_max_hor_some_new = width_tot - x_min_hor_some + x_min_hor_some_new = width_tot - x_max_hor_some x_min_hor_some =list(np.copy(x_min_hor_some_new)) x_max_hor_some =list(np.copy(x_max_hor_some_new)) - peaks_neg_tot=return_points_with_boundies(peaks_neg_fin,0, regions_without_separators[:,:].shape[1]) + peaks_neg_tot = [0] + peaks_neg_fin + [width_tot] peaks_neg_tot_tables.append(peaks_neg_tot) reading_order_type, x_starting, x_ending, y_type_2, y_diff_type_2, \ @@ -1697,26 +1710,27 @@ def return_boxes_of_images_by_order_of_reading_new( x_min_hor_some, x_max_hor_some, cy_hor_some, peaks_neg_tot, cy_hor_diff) all_columns = set(range(len(peaks_neg_tot) - 1)) - if ((reading_order_type==1) or - (reading_order_type==0 and - (len(y_lines_without_mother)>=2 or there_is_sep_with_child==1))): + # print("all_columns", all_columns) + if (reading_order_type == 1 or + len(y_lines_without_mother) >= 2 or + there_is_sep_with_child == 1): try: - y_grenze = splitter_y_new[i] + 300 + y_grenze = top + 300 #check if there is a big separator in this y_mains_sep_ohne_grenzen args_early_ys=np.arange(len(y_type_2)) #print(args_early_ys,'args_early_ys') - #print(splitter_y_new[i], splitter_y_new[i+1]) + #print(top, bot) - x_starting_up = x_starting[(y_type_2 > splitter_y_new[i]) & + x_starting_up = x_starting[(y_type_2 > top) & (y_type_2 <= y_grenze)] - x_ending_up = x_ending[(y_type_2 > splitter_y_new[i]) & + x_ending_up = x_ending[(y_type_2 > top) & (y_type_2 <= y_grenze)] - y_type_2_up = y_type_2[(y_type_2 > splitter_y_new[i]) & + y_type_2_up = y_type_2[(y_type_2 > top) & (y_type_2 <= y_grenze)] - y_diff_type_2_up = y_diff_type_2[(y_type_2 > splitter_y_new[i]) & + y_diff_type_2_up = y_diff_type_2[(y_type_2 > top) & (y_type_2 <= y_grenze)] - args_up = args_early_ys[(y_type_2 > splitter_y_new[i]) & + args_up = args_early_ys[(y_type_2 > top) & (y_type_2 <= y_grenze)] if len(y_type_2_up) > 0: y_main_separator_up = y_type_2_up [(x_starting_up==0) & @@ -1730,27 +1744,28 @@ def return_boxes_of_images_by_order_of_reading_new( args_to_be_kept = np.array(list( set(args_early_ys) - set(args_main_to_deleted) )) #print(args_to_be_kept,'args_to_be_kept') boxes.append([0, peaks_neg_tot[len(peaks_neg_tot)-1], - splitter_y_new[i], y_diff_main_separator_up.max()]) - splitter_y_new[i] = y_diff_main_separator_up.max() + top, y_diff_main_separator_up.max()]) + # dbg_plt(boxes[-1], "first box") + top = y_diff_main_separator_up.max() - #print(splitter_y_new[i],'splitter_y_new[i]') + #print(top,'top') y_type_2 = y_type_2[args_to_be_kept] x_starting = x_starting[args_to_be_kept] x_ending = x_ending[args_to_be_kept] y_diff_type_2 = y_diff_type_2[args_to_be_kept] #print('galdiha') - y_grenze = splitter_y_new[i] + 200 + y_grenze = top + 200 args_early_ys2=np.arange(len(y_type_2)) - y_type_2_up=y_type_2[(y_type_2 > splitter_y_new[i]) & + y_type_2_up=y_type_2[(y_type_2 > top) & (y_type_2 <= y_grenze)] - x_starting_up=x_starting[(y_type_2 > splitter_y_new[i]) & + x_starting_up=x_starting[(y_type_2 > top) & (y_type_2 <= y_grenze)] - x_ending_up=x_ending[(y_type_2 > splitter_y_new[i]) & + x_ending_up=x_ending[(y_type_2 > top) & (y_type_2 <= y_grenze)] - y_diff_type_2_up=y_diff_type_2[(y_type_2 > splitter_y_new[i]) & + y_diff_type_2_up=y_diff_type_2[(y_type_2 > top) & (y_type_2 <= y_grenze)] - args_up2=args_early_ys2[(y_type_2 > splitter_y_new[i]) & + args_up2=args_early_ys2[(y_type_2 > top) & (y_type_2 <= y_grenze)] #print(y_type_2_up,x_starting_up,x_ending_up,'didid') nodes_in = set() @@ -1804,13 +1819,14 @@ def return_boxes_of_images_by_order_of_reading_new( pass #print('burdaydikh2') - #int(splitter_y_new[i]) + #int(top) y_lines_by_order=[] x_start_by_order=[] x_end_by_order=[] - if (len(x_end_with_child_without_mother)==0 and reading_order_type==0) or reading_order_type==1: - if reading_order_type==1: - y_lines_by_order.append(splitter_y_new[i]) + if (reading_order_type == 1 or + len(x_end_with_child_without_mother) == 0): + if reading_order_type == 1: + y_lines_by_order.append(top) x_start_by_order.append(0) x_end_by_order.append(len(peaks_neg_tot)-2) else: @@ -1823,8 +1839,8 @@ def return_boxes_of_images_by_order_of_reading_new( columns_not_covered = list(all_columns - columns_covered_by_mothers) y_type_2 = np.append(y_type_2, np.ones(len(columns_not_covered) + len(x_start_without_mother), - dtype=int) * splitter_y_new[i]) - ##y_lines_by_order = np.append(y_lines_by_order, [splitter_y_new[i]] * len(columns_not_covered)) + dtype=int) * top) + ##y_lines_by_order = np.append(y_lines_by_order, [top] * len(columns_not_covered)) ##x_start_by_order = np.append(x_start_by_order, [0] * len(columns_not_covered)) x_starting = np.append(x_starting, np.array(columns_not_covered, int)) x_starting = np.append(x_starting, x_start_without_mother) @@ -1839,22 +1855,15 @@ def return_boxes_of_images_by_order_of_reading_new( ind_args_in_col=ind_args[x_starting==column] #print('babali2') #print(ind_args_in_col,'ind_args_in_col') - ind_args_in_col=np.array(ind_args_in_col) #print(len(y_type_2)) y_column=y_type_2[ind_args_in_col] x_start_column=x_starting[ind_args_in_col] x_end_column=x_ending[ind_args_in_col] #print('babali3') ind_args_col_sorted=np.argsort(y_column) - y_col_sort=y_column[ind_args_col_sorted] - x_start_column_sort=x_start_column[ind_args_col_sorted] - x_end_column_sort=x_end_column[ind_args_col_sorted] - #print('babali4') - for ii in range(len(y_col_sort)): - #print('babali5') - y_lines_by_order.append(y_col_sort[ii]) - x_start_by_order.append(x_start_column_sort[ii]) - x_end_by_order.append(x_end_column_sort[ii]-1) + y_lines_by_order.extend(y_column[ind_args_col_sorted]) + x_start_by_order.extend(x_start_column[ind_args_col_sorted]) + x_end_by_order.extend(x_end_column[ind_args_col_sorted] - 1) else: #print(x_start_without_mother,x_end_without_mother,peaks_neg_tot,'dodo') columns_covered_by_mothers = set() @@ -1864,8 +1873,8 @@ def return_boxes_of_images_by_order_of_reading_new( x_end_without_mother[dj])) columns_not_covered = list(all_columns - columns_covered_by_mothers) y_type_2 = np.append(y_type_2, np.ones(len(columns_not_covered) + len(x_start_without_mother), - dtype=int) * splitter_y_new[i]) - ##y_lines_by_order = np.append(y_lines_by_order, [splitter_y_new[i]] * len(columns_not_covered)) + dtype=int) * top) + ##y_lines_by_order = np.append(y_lines_by_order, [top] * len(columns_not_covered)) ##x_start_by_order = np.append(x_start_by_order, [0] * len(columns_not_covered)) x_starting = np.append(x_starting, np.array(columns_not_covered, int)) x_starting = np.append(x_starting, x_start_without_mother) @@ -1888,25 +1897,24 @@ def return_boxes_of_images_by_order_of_reading_new( x_start_with_child_without_mother = np.array(x_start_with_child_without_mother, int) for i_s_nc in columns_not_covered_child_no_mother: if i_s_nc in x_start_with_child_without_mother: + #print("i_s_nc", i_s_nc) x_end_biggest_column = \ x_end_with_child_without_mother[x_start_with_child_without_mother==i_s_nc][0] args_all_biggest_lines = ind_args[(x_starting==i_s_nc) & (x_ending==x_end_biggest_column)] y_column_nc = y_type_2[args_all_biggest_lines] - x_start_column_nc = x_starting[args_all_biggest_lines] - x_end_column_nc = x_ending[args_all_biggest_lines] + #x_start_column_nc = x_starting[args_all_biggest_lines] + #x_end_column_nc = x_ending[args_all_biggest_lines] y_column_nc = np.sort(y_column_nc) for i_c in range(len(y_column_nc)): - if i_c==(len(y_column_nc)-1): - ind_all_lines_between_nm_wc=ind_args[(y_type_2>y_column_nc[i_c]) & - (y_type_2=i_s_nc) & - (x_ending<=x_end_biggest_column)] - else: - ind_all_lines_between_nm_wc=ind_args[(y_type_2>y_column_nc[i_c]) & - (y_type_2=i_s_nc) & - (x_ending<=x_end_biggest_column)] + #print("i_c", i_c) + ind_all_lines_between_nm_wc = \ + ind_args[(y_type_2 > y_column_nc[i_c]) & + (y_type_2 < (y_column_nc[i_c+1] + if i_c < len(y_column_nc)-1 + else bot)) & + (x_starting >= i_s_nc) & + (x_ending <= x_end_biggest_column)] y_all_between_nm_wc = y_type_2[ind_all_lines_between_nm_wc] x_starting_all_between_nm_wc = x_starting[ind_all_lines_between_nm_wc] x_ending_all_between_nm_wc = x_ending[ind_all_lines_between_nm_wc] @@ -1965,78 +1973,58 @@ def return_boxes_of_images_by_order_of_reading_new( ind_args_in_col=ind_args_between[x_starting_all_between_nm_wc==column] #print('babali2') #print(ind_args_in_col,'ind_args_in_col') - ind_args_in_col=np.array(ind_args_in_col) #print(len(y_type_2)) y_column=y_all_between_nm_wc[ind_args_in_col] x_start_column=x_starting_all_between_nm_wc[ind_args_in_col] x_end_column=x_ending_all_between_nm_wc[ind_args_in_col] #print('babali3') ind_args_col_sorted=np.argsort(y_column) - y_col_sort=y_column[ind_args_col_sorted] - x_start_column_sort=x_start_column[ind_args_col_sorted] - x_end_column_sort=x_end_column[ind_args_col_sorted] - #print('babali4') - for ii in range(len(y_col_sort)): - #print('babali5') - y_lines_by_order.append(y_col_sort[ii]) - x_start_by_order.append(x_start_column_sort[ii]) - x_end_by_order.append(x_end_column_sort[ii]-1) + y_lines_by_order.extend(y_column[ind_args_col_sorted]) + x_start_by_order.extend(x_start_column[ind_args_col_sorted]) + x_end_by_order.extend(x_end_column[ind_args_col_sorted] - 1) else: #print(column,'column') ind_args_in_col=ind_args[x_starting==i_s_nc] #print('babali2') #print(ind_args_in_col,'ind_args_in_col') - ind_args_in_col=np.array(ind_args_in_col) #print(len(y_type_2)) y_column=y_type_2[ind_args_in_col] x_start_column=x_starting[ind_args_in_col] x_end_column=x_ending[ind_args_in_col] #print('babali3') - ind_args_col_sorted=np.argsort(y_column) - y_col_sort=y_column[ind_args_col_sorted] - x_start_column_sort=x_start_column[ind_args_col_sorted] - x_end_column_sort=x_end_column[ind_args_col_sorted] - #print('babali4') - for ii in range(len(y_col_sort)): - y_lines_by_order.append(y_col_sort[ii]) - x_start_by_order.append(x_start_column_sort[ii]) - x_end_by_order.append(x_end_column_sort[ii]-1) + ind_args_col_sorted = np.argsort(y_column) + y_lines_by_order.extend(y_column[ind_args_col_sorted]) + x_start_by_order.extend(x_start_column[ind_args_col_sorted]) + x_end_by_order.extend(x_end_column[ind_args_col_sorted] - 1) + y_lines_by_order = np.array(y_lines_by_order) + x_start_by_order = np.array(x_start_by_order) + x_end_by_order = np.array(x_end_by_order) for il in range(len(y_lines_by_order)): - y_copy = list(y_lines_by_order) - x_start_copy = list(x_start_by_order) - x_end_copy = list(x_end_by_order) - - #print(y_copy,'y_copy') - y_itself=y_copy.pop(il) - x_start_itself=x_start_copy.pop(il) - x_end_itself=x_end_copy.pop(il) - - #print(y_copy,'y_copy2') + #print(il, "il") + y_itself = y_lines_by_order[il] + x_start_itself = x_start_by_order[il] + x_end_itself = x_end_by_order[il] for column in range(int(x_start_itself), int(x_end_itself)+1): #print(column,'cols') - y_in_cols=[] - for yic in range(len(y_copy)): - #print('burda') - if (y_copy[yic]>y_itself and - column>=x_start_copy[yic] and - column<=x_end_copy[yic]): - y_in_cols.append(y_copy[yic]) + y_in_cols = y_lines_by_order[(y_itself < y_lines_by_order) & + (column >= x_start_by_order) & + (column <= x_end_by_order)] + #print('burda') + y_down = y_in_cols.min(initial=bot) #print('burda2') #print(y_in_cols,'y_in_cols') - if len(y_in_cols)>0: - y_down=np.min(y_in_cols) - else: - y_down=splitter_y_new[i+1] #print(y_itself,'y_itself') boxes.append([peaks_neg_tot[column], peaks_neg_tot[column+1], y_itself, y_down]) + # dbg_plt(boxes[-1], "A column %d box" % (column + 1)) except: logger.exception("cannot assign boxes") boxes.append([0, peaks_neg_tot[len(peaks_neg_tot)-1], - splitter_y_new[i], splitter_y_new[i+1]]) + top, bot]) + # dbg_plt(boxes[-1], "fallback box") else: y_lines_by_order=[] x_start_by_order=[] @@ -2050,8 +2038,8 @@ def return_boxes_of_images_by_order_of_reading_new( columns_not_covered = list(all_columns - columns_covered_by_lines_covered_more_than_2col) y_type_2 = np.append(y_type_2, np.ones(len(columns_not_covered) + 1, - dtype=int) * splitter_y_new[i]) - ##y_lines_by_order = np.append(y_lines_by_order, [splitter_y_new[i]] * len(columns_not_covered)) + dtype=int) * top) + ##y_lines_by_order = np.append(y_lines_by_order, [top] * len(columns_not_covered)) ##x_start_by_order = np.append(x_start_by_order, [0] * len(columns_not_covered)) x_starting = np.append(x_starting, np.array(columns_not_covered, x_starting.dtype)) x_ending = np.append(x_ending, np.array(columns_not_covered, x_ending.dtype) + 1) @@ -2064,8 +2052,8 @@ def return_boxes_of_images_by_order_of_reading_new( else: columns_not_covered = list(all_columns) y_type_2 = np.append(y_type_2, np.ones(len(columns_not_covered), - dtype=int) * splitter_y_new[i]) - ##y_lines_by_order = np.append(y_lines_by_order, [splitter_y_new[i]] * len(columns_not_covered)) + dtype=int) * top) + ##y_lines_by_order = np.append(y_lines_by_order, [top] * len(columns_not_covered)) ##x_start_by_order = np.append(x_start_by_order, [0] * len(columns_not_covered)) x_starting = np.append(x_starting, np.array(columns_not_covered, x_starting.dtype)) x_ending = np.append(x_ending, np.array(columns_not_covered, x_ending.dtype) + 1) @@ -2075,71 +2063,64 @@ def return_boxes_of_images_by_order_of_reading_new( for column in range(len(peaks_neg_tot)-1): #print(column,'column') ind_args_in_col=ind_args[x_starting==column] - ind_args_in_col=np.array(ind_args_in_col) #print(len(y_type_2)) y_column=y_type_2[ind_args_in_col] x_start_column=x_starting[ind_args_in_col] x_end_column=x_ending[ind_args_in_col] - ind_args_col_sorted=np.argsort(y_column) - y_col_sort=y_column[ind_args_col_sorted] - x_start_column_sort=x_start_column[ind_args_col_sorted] - x_end_column_sort=x_end_column[ind_args_col_sorted] - #print('babali4') - for ii in range(len(y_col_sort)): - #print('babali5') - y_lines_by_order.append(y_col_sort[ii]) - x_start_by_order.append(x_start_column_sort[ii]) - x_end_by_order.append(x_end_column_sort[ii]-1) + ind_args_col_sorted = np.argsort(y_column) + y_lines_by_order.extend(y_column[ind_args_col_sorted]) + x_start_by_order.extend(x_start_column[ind_args_col_sorted]) + x_end_by_order.extend(x_end_column[ind_args_col_sorted] - 1) + y_lines_by_order = np.array(y_lines_by_order) + x_start_by_order = np.array(x_start_by_order) + x_end_by_order = np.array(x_end_by_order) for il in range(len(y_lines_by_order)): - y_copy = list(y_lines_by_order) - x_start_copy = list(x_start_by_order) - x_end_copy = list(x_end_by_order) - - #print(y_copy,'y_copy') - y_itself=y_copy.pop(il) - x_start_itself=x_start_copy.pop(il) - x_end_itself=x_end_copy.pop(il) - + #print(il, "il") + y_itself = y_lines_by_order[il] + #print(y_itself,'y_itself') + x_start_itself = x_start_by_order[il] + x_end_itself = x_end_by_order[il] for column in range(x_start_itself, x_end_itself+1): #print(column,'cols') - y_in_cols=[] - for yic in range(len(y_copy)): - #print('burda') - if (y_copy[yic]>y_itself and - column>=x_start_copy[yic] and - column<=x_end_copy[yic]): - y_in_cols.append(y_copy[yic]) + y_in_cols = y_lines_by_order[(y_itself < y_lines_by_order) & + (column >= x_start_by_order) & + (column <= x_end_by_order)] #print('burda2') #print(y_in_cols,'y_in_cols') - if len(y_in_cols)>0: - y_down=np.min(y_in_cols) - else: - y_down=splitter_y_new[i+1] - #print(y_itself,'y_itself') + y_down = y_in_cols.min(initial=bot) + #print(y_down,'y_down') boxes.append([peaks_neg_tot[column], peaks_neg_tot[column+1], y_itself, y_down]) + # dbg_plt(boxes[-1], "B column %d box" % (column + 1)) #else: - #boxes.append([ 0, regions_without_separators[:,:].shape[1] ,splitter_y_new[i],splitter_y_new[i+1]]) + #boxes.append([ 0, regions_without_separators[:,:].shape[1] ,top, bot]) if right2left_readingorder: peaks_neg_tot_tables_new = [] if len(peaks_neg_tot_tables)>=1: for peaks_tab_ind in peaks_neg_tot_tables: - peaks_neg_tot_tables_ind = regions_without_separators.shape[1] - np.array(peaks_tab_ind) + peaks_neg_tot_tables_ind = width_tot - np.array(peaks_tab_ind) peaks_neg_tot_tables_ind = list(peaks_neg_tot_tables_ind[::-1]) peaks_neg_tot_tables_new.append(peaks_neg_tot_tables_ind) for i in range(len(boxes)): - x_start_new = regions_without_separators.shape[1] - boxes[i][1] - x_end_new = regions_without_separators.shape[1] - boxes[i][0] + x_start_new = width_tot - boxes[i][1] + x_end_new = width_tot - boxes[i][0] boxes[i][0] = x_start_new boxes[i][1] = x_end_new peaks_neg_tot_tables = peaks_neg_tot_tables_new + # show final xy-cut + # plt.imshow(regions_without_separators) + # for xmin, xmax, ymin, ymax in boxes: + # plt.gca().add_patch(patches.Rectangle((xmin, ymin), xmax - xmin, ymax - ymin, + # fill=False, linewidth=1, edgecolor='r')) + # plt.show() + logger.debug('exit return_boxes_of_images_by_order_of_reading_new') return boxes, peaks_neg_tot_tables From 0fc4b2535dc005612406cd4ffbf2471a5b4e1485 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 20 Oct 2025 16:47:35 +0200 Subject: [PATCH 10/32] `return_boxes_of_images_by_order_of_reading_new`: fix no-mother case - when handling lines without mother, and biggest line already accounts for all columns, but some are too close to the top and therefore must be removed, avoid invalidating `biggest` index, causing `IndexError` - remove try-catch (now unnecessary) - array instead of list operations --- src/eynollah/utils/__init__.py | 62 ++++++++++++++++------------------ 1 file changed, 29 insertions(+), 33 deletions(-) diff --git a/src/eynollah/utils/__init__.py b/src/eynollah/utils/__init__.py index 698b0bd..b331cab 100644 --- a/src/eynollah/utils/__init__.py +++ b/src/eynollah/utils/__init__.py @@ -1919,54 +1919,50 @@ def return_boxes_of_images_by_order_of_reading_new( x_starting_all_between_nm_wc = x_starting[ind_all_lines_between_nm_wc] x_ending_all_between_nm_wc = x_ending[ind_all_lines_between_nm_wc] - x_diff_all_between_nm_wc = x_ending_all_between_nm_wc - x_starting_all_between_nm_wc - if len(x_diff_all_between_nm_wc)>0: - biggest=np.argmax(x_diff_all_between_nm_wc) - columns_covered_by_mothers = set() - for dj in range(len(x_starting_all_between_nm_wc)): + for dj in range(len(ind_all_lines_between_nm_wc)): columns_covered_by_mothers.update( range(x_starting_all_between_nm_wc[dj], x_ending_all_between_nm_wc[dj])) child_columns = set(range(i_s_nc, x_end_biggest_column)) columns_not_covered = list(child_columns - columns_covered_by_mothers) - should_longest_line_be_extended=0 - if (len(x_diff_all_between_nm_wc) > 0 and - set(list(range(x_starting_all_between_nm_wc[biggest], - x_ending_all_between_nm_wc[biggest])) + - list(columns_not_covered)) != child_columns): - should_longest_line_be_extended=1 - index_lines_so_close_to_top_separator = \ - np.arange(len(y_all_between_nm_wc))[(y_all_between_nm_wc>y_column_nc[i_c]) & - (y_all_between_nm_wc<=(y_column_nc[i_c]+500))] - if len(index_lines_so_close_to_top_separator) > 0: - indexes_remained_after_deleting_closed_lines= \ - np.array(list(set(list(range(len(y_all_between_nm_wc)))) - - set(list(index_lines_so_close_to_top_separator)))) - if len(indexes_remained_after_deleting_closed_lines) > 0: + if len(ind_all_lines_between_nm_wc): + biggest = np.argmax(x_ending_all_between_nm_wc - + x_starting_all_between_nm_wc) + if columns_covered_by_mothers == set( + range(x_starting_all_between_nm_wc[biggest], + x_ending_all_between_nm_wc[biggest])): + # biggest accounts for all columns alone, + # longest line should be extended + lines_so_close_to_top_separator = \ + ((y_all_between_nm_wc > y_column_nc[i_c]) & + (y_all_between_nm_wc <= y_column_nc[i_c] + 500)) + if (np.count_nonzero(lines_so_close_to_top_separator) and + np.count_nonzero(lines_so_close_to_top_separator) < + len(ind_all_lines_between_nm_wc)): y_all_between_nm_wc = \ - y_all_between_nm_wc[indexes_remained_after_deleting_closed_lines] + y_all_between_nm_wc[~lines_so_close_to_top_separator] x_starting_all_between_nm_wc = \ - x_starting_all_between_nm_wc[indexes_remained_after_deleting_closed_lines] + x_starting_all_between_nm_wc[~lines_so_close_to_top_separator] x_ending_all_between_nm_wc = \ - x_ending_all_between_nm_wc[indexes_remained_after_deleting_closed_lines] + x_ending_all_between_nm_wc[~lines_so_close_to_top_separator] - y_all_between_nm_wc = np.append(y_all_between_nm_wc, y_column_nc[i_c]) - x_starting_all_between_nm_wc = np.append(x_starting_all_between_nm_wc, i_s_nc) - x_ending_all_between_nm_wc = np.append(x_ending_all_between_nm_wc, x_end_biggest_column) - - if len(x_diff_all_between_nm_wc) > 0: - try: + y_all_between_nm_wc = np.append(y_all_between_nm_wc, y_column_nc[i_c]) + x_starting_all_between_nm_wc = np.append(x_starting_all_between_nm_wc, i_s_nc) + x_ending_all_between_nm_wc = np.append(x_ending_all_between_nm_wc, x_end_biggest_column) + else: y_all_between_nm_wc = np.append(y_all_between_nm_wc, y_column_nc[i_c]) x_starting_all_between_nm_wc = np.append(x_starting_all_between_nm_wc, x_starting_all_between_nm_wc[biggest]) x_ending_all_between_nm_wc = np.append(x_ending_all_between_nm_wc, x_ending_all_between_nm_wc[biggest]) - except: - logger.exception("cannot append") - y_all_between_nm_wc = np.append(y_all_between_nm_wc, [y_column_nc[i_c]] * len(columns_not_covered)) - x_starting_all_between_nm_wc = np.append(x_starting_all_between_nm_wc, np.array(columns_not_covered, int)) - x_ending_all_between_nm_wc = np.append(x_ending_all_between_nm_wc, np.array(columns_not_covered, int) + 1) + if len(columns_not_covered): + y_all_between_nm_wc = np.append( + y_all_between_nm_wc, [y_column_nc[i_c]] * len(columns_not_covered)) + x_starting_all_between_nm_wc = np.append( + x_starting_all_between_nm_wc, np.array(columns_not_covered, int)) + x_ending_all_between_nm_wc = np.append( + x_ending_all_between_nm_wc, np.array(columns_not_covered, int) + 1) ind_args_between=np.arange(len(x_ending_all_between_nm_wc)) for column in range(int(i_s_nc), int(x_end_biggest_column)): From e2dfec75fbefe3e5aeffd71a7a61eab6092f6c92 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 24 Oct 2025 01:19:20 +0200 Subject: [PATCH 11/32] `return_x_start_end_mothers_childs_and_type_of_reading_order`: simplify and document MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - simplify - rename identifiers to make readable: - `y_sep` → `y_mid` (because the cy gets passed) - `y_diff` → `y_max` (because the ymax gets passed) - array instead of list operations - add docstring and in-line comments - return (zero-length) numpy array instead of empty list --- src/eynollah/eynollah.py | 10 +- src/eynollah/utils/__init__.py | 378 +++++++++++++++++---------------- 2 files changed, 198 insertions(+), 190 deletions(-) diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index 9412861..08ffed7 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -2507,6 +2507,7 @@ class Eynollah: My_main[ii] < box[3])): arg_text_con_main[ii] = jj check_if_textregion_located_in_a_box = True + #print("main/matched", (mx_main[ii], Mx_main[ii], my_main[ii], My_main[ii]), "\tin", box, only_centers) break if not check_if_textregion_located_in_a_box: dists_tr_from_box = np.linalg.norm(c_boxes - np.array([[cy_main[ii]], [cx_main[ii]]]), axis=0) @@ -2514,6 +2515,7 @@ class Eynollah: (boxes[:, 0] <= cx_main[ii]) & (cx_main[ii] < boxes[:, 1])) ind_min = np.argmin(np.ma.masked_array(dists_tr_from_box, ~pcontained_in_box)) arg_text_con_main[ii] = ind_min + #print("main/fallback", (mx_main[ii], Mx_main[ii], my_main[ii], My_main[ii]), "\tin", boxes[ind_min], only_centers) args_contours_main = np.arange(len(contours_only_text_parent)) order_by_con_main = np.zeros_like(arg_text_con_main) @@ -2531,6 +2533,7 @@ class Eynollah: My_head[ii] < box[3])): arg_text_con_head[ii] = jj check_if_textregion_located_in_a_box = True + #print("head/matched", (mx_head[ii], Mx_head[ii], my_head[ii], My_head[ii]), "\tin", box, only_centers) break if not check_if_textregion_located_in_a_box: dists_tr_from_box = np.linalg.norm(c_boxes - np.array([[cy_head[ii]], [cx_head[ii]]]), axis=0) @@ -2538,6 +2541,7 @@ class Eynollah: (boxes[:, 0] <= cx_head[ii]) & (cx_head[ii] < boxes[:, 1])) ind_min = np.argmin(np.ma.masked_array(dists_tr_from_box, ~pcontained_in_box)) arg_text_con_head[ii] = ind_min + #print("head/fallback", (mx_head[ii], Mx_head[ii], my_head[ii], My_head[ii]), "\tin", boxes[ind_min], only_centers) args_contours_head = np.arange(len(contours_only_text_parent_h)) order_by_con_head = np.zeros_like(arg_text_con_head) @@ -2587,7 +2591,7 @@ class Eynollah: try: results = match_boxes(False) except Exception as why: - self.logger.error(why) + self.logger.exception(why) results = match_boxes(True) self.logger.debug("exit do_order_of_regions") @@ -2976,7 +2980,7 @@ class Eynollah: max(self.num_col_lower or num_col_classifier, num_col_classifier)) except Exception as why: - self.logger.error(why) + self.logger.exception(why) num_col = None #print("inside graphics 3 ", time.time() - t_in_gr) return (num_col, num_col_classifier, img_only_regions, page_coord, image_page, mask_images, mask_lines, @@ -3044,7 +3048,7 @@ class Eynollah: if not num_column_is_classified: num_col_classifier = num_col + 1 except Exception as why: - self.logger.error(why) + self.logger.exception(why) num_col = None return (num_col, num_col_classifier, img_only_regions, page_coord, image_page, mask_images, mask_lines, text_regions_p_1, cont_page, table_prediction) diff --git a/src/eynollah/utils/__init__.py b/src/eynollah/utils/__init__.py index b331cab..f1a8aae 100644 --- a/src/eynollah/utils/__init__.py +++ b/src/eynollah/utils/__init__.py @@ -33,226 +33,229 @@ def pairwise(iterable): a = b def return_x_start_end_mothers_childs_and_type_of_reading_order( - x_min_hor_some, x_max_hor_some, cy_hor_some, peak_points, cy_hor_diff): + x_min_hor_some, x_max_hor_some, cy_hor_some, peak_points, y_max_hor_some): + """ + Analyse which separators overlap multiple column candidates, + and how they overlap each other. + + Ignore separators not spanning multiple columns. + + For the separators to be returned, try to join them when they are directly + adjacent horizontally but nearby vertically (and thus mutually compatible). + Also, mark any separators that already span the full width. + + Furthermore, identify which pairs of (unjoined) separators span subsets of columns + of each other (disregarding vertical positions). Referring, respectively, to the + superset separators as "mothers" and to the subset separators as "children", + retrieve information on which columns are spanned by separators with no mother, + and which columns are spanned by their children (if any). + + Moreover, determine if there is any (column) overlap among the multi-span separators + with no mother, specifically (and thus, no simple box separation is possible). + + Arguments: + * the x start column index of the raw separators + * the x end column index of the raw separators + * the y center coordinate of the raw separators + * the x column coordinates + * the y end coordinate of the raw separators + + Returns: + a tuple of: + * whether any top-level (no-mother) multi-span separators overlap each other + * the x start column index of the resulting multi-span separators + * the x end column index of the resulting multi-span separators + * the y center coordinate of the resulting multi-span separators + * the y end coordinate of the resulting multi-span separators + * the y center (for 1 representative) of the top-level (no-mother) multi-span separators + * the x start column index of the top-level (no-mother) multi-span separators + * the x end column index of the top-level (no-mother) multi-span separators + * whether any multi-span separators have super-spans of other (child) multi-span separators + * the y center (for 1 representative) of the top-level (no-mother) multi-span separators + which have super-spans of other (child) multi-span separators + * the x start column index of the top-level multi-span separators + which have super-spans of other (child) multi-span separators + * the x end column index of the top-level multi-span separators + which have super-spans of other (child) multi-span separators + * indexes of multi-span separators with full-width span + """ x_start=[] x_end=[] - kind=[]#if covers 2 and more than 2 columns set it to 1 otherwise 0 len_sep=[] - y_sep=[] - y_diff=[] + y_mid=[] + y_max=[] new_main_sep_y=[] - indexer=0 for i in range(len(x_min_hor_some)): - starting=x_min_hor_some[i]-peak_points - starting=starting[starting>=0] - min_start=np.argmin(starting) - ending=peak_points-x_max_hor_some[i] - len_ending_neg=len(ending[ending<=0]) - - ending=ending[ending>0] - max_end=np.argmin(ending)+len_ending_neg + #print(indexer, "%d:%d" % (x_min_hor_some[i], x_max_hor_some[i]), cy_hor_some[i]) + starting = x_min_hor_some[i] - peak_points + min_start = np.flatnonzero(starting >= 0)[-1] # last left-of + ending = x_max_hor_some[i] - peak_points + max_end = np.flatnonzero(ending < 0)[0] # first right-of + #print(indexer, "%d:%d" % (min_start, max_end)) if (max_end-min_start)>=2: + # column range of separator spans more than one column candidate if (max_end-min_start)==(len(peak_points)-1): + # all columns (i.e. could be true new y splitter) new_main_sep_y.append(indexer) #print((max_end-min_start),len(peak_points),'(max_end-min_start)') - y_sep.append(cy_hor_some[i]) - y_diff.append(cy_hor_diff[i]) + y_mid.append(cy_hor_some[i]) + y_max.append(y_max_hor_some[i]) x_end.append(max_end) - - x_start.append( min_start) - + x_start.append(min_start) len_sep.append(max_end-min_start) - if max_end==min_start+1: - kind.append(0) - else: - kind.append(1) - indexer+=1 + #print(x_start,'x_start') + #print(x_end,'x_end') x_start_returned = np.array(x_start, dtype=int) x_end_returned = np.array(x_end, dtype=int) - y_sep_returned = np.array(y_sep, dtype=int) - y_diff_returned = np.array(y_diff, dtype=int) - - all_args_uniq = contours_in_same_horizon(y_sep_returned) - args_to_be_unified=[] - y_unified=[] - y_diff_unified=[] - x_s_unified=[] - x_e_unified=[] - if len(all_args_uniq)>0: - #print('burda') - if type(all_args_uniq[0]) is list: - for dd in range(len(all_args_uniq)): - if len(all_args_uniq[dd])==2: - x_s_same_hor=np.array(x_start_returned)[all_args_uniq[dd]] - x_e_same_hor=np.array(x_end_returned)[all_args_uniq[dd]] - y_sep_same_hor=np.array(y_sep_returned)[all_args_uniq[dd]] - y_diff_same_hor=np.array(y_diff_returned)[all_args_uniq[dd]] - #print('burda2') - if (x_s_same_hor[0]==x_e_same_hor[1]-1 or - x_s_same_hor[1]==x_e_same_hor[0]-1 and - x_s_same_hor[0]!=x_s_same_hor[1] and - x_e_same_hor[0]!=x_e_same_hor[1]): - #print('burda3') - for arg_in in all_args_uniq[dd]: - #print(arg_in,'arg_in') - args_to_be_unified.append(arg_in) - y_selected=np.min(y_sep_same_hor) - y_diff_selected=np.max(y_diff_same_hor) - x_s_selected=np.min(x_s_same_hor) - x_e_selected=np.max(x_e_same_hor) - - x_s_unified.append(x_s_selected) - x_e_unified.append(x_e_selected) - y_unified.append(y_selected) - y_diff_unified.append(y_diff_selected) - #print(x_s_same_hor,'x_s_same_hor') - #print(x_e_same_hor[:]-1,'x_e_same_hor') - #print('#############################') - #print(x_s_unified,'y_selected') - #print(x_e_unified,'x_s_selected') - #print(y_unified,'x_e_same_hor') - - args_lines_not_unified=list( set(range(len(y_sep_returned)))-set(args_to_be_unified) ) - #print(args_lines_not_unified,'args_lines_not_unified') - - x_start_returned_not_unified=list( np.array(x_start_returned)[args_lines_not_unified] ) - x_end_returned_not_unified=list( np.array(x_end_returned)[args_lines_not_unified] ) - y_sep_returned_not_unified=list (np.array(y_sep_returned)[args_lines_not_unified] ) - y_diff_returned_not_unified=list (np.array(y_diff_returned)[args_lines_not_unified] ) - - for dv in range(len(y_unified)): - y_sep_returned_not_unified.append(y_unified[dv]) - y_diff_returned_not_unified.append(y_diff_unified[dv]) - x_start_returned_not_unified.append(x_s_unified[dv]) - x_end_returned_not_unified.append(x_e_unified[dv]) - - #print(y_sep_returned,'y_sep_returned') + y_mid_returned = np.array(y_mid, dtype=int) + y_max_returned = np.array(y_max, dtype=int) + #print(y_mid_returned,'y_mid_returned') #print(x_start_returned,'x_start_returned') #print(x_end_returned,'x_end_returned') - x_start_returned = np.array(x_start_returned_not_unified, dtype=int) - x_end_returned = np.array(x_end_returned_not_unified, dtype=int) - y_sep_returned = np.array(y_sep_returned_not_unified, dtype=int) - y_diff_returned = np.array(y_diff_returned_not_unified, dtype=int) + # join/elongate separators if follow-up x and similar y + sep_pairs = contours_in_same_horizon(y_mid_returned) + if len(sep_pairs): + #print('burda') + args_to_be_unified = set() + y_mid_unified = [] + y_max_unified = [] + x_start_unified = [] + x_end_unified = [] + for pair in sep_pairs: + if (not np.array_equal(*x_start_returned[pair]) and + not np.array_equal(*x_end_returned[pair]) and + # immediately adjacent columns? + np.diff(x_end_returned[pair] - + x_start_returned[pair])[0] in [1, -1]): - #print(y_sep_returned,'y_sep_returned2') + args_to_be_unified.union(set(pair)) + y_mid_unified.append(np.min(y_mid_returned[pair])) + y_max_unified.append(np.max(y_max_returned[pair])) + x_start_unified.append(np.min(x_start_returned[pair])) + x_end_unified.append(np.max(x_end_returned[pair])) + #print(pair,'pair') + #print(x_start_returned[pair],'x_s_same_hor') + #print(x_end_returned[pair],'x_e_same_hor') + #print(y_mid_unified,'y_mid_unified') + #print(y_max_unified,'y_max_unified') + #print(x_start_unified,'x_s_unified') + #print(x_end_unified,'x_e_selected') + #print('#############################') + + if len(y_mid_unified): + args_lines_not_unified = np.setdiff1d(np.arange(len(y_mid_returned)), + list(args_to_be_unified), assume_unique=True) + #print(args_lines_not_unified,'args_lines_not_unified') + x_start_returned = np.append(x_start_returned[args_lines_not_unified], + x_start_unified, axis=0) + x_end_returned = np.append(x_end_returned[args_lines_not_unified], + x_end_unified, axis=0) + y_mid_returned = np.append(y_mid_returned[args_lines_not_unified], + y_mid_unified, axis=0) + y_max_returned = np.append(y_max_returned[args_lines_not_unified], + y_max_unified, axis=0) + #print(y_mid_returned,'y_mid_returned2') #print(x_start_returned,'x_start_returned2') #print(x_end_returned,'x_end_returned2') - #print(new_main_sep_y,'new_main_sep_y') + #print(new_main_sep_y,'new_main_sep_y') #print(x_start,'x_start') #print(x_end,'x_end') - if len(new_main_sep_y)>0: + x_start = np.array(x_start) + x_end = np.array(x_end) + y_mid = np.array(y_mid) + if len(new_main_sep_y): + # some full-width multi-span separators exist, so + # restrict the y range of separators to search for + # mutual overlaps to only those within the largest + # y strip between adjacent multi-span separators + # that involve at least one such full-width seps. + # (does not affect the separators to be returned) + min_ys=np.min(y_mid) + max_ys=np.max(y_mid) + #print(min_ys,'min_ys') + #print(max_ys,'max_ys') - min_ys=np.min(y_sep) - max_ys=np.max(y_sep) + y_mains0 = list(y_mid[new_main_sep_y]) + y_mains = [min_ys] + y_mains0 + [max_ys] - y_mains=[] - y_mains.append(min_ys) - y_mains_sep_ohne_grenzen=[] + y_mains = np.sort(y_mains) + argm = np.argmax(np.diff(y_mains)) + y_mid_new = y_mains[argm] + y_mid_next_new = y_mains[argm + 1] - for ii in range(len(new_main_sep_y)): - y_mains.append(y_sep[new_main_sep_y[ii]]) - y_mains_sep_ohne_grenzen.append(y_sep[new_main_sep_y[ii]]) - - y_mains.append(max_ys) - - y_mains_sorted=np.sort(y_mains) - diff=np.diff(y_mains_sorted) - argm=np.argmax(diff) - - y_min_new=y_mains_sorted[argm] - y_max_new=y_mains_sorted[argm+1] - - #print(y_min_new,'y_min_new') - #print(y_max_new,'y_max_new') - #print(y_sep[new_main_sep_y[0]],y_sep,'yseps') + #print(y_mid_new,argm,'y_mid_new') + #print(y_mid_next_new,argm+1,'y_mid_next_new') + #print(y_mid[new_main_sep_y],new_main_sep_y,'yseps') x_start=np.array(x_start) x_end=np.array(x_end) - kind=np.array(kind) - y_sep=np.array(y_sep) - if (y_min_new in y_mains_sep_ohne_grenzen and - y_max_new in y_mains_sep_ohne_grenzen): - x_start=x_start[(y_sep>y_min_new) & (y_sepy_min_new) & (y_sepy_min_new) & (y_sepy_min_new) & (y_sepy_min_new) & (y_sep<=y_max_new)] - #print('burda1') - x_end=x_end[(y_sep>y_min_new) & (y_sep<=y_max_new)] - #print('burda2') - kind=kind[(y_sep>y_min_new) & (y_sep<=y_max_new)] - y_sep=y_sep[(y_sep>y_min_new) & (y_sep<=y_max_new)] - elif (y_min_new not in y_mains_sep_ohne_grenzen and - y_max_new in y_mains_sep_ohne_grenzen): - x_start=x_start[(y_sep>=y_min_new) & (y_sep=y_min_new) & (y_sep=y_min_new) & (y_sep=y_min_new) & (y_sep y_mid_new else: - x_start=x_start[(y_sep>=y_min_new) & (y_sep<=y_max_new)] - x_end=x_end[(y_sep>=y_min_new) & (y_sep<=y_max_new)] - kind=kind[(y_sep>=y_min_new) & (y_sep<=y_max_new)] - y_sep=y_sep[(y_sep>=y_min_new) & (y_sep<=y_max_new)] + where = y_mid >= y_mid_new + if y_mid_next_new in y_mains0: + where &= y_mid < y_mid_next_new + else: + where &= y_mid <= y_mid_next_new + x_start = x_start[where] + x_end = x_end[where] + y_mid = y_mid[where] #print(x_start,'x_start') #print(x_end,'x_end') - #print(len_sep) + # remove redundant separators that span the same columns + # (keeping only 1 representative each) deleted = set() - for i in range(len(x_start)-1): - nodes_i=set(range(x_start[i],x_end[i]+1)) - for j in range(i+1,len(x_start)): - if nodes_i==set(range(x_start[j],x_end[j]+1)): - deleted.add(j) - #print(np.unique(deleted)) - + for index_i in range(len(x_start) - 1): + nodes_i = set(range(x_start[index_i], x_end[index_i] + 1)) + #print(nodes_i, "nodes_i") + for index_j in range(index_i + 1, len(x_start)): + nodes_j = set(range(x_start[index_j], x_end[index_j] + 1)) + #print(nodes_j, "nodes_j") + if nodes_i == nodes_j: + deleted.add(index_j) + #print(deleted,"deleted") remained_sep_indexes = set(range(len(x_start))) - deleted #print(remained_sep_indexes,'remained_sep_indexes') - mother=[]#if it has mother - child=[] + + # determine which separators span which columns + mother = [] # whether the respective separator has a mother separator + child = [] # whether the respective separator has a child separator for index_i in remained_sep_indexes: have_mother=0 have_child=0 - nodes_ind=set(range(x_start[index_i],x_end[index_i]+1)) + nodes_i = set(range(x_start[index_i], x_end[index_i] + 1)) for index_j in remained_sep_indexes: - nodes_ind_j=set(range(x_start[index_j],x_end[index_j]+1)) - if nodes_indnodes_ind_j: + if nodes_i > nodes_j: have_child=1 mother.append(have_mother) child.append(have_child) - - #print(mother,'mother') - #print(len(remained_sep_indexes)) - #print(len(remained_sep_indexes),len(x_start),len(x_end),len(y_sep),'lens') - y_lines_without_mother=[] - x_start_without_mother=[] - x_end_without_mother=[] - - y_lines_with_child_without_mother=[] - x_start_with_child_without_mother=[] - x_end_with_child_without_mother=[] + #print(mother, "mother") + #print(child, "child") mother = np.array(mother) child = np.array(child) #print(mother,'mother') #print(child,'child') remained_sep_indexes = np.array(list(remained_sep_indexes)) - x_start = np.array(x_start) - x_end = np.array(x_end) - y_sep = np.array(y_sep) + #print(len(remained_sep_indexes)) + #print(len(remained_sep_indexes),len(x_start),len(x_end),len(y_mid),'lens') - if len(remained_sep_indexes)>1: + reading_order_type = 0 + if len(remained_sep_indexes): #print(np.array(remained_sep_indexes),'np.array(remained_sep_indexes)') #print(np.array(mother),'mother') remained_sep_indexes_without_mother = remained_sep_indexes[mother==0] @@ -262,52 +265,53 @@ def return_x_start_end_mothers_childs_and_type_of_reading_order( x_end_with_child_without_mother = x_end[remained_sep_indexes_with_child_without_mother] x_start_with_child_without_mother = x_start[remained_sep_indexes_with_child_without_mother] - y_lines_with_child_without_mother = y_sep[remained_sep_indexes_with_child_without_mother] + y_mid_with_child_without_mother = y_mid[remained_sep_indexes_with_child_without_mother] - reading_order_type=0 x_end_without_mother = x_end[remained_sep_indexes_without_mother] x_start_without_mother = x_start[remained_sep_indexes_without_mother] - y_lines_without_mother = y_sep[remained_sep_indexes_without_mother] + y_mid_without_mother = y_mid[remained_sep_indexes_without_mother] if len(remained_sep_indexes_without_mother)>=2: for i in range(len(remained_sep_indexes_without_mother)-1): - nodes_i=set(range(x_start[remained_sep_indexes_without_mother[i]], - x_end[remained_sep_indexes_without_mother[i]] - # + 1 - )) - for j in range(i+1,len(remained_sep_indexes_without_mother)): - nodes_j=set(range(x_start[remained_sep_indexes_without_mother[j]], - x_end[remained_sep_indexes_without_mother[j]] - # + 1 - )) + index_i = remained_sep_indexes_without_mother[i] + nodes_i = set(range(x_start[index_i], x_end[index_i])) # + 1 + #print(index_i, nodes_i, "nodes_i without mother") + for j in range(i + 1, len(remained_sep_indexes_without_mother)): + index_j = remained_sep_indexes_without_mother[j] + nodes_j = set(range(x_start[index_j], x_end[index_j])) # + 1 + #print(index_j, nodes_j, "nodes_j without mother") if nodes_i - nodes_j != nodes_i: + #print("type=1") reading_order_type = 1 else: - reading_order_type = 0 - #print(reading_order_type,'javab') - #print(y_lines_with_child_without_mother,'y_lines_with_child_without_mother') + y_mid_without_mother = np.zeros(0, int) + x_start_without_mother = np.zeros(0, int) + x_end_without_mother = np.zeros(0, int) + y_mid_with_child_without_mother = np.zeros(0, int) + x_start_with_child_without_mother = np.zeros(0, int) + x_end_with_child_without_mother = np.zeros(0, int) + + #print(reading_order_type,'reading_order_type') + #print(y_mid_with_child_without_mother,'y_mid_with_child_without_mother') #print(x_start_with_child_without_mother,'x_start_with_child_without_mother') #print(x_end_with_child_without_mother,'x_end_with_hild_without_mother') len_sep_with_child = len(child[child==1]) - #print(len_sep_with_child,'len_sep_with_child') there_is_sep_with_child = 0 if len_sep_with_child >= 1: there_is_sep_with_child = 1 - #print(all_args_uniq,'all_args_uniq') - #print(args_to_be_unified,'args_to_be_unified') return (reading_order_type, x_start_returned, x_end_returned, - y_sep_returned, - y_diff_returned, - y_lines_without_mother, + y_mid_returned, + y_max_returned, + y_mid_without_mother, x_start_without_mother, x_end_without_mother, there_is_sep_with_child, - y_lines_with_child_without_mother, + y_mid_with_child_without_mother, x_start_with_child_without_mother, x_end_with_child_without_mother, new_main_sep_y) From b2a79cc6ed766cef5074629fcb76ae1c6846f084 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 24 Oct 2025 01:31:52 +0200 Subject: [PATCH 12/32] `return_x_start_end_mothers_childs_and_type_of_reading_order`: fix+1 when calculating `reading_order_type`, upper limit on column range (`x_end`) needs to be `+1` here as well --- src/eynollah/utils/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/eynollah/utils/__init__.py b/src/eynollah/utils/__init__.py index f1a8aae..3a383e9 100644 --- a/src/eynollah/utils/__init__.py +++ b/src/eynollah/utils/__init__.py @@ -274,11 +274,11 @@ def return_x_start_end_mothers_childs_and_type_of_reading_order( if len(remained_sep_indexes_without_mother)>=2: for i in range(len(remained_sep_indexes_without_mother)-1): index_i = remained_sep_indexes_without_mother[i] - nodes_i = set(range(x_start[index_i], x_end[index_i])) # + 1 + nodes_i = set(range(x_start[index_i], x_end[index_i] + 1)) #print(index_i, nodes_i, "nodes_i without mother") for j in range(i + 1, len(remained_sep_indexes_without_mother)): index_j = remained_sep_indexes_without_mother[j] - nodes_j = set(range(x_start[index_j], x_end[index_j])) # + 1 + nodes_j = set(range(x_start[index_j], x_end[index_j] + 1)) #print(index_j, nodes_j, "nodes_j without mother") if nodes_i - nodes_j != nodes_i: #print("type=1") From acee4c1bfe227055194050935f1868d1fb156701 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 24 Oct 2025 01:43:41 +0200 Subject: [PATCH 13/32] `find_number_of_columns_in_document`: simplify --- src/eynollah/utils/__init__.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/src/eynollah/utils/__init__.py b/src/eynollah/utils/__init__.py index 3a383e9..f948de2 100644 --- a/src/eynollah/utils/__init__.py +++ b/src/eynollah/utils/__init__.py @@ -1551,23 +1551,23 @@ def find_number_of_columns_in_document(region_pre_p, num_col_classifier, tables, (x_max_head>=.84*region_pre_p.shape[1])] cy_seps_splitters = np.append(cy_seps_splitters, cy_seps_splitters_head) - cy_seps_splitters = np.sort(cy_seps_splitters) + cy_seps_splitters = np.sort(cy_seps_splitters).astype(int) splitter_y_new = [0] + list(cy_seps_splitters) + [region_pre_p.shape[0]] - splitter_y_new_diff = np.diff(splitter_y_new) / float(region_pre_p.shape[0]) * 100 - - args_big_parts=np.arange(len(splitter_y_new_diff))[ splitter_y_new_diff>22 ] + big_part = 22 * region_pre_p.shape[0] // 100 # percent height regions_without_separators=return_regions_without_separators(region_pre_p) - length_y_threshold=regions_without_separators.shape[0]/4.0 num_col_fin=0 peaks_neg_fin_fin=[] - for itiles in args_big_parts: - regions_without_separators_tile=regions_without_separators[int(splitter_y_new[itiles]): - int(splitter_y_new[itiles+1]),:] + num_big_parts = 0 + for top, bot in pairwise(splitter_y_new): + if bot - top < big_part: + continue + num_big_parts += 1 try: - num_col, peaks_neg_fin = find_num_col(regions_without_separators_tile, + num_col, peaks_neg_fin = find_num_col(regions_without_separators[top: bot], num_col_classifier, tables, multiplier=7.0) + #print("big part %d:%d has %d columns" % (top, bot, num_col), peaks_neg_fin) except: num_col = 0 peaks_neg_fin = [] @@ -1575,7 +1575,7 @@ def find_number_of_columns_in_document(region_pre_p, num_col_classifier, tables, num_col_fin=num_col peaks_neg_fin_fin=peaks_neg_fin - if len(args_big_parts)==1 and (len(peaks_neg_fin_fin)+1)=500] peaks_neg_fin=peaks_neg_fin[peaks_neg_fin<=(vertical.shape[1]-500)] From 5d15941b350841a4490e002c92ff89a5f6113905 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 24 Oct 2025 01:51:59 +0200 Subject: [PATCH 14/32] `contours_in_same_horizon`: simplify - array instead of list operations - return array of index pairs instead of list objects --- src/eynollah/utils/__init__.py | 73 ++++++++++++++++------------------ src/eynollah/utils/contour.py | 25 +++++------- 2 files changed, 44 insertions(+), 54 deletions(-) diff --git a/src/eynollah/utils/__init__.py b/src/eynollah/utils/__init__.py index f948de2..10987ad 100644 --- a/src/eynollah/utils/__init__.py +++ b/src/eynollah/utils/__init__.py @@ -1315,47 +1315,42 @@ def combine_hor_lines_and_delete_cross_points_and_get_lines_features_back_new( float(num_col_classifier)) if len_lines_bigger_than_x_width_smaller_than_acolumn_width_per_column < 10: args_hor=np.arange(len(slope_lines_hor)) - all_args_uniq=contours_in_same_horizon(cy_main_hor) - #print(all_args_uniq,'all_args_uniq') - if len(all_args_uniq)>0: - if type(all_args_uniq[0]) is list: - special_separators=[] - contours_new=[] - for dd in range(len(all_args_uniq)): - merged_all=None - some_args=args_hor[all_args_uniq[dd]] - some_cy=cy_main_hor[all_args_uniq[dd]] - some_x_min=x_min_main_hor[all_args_uniq[dd]] - some_x_max=x_max_main_hor[all_args_uniq[dd]] + sep_pairs=contours_in_same_horizon(cy_main_hor) + if len(sep_pairs): + special_separators=[] + contours_new=[] + for pair in sep_pairs: + merged_all=None + some_args=args_hor[pair] + some_cy=cy_main_hor[pair] + some_x_min=x_min_main_hor[pair] + some_x_max=x_max_main_hor[pair] - #img_in=np.zeros(separators_closeup_n[:,:,2].shape) - #print(img_p_in_ver.shape[1],some_x_max-some_x_min,'xdiff') - diff_x_some=some_x_max-some_x_min - for jv in range(len(some_args)): - img_p_in=cv2.fillPoly(img_in_hor, pts=[contours_lines_hor[some_args[jv]]], color=(1,1,1)) - if any(i_diff>(img_p_in_ver.shape[1]/float(3.3)) for i_diff in diff_x_some): - img_p_in[int(np.mean(some_cy))-5: - int(np.mean(some_cy))+5, - int(np.min(some_x_min)): - int(np.max(some_x_max)) ]=1 - sum_dis=dist_x_hor[some_args].sum() - diff_max_min_uniques=np.max(x_max_main_hor[some_args])-np.min(x_min_main_hor[some_args]) + #img_in=np.zeros(separators_closeup_n[:,:,2].shape) + #print(img_p_in_ver.shape[1],some_x_max-some_x_min,'xdiff') + diff_x_some=some_x_max-some_x_min + for jv in range(len(some_args)): + img_p_in=cv2.fillPoly(img_in_hor, pts=[contours_lines_hor[some_args[jv]]], color=(1,1,1)) + if any(i_diff>(img_p_in_ver.shape[1]/float(3.3)) for i_diff in diff_x_some): + img_p_in[int(np.mean(some_cy))-5: + int(np.mean(some_cy))+5, + int(np.min(some_x_min)): + int(np.max(some_x_max)) ]=1 + sum_dis=dist_x_hor[some_args].sum() + diff_max_min_uniques=np.max(x_max_main_hor[some_args])-np.min(x_min_main_hor[some_args]) - if (diff_max_min_uniques > sum_dis and - sum_dis / float(diff_max_min_uniques) > 0.85 and - diff_max_min_uniques / float(img_p_in_ver.shape[1]) > 0.85 and - np.std(dist_x_hor[some_args]) < 0.55 * np.mean(dist_x_hor[some_args])): - # print(dist_x_hor[some_args], - # dist_x_hor[some_args].sum(), - # np.min(x_min_main_hor[some_args]), - # np.max(x_max_main_hor[some_args]),'jalibdi') - # print(np.mean( dist_x_hor[some_args] ), - # np.std( dist_x_hor[some_args] ), - # np.var( dist_x_hor[some_args] ),'jalibdiha') - special_separators.append(np.mean(cy_main_hor[some_args])) - else: - img_p_in=img_in_hor - special_separators=[] + if (diff_max_min_uniques > sum_dis and + sum_dis / float(diff_max_min_uniques) > 0.85 and + diff_max_min_uniques / float(img_p_in_ver.shape[1]) > 0.85 and + np.std(dist_x_hor[some_args]) < 0.55 * np.mean(dist_x_hor[some_args])): + # print(dist_x_hor[some_args], + # dist_x_hor[some_args].sum(), + # np.min(x_min_main_hor[some_args]), + # np.max(x_max_main_hor[some_args]),'jalibdi') + # print(np.mean( dist_x_hor[some_args] ), + # np.std( dist_x_hor[some_args] ), + # np.var( dist_x_hor[some_args] ),'jalibdiha') + special_separators.append(np.mean(cy_main_hor[some_args])) else: img_p_in=img_in_hor special_separators=[] diff --git a/src/eynollah/utils/contour.py b/src/eynollah/utils/contour.py index f304db2..052688c 100644 --- a/src/eynollah/utils/contour.py +++ b/src/eynollah/utils/contour.py @@ -14,21 +14,16 @@ from shapely.ops import unary_union, nearest_points from .rotate import rotate_image, rotation_image_new def contours_in_same_horizon(cy_main_hor): - X1 = np.zeros((len(cy_main_hor), len(cy_main_hor))) - X2 = np.zeros((len(cy_main_hor), len(cy_main_hor))) - - X1[0::1, :] = cy_main_hor[:] - X2 = X1.T - - X_dif = np.abs(X2 - X1) - args_help = np.array(range(len(cy_main_hor))) - all_args = [] - for i in range(len(cy_main_hor)): - list_h = list(args_help[X_dif[i, :] <= 20]) - list_h.append(i) - if len(list_h) > 1: - all_args.append(list(set(list_h))) - return np.unique(np.array(all_args, dtype=object)) + """ + Takes an array of y coords, identifies all pairs among them + which are close to each other, and returns all such pairs + by index into the array. + """ + sort = np.argsort(cy_main_hor) + same = np.diff(cy_main_hor[sort] <= 20) + # groups = np.split(sort, np.arange(len(cy_main_hor) - 1)[~same] + 1) + same = np.flatnonzero(same) + return np.stack((sort[:-1][same], sort[1:][same])).T def find_contours_mean_y_diff(contours_main): M_main = [cv2.moments(contours_main[j]) for j in range(len(contours_main))] From 6cc5900943d5395adbbbea737871413bf10b9ccf Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 24 Oct 2025 01:55:07 +0200 Subject: [PATCH 15/32] `find_num_col`: add better plotting (but commented out) --- src/eynollah/utils/__init__.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/src/eynollah/utils/__init__.py b/src/eynollah/utils/__init__.py index 10987ad..4046396 100644 --- a/src/eynollah/utils/__init__.py +++ b/src/eynollah/utils/__init__.py @@ -485,9 +485,12 @@ def find_num_col(regions_without_separators, num_col_classifier, tables, multipl # print(peaks_neg,'peaks_neg') # fig, (ax1, ax2) = plt.subplots(2, sharex=True) # ax1.imshow(regions_without_separators, aspect="auto") - # ax2.plot(z) - # ax2.scatter(peaks_neg, z[peaks_neg]) - # ax2.axhline(grenze, label="grenze") + # ax2.plot(z, color='red', label='z') + # ax2.plot(zneg[20:], color='blue', label='zneg') + # ax2.scatter(peaks_neg, z[peaks_neg], color='red') + # ax2.scatter(peaks_neg, zneg[20:][peaks_neg], color='blue') + # ax2.axhline(min_peaks_pos, color='red', label="min_peaks_pos") + # ax2.axhline(grenze, color='blue', label="grenze") # ax2.text(0, grenze, "grenze") # plt.show() @@ -816,6 +819,12 @@ def find_num_col_by_vertical_lines(regions_without_separators, multiplier=3.8): peaks, _ = find_peaks(z, height=0) # print(peaks,'peaksnew') + # fig, (ax1, ax2) = plt.subplots(2, sharex=True, suptitle='find_num_col_by_vertical_lines') + # ax1.imshow(regions_without_separators, aspect="auto") + # ax2.plot(z) + # ax2.scatter(peaks, z[peaks]) + # ax2.set_title('find_peaks(regions_without_separators.sum(axis=0), height=0)') + # plt.show() return peaks def return_regions_without_separators(regions_pre): From 6fbb5f8a12185192f7d9db7b008c3ef8b5f24d33 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 24 Oct 2025 02:02:39 +0200 Subject: [PATCH 16/32] `return_boxes_of_images_by_order_of_reading_new`: simplify MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - array instead of list operations - add better plotting (but commented out) - add more debug printing (but commented out) - add more inline comments for documentation - rename identifiers to make more readable: - `cy_hor_diff` → `y_max_hor_some` (because the ymax gets passed) - `lines` → `seps` - `y_type_2` → `y_mid` - `y_diff_type_2` → `y_max` - `y_lines_by_order` → `y_mid_by_order` - `y_lines_without_mother` → `y_mid_without_mother` - `y_lines_with_child_without_mother` → `y_mid_with_child_without_mother` - `y_column` → `y_mid_column` - `y_column_nc` → `y_mid_column_nc` - `y_all_between_nm_wc` → `y_mid_between_nm_wc` - `lines_so_close_to_top_separator` → `seps_too_close_to_top_separator` - `y_in_cols` and `y_down` → `y_mid_next` - use `pairwise()` `nc_top:nc_bot` instead of `i_c` indexing --- src/eynollah/utils/__init__.py | 480 +++++++++++++++++---------------- 1 file changed, 247 insertions(+), 233 deletions(-) diff --git a/src/eynollah/utils/__init__.py b/src/eynollah/utils/__init__.py index 4046396..eca96f3 100644 --- a/src/eynollah/utils/__init__.py +++ b/src/eynollah/utils/__init__.py @@ -1599,19 +1599,31 @@ def return_boxes_of_images_by_order_of_reading_new( if logger is None: logger = getLogger(__package__) logger.debug('enter return_boxes_of_images_by_order_of_reading_new') - # def dbg_plt(box=None, title=None): - # if box is None: - # box = [None, None, None, None] - # img = regions_without_separators[box[2]:box[3], box[0]:box[1]] + + # def dbg_plt(box=None, title=None, rectangles=None, rectangles_showidx=False): + # minx, maxx, miny, maxy = box or (0, None, 0, None) + # img = regions_without_separators[miny:maxy, minx:maxx] # plt.imshow(img) # xrange = np.arange(0, img.shape[1], 100) # yrange = np.arange(0, img.shape[0], 100) - # plt.gca().set_xticks(xrange, xrange + (box[0] or 0)) - # plt.gca().set_yticks(yrange, yrange + (box[2] or 0)) + # ax = plt.gca() + # ax.set_xticks(xrange) + # ax.set_yticks(yrange) + # ax.set_xticklabels(xrange + minx) + # ax.set_yticklabels(yrange + miny) + # def format_coord(x, y): + # return 'x={:g}, y={:g}'.format(x + minx, y + miny) + # ax.format_coord = format_coord # if title: # plt.title(title) + # if rectangles: + # for i, (xmin, xmax, ymin, ymax) in enumerate(rectangles): + # ax.add_patch(patches.Rectangle((xmin, ymin), xmax - xmin, ymax - ymin, + # fill=False, linewidth=1, edgecolor='r')) + # if rectangles_showidx: + # ax.text((xmin+xmax)/2, (ymin+ymax)/2, str(i + 1), c='r') # plt.show() - # dbg_plt() + # dbg_plt(title="return_boxes_of_images_by_order_of_reading_new") boxes=[] peaks_neg_tot_tables = [] @@ -1619,9 +1631,7 @@ def return_boxes_of_images_by_order_of_reading_new( width_tot = regions_without_separators.shape[1] for top, bot in pairwise(splitter_y_new): # print("%d:%d" % (top, bot), 'i') - # dbg_plt([None, None, top, bot], - # "image cut for y split %d:%d" % ( - # top, bot)) + # dbg_plt([0, None, top, bot], "image cut for y split %d:%d" % (top, bot)) matrix_new = matrix_of_lines_ch[(matrix_of_lines_ch[:,6] > top) & (matrix_of_lines_ch[:,7] < bot)] #print(len( matrix_new[:,9][matrix_new[:,9]==1] )) @@ -1677,20 +1687,21 @@ def return_boxes_of_images_by_order_of_reading_new( peaks_neg_fin = peaks_neg_fin1 else: peaks_neg_fin = peaks_neg_fin2 + # add offset to local result peaks_neg_fin = list(np.array(peaks_neg_fin) + left) #print(peaks_neg_fin,'peaks_neg_fin') - if right < peaks_neg_fin_early[-1]: - peaks_neg_fin_rev.append(right) peaks_neg_fin_rev.extend(peaks_neg_fin) + if right < peaks_neg_fin_early[-1]: + # all but the last column: interject the preexisting boundary + peaks_neg_fin_rev.append(right) + #print(peaks_neg_fin_rev,'peaks_neg_fin_rev') - if len(peaks_neg_fin_rev)>=len(peaks_neg_fin_org): - peaks_neg_fin=list(np.sort(peaks_neg_fin_rev)) - num_col=len(peaks_neg_fin) + if len(peaks_neg_fin_rev) >= len(peaks_neg_fin_org): + peaks_neg_fin = peaks_neg_fin_rev else: - peaks_neg_fin=list(np.copy(peaks_neg_fin_org)) - num_col=len(peaks_neg_fin) - + peaks_neg_fin = peaks_neg_fin_org + num_col = len(peaks_neg_fin) #print(peaks_neg_fin,'peaks_neg_fin') except: logger.exception("cannot find peaks consistent with columns") @@ -1700,7 +1711,7 @@ def return_boxes_of_images_by_order_of_reading_new( x_min_hor_some=matrix_new[:,2][ (matrix_new[:,9]==0) ] x_max_hor_some=matrix_new[:,3][ (matrix_new[:,9]==0) ] cy_hor_some=matrix_new[:,5][ (matrix_new[:,9]==0) ] - cy_hor_diff=matrix_new[:,7][ (matrix_new[:,9]==0) ] + y_max_hor_some=matrix_new[:,7][ (matrix_new[:,9]==0) ] if right2left_readingorder: x_max_hor_some_new = width_tot - x_min_hor_some @@ -1708,136 +1719,121 @@ def return_boxes_of_images_by_order_of_reading_new( x_min_hor_some =list(np.copy(x_min_hor_some_new)) x_max_hor_some =list(np.copy(x_max_hor_some_new)) - peaks_neg_tot = [0] + peaks_neg_fin + [width_tot] + peaks_neg_tot = np.array([0] + peaks_neg_fin + [width_tot]) + #print(peaks_neg_tot,'peaks_neg_tot') peaks_neg_tot_tables.append(peaks_neg_tot) - reading_order_type, x_starting, x_ending, y_type_2, y_diff_type_2, \ - y_lines_without_mother, x_start_without_mother, x_end_without_mother, there_is_sep_with_child, \ - y_lines_with_child_without_mother, x_start_with_child_without_mother, x_end_with_child_without_mother, \ - new_main_sep_y = return_x_start_end_mothers_childs_and_type_of_reading_order( - x_min_hor_some, x_max_hor_some, cy_hor_some, peaks_neg_tot, cy_hor_diff) - all_columns = set(range(len(peaks_neg_tot) - 1)) - # print("all_columns", all_columns) + #print("all_columns", all_columns) + + reading_order_type, x_starting, x_ending, y_mid, y_max, \ + y_mid_without_mother, x_start_without_mother, x_end_without_mother, \ + there_is_sep_with_child, \ + y_mid_with_child_without_mother, x_start_with_child_without_mother, x_end_with_child_without_mother, \ + new_main_sep_y = return_x_start_end_mothers_childs_and_type_of_reading_order( + x_min_hor_some, x_max_hor_some, cy_hor_some, peaks_neg_tot, y_max_hor_some) + + # show multi-column separators + # dbg_plt([0, None, top, bot], "multi-column separators in current split", + # list(zip(peaks_neg_tot[x_starting], peaks_neg_tot[x_ending], + # y_mid - top, y_max - top)), True) + if (reading_order_type == 1 or - len(y_lines_without_mother) >= 2 or + len(y_mid_without_mother) >= 2 or there_is_sep_with_child == 1): + # there are top-level multi-colspan horizontal separators which overlap each other + # or multiple top-level multi-colspan horizontal separators + # or multi-colspan horizontal separators shorter than their respective top-level: + # todo: explain how this is dealt with try: y_grenze = top + 300 - #check if there is a big separator in this y_mains_sep_ohne_grenzen + up = (y_mid > top) & (y_mid <= y_grenze) - args_early_ys=np.arange(len(y_type_2)) + args_early_ys=np.arange(len(y_mid)) #print(args_early_ys,'args_early_ys') - #print(top, bot) + #print(y_mid,'y_mid') - x_starting_up = x_starting[(y_type_2 > top) & - (y_type_2 <= y_grenze)] - x_ending_up = x_ending[(y_type_2 > top) & - (y_type_2 <= y_grenze)] - y_type_2_up = y_type_2[(y_type_2 > top) & - (y_type_2 <= y_grenze)] - y_diff_type_2_up = y_diff_type_2[(y_type_2 > top) & - (y_type_2 <= y_grenze)] - args_up = args_early_ys[(y_type_2 > top) & - (y_type_2 <= y_grenze)] - if len(y_type_2_up) > 0: - y_main_separator_up = y_type_2_up [(x_starting_up==0) & - (x_ending_up==(len(peaks_neg_tot)-1) )] - y_diff_main_separator_up = y_diff_type_2_up[(x_starting_up==0) & - (x_ending_up==(len(peaks_neg_tot)-1) )] - args_main_to_deleted = args_up[(x_starting_up==0) & - (x_ending_up==(len(peaks_neg_tot)-1) )] - #print(y_main_separator_up,y_diff_main_separator_up,args_main_to_deleted,'fffffjammmm') - if len(y_diff_main_separator_up) > 0: + x_starting_up = x_starting[up] + x_ending_up = x_ending[up] + y_mid_up = y_mid[up] + y_max_up = y_max[up] + args_up = args_early_ys[up] + #print(args_up,'args_up') + #print(y_mid_up,'y_mid_up') + #check if there is a big separator in this y_mains0 + if len(y_mid_up) > 0: + # is there a separator with full-width span? + main_separator = (x_starting_up == 0) & (x_ending_up == len(peaks_neg_tot) - 1) + y_mid_main_separator_up = y_mid_up[main_separator] + y_max_main_separator_up = y_max_up[main_separator] + args_main_to_deleted = args_up[main_separator] + #print(y_mid_main_separator_up,y_max_main_separator_up,args_main_to_deleted,'fffffjammmm') + if len(y_max_main_separator_up): args_to_be_kept = np.array(list( set(args_early_ys) - set(args_main_to_deleted) )) #print(args_to_be_kept,'args_to_be_kept') - boxes.append([0, peaks_neg_tot[len(peaks_neg_tot)-1], - top, y_diff_main_separator_up.max()]) - # dbg_plt(boxes[-1], "first box") - top = y_diff_main_separator_up.max() + boxes.append([0, peaks_neg_tot[-1], + top, y_max_main_separator_up.max()]) + # dbg_plt(boxes[-1], "near top main separator box") + top = y_max_main_separator_up.max() #print(top,'top') - y_type_2 = y_type_2[args_to_be_kept] + y_mid = y_mid[args_to_be_kept] x_starting = x_starting[args_to_be_kept] x_ending = x_ending[args_to_be_kept] - y_diff_type_2 = y_diff_type_2[args_to_be_kept] + y_max = y_max[args_to_be_kept] #print('galdiha') y_grenze = top + 200 - args_early_ys2=np.arange(len(y_type_2)) - y_type_2_up=y_type_2[(y_type_2 > top) & - (y_type_2 <= y_grenze)] - x_starting_up=x_starting[(y_type_2 > top) & - (y_type_2 <= y_grenze)] - x_ending_up=x_ending[(y_type_2 > top) & - (y_type_2 <= y_grenze)] - y_diff_type_2_up=y_diff_type_2[(y_type_2 > top) & - (y_type_2 <= y_grenze)] - args_up2=args_early_ys2[(y_type_2 > top) & - (y_type_2 <= y_grenze)] - #print(y_type_2_up,x_starting_up,x_ending_up,'didid') - nodes_in = set() - for ij in range(len(x_starting_up)): - nodes_in.update(range(x_starting_up[ij], - x_ending_up[ij])) - #print(nodes_in,'nodes_in') + up = (y_mid > top) & (y_mid <= y_grenze) + args_early_ys2 = np.arange(len(y_mid)) + x_starting_up = x_starting[up] + x_ending_up = x_ending[up] + y_mid_up = y_mid[up] + y_max_up = y_max[up] + args_up2 = args_early_ys2[up] + #print(y_mid_up,x_starting_up,x_ending_up,'didid') + else: + args_early_ys2 = args_early_ys + args_up2 = args_up - if nodes_in == set(range(len(peaks_neg_tot)-1)): - pass - elif nodes_in == set(range(1, len(peaks_neg_tot)-1)): - pass - else: - #print('burdaydikh') - args_to_be_kept2=np.array(list( set(args_early_ys2)-set(args_up2) )) + nodes_in = set() + for ij in range(len(x_starting_up)): + nodes_in.update(range(x_starting_up[ij], + x_ending_up[ij])) + #print(nodes_in,'nodes_in') + #print(np.array(range(len(peaks_neg_tot)-1)),'np.array(range(len(peaks_neg_tot)-1))') - if len(args_to_be_kept2)>0: - y_type_2 = y_type_2[args_to_be_kept2] - x_starting = x_starting[args_to_be_kept2] - x_ending = x_ending[args_to_be_kept2] - y_diff_type_2 = y_diff_type_2[args_to_be_kept2] - else: - pass - #print('burdaydikh2') - elif len(y_diff_main_separator_up)==0: - nodes_in = set() - for ij in range(len(x_starting_up)): - nodes_in.update(range(x_starting_up[ij], - x_ending_up[ij])) - #print(nodes_in,'nodes_in2') - #print(np.array(range(len(peaks_neg_tot)-1)),'np.array(range(len(peaks_neg_tot)-1))') + if nodes_in == set(range(len(peaks_neg_tot)-1)): + pass + elif nodes_in == set(range(1, len(peaks_neg_tot)-1)): + pass + else: + #print('burdaydikh') + args_to_be_kept2 = np.array(list( set(args_early_ys2) - set(args_up2) )) - if nodes_in == set(range(len(peaks_neg_tot)-1)): - pass - elif nodes_in == set(range(1,len(peaks_neg_tot)-1)): - pass - else: - #print('burdaydikh') - #print(args_early_ys,'args_early_ys') - #print(args_up,'args_up') - args_to_be_kept2=np.array(list( set(args_early_ys) - set(args_up) )) - - #print(args_to_be_kept2,'args_to_be_kept2') - #print(len(y_type_2),len(x_starting),len(x_ending),len(y_diff_type_2)) - if len(args_to_be_kept2)>0: - y_type_2 = y_type_2[args_to_be_kept2] - x_starting = x_starting[args_to_be_kept2] - x_ending = x_ending[args_to_be_kept2] - y_diff_type_2 = y_diff_type_2[args_to_be_kept2] - else: - pass - #print('burdaydikh2') + if len(args_to_be_kept2): + #print(args_to_be_kept2, "args_to_be_kept2") + y_mid = y_mid[args_to_be_kept2] + x_starting = x_starting[args_to_be_kept2] + x_ending = x_ending[args_to_be_kept2] + y_max = y_max[args_to_be_kept2] #int(top) - y_lines_by_order=[] + # order multi-column separators + y_mid_by_order=[] x_start_by_order=[] x_end_by_order=[] if (reading_order_type == 1 or len(x_end_with_child_without_mother) == 0): if reading_order_type == 1: - y_lines_by_order.append(top) + # there are top-level multi-colspan horizontal separators which overlap each other + #print("adding all columns at top because of multiple overlapping mothers") + y_mid_by_order.append(top) x_start_by_order.append(0) x_end_by_order.append(len(peaks_neg_tot)-2) else: + # there are no top-level multi-colspan horizontal separators which themselves + # contain shorter multi-colspan separators #print(x_start_without_mother,x_end_without_mother,peaks_neg_tot,'dodo') columns_covered_by_mothers = set() for dj in range(len(x_start_without_mother)): @@ -1845,31 +1841,32 @@ def return_boxes_of_images_by_order_of_reading_new( range(x_start_without_mother[dj], x_end_without_mother[dj])) columns_not_covered = list(all_columns - columns_covered_by_mothers) - y_type_2 = np.append(y_type_2, np.ones(len(columns_not_covered) + - len(x_start_without_mother), - dtype=int) * top) - ##y_lines_by_order = np.append(y_lines_by_order, [top] * len(columns_not_covered)) + #print(columns_covered_by_mothers, "columns_covered_by_mothers") + #print(columns_not_covered, "columns_not_covered") + y_mid = np.append(y_mid, np.ones(len(columns_not_covered) + + len(x_start_without_mother), + dtype=int) * top) + ##y_mid_by_order = np.append(y_mid_by_order, [top] * len(columns_not_covered)) ##x_start_by_order = np.append(x_start_by_order, [0] * len(columns_not_covered)) x_starting = np.append(x_starting, np.array(columns_not_covered, int)) x_starting = np.append(x_starting, x_start_without_mother) x_ending = np.append(x_ending, np.array(columns_not_covered, int) + 1) x_ending = np.append(x_ending, x_end_without_mother) - ind_args=np.arange(len(y_type_2)) - #ind_args=np.array(ind_args) + ind_args=np.arange(len(y_mid)) #print(ind_args,'ind_args') for column in range(len(peaks_neg_tot)-1): #print(column,'column') ind_args_in_col=ind_args[x_starting==column] #print('babali2') #print(ind_args_in_col,'ind_args_in_col') - #print(len(y_type_2)) - y_column=y_type_2[ind_args_in_col] + #print(len(y_mid)) + y_mid_column=y_mid[ind_args_in_col] x_start_column=x_starting[ind_args_in_col] x_end_column=x_ending[ind_args_in_col] #print('babali3') - ind_args_col_sorted=np.argsort(y_column) - y_lines_by_order.extend(y_column[ind_args_col_sorted]) + ind_args_col_sorted=np.argsort(y_mid_column) + y_mid_by_order.extend(y_mid_column[ind_args_col_sorted]) x_start_by_order.extend(x_start_column[ind_args_col_sorted]) x_end_by_order.extend(x_end_column[ind_args_col_sorted] - 1) else: @@ -1880,93 +1877,113 @@ def return_boxes_of_images_by_order_of_reading_new( range(x_start_without_mother[dj], x_end_without_mother[dj])) columns_not_covered = list(all_columns - columns_covered_by_mothers) - y_type_2 = np.append(y_type_2, np.ones(len(columns_not_covered) + len(x_start_without_mother), - dtype=int) * top) - ##y_lines_by_order = np.append(y_lines_by_order, [top] * len(columns_not_covered)) + #print(columns_covered_by_mothers, "columns_covered_by_mothers") + #print(columns_not_covered, "columns_not_covered") + y_mid = np.append(y_mid, np.ones(len(columns_not_covered) + + len(x_start_without_mother), + dtype=int) * top) + ##y_mid_by_order = np.append(y_mid_by_order, [top] * len(columns_not_covered)) ##x_start_by_order = np.append(x_start_by_order, [0] * len(columns_not_covered)) x_starting = np.append(x_starting, np.array(columns_not_covered, int)) x_starting = np.append(x_starting, x_start_without_mother) x_ending = np.append(x_ending, np.array(columns_not_covered, int) + 1) x_ending = np.append(x_ending, x_end_without_mother) - columns_covered_by_with_child_no_mothers = set() + columns_covered_by_mothers_with_child = set() for dj in range(len(x_end_with_child_without_mother)): - columns_covered_by_with_child_no_mothers.update( + columns_covered_by_mothers_with_child.update( range(x_start_with_child_without_mother[dj], x_end_with_child_without_mother[dj])) - columns_not_covered_child_no_mother = list( - all_columns - columns_covered_by_with_child_no_mothers) + #print(columns_covered_by_mothers_with_child, "columns_covered_by_mothers_with_child") + columns_not_covered_by_mothers_with_child = list( + all_columns - columns_covered_by_mothers_with_child) #indexes_to_be_spanned=[] for i_s in range(len(x_end_with_child_without_mother)): - columns_not_covered_child_no_mother.append(x_start_with_child_without_mother[i_s]) - columns_not_covered_child_no_mother = np.sort(columns_not_covered_child_no_mother) - ind_args = np.arange(len(y_type_2)) - x_end_with_child_without_mother = np.array(x_end_with_child_without_mother, int) - x_start_with_child_without_mother = np.array(x_start_with_child_without_mother, int) - for i_s_nc in columns_not_covered_child_no_mother: + columns_not_covered_by_mothers_with_child.append(x_start_with_child_without_mother[i_s]) + columns_not_covered_by_mothers_with_child = np.sort(columns_not_covered_by_mothers_with_child) + #print(columns_not_covered_by_mothers_with_child, "columns_not_covered_by_mothers_with_child") + ind_args = np.arange(len(y_mid)) + for i_s_nc in columns_not_covered_by_mothers_with_child: if i_s_nc in x_start_with_child_without_mother: + # use only seps with mother's span ("biggest") #print("i_s_nc", i_s_nc) x_end_biggest_column = \ - x_end_with_child_without_mother[x_start_with_child_without_mother==i_s_nc][0] - args_all_biggest_lines = ind_args[(x_starting==i_s_nc) & - (x_ending==x_end_biggest_column)] - y_column_nc = y_type_2[args_all_biggest_lines] - #x_start_column_nc = x_starting[args_all_biggest_lines] - #x_end_column_nc = x_ending[args_all_biggest_lines] - y_column_nc = np.sort(y_column_nc) - for i_c in range(len(y_column_nc)): + x_end_with_child_without_mother[ + x_start_with_child_without_mother == i_s_nc][0] + args_all_biggest_seps = \ + ind_args[(x_starting == i_s_nc) & + (x_ending == x_end_biggest_column)] + y_mid_column_nc = y_mid[args_all_biggest_seps] + #print("%d:%d" % (i_s_nc, x_end_biggest_column), "columns covered by mother with child") + #x_start_column_nc = x_starting[args_all_biggest_seps] + #x_end_column_nc = x_ending[args_all_biggest_seps] + y_mid_column_nc = np.sort(y_mid_column_nc) + #print(y_mid_column_nc, "y_mid_column_nc (sorted)") + for nc_top, nc_bot in pairwise(np.append(y_mid_column_nc, bot)): #print("i_c", i_c) - ind_all_lines_between_nm_wc = \ - ind_args[(y_type_2 > y_column_nc[i_c]) & - (y_type_2 < (y_column_nc[i_c+1] - if i_c < len(y_column_nc)-1 - else bot)) & + #print("%d:%d" % (nc_top, nc_bot), "y_mid_column_nc") + ind_all_seps_between_nm_wc = \ + ind_args[(y_mid > nc_top) & + (y_mid < nc_bot) & (x_starting >= i_s_nc) & (x_ending <= x_end_biggest_column)] - y_all_between_nm_wc = y_type_2[ind_all_lines_between_nm_wc] - x_starting_all_between_nm_wc = x_starting[ind_all_lines_between_nm_wc] - x_ending_all_between_nm_wc = x_ending[ind_all_lines_between_nm_wc] + y_mid_all_between_nm_wc = y_mid[ind_all_seps_between_nm_wc] + x_starting_all_between_nm_wc = x_starting[ind_all_seps_between_nm_wc] + x_ending_all_between_nm_wc = x_ending[ind_all_seps_between_nm_wc] columns_covered_by_mothers = set() - for dj in range(len(ind_all_lines_between_nm_wc)): + for dj in range(len(ind_all_seps_between_nm_wc)): columns_covered_by_mothers.update( range(x_starting_all_between_nm_wc[dj], x_ending_all_between_nm_wc[dj])) + #print(columns_covered_by_mothers, "columns_covered_by_mothers") child_columns = set(range(i_s_nc, x_end_biggest_column)) columns_not_covered = list(child_columns - columns_covered_by_mothers) + #print(child_columns, "child_columns") + #print(columns_not_covered, "columns_not_covered") - if len(ind_all_lines_between_nm_wc): + if len(ind_all_seps_between_nm_wc): biggest = np.argmax(x_ending_all_between_nm_wc - x_starting_all_between_nm_wc) + #print(ind_all_seps_between_nm_wc, "ind_all_seps_between_nm_wc") + #print(biggest, "%d:%d" % (x_starting_all_between_nm_wc[biggest], + x_ending_all_between_nm_wc[biggest]), "biggest") if columns_covered_by_mothers == set( range(x_starting_all_between_nm_wc[biggest], x_ending_all_between_nm_wc[biggest])): - # biggest accounts for all columns alone, - # longest line should be extended - lines_so_close_to_top_separator = \ - ((y_all_between_nm_wc > y_column_nc[i_c]) & - (y_all_between_nm_wc <= y_column_nc[i_c] + 500)) - if (np.count_nonzero(lines_so_close_to_top_separator) and - np.count_nonzero(lines_so_close_to_top_separator) < - len(ind_all_lines_between_nm_wc)): - y_all_between_nm_wc = \ - y_all_between_nm_wc[~lines_so_close_to_top_separator] + # single biggest accounts for all covered columns alone, + # this separator should be extended to cover all + seps_too_close_to_top_separator = \ + ((y_mid_all_between_nm_wc > nc_top) & + (y_mid_all_between_nm_wc <= nc_top + 500)) + if (np.count_nonzero(seps_too_close_to_top_separator) and + np.count_nonzero(seps_too_close_to_top_separator) < + len(ind_all_seps_between_nm_wc)): + #print(seps_too_close_to_top_separator, "seps_too_close_to_top_separator") + y_mid_all_between_nm_wc = \ + y_mid_all_between_nm_wc[~seps_too_close_to_top_separator] x_starting_all_between_nm_wc = \ - x_starting_all_between_nm_wc[~lines_so_close_to_top_separator] + x_starting_all_between_nm_wc[~seps_too_close_to_top_separator] x_ending_all_between_nm_wc = \ - x_ending_all_between_nm_wc[~lines_so_close_to_top_separator] + x_ending_all_between_nm_wc[~seps_too_close_to_top_separator] - y_all_between_nm_wc = np.append(y_all_between_nm_wc, y_column_nc[i_c]) - x_starting_all_between_nm_wc = np.append(x_starting_all_between_nm_wc, i_s_nc) - x_ending_all_between_nm_wc = np.append(x_ending_all_between_nm_wc, x_end_biggest_column) + y_mid_all_between_nm_wc = np.append( + y_mid_all_between_nm_wc, nc_top) + x_starting_all_between_nm_wc = np.append( + x_starting_all_between_nm_wc, i_s_nc) + x_ending_all_between_nm_wc = np.append( + x_ending_all_between_nm_wc, x_end_biggest_column) else: - y_all_between_nm_wc = np.append(y_all_between_nm_wc, y_column_nc[i_c]) - x_starting_all_between_nm_wc = np.append(x_starting_all_between_nm_wc, x_starting_all_between_nm_wc[biggest]) - x_ending_all_between_nm_wc = np.append(x_ending_all_between_nm_wc, x_ending_all_between_nm_wc[biggest]) + y_mid_all_between_nm_wc = np.append( + y_mid_all_between_nm_wc, nc_top) + x_starting_all_between_nm_wc = np.append( + x_starting_all_between_nm_wc, x_starting_all_between_nm_wc[biggest]) + x_ending_all_between_nm_wc = np.append( + x_ending_all_between_nm_wc, x_ending_all_between_nm_wc[biggest]) if len(columns_not_covered): - y_all_between_nm_wc = np.append( - y_all_between_nm_wc, [y_column_nc[i_c]] * len(columns_not_covered)) + y_mid_all_between_nm_wc = np.append( + y_mid_all_between_nm_wc, [nc_top] * len(columns_not_covered)) x_starting_all_between_nm_wc = np.append( x_starting_all_between_nm_wc, np.array(columns_not_covered, int)) x_ending_all_between_nm_wc = np.append( @@ -1977,52 +1994,53 @@ def return_boxes_of_images_by_order_of_reading_new( ind_args_in_col=ind_args_between[x_starting_all_between_nm_wc==column] #print('babali2') #print(ind_args_in_col,'ind_args_in_col') - #print(len(y_type_2)) - y_column=y_all_between_nm_wc[ind_args_in_col] + #print(len(y_mid)) + y_mid_column=y_mid_all_between_nm_wc[ind_args_in_col] x_start_column=x_starting_all_between_nm_wc[ind_args_in_col] x_end_column=x_ending_all_between_nm_wc[ind_args_in_col] #print('babali3') - ind_args_col_sorted=np.argsort(y_column) - y_lines_by_order.extend(y_column[ind_args_col_sorted]) + ind_args_col_sorted=np.argsort(y_mid_column) + y_mid_by_order.extend(y_mid_column[ind_args_col_sorted]) x_start_by_order.extend(x_start_column[ind_args_col_sorted]) x_end_by_order.extend(x_end_column[ind_args_col_sorted] - 1) else: - #print(column,'column') + #print(i_s_nc,'column not covered by mothers with child') ind_args_in_col=ind_args[x_starting==i_s_nc] #print('babali2') #print(ind_args_in_col,'ind_args_in_col') - #print(len(y_type_2)) - y_column=y_type_2[ind_args_in_col] + #print(len(y_mid)) + y_mid_column=y_mid[ind_args_in_col] x_start_column=x_starting[ind_args_in_col] x_end_column=x_ending[ind_args_in_col] #print('babali3') - ind_args_col_sorted = np.argsort(y_column) - y_lines_by_order.extend(y_column[ind_args_col_sorted]) + ind_args_col_sorted = np.argsort(y_mid_column) + y_mid_by_order.extend(y_mid_column[ind_args_col_sorted]) x_start_by_order.extend(x_start_column[ind_args_col_sorted]) x_end_by_order.extend(x_end_column[ind_args_col_sorted] - 1) - y_lines_by_order = np.array(y_lines_by_order) + # create single-column boxes from multi-column separators + y_mid_by_order = np.array(y_mid_by_order) x_start_by_order = np.array(x_start_by_order) x_end_by_order = np.array(x_end_by_order) - for il in range(len(y_lines_by_order)): + for il in range(len(y_mid_by_order)): #print(il, "il") - y_itself = y_lines_by_order[il] + y_mid_itself = y_mid_by_order[il] x_start_itself = x_start_by_order[il] x_end_itself = x_end_by_order[il] for column in range(int(x_start_itself), int(x_end_itself)+1): #print(column,'cols') - y_in_cols = y_lines_by_order[(y_itself < y_lines_by_order) & - (column >= x_start_by_order) & - (column <= x_end_by_order)] #print('burda') - y_down = y_in_cols.min(initial=bot) #print('burda2') - #print(y_in_cols,'y_in_cols') - #print(y_itself,'y_itself') + y_mid_next = y_mid_by_order[(y_mid_itself < y_mid_by_order) & + (column >= x_start_by_order) & + (column <= x_end_by_order)] + y_mid_next = y_mid_next.min(initial=bot) + #print(y_mid_next,'y_mid_next') + #print(y_mid_itself,'y_mid_itself') boxes.append([peaks_neg_tot[column], peaks_neg_tot[column+1], - y_itself, - y_down]) + y_mid_itself, + y_mid_next]) # dbg_plt(boxes[-1], "A column %d box" % (column + 1)) except: logger.exception("cannot assign boxes") @@ -2030,20 +2048,21 @@ def return_boxes_of_images_by_order_of_reading_new( top, bot]) # dbg_plt(boxes[-1], "fallback box") else: - y_lines_by_order=[] + # order multi-column separators + y_mid_by_order=[] x_start_by_order=[] x_end_by_order=[] if len(x_starting)>0: - columns_covered_by_lines_covered_more_than_2col = set() + columns_covered_by_seps_covered_more_than_2col = set() for dj in range(len(x_starting)): if set(range(x_starting[dj], x_ending[dj])) != all_columns: - columns_covered_by_lines_covered_more_than_2col.update( + columns_covered_by_seps_covered_more_than_2col.update( range(x_starting[dj], x_ending[dj])) - columns_not_covered = list(all_columns - columns_covered_by_lines_covered_more_than_2col) + columns_not_covered = list(all_columns - columns_covered_by_seps_covered_more_than_2col) - y_type_2 = np.append(y_type_2, np.ones(len(columns_not_covered) + 1, - dtype=int) * top) - ##y_lines_by_order = np.append(y_lines_by_order, [top] * len(columns_not_covered)) + y_mid = np.append(y_mid, np.ones(len(columns_not_covered) + 1, + dtype=int) * top) + ##y_mid_by_order = np.append(y_mid_by_order, [top] * len(columns_not_covered)) ##x_start_by_order = np.append(x_start_by_order, [0] * len(columns_not_covered)) x_starting = np.append(x_starting, np.array(columns_not_covered, x_starting.dtype)) x_ending = np.append(x_ending, np.array(columns_not_covered, x_ending.dtype) + 1) @@ -2055,53 +2074,52 @@ def return_boxes_of_images_by_order_of_reading_new( x_ending = np.append(x_ending, x_ending[0]) else: columns_not_covered = list(all_columns) - y_type_2 = np.append(y_type_2, np.ones(len(columns_not_covered), - dtype=int) * top) - ##y_lines_by_order = np.append(y_lines_by_order, [top] * len(columns_not_covered)) + y_mid = np.append(y_mid, np.ones(len(columns_not_covered), + dtype=int) * top) + ##y_mid_by_order = np.append(y_mid_by_order, [top] * len(columns_not_covered)) ##x_start_by_order = np.append(x_start_by_order, [0] * len(columns_not_covered)) x_starting = np.append(x_starting, np.array(columns_not_covered, x_starting.dtype)) x_ending = np.append(x_ending, np.array(columns_not_covered, x_ending.dtype) + 1) - ind_args = np.arange(len(y_type_2)) - + ind_args = np.arange(len(y_mid)) + for column in range(len(peaks_neg_tot)-1): #print(column,'column') ind_args_in_col=ind_args[x_starting==column] - #print(len(y_type_2)) - y_column=y_type_2[ind_args_in_col] + #print(len(y_mid)) + y_mid_column=y_mid[ind_args_in_col] x_start_column=x_starting[ind_args_in_col] x_end_column=x_ending[ind_args_in_col] - ind_args_col_sorted = np.argsort(y_column) - y_lines_by_order.extend(y_column[ind_args_col_sorted]) + ind_args_col_sorted = np.argsort(y_mid_column) + y_mid_by_order.extend(y_mid_column[ind_args_col_sorted]) x_start_by_order.extend(x_start_column[ind_args_col_sorted]) x_end_by_order.extend(x_end_column[ind_args_col_sorted] - 1) - y_lines_by_order = np.array(y_lines_by_order) + # create single-column boxes from multi-column separators + y_mid_by_order = np.array(y_mid_by_order) x_start_by_order = np.array(x_start_by_order) x_end_by_order = np.array(x_end_by_order) - for il in range(len(y_lines_by_order)): + for il in range(len(y_mid_by_order)): #print(il, "il") - y_itself = y_lines_by_order[il] - #print(y_itself,'y_itself') + y_mid_itself = y_mid_by_order[il] + #print(y_mid_itself,'y_mid_itself') x_start_itself = x_start_by_order[il] x_end_itself = x_end_by_order[il] for column in range(x_start_itself, x_end_itself+1): #print(column,'cols') - y_in_cols = y_lines_by_order[(y_itself < y_lines_by_order) & - (column >= x_start_by_order) & - (column <= x_end_by_order)] #print('burda2') - #print(y_in_cols,'y_in_cols') - y_down = y_in_cols.min(initial=bot) - #print(y_down,'y_down') + y_mid_next = y_mid_by_order[(y_mid_itself < y_mid_by_order) & + (column >= x_start_by_order) & + (column <= x_end_by_order)] + #print(y_mid_next,'y_mid_next') + y_mid_next = y_mid_next.min(initial=bot) + #print(y_mid_next,'y_mid_next') boxes.append([peaks_neg_tot[column], peaks_neg_tot[column+1], - y_itself, - y_down]) + y_mid_itself, + y_mid_next]) # dbg_plt(boxes[-1], "B column %d box" % (column + 1)) - #else: - #boxes.append([ 0, regions_without_separators[:,:].shape[1] ,top, bot]) if right2left_readingorder: peaks_neg_tot_tables_new = [] @@ -2119,11 +2137,7 @@ def return_boxes_of_images_by_order_of_reading_new( peaks_neg_tot_tables = peaks_neg_tot_tables_new # show final xy-cut - # plt.imshow(regions_without_separators) - # for xmin, xmax, ymin, ymax in boxes: - # plt.gca().add_patch(patches.Rectangle((xmin, ymin), xmax - xmin, ymax - ymin, - # fill=False, linewidth=1, edgecolor='r')) - # plt.show() + # dbg_plt(None, "final XY-Cut", boxes, True) logger.debug('exit return_boxes_of_images_by_order_of_reading_new') return boxes, peaks_neg_tot_tables From 66a0e55e49e4224e38c9792d06d2468c7fe8fe90 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 24 Oct 2025 02:15:13 +0200 Subject: [PATCH 17/32] `return_boxes_of_images_by_order_of_reading_new`: avoid oversplits when y slice (`top:bot`) is not a significant part of the page, viz. less than 22% (as in `find_number_of_columns_in_document`), avoid forcing `find_num_col` to reach `num_col_classifier` (allows large headers not to be split up and thus better ordered) --- src/eynollah/utils/__init__.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/src/eynollah/utils/__init__.py b/src/eynollah/utils/__init__.py index eca96f3..2017cea 100644 --- a/src/eynollah/utils/__init__.py +++ b/src/eynollah/utils/__init__.py @@ -1628,7 +1628,8 @@ def return_boxes_of_images_by_order_of_reading_new( boxes=[] peaks_neg_tot_tables = [] splitter_y_new = np.array(splitter_y_new, dtype=int) - width_tot = regions_without_separators.shape[1] + height_tot, width_tot = regions_without_separators.shape + big_part = 22 * height_tot // 100 # percent height for top, bot in pairwise(splitter_y_new): # print("%d:%d" % (top, bot), 'i') # dbg_plt([0, None, top, bot], "image cut for y split %d:%d" % (top, bot)) @@ -1644,12 +1645,17 @@ def return_boxes_of_images_by_order_of_reading_new( try: num_col, peaks_neg_fin = find_num_col( regions_without_separators[top:bot], - num_col_classifier, tables, multiplier=6. if erosion_hurts else 7.) + # we do not expect to get all columns in small parts (headings etc.): + num_col_classifier if bot - top >= big_part else 1, + tables, multiplier=6. if erosion_hurts else 7.) except: peaks_neg_fin=[] num_col = 0 try: - if (len(peaks_neg_fin)+1)= big_part): # found too few columns here #print('burda') peaks_neg_fin_org = np.copy(peaks_neg_fin) From 3ebbc2d693ae14a640c3cb478b6a01cd1e42efb7 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 24 Oct 2025 02:30:39 +0200 Subject: [PATCH 18/32] `return_boxes_of_images_by_order_of_reading_new`: indent (by removing unnecessary conditional) --- src/eynollah/utils/__init__.py | 843 ++++++++++++++++----------------- 1 file changed, 421 insertions(+), 422 deletions(-) diff --git a/src/eynollah/utils/__init__.py b/src/eynollah/utils/__init__.py index 2017cea..f30d55e 100644 --- a/src/eynollah/utils/__init__.py +++ b/src/eynollah/utils/__init__.py @@ -1641,241 +1641,204 @@ def return_boxes_of_images_by_order_of_reading_new( #if (len(matrix_new[:,9][matrix_new[:,9]==1]) > 0 and # np.max(matrix_new[:,8][matrix_new[:,9]==1]) >= # 0.1 * (np.abs(bot-top))): - if True: - try: - num_col, peaks_neg_fin = find_num_col( - regions_without_separators[top:bot], - # we do not expect to get all columns in small parts (headings etc.): - num_col_classifier if bot - top >= big_part else 1, - tables, multiplier=6. if erosion_hurts else 7.) - except: - peaks_neg_fin=[] - num_col = 0 - try: - if ((len(peaks_neg_fin) + 1 < num_col_classifier or - num_col_classifier == 6) and - # we do not expect to get all columns in small parts (headings etc.): - bot - top >= big_part): - # found too few columns here - #print('burda') - peaks_neg_fin_org = np.copy(peaks_neg_fin) - #print("peaks_neg_fin_org", peaks_neg_fin_org) - if len(peaks_neg_fin)==0: - num_col, peaks_neg_fin = find_num_col( - regions_without_separators[top:bot], - num_col_classifier, tables, multiplier=3.) - #print(peaks_neg_fin,'peaks_neg_fin') - peaks_neg_fin_early = [0] + peaks_neg_fin + [width_tot-1] + try: + num_col, peaks_neg_fin = find_num_col( + regions_without_separators[top:bot], + # we do not expect to get all columns in small parts (headings etc.): + num_col_classifier if bot - top >= big_part else 1, + tables, multiplier=6. if erosion_hurts else 7.) + except: + peaks_neg_fin=[] + num_col = 0 + try: + if ((len(peaks_neg_fin) + 1 < num_col_classifier or + num_col_classifier == 6) and + # we do not expect to get all columns in small parts (headings etc.): + bot - top >= big_part): + # found too few columns here + #print('burda') + peaks_neg_fin_org = np.copy(peaks_neg_fin) + #print("peaks_neg_fin_org", peaks_neg_fin_org) + if len(peaks_neg_fin)==0: + num_col, peaks_neg_fin = find_num_col( + regions_without_separators[top:bot], + num_col_classifier, tables, multiplier=3.) + #print(peaks_neg_fin,'peaks_neg_fin') + peaks_neg_fin_early = [0] + peaks_neg_fin + [width_tot-1] - #print(peaks_neg_fin_early,'burda2') - peaks_neg_fin_rev=[] - for left, right in pairwise(peaks_neg_fin_early): - # print("%d:%d" % (left, right), 'i_n') - # dbg_plt([left, right, top, bot], - # "image cut for y split %d:%d / x gap %d:%d" % ( - # top, bot, left, right)) - # plt.plot(regions_without_separators[top:bot, left:right].sum(axis=0)) - # plt.title("vertical projection (sum over y)") - # plt.show() - try: - _, peaks_neg_fin1 = find_num_col( - regions_without_separators[top:bot, left:right], - num_col_classifier, tables, multiplier=7.) - except: - peaks_neg_fin1 = [] - try: - _, peaks_neg_fin2 = find_num_col( - regions_without_separators[top:bot, left:right], - num_col_classifier, tables, multiplier=5.) - except: - peaks_neg_fin2 = [] - if len(peaks_neg_fin1) >= len(peaks_neg_fin2): - peaks_neg_fin = peaks_neg_fin1 - else: - peaks_neg_fin = peaks_neg_fin2 - # add offset to local result - peaks_neg_fin = list(np.array(peaks_neg_fin) + left) - #print(peaks_neg_fin,'peaks_neg_fin') - - peaks_neg_fin_rev.extend(peaks_neg_fin) - if right < peaks_neg_fin_early[-1]: - # all but the last column: interject the preexisting boundary - peaks_neg_fin_rev.append(right) - #print(peaks_neg_fin_rev,'peaks_neg_fin_rev') - - if len(peaks_neg_fin_rev) >= len(peaks_neg_fin_org): - peaks_neg_fin = peaks_neg_fin_rev + #print(peaks_neg_fin_early,'burda2') + peaks_neg_fin_rev=[] + for left, right in pairwise(peaks_neg_fin_early): + # print("%d:%d" % (left, right), 'i_n') + # dbg_plt([left, right, top, bot], + # "image cut for y split %d:%d / x gap %d:%d" % ( + # top, bot, left, right)) + # plt.plot(regions_without_separators[top:bot, left:right].sum(axis=0)) + # plt.title("vertical projection (sum over y)") + # plt.show() + try: + _, peaks_neg_fin1 = find_num_col( + regions_without_separators[top:bot, left:right], + num_col_classifier, tables, multiplier=7.) + except: + peaks_neg_fin1 = [] + try: + _, peaks_neg_fin2 = find_num_col( + regions_without_separators[top:bot, left:right], + num_col_classifier, tables, multiplier=5.) + except: + peaks_neg_fin2 = [] + if len(peaks_neg_fin1) >= len(peaks_neg_fin2): + peaks_neg_fin = peaks_neg_fin1 else: - peaks_neg_fin = peaks_neg_fin_org - num_col = len(peaks_neg_fin) + peaks_neg_fin = peaks_neg_fin2 + # add offset to local result + peaks_neg_fin = list(np.array(peaks_neg_fin) + left) #print(peaks_neg_fin,'peaks_neg_fin') - except: - logger.exception("cannot find peaks consistent with columns") - #num_col, peaks_neg_fin = find_num_col( - # regions_without_separators[top:bot,:], - # multiplier=7.0) - x_min_hor_some=matrix_new[:,2][ (matrix_new[:,9]==0) ] - x_max_hor_some=matrix_new[:,3][ (matrix_new[:,9]==0) ] - cy_hor_some=matrix_new[:,5][ (matrix_new[:,9]==0) ] - y_max_hor_some=matrix_new[:,7][ (matrix_new[:,9]==0) ] - if right2left_readingorder: - x_max_hor_some_new = width_tot - x_min_hor_some - x_min_hor_some_new = width_tot - x_max_hor_some - x_min_hor_some =list(np.copy(x_min_hor_some_new)) - x_max_hor_some =list(np.copy(x_max_hor_some_new)) + peaks_neg_fin_rev.extend(peaks_neg_fin) + if right < peaks_neg_fin_early[-1]: + # all but the last column: interject the preexisting boundary + peaks_neg_fin_rev.append(right) + #print(peaks_neg_fin_rev,'peaks_neg_fin_rev') - peaks_neg_tot = np.array([0] + peaks_neg_fin + [width_tot]) - #print(peaks_neg_tot,'peaks_neg_tot') - peaks_neg_tot_tables.append(peaks_neg_tot) + if len(peaks_neg_fin_rev) >= len(peaks_neg_fin_org): + peaks_neg_fin = peaks_neg_fin_rev + else: + peaks_neg_fin = peaks_neg_fin_org + num_col = len(peaks_neg_fin) + #print(peaks_neg_fin,'peaks_neg_fin') + except: + logger.exception("cannot find peaks consistent with columns") + #num_col, peaks_neg_fin = find_num_col( + # regions_without_separators[top:bot,:], + # multiplier=7.0) + x_min_hor_some=matrix_new[:,2][ (matrix_new[:,9]==0) ] + x_max_hor_some=matrix_new[:,3][ (matrix_new[:,9]==0) ] + cy_hor_some=matrix_new[:,5][ (matrix_new[:,9]==0) ] + y_max_hor_some=matrix_new[:,7][ (matrix_new[:,9]==0) ] - all_columns = set(range(len(peaks_neg_tot) - 1)) - #print("all_columns", all_columns) + if right2left_readingorder: + x_max_hor_some_new = width_tot - x_min_hor_some + x_min_hor_some_new = width_tot - x_max_hor_some + x_min_hor_some =list(np.copy(x_min_hor_some_new)) + x_max_hor_some =list(np.copy(x_max_hor_some_new)) - reading_order_type, x_starting, x_ending, y_mid, y_max, \ - y_mid_without_mother, x_start_without_mother, x_end_without_mother, \ - there_is_sep_with_child, \ - y_mid_with_child_without_mother, x_start_with_child_without_mother, x_end_with_child_without_mother, \ - new_main_sep_y = return_x_start_end_mothers_childs_and_type_of_reading_order( - x_min_hor_some, x_max_hor_some, cy_hor_some, peaks_neg_tot, y_max_hor_some) + peaks_neg_tot = np.array([0] + peaks_neg_fin + [width_tot]) + #print(peaks_neg_tot,'peaks_neg_tot') + peaks_neg_tot_tables.append(peaks_neg_tot) - # show multi-column separators - # dbg_plt([0, None, top, bot], "multi-column separators in current split", - # list(zip(peaks_neg_tot[x_starting], peaks_neg_tot[x_ending], - # y_mid - top, y_max - top)), True) + all_columns = set(range(len(peaks_neg_tot) - 1)) + #print("all_columns", all_columns) - if (reading_order_type == 1 or - len(y_mid_without_mother) >= 2 or - there_is_sep_with_child == 1): - # there are top-level multi-colspan horizontal separators which overlap each other - # or multiple top-level multi-colspan horizontal separators - # or multi-colspan horizontal separators shorter than their respective top-level: - # todo: explain how this is dealt with - try: - y_grenze = top + 300 - up = (y_mid > top) & (y_mid <= y_grenze) + reading_order_type, x_starting, x_ending, y_mid, y_max, \ + y_mid_without_mother, x_start_without_mother, x_end_without_mother, \ + there_is_sep_with_child, \ + y_mid_with_child_without_mother, x_start_with_child_without_mother, x_end_with_child_without_mother, \ + new_main_sep_y = return_x_start_end_mothers_childs_and_type_of_reading_order( + x_min_hor_some, x_max_hor_some, cy_hor_some, peaks_neg_tot, y_max_hor_some) - args_early_ys=np.arange(len(y_mid)) - #print(args_early_ys,'args_early_ys') - #print(y_mid,'y_mid') + # show multi-column separators + # dbg_plt([0, None, top, bot], "multi-column separators in current split", + # list(zip(peaks_neg_tot[x_starting], peaks_neg_tot[x_ending], + # y_mid - top, y_max - top)), True) - x_starting_up = x_starting[up] - x_ending_up = x_ending[up] - y_mid_up = y_mid[up] - y_max_up = y_max[up] - args_up = args_early_ys[up] - #print(args_up,'args_up') - #print(y_mid_up,'y_mid_up') - #check if there is a big separator in this y_mains0 - if len(y_mid_up) > 0: - # is there a separator with full-width span? - main_separator = (x_starting_up == 0) & (x_ending_up == len(peaks_neg_tot) - 1) - y_mid_main_separator_up = y_mid_up[main_separator] - y_max_main_separator_up = y_max_up[main_separator] - args_main_to_deleted = args_up[main_separator] - #print(y_mid_main_separator_up,y_max_main_separator_up,args_main_to_deleted,'fffffjammmm') - if len(y_max_main_separator_up): - args_to_be_kept = np.array(list( set(args_early_ys) - set(args_main_to_deleted) )) - #print(args_to_be_kept,'args_to_be_kept') - boxes.append([0, peaks_neg_tot[-1], - top, y_max_main_separator_up.max()]) - # dbg_plt(boxes[-1], "near top main separator box") - top = y_max_main_separator_up.max() + if (reading_order_type == 1 or + len(y_mid_without_mother) >= 2 or + there_is_sep_with_child == 1): + # there are top-level multi-colspan horizontal separators which overlap each other + # or multiple top-level multi-colspan horizontal separators + # or multi-colspan horizontal separators shorter than their respective top-level: + # todo: explain how this is dealt with + try: + y_grenze = top + 300 + up = (y_mid > top) & (y_mid <= y_grenze) - #print(top,'top') - y_mid = y_mid[args_to_be_kept] - x_starting = x_starting[args_to_be_kept] - x_ending = x_ending[args_to_be_kept] - y_max = y_max[args_to_be_kept] + args_early_ys=np.arange(len(y_mid)) + #print(args_early_ys,'args_early_ys') + #print(y_mid,'y_mid') - #print('galdiha') - y_grenze = top + 200 - up = (y_mid > top) & (y_mid <= y_grenze) - args_early_ys2 = np.arange(len(y_mid)) - x_starting_up = x_starting[up] - x_ending_up = x_ending[up] - y_mid_up = y_mid[up] - y_max_up = y_max[up] - args_up2 = args_early_ys2[up] - #print(y_mid_up,x_starting_up,x_ending_up,'didid') - else: - args_early_ys2 = args_early_ys - args_up2 = args_up + x_starting_up = x_starting[up] + x_ending_up = x_ending[up] + y_mid_up = y_mid[up] + y_max_up = y_max[up] + args_up = args_early_ys[up] + #print(args_up,'args_up') + #print(y_mid_up,'y_mid_up') + #check if there is a big separator in this y_mains0 + if len(y_mid_up) > 0: + # is there a separator with full-width span? + main_separator = (x_starting_up == 0) & (x_ending_up == len(peaks_neg_tot) - 1) + y_mid_main_separator_up = y_mid_up[main_separator] + y_max_main_separator_up = y_max_up[main_separator] + args_main_to_deleted = args_up[main_separator] + #print(y_mid_main_separator_up,y_max_main_separator_up,args_main_to_deleted,'fffffjammmm') + if len(y_max_main_separator_up): + args_to_be_kept = np.array(list( set(args_early_ys) - set(args_main_to_deleted) )) + #print(args_to_be_kept,'args_to_be_kept') + boxes.append([0, peaks_neg_tot[-1], + top, y_max_main_separator_up.max()]) + # dbg_plt(boxes[-1], "near top main separator box") + top = y_max_main_separator_up.max() - nodes_in = set() - for ij in range(len(x_starting_up)): - nodes_in.update(range(x_starting_up[ij], - x_ending_up[ij])) - #print(nodes_in,'nodes_in') - #print(np.array(range(len(peaks_neg_tot)-1)),'np.array(range(len(peaks_neg_tot)-1))') + #print(top,'top') + y_mid = y_mid[args_to_be_kept] + x_starting = x_starting[args_to_be_kept] + x_ending = x_ending[args_to_be_kept] + y_max = y_max[args_to_be_kept] - if nodes_in == set(range(len(peaks_neg_tot)-1)): - pass - elif nodes_in == set(range(1, len(peaks_neg_tot)-1)): - pass - else: - #print('burdaydikh') - args_to_be_kept2 = np.array(list( set(args_early_ys2) - set(args_up2) )) - - if len(args_to_be_kept2): - #print(args_to_be_kept2, "args_to_be_kept2") - y_mid = y_mid[args_to_be_kept2] - x_starting = x_starting[args_to_be_kept2] - x_ending = x_ending[args_to_be_kept2] - y_max = y_max[args_to_be_kept2] - - #int(top) - # order multi-column separators - y_mid_by_order=[] - x_start_by_order=[] - x_end_by_order=[] - if (reading_order_type == 1 or - len(x_end_with_child_without_mother) == 0): - if reading_order_type == 1: - # there are top-level multi-colspan horizontal separators which overlap each other - #print("adding all columns at top because of multiple overlapping mothers") - y_mid_by_order.append(top) - x_start_by_order.append(0) - x_end_by_order.append(len(peaks_neg_tot)-2) - else: - # there are no top-level multi-colspan horizontal separators which themselves - # contain shorter multi-colspan separators - #print(x_start_without_mother,x_end_without_mother,peaks_neg_tot,'dodo') - columns_covered_by_mothers = set() - for dj in range(len(x_start_without_mother)): - columns_covered_by_mothers.update( - range(x_start_without_mother[dj], - x_end_without_mother[dj])) - columns_not_covered = list(all_columns - columns_covered_by_mothers) - #print(columns_covered_by_mothers, "columns_covered_by_mothers") - #print(columns_not_covered, "columns_not_covered") - y_mid = np.append(y_mid, np.ones(len(columns_not_covered) + - len(x_start_without_mother), - dtype=int) * top) - ##y_mid_by_order = np.append(y_mid_by_order, [top] * len(columns_not_covered)) - ##x_start_by_order = np.append(x_start_by_order, [0] * len(columns_not_covered)) - x_starting = np.append(x_starting, np.array(columns_not_covered, int)) - x_starting = np.append(x_starting, x_start_without_mother) - x_ending = np.append(x_ending, np.array(columns_not_covered, int) + 1) - x_ending = np.append(x_ending, x_end_without_mother) - - ind_args=np.arange(len(y_mid)) - #print(ind_args,'ind_args') - for column in range(len(peaks_neg_tot)-1): - #print(column,'column') - ind_args_in_col=ind_args[x_starting==column] - #print('babali2') - #print(ind_args_in_col,'ind_args_in_col') - #print(len(y_mid)) - y_mid_column=y_mid[ind_args_in_col] - x_start_column=x_starting[ind_args_in_col] - x_end_column=x_ending[ind_args_in_col] - #print('babali3') - ind_args_col_sorted=np.argsort(y_mid_column) - y_mid_by_order.extend(y_mid_column[ind_args_col_sorted]) - x_start_by_order.extend(x_start_column[ind_args_col_sorted]) - x_end_by_order.extend(x_end_column[ind_args_col_sorted] - 1) + #print('galdiha') + y_grenze = top + 200 + up = (y_mid > top) & (y_mid <= y_grenze) + args_early_ys2 = np.arange(len(y_mid)) + x_starting_up = x_starting[up] + x_ending_up = x_ending[up] + y_mid_up = y_mid[up] + y_max_up = y_max[up] + args_up2 = args_early_ys2[up] + #print(y_mid_up,x_starting_up,x_ending_up,'didid') else: + args_early_ys2 = args_early_ys + args_up2 = args_up + + nodes_in = set() + for ij in range(len(x_starting_up)): + nodes_in.update(range(x_starting_up[ij], + x_ending_up[ij])) + #print(nodes_in,'nodes_in') + #print(np.array(range(len(peaks_neg_tot)-1)),'np.array(range(len(peaks_neg_tot)-1))') + + if nodes_in == set(range(len(peaks_neg_tot)-1)): + pass + elif nodes_in == set(range(1, len(peaks_neg_tot)-1)): + pass + else: + #print('burdaydikh') + args_to_be_kept2 = np.array(list( set(args_early_ys2) - set(args_up2) )) + + if len(args_to_be_kept2): + #print(args_to_be_kept2, "args_to_be_kept2") + y_mid = y_mid[args_to_be_kept2] + x_starting = x_starting[args_to_be_kept2] + x_ending = x_ending[args_to_be_kept2] + y_max = y_max[args_to_be_kept2] + + #int(top) + # order multi-column separators + y_mid_by_order=[] + x_start_by_order=[] + x_end_by_order=[] + if (reading_order_type == 1 or + len(x_end_with_child_without_mother) == 0): + if reading_order_type == 1: + # there are top-level multi-colspan horizontal separators which overlap each other + #print("adding all columns at top because of multiple overlapping mothers") + y_mid_by_order.append(top) + x_start_by_order.append(0) + x_end_by_order.append(len(peaks_neg_tot)-2) + else: + # there are no top-level multi-colspan horizontal separators which themselves + # contain shorter multi-colspan separators #print(x_start_without_mother,x_end_without_mother,peaks_neg_tot,'dodo') columns_covered_by_mothers = set() for dj in range(len(x_start_without_mother)): @@ -1895,212 +1858,170 @@ def return_boxes_of_images_by_order_of_reading_new( x_ending = np.append(x_ending, np.array(columns_not_covered, int) + 1) x_ending = np.append(x_ending, x_end_without_mother) - columns_covered_by_mothers_with_child = set() - for dj in range(len(x_end_with_child_without_mother)): - columns_covered_by_mothers_with_child.update( - range(x_start_with_child_without_mother[dj], - x_end_with_child_without_mother[dj])) - #print(columns_covered_by_mothers_with_child, "columns_covered_by_mothers_with_child") - columns_not_covered_by_mothers_with_child = list( - all_columns - columns_covered_by_mothers_with_child) - #indexes_to_be_spanned=[] - for i_s in range(len(x_end_with_child_without_mother)): - columns_not_covered_by_mothers_with_child.append(x_start_with_child_without_mother[i_s]) - columns_not_covered_by_mothers_with_child = np.sort(columns_not_covered_by_mothers_with_child) - #print(columns_not_covered_by_mothers_with_child, "columns_not_covered_by_mothers_with_child") - ind_args = np.arange(len(y_mid)) - for i_s_nc in columns_not_covered_by_mothers_with_child: - if i_s_nc in x_start_with_child_without_mother: - # use only seps with mother's span ("biggest") - #print("i_s_nc", i_s_nc) - x_end_biggest_column = \ - x_end_with_child_without_mother[ - x_start_with_child_without_mother == i_s_nc][0] - args_all_biggest_seps = \ - ind_args[(x_starting == i_s_nc) & - (x_ending == x_end_biggest_column)] - y_mid_column_nc = y_mid[args_all_biggest_seps] - #print("%d:%d" % (i_s_nc, x_end_biggest_column), "columns covered by mother with child") - #x_start_column_nc = x_starting[args_all_biggest_seps] - #x_end_column_nc = x_ending[args_all_biggest_seps] - y_mid_column_nc = np.sort(y_mid_column_nc) - #print(y_mid_column_nc, "y_mid_column_nc (sorted)") - for nc_top, nc_bot in pairwise(np.append(y_mid_column_nc, bot)): - #print("i_c", i_c) - #print("%d:%d" % (nc_top, nc_bot), "y_mid_column_nc") - ind_all_seps_between_nm_wc = \ - ind_args[(y_mid > nc_top) & - (y_mid < nc_bot) & - (x_starting >= i_s_nc) & - (x_ending <= x_end_biggest_column)] - y_mid_all_between_nm_wc = y_mid[ind_all_seps_between_nm_wc] - x_starting_all_between_nm_wc = x_starting[ind_all_seps_between_nm_wc] - x_ending_all_between_nm_wc = x_ending[ind_all_seps_between_nm_wc] - - columns_covered_by_mothers = set() - for dj in range(len(ind_all_seps_between_nm_wc)): - columns_covered_by_mothers.update( - range(x_starting_all_between_nm_wc[dj], - x_ending_all_between_nm_wc[dj])) - #print(columns_covered_by_mothers, "columns_covered_by_mothers") - child_columns = set(range(i_s_nc, x_end_biggest_column)) - columns_not_covered = list(child_columns - columns_covered_by_mothers) - #print(child_columns, "child_columns") - #print(columns_not_covered, "columns_not_covered") - - if len(ind_all_seps_between_nm_wc): - biggest = np.argmax(x_ending_all_between_nm_wc - - x_starting_all_between_nm_wc) - #print(ind_all_seps_between_nm_wc, "ind_all_seps_between_nm_wc") - #print(biggest, "%d:%d" % (x_starting_all_between_nm_wc[biggest], - x_ending_all_between_nm_wc[biggest]), "biggest") - if columns_covered_by_mothers == set( - range(x_starting_all_between_nm_wc[biggest], - x_ending_all_between_nm_wc[biggest])): - # single biggest accounts for all covered columns alone, - # this separator should be extended to cover all - seps_too_close_to_top_separator = \ - ((y_mid_all_between_nm_wc > nc_top) & - (y_mid_all_between_nm_wc <= nc_top + 500)) - if (np.count_nonzero(seps_too_close_to_top_separator) and - np.count_nonzero(seps_too_close_to_top_separator) < - len(ind_all_seps_between_nm_wc)): - #print(seps_too_close_to_top_separator, "seps_too_close_to_top_separator") - y_mid_all_between_nm_wc = \ - y_mid_all_between_nm_wc[~seps_too_close_to_top_separator] - x_starting_all_between_nm_wc = \ - x_starting_all_between_nm_wc[~seps_too_close_to_top_separator] - x_ending_all_between_nm_wc = \ - x_ending_all_between_nm_wc[~seps_too_close_to_top_separator] - - y_mid_all_between_nm_wc = np.append( - y_mid_all_between_nm_wc, nc_top) - x_starting_all_between_nm_wc = np.append( - x_starting_all_between_nm_wc, i_s_nc) - x_ending_all_between_nm_wc = np.append( - x_ending_all_between_nm_wc, x_end_biggest_column) - else: - y_mid_all_between_nm_wc = np.append( - y_mid_all_between_nm_wc, nc_top) - x_starting_all_between_nm_wc = np.append( - x_starting_all_between_nm_wc, x_starting_all_between_nm_wc[biggest]) - x_ending_all_between_nm_wc = np.append( - x_ending_all_between_nm_wc, x_ending_all_between_nm_wc[biggest]) - - if len(columns_not_covered): - y_mid_all_between_nm_wc = np.append( - y_mid_all_between_nm_wc, [nc_top] * len(columns_not_covered)) - x_starting_all_between_nm_wc = np.append( - x_starting_all_between_nm_wc, np.array(columns_not_covered, int)) - x_ending_all_between_nm_wc = np.append( - x_ending_all_between_nm_wc, np.array(columns_not_covered, int) + 1) - - ind_args_between=np.arange(len(x_ending_all_between_nm_wc)) - for column in range(int(i_s_nc), int(x_end_biggest_column)): - ind_args_in_col=ind_args_between[x_starting_all_between_nm_wc==column] - #print('babali2') - #print(ind_args_in_col,'ind_args_in_col') - #print(len(y_mid)) - y_mid_column=y_mid_all_between_nm_wc[ind_args_in_col] - x_start_column=x_starting_all_between_nm_wc[ind_args_in_col] - x_end_column=x_ending_all_between_nm_wc[ind_args_in_col] - #print('babali3') - ind_args_col_sorted=np.argsort(y_mid_column) - y_mid_by_order.extend(y_mid_column[ind_args_col_sorted]) - x_start_by_order.extend(x_start_column[ind_args_col_sorted]) - x_end_by_order.extend(x_end_column[ind_args_col_sorted] - 1) - else: - #print(i_s_nc,'column not covered by mothers with child') - ind_args_in_col=ind_args[x_starting==i_s_nc] - #print('babali2') - #print(ind_args_in_col,'ind_args_in_col') - #print(len(y_mid)) - y_mid_column=y_mid[ind_args_in_col] - x_start_column=x_starting[ind_args_in_col] - x_end_column=x_ending[ind_args_in_col] - #print('babali3') - ind_args_col_sorted = np.argsort(y_mid_column) - y_mid_by_order.extend(y_mid_column[ind_args_col_sorted]) - x_start_by_order.extend(x_start_column[ind_args_col_sorted]) - x_end_by_order.extend(x_end_column[ind_args_col_sorted] - 1) - - # create single-column boxes from multi-column separators - y_mid_by_order = np.array(y_mid_by_order) - x_start_by_order = np.array(x_start_by_order) - x_end_by_order = np.array(x_end_by_order) - for il in range(len(y_mid_by_order)): - #print(il, "il") - y_mid_itself = y_mid_by_order[il] - x_start_itself = x_start_by_order[il] - x_end_itself = x_end_by_order[il] - for column in range(int(x_start_itself), int(x_end_itself)+1): - #print(column,'cols') - #print('burda') - #print('burda2') - y_mid_next = y_mid_by_order[(y_mid_itself < y_mid_by_order) & - (column >= x_start_by_order) & - (column <= x_end_by_order)] - y_mid_next = y_mid_next.min(initial=bot) - #print(y_mid_next,'y_mid_next') - #print(y_mid_itself,'y_mid_itself') - boxes.append([peaks_neg_tot[column], - peaks_neg_tot[column+1], - y_mid_itself, - y_mid_next]) - # dbg_plt(boxes[-1], "A column %d box" % (column + 1)) - except: - logger.exception("cannot assign boxes") - boxes.append([0, peaks_neg_tot[len(peaks_neg_tot)-1], - top, bot]) - # dbg_plt(boxes[-1], "fallback box") - else: - # order multi-column separators - y_mid_by_order=[] - x_start_by_order=[] - x_end_by_order=[] - if len(x_starting)>0: - columns_covered_by_seps_covered_more_than_2col = set() - for dj in range(len(x_starting)): - if set(range(x_starting[dj], x_ending[dj])) != all_columns: - columns_covered_by_seps_covered_more_than_2col.update( - range(x_starting[dj], x_ending[dj])) - columns_not_covered = list(all_columns - columns_covered_by_seps_covered_more_than_2col) - - y_mid = np.append(y_mid, np.ones(len(columns_not_covered) + 1, - dtype=int) * top) - ##y_mid_by_order = np.append(y_mid_by_order, [top] * len(columns_not_covered)) - ##x_start_by_order = np.append(x_start_by_order, [0] * len(columns_not_covered)) - x_starting = np.append(x_starting, np.array(columns_not_covered, x_starting.dtype)) - x_ending = np.append(x_ending, np.array(columns_not_covered, x_ending.dtype) + 1) - if len(new_main_sep_y) > 0: - x_starting = np.append(x_starting, 0) - x_ending = np.append(x_ending, len(peaks_neg_tot) - 1) - else: - x_starting = np.append(x_starting, x_starting[0]) - x_ending = np.append(x_ending, x_ending[0]) + ind_args=np.arange(len(y_mid)) + #print(ind_args,'ind_args') + for column in range(len(peaks_neg_tot)-1): + #print(column,'column') + ind_args_in_col=ind_args[x_starting==column] + #print('babali2') + #print(ind_args_in_col,'ind_args_in_col') + #print(len(y_mid)) + y_mid_column=y_mid[ind_args_in_col] + x_start_column=x_starting[ind_args_in_col] + x_end_column=x_ending[ind_args_in_col] + #print('babali3') + ind_args_col_sorted=np.argsort(y_mid_column) + y_mid_by_order.extend(y_mid_column[ind_args_col_sorted]) + x_start_by_order.extend(x_start_column[ind_args_col_sorted]) + x_end_by_order.extend(x_end_column[ind_args_col_sorted] - 1) else: - columns_not_covered = list(all_columns) - y_mid = np.append(y_mid, np.ones(len(columns_not_covered), + #print(x_start_without_mother,x_end_without_mother,peaks_neg_tot,'dodo') + columns_covered_by_mothers = set() + for dj in range(len(x_start_without_mother)): + columns_covered_by_mothers.update( + range(x_start_without_mother[dj], + x_end_without_mother[dj])) + columns_not_covered = list(all_columns - columns_covered_by_mothers) + #print(columns_covered_by_mothers, "columns_covered_by_mothers") + #print(columns_not_covered, "columns_not_covered") + y_mid = np.append(y_mid, np.ones(len(columns_not_covered) + + len(x_start_without_mother), dtype=int) * top) ##y_mid_by_order = np.append(y_mid_by_order, [top] * len(columns_not_covered)) ##x_start_by_order = np.append(x_start_by_order, [0] * len(columns_not_covered)) - x_starting = np.append(x_starting, np.array(columns_not_covered, x_starting.dtype)) - x_ending = np.append(x_ending, np.array(columns_not_covered, x_ending.dtype) + 1) + x_starting = np.append(x_starting, np.array(columns_not_covered, int)) + x_starting = np.append(x_starting, x_start_without_mother) + x_ending = np.append(x_ending, np.array(columns_not_covered, int) + 1) + x_ending = np.append(x_ending, x_end_without_mother) - ind_args = np.arange(len(y_mid)) + columns_covered_by_mothers_with_child = set() + for dj in range(len(x_end_with_child_without_mother)): + columns_covered_by_mothers_with_child.update( + range(x_start_with_child_without_mother[dj], + x_end_with_child_without_mother[dj])) + #print(columns_covered_by_mothers_with_child, "columns_covered_by_mothers_with_child") + columns_not_covered_by_mothers_with_child = list( + all_columns - columns_covered_by_mothers_with_child) + #indexes_to_be_spanned=[] + for i_s in range(len(x_end_with_child_without_mother)): + columns_not_covered_by_mothers_with_child.append(x_start_with_child_without_mother[i_s]) + columns_not_covered_by_mothers_with_child = np.sort(columns_not_covered_by_mothers_with_child) + #print(columns_not_covered_by_mothers_with_child, "columns_not_covered_by_mothers_with_child") + ind_args = np.arange(len(y_mid)) + for i_s_nc in columns_not_covered_by_mothers_with_child: + if i_s_nc in x_start_with_child_without_mother: + # use only seps with mother's span ("biggest") + #print("i_s_nc", i_s_nc) + x_end_biggest_column = \ + x_end_with_child_without_mother[ + x_start_with_child_without_mother == i_s_nc][0] + args_all_biggest_seps = \ + ind_args[(x_starting == i_s_nc) & + (x_ending == x_end_biggest_column)] + y_mid_column_nc = y_mid[args_all_biggest_seps] + #print("%d:%d" % (i_s_nc, x_end_biggest_column), "columns covered by mother with child") + #x_start_column_nc = x_starting[args_all_biggest_seps] + #x_end_column_nc = x_ending[args_all_biggest_seps] + y_mid_column_nc = np.sort(y_mid_column_nc) + #print(y_mid_column_nc, "y_mid_column_nc (sorted)") + for nc_top, nc_bot in pairwise(np.append(y_mid_column_nc, bot)): + #print("i_c", i_c) + #print("%d:%d" % (nc_top, nc_bot), "y_mid_column_nc") + ind_all_seps_between_nm_wc = \ + ind_args[(y_mid > nc_top) & + (y_mid < nc_bot) & + (x_starting >= i_s_nc) & + (x_ending <= x_end_biggest_column)] + y_mid_all_between_nm_wc = y_mid[ind_all_seps_between_nm_wc] + x_starting_all_between_nm_wc = x_starting[ind_all_seps_between_nm_wc] + x_ending_all_between_nm_wc = x_ending[ind_all_seps_between_nm_wc] - for column in range(len(peaks_neg_tot)-1): - #print(column,'column') - ind_args_in_col=ind_args[x_starting==column] - #print(len(y_mid)) - y_mid_column=y_mid[ind_args_in_col] - x_start_column=x_starting[ind_args_in_col] - x_end_column=x_ending[ind_args_in_col] + columns_covered_by_mothers = set() + for dj in range(len(ind_all_seps_between_nm_wc)): + columns_covered_by_mothers.update( + range(x_starting_all_between_nm_wc[dj], + x_ending_all_between_nm_wc[dj])) + #print(columns_covered_by_mothers, "columns_covered_by_mothers") + child_columns = set(range(i_s_nc, x_end_biggest_column)) + columns_not_covered = list(child_columns - columns_covered_by_mothers) + #print(child_columns, "child_columns") + #print(columns_not_covered, "columns_not_covered") - ind_args_col_sorted = np.argsort(y_mid_column) - y_mid_by_order.extend(y_mid_column[ind_args_col_sorted]) - x_start_by_order.extend(x_start_column[ind_args_col_sorted]) - x_end_by_order.extend(x_end_column[ind_args_col_sorted] - 1) + if len(ind_all_seps_between_nm_wc): + biggest = np.argmax(x_ending_all_between_nm_wc - + x_starting_all_between_nm_wc) + #print(ind_all_seps_between_nm_wc, "ind_all_seps_between_nm_wc") + #print(biggest, "%d:%d" % (x_starting_all_between_nm_wc[biggest], + x_ending_all_between_nm_wc[biggest]), "biggest") + if columns_covered_by_mothers == set( + range(x_starting_all_between_nm_wc[biggest], + x_ending_all_between_nm_wc[biggest])): + # single biggest accounts for all covered columns alone, + # this separator should be extended to cover all + seps_too_close_to_top_separator = \ + ((y_mid_all_between_nm_wc > nc_top) & + (y_mid_all_between_nm_wc <= nc_top + 500)) + if (np.count_nonzero(seps_too_close_to_top_separator) and + np.count_nonzero(seps_too_close_to_top_separator) < + len(ind_all_seps_between_nm_wc)): + #print(seps_too_close_to_top_separator, "seps_too_close_to_top_separator") + y_mid_all_between_nm_wc = \ + y_mid_all_between_nm_wc[~seps_too_close_to_top_separator] + x_starting_all_between_nm_wc = \ + x_starting_all_between_nm_wc[~seps_too_close_to_top_separator] + x_ending_all_between_nm_wc = \ + x_ending_all_between_nm_wc[~seps_too_close_to_top_separator] + + y_mid_all_between_nm_wc = np.append( + y_mid_all_between_nm_wc, nc_top) + x_starting_all_between_nm_wc = np.append( + x_starting_all_between_nm_wc, i_s_nc) + x_ending_all_between_nm_wc = np.append( + x_ending_all_between_nm_wc, x_end_biggest_column) + else: + y_mid_all_between_nm_wc = np.append( + y_mid_all_between_nm_wc, nc_top) + x_starting_all_between_nm_wc = np.append( + x_starting_all_between_nm_wc, x_starting_all_between_nm_wc[biggest]) + x_ending_all_between_nm_wc = np.append( + x_ending_all_between_nm_wc, x_ending_all_between_nm_wc[biggest]) + + if len(columns_not_covered): + y_mid_all_between_nm_wc = np.append( + y_mid_all_between_nm_wc, [nc_top] * len(columns_not_covered)) + x_starting_all_between_nm_wc = np.append( + x_starting_all_between_nm_wc, np.array(columns_not_covered, int)) + x_ending_all_between_nm_wc = np.append( + x_ending_all_between_nm_wc, np.array(columns_not_covered, int) + 1) + + ind_args_between=np.arange(len(x_ending_all_between_nm_wc)) + for column in range(int(i_s_nc), int(x_end_biggest_column)): + ind_args_in_col=ind_args_between[x_starting_all_between_nm_wc==column] + #print('babali2') + #print(ind_args_in_col,'ind_args_in_col') + #print(len(y_mid)) + y_mid_column=y_mid_all_between_nm_wc[ind_args_in_col] + x_start_column=x_starting_all_between_nm_wc[ind_args_in_col] + x_end_column=x_ending_all_between_nm_wc[ind_args_in_col] + #print('babali3') + ind_args_col_sorted=np.argsort(y_mid_column) + y_mid_by_order.extend(y_mid_column[ind_args_col_sorted]) + x_start_by_order.extend(x_start_column[ind_args_col_sorted]) + x_end_by_order.extend(x_end_column[ind_args_col_sorted] - 1) + else: + #print(i_s_nc,'column not covered by mothers with child') + ind_args_in_col=ind_args[x_starting==i_s_nc] + #print('babali2') + #print(ind_args_in_col,'ind_args_in_col') + #print(len(y_mid)) + y_mid_column=y_mid[ind_args_in_col] + x_start_column=x_starting[ind_args_in_col] + x_end_column=x_ending[ind_args_in_col] + #print('babali3') + ind_args_col_sorted = np.argsort(y_mid_column) + y_mid_by_order.extend(y_mid_column[ind_args_col_sorted]) + x_start_by_order.extend(x_start_column[ind_args_col_sorted]) + x_end_by_order.extend(x_end_column[ind_args_col_sorted] - 1) # create single-column boxes from multi-column separators y_mid_by_order = np.array(y_mid_by_order) @@ -2109,23 +2030,101 @@ def return_boxes_of_images_by_order_of_reading_new( for il in range(len(y_mid_by_order)): #print(il, "il") y_mid_itself = y_mid_by_order[il] - #print(y_mid_itself,'y_mid_itself') x_start_itself = x_start_by_order[il] x_end_itself = x_end_by_order[il] - for column in range(x_start_itself, x_end_itself+1): + for column in range(int(x_start_itself), int(x_end_itself)+1): #print(column,'cols') + #print('burda') #print('burda2') y_mid_next = y_mid_by_order[(y_mid_itself < y_mid_by_order) & (column >= x_start_by_order) & (column <= x_end_by_order)] - #print(y_mid_next,'y_mid_next') y_mid_next = y_mid_next.min(initial=bot) #print(y_mid_next,'y_mid_next') + #print(y_mid_itself,'y_mid_itself') boxes.append([peaks_neg_tot[column], peaks_neg_tot[column+1], y_mid_itself, y_mid_next]) - # dbg_plt(boxes[-1], "B column %d box" % (column + 1)) + # dbg_plt(boxes[-1], "A column %d box" % (column + 1)) + except: + logger.exception("cannot assign boxes") + boxes.append([0, peaks_neg_tot[len(peaks_neg_tot)-1], + top, bot]) + # dbg_plt(boxes[-1], "fallback box") + else: + # order multi-column separators + y_mid_by_order=[] + x_start_by_order=[] + x_end_by_order=[] + if len(x_starting)>0: + columns_covered_by_seps_covered_more_than_2col = set() + for dj in range(len(x_starting)): + if set(range(x_starting[dj], x_ending[dj])) != all_columns: + columns_covered_by_seps_covered_more_than_2col.update( + range(x_starting[dj], x_ending[dj])) + columns_not_covered = list(all_columns - columns_covered_by_seps_covered_more_than_2col) + + y_mid = np.append(y_mid, np.ones(len(columns_not_covered) + 1, + dtype=int) * top) + ##y_mid_by_order = np.append(y_mid_by_order, [top] * len(columns_not_covered)) + ##x_start_by_order = np.append(x_start_by_order, [0] * len(columns_not_covered)) + x_starting = np.append(x_starting, np.array(columns_not_covered, x_starting.dtype)) + x_ending = np.append(x_ending, np.array(columns_not_covered, x_ending.dtype) + 1) + if len(new_main_sep_y) > 0: + x_starting = np.append(x_starting, 0) + x_ending = np.append(x_ending, len(peaks_neg_tot) - 1) + else: + x_starting = np.append(x_starting, x_starting[0]) + x_ending = np.append(x_ending, x_ending[0]) + else: + columns_not_covered = list(all_columns) + y_mid = np.append(y_mid, np.ones(len(columns_not_covered), + dtype=int) * top) + ##y_mid_by_order = np.append(y_mid_by_order, [top] * len(columns_not_covered)) + ##x_start_by_order = np.append(x_start_by_order, [0] * len(columns_not_covered)) + x_starting = np.append(x_starting, np.array(columns_not_covered, x_starting.dtype)) + x_ending = np.append(x_ending, np.array(columns_not_covered, x_ending.dtype) + 1) + + ind_args = np.arange(len(y_mid)) + + for column in range(len(peaks_neg_tot)-1): + #print(column,'column') + ind_args_in_col=ind_args[x_starting==column] + #print(len(y_mid)) + y_mid_column=y_mid[ind_args_in_col] + x_start_column=x_starting[ind_args_in_col] + x_end_column=x_ending[ind_args_in_col] + + ind_args_col_sorted = np.argsort(y_mid_column) + y_mid_by_order.extend(y_mid_column[ind_args_col_sorted]) + x_start_by_order.extend(x_start_column[ind_args_col_sorted]) + x_end_by_order.extend(x_end_column[ind_args_col_sorted] - 1) + + # create single-column boxes from multi-column separators + y_mid_by_order = np.array(y_mid_by_order) + x_start_by_order = np.array(x_start_by_order) + x_end_by_order = np.array(x_end_by_order) + for il in range(len(y_mid_by_order)): + #print(il, "il") + y_mid_itself = y_mid_by_order[il] + #print(y_mid_itself,'y_mid_itself') + x_start_itself = x_start_by_order[il] + x_end_itself = x_end_by_order[il] + for column in range(x_start_itself, x_end_itself+1): + #print(column,'cols') + #print('burda2') + y_mid_next = y_mid_by_order[(y_mid_itself < y_mid_by_order) & + (column >= x_start_by_order) & + (column <= x_end_by_order)] + #print(y_mid_next,'y_mid_next') + y_mid_next = y_mid_next.min(initial=bot) + #print(y_mid_next,'y_mid_next') + boxes.append([peaks_neg_tot[column], + peaks_neg_tot[column+1], + y_mid_itself, + y_mid_next]) + # dbg_plt(boxes[-1], "B column %d box" % (column + 1)) if right2left_readingorder: peaks_neg_tot_tables_new = [] From a2a9fe51175cfd11bc62d1e917bf79b299a7846e Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 24 Oct 2025 02:35:04 +0200 Subject: [PATCH 19/32] `delete_separator_around`: simplify, eynollah: identifiers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - use array instead of list operations - rename identifiers: - `pixel` → `label` - `line` → `sep` --- src/eynollah/eynollah.py | 104 ++++++++++++++++++--------------------- 1 file changed, 47 insertions(+), 57 deletions(-) diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index 08ffed7..eee3777 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -2669,45 +2669,35 @@ class Eynollah: return layout_org, contours_new - def delete_separator_around(self, spliter_y,peaks_neg,image_by_region, pixel_line, pixel_table): + def delete_separator_around(self, splitter_y, peaks_neg, image_by_region, label_seps, label_table): # format of subboxes: box=[x1, x2 , y1, y2] pix_del = 100 - if len(image_by_region.shape)==3: - for i in range(len(spliter_y)-1): - for j in range(1,len(peaks_neg[i])-1): - ys = slice(int(spliter_y[i]), - int(spliter_y[i+1])) - xs = slice(peaks_neg[i][j] - pix_del, - peaks_neg[i][j] + pix_del) - image_by_region[ys,xs,0][image_by_region[ys,xs,0]==pixel_line] = 0 - image_by_region[ys,xs,0][image_by_region[ys,xs,1]==pixel_line] = 0 - image_by_region[ys,xs,0][image_by_region[ys,xs,2]==pixel_line] = 0 - - image_by_region[ys,xs,0][image_by_region[ys,xs,0]==pixel_table] = 0 - image_by_region[ys,xs,0][image_by_region[ys,xs,1]==pixel_table] = 0 - image_by_region[ys,xs,0][image_by_region[ys,xs,2]==pixel_table] = 0 - else: - for i in range(len(spliter_y)-1): - for j in range(1,len(peaks_neg[i])-1): - ys = slice(int(spliter_y[i]), - int(spliter_y[i+1])) - xs = slice(peaks_neg[i][j] - pix_del, - peaks_neg[i][j] + pix_del) - image_by_region[ys,xs][image_by_region[ys,xs]==pixel_line] = 0 - image_by_region[ys,xs][image_by_region[ys,xs]==pixel_table] = 0 + for i in range(len(splitter_y)-1): + for j in range(1,len(peaks_neg[i])-1): + where = np.index_exp[splitter_y[i]: + splitter_y[i+1], + peaks_neg[i][j] - pix_del: + peaks_neg[i][j] + pix_del, + :] + if image_by_region.ndim < 3: + where = where[:2] + else: + print("image_by_region ndim is 3!") # rs + image_by_region[where][image_by_region[where] == label_seps] = 0 + image_by_region[where][image_by_region[where] == label_table] = 0 return image_by_region def add_tables_heuristic_to_layout( self, image_regions_eraly_p, boxes, - slope_mean_hor, spliter_y, peaks_neg_tot, image_revised, - num_col_classifier, min_area, pixel_line): + slope_mean_hor, splitter_y, peaks_neg_tot, image_revised, + num_col_classifier, min_area, label_seps): - pixel_table =10 - image_revised_1 = self.delete_separator_around(spliter_y, peaks_neg_tot, image_revised, pixel_line, pixel_table) + label_table =10 + image_revised_1 = self.delete_separator_around(splitter_y, peaks_neg_tot, image_revised, label_seps, label_table) try: - image_revised_1[:,:30][image_revised_1[:,:30]==pixel_line] = 0 - image_revised_1[:,-30:][image_revised_1[:,-30:]==pixel_line] = 0 + image_revised_1[:,:30][image_revised_1[:,:30]==label_seps] = 0 + image_revised_1[:,-30:][image_revised_1[:,-30:]==label_seps] = 0 except: pass boxes = np.array(boxes, dtype=int) # to be on the safe side @@ -2718,7 +2708,7 @@ class Eynollah: _, thresh = cv2.threshold(image_col, 0, 255, 0) contours,hirarchy=cv2.findContours(thresh.copy(), cv2.RETR_TREE,cv2.CHAIN_APPROX_SIMPLE) - if indiv==pixel_table: + if indiv==label_table: main_contours = filter_contours_area_of_image_tables(thresh, contours, hirarchy, max_area=1, min_area=0.001) else: @@ -2734,11 +2724,11 @@ class Eynollah: box_xs = slice(*boxes[i][0:2]) image_box = img_comm[box_ys, box_xs] try: - image_box_tabels_1 = (image_box == pixel_table) * 1 + image_box_tabels_1 = (image_box == label_table) * 1 contours_tab,_=return_contours_of_image(image_box_tabels_1) contours_tab=filter_contours_area_of_image_tables(image_box_tabels_1,contours_tab,_,1,0.003) - image_box_tabels_1 = (image_box == pixel_line).astype(np.uint8) * 1 - image_box_tabels_and_m_text = ( (image_box == pixel_table) | + image_box_tabels_1 = (image_box == label_seps).astype(np.uint8) * 1 + image_box_tabels_and_m_text = ( (image_box == label_table) | (image_box == 1) ).astype(np.uint8) * 1 image_box_tabels_1 = cv2.dilate(image_box_tabels_1, KERNEL, iterations=5) @@ -2800,7 +2790,7 @@ class Eynollah: y_up_tabs=[] for ii in range(len(y_up_tabs)): - image_box[y_up_tabs[ii]:y_down_tabs[ii]] = pixel_table + image_box[y_up_tabs[ii]:y_down_tabs[ii]] = label_table image_revised_last[box_ys, box_xs] = image_box else: @@ -2811,14 +2801,14 @@ class Eynollah: image_revised_last[box_ys, box_xs] = image_box if num_col_classifier==1: - img_tables_col_1 = (image_revised_last == pixel_table).astype(np.uint8) + img_tables_col_1 = (image_revised_last == label_table).astype(np.uint8) contours_table_col1, _ = return_contours_of_image(img_tables_col_1) _,_ ,_ , _, y_min_tab_col1 ,y_max_tab_col1, _= find_new_features_of_contours(contours_table_col1) if len(y_min_tab_col1)>0: for ijv in range(len(y_min_tab_col1)): - image_revised_last[int(y_min_tab_col1[ijv]):int(y_max_tab_col1[ijv])] = pixel_table + image_revised_last[int(y_min_tab_col1[ijv]):int(y_max_tab_col1[ijv])] = label_table return image_revised_last def get_tables_from_model(self, img, num_col_classifier): @@ -3153,14 +3143,14 @@ class Eynollah: text_regions_p_1_n = None textline_mask_tot_d = None regions_without_separators_d = None - pixel_lines = 3 + label_seps = 3 if np.abs(slope_deskew) < SLOPE_THRESHOLD: - _, _, matrix_of_lines_ch, splitter_y_new, _ = find_number_of_columns_in_document( - text_regions_p, num_col_classifier, self.tables, pixel_lines) + _, _, matrix_of_seps_ch, splitter_y_new, _ = find_number_of_columns_in_document( + text_regions_p, num_col_classifier, self.tables, label_seps) if np.abs(slope_deskew) >= SLOPE_THRESHOLD: - _, _, matrix_of_lines_ch_d, splitter_y_new_d, _ = find_number_of_columns_in_document( - text_regions_p_1_n, num_col_classifier, self.tables, pixel_lines) + _, _, matrix_of_seps_ch_d, splitter_y_new_d, _ = find_number_of_columns_in_document( + text_regions_p_1_n, num_col_classifier, self.tables, label_seps) #print(time.time()-t_0_box,'time box in 2') self.logger.info("num_col_classifier: %s", num_col_classifier) @@ -3175,7 +3165,7 @@ class Eynollah: t1 = time.time() if np.abs(slope_deskew) < SLOPE_THRESHOLD: boxes, peaks_neg_tot_tables = return_boxes_of_images_by_order_of_reading_new( - splitter_y_new, regions_without_separators, matrix_of_lines_ch, + splitter_y_new, regions_without_separators, matrix_of_seps_ch, num_col_classifier, erosion_hurts, self.tables, self.right2left) boxes_d = None self.logger.debug("len(boxes): %s", len(boxes)) @@ -3187,17 +3177,17 @@ class Eynollah: else: text_regions_p_tables = np.copy(text_regions_p) text_regions_p_tables[(table_prediction == 1)] = 10 - pixel_line = 3 + label_seps = 3 img_revised_tab2 = self.add_tables_heuristic_to_layout( text_regions_p_tables, boxes, 0, splitter_y_new, peaks_neg_tot_tables, text_regions_p_tables, - num_col_classifier , 0.000005, pixel_line) + num_col_classifier , 0.000005, label_seps) #print(time.time()-t_0_box,'time box in 3.2') img_revised_tab2, contoures_tables = self.check_iou_of_bounding_box_and_contour_for_tables( img_revised_tab2, table_prediction, 10, num_col_classifier) #print(time.time()-t_0_box,'time box in 3.3') else: boxes_d, peaks_neg_tot_tables_d = return_boxes_of_images_by_order_of_reading_new( - splitter_y_new_d, regions_without_separators_d, matrix_of_lines_ch_d, + splitter_y_new_d, regions_without_separators_d, matrix_of_seps_ch_d, num_col_classifier, erosion_hurts, self.tables, self.right2left) boxes = None self.logger.debug("len(boxes): %s", len(boxes_d)) @@ -3210,11 +3200,11 @@ class Eynollah: text_regions_p_tables = np.round(text_regions_p_tables) text_regions_p_tables[(text_regions_p_tables != 3) & (table_prediction_n == 1)] = 10 - pixel_line = 3 + label_seps = 3 img_revised_tab2 = self.add_tables_heuristic_to_layout( text_regions_p_tables, boxes_d, 0, splitter_y_new_d, peaks_neg_tot_tables_d, text_regions_p_tables, - num_col_classifier, 0.000005, pixel_line) + num_col_classifier, 0.000005, label_seps) img_revised_tab2_d,_ = self.check_iou_of_bounding_box_and_contour_for_tables( img_revised_tab2, table_prediction_n, 10, num_col_classifier) @@ -3333,14 +3323,14 @@ class Eynollah: regions_without_separators = (text_regions_p[:,:] == 1)*1 regions_without_separators[table_prediction == 1] = 1 - pixel_lines=3 + label_seps=3 if np.abs(slope_deskew) < SLOPE_THRESHOLD: num_col, _, matrix_of_lines_ch, splitter_y_new, _ = find_number_of_columns_in_document( - text_regions_p, num_col_classifier, self.tables, pixel_lines) + text_regions_p, num_col_classifier, self.tables, label_seps) if np.abs(slope_deskew) >= SLOPE_THRESHOLD: num_col_d, _, matrix_of_lines_ch_d, splitter_y_new_d, _ = find_number_of_columns_in_document( - text_regions_p_1_n, num_col_classifier, self.tables, pixel_lines) + text_regions_p_1_n, num_col_classifier, self.tables, label_seps) if num_col_classifier>=3: if np.abs(slope_deskew) < SLOPE_THRESHOLD: @@ -3359,10 +3349,10 @@ class Eynollah: num_col_classifier, erosion_hurts, self.tables, self.right2left) text_regions_p_tables = np.copy(text_regions_p) text_regions_p_tables[:,:][(table_prediction[:,:]==1)] = 10 - pixel_line = 3 + label_seps = 3 img_revised_tab2 = self.add_tables_heuristic_to_layout( text_regions_p_tables, boxes, 0, splitter_y_new, peaks_neg_tot_tables, text_regions_p_tables, - num_col_classifier , 0.000005, pixel_line) + num_col_classifier , 0.000005, label_seps) img_revised_tab2,contoures_tables = self.check_iou_of_bounding_box_and_contour_for_tables( img_revised_tab2, table_prediction, 10, num_col_classifier) @@ -3374,11 +3364,11 @@ class Eynollah: text_regions_p_tables = np.round(text_regions_p_tables) text_regions_p_tables[(text_regions_p_tables != 3) & (table_prediction_n == 1)] = 10 - pixel_line = 3 + label_seps = 3 img_revised_tab2 = self.add_tables_heuristic_to_layout( text_regions_p_tables, boxes_d, 0, splitter_y_new_d, peaks_neg_tot_tables_d, text_regions_p_tables, - num_col_classifier, 0.000005, pixel_line) + num_col_classifier, 0.000005, label_seps) img_revised_tab2_d,_ = self.check_iou_of_bounding_box_and_contour_for_tables( img_revised_tab2, table_prediction_n, 10, num_col_classifier) @@ -4721,12 +4711,12 @@ class Eynollah: regions_without_separators_d = cv2.erode(regions_without_separators_d[:, :], KERNEL, iterations=6) if np.abs(slope_deskew) < SLOPE_THRESHOLD: - boxes, peaks_neg_tot_tables = return_boxes_of_images_by_order_of_reading_new( + boxes, _ = return_boxes_of_images_by_order_of_reading_new( splitter_y_new, regions_without_separators, matrix_of_lines_ch, num_col_classifier, erosion_hurts, self.tables, self.right2left, logger=self.logger) else: - boxes_d, peaks_neg_tot_tables_d = return_boxes_of_images_by_order_of_reading_new( + boxes_d, _ = return_boxes_of_images_by_order_of_reading_new( splitter_y_new_d, regions_without_separators_d, matrix_of_lines_ch_d, num_col_classifier, erosion_hurts, self.tables, self.right2left, logger=self.logger) From 3367462d181bca16316e84957299e0abb08ec0d8 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 24 Oct 2025 22:46:46 +0200 Subject: [PATCH 20/32] `return_boxes_of_images_by_order_of_reading_new`: change arg order --- src/eynollah/utils/__init__.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/eynollah/utils/__init__.py b/src/eynollah/utils/__init__.py index f30d55e..a163fad 100644 --- a/src/eynollah/utils/__init__.py +++ b/src/eynollah/utils/__init__.py @@ -33,7 +33,7 @@ def pairwise(iterable): a = b def return_x_start_end_mothers_childs_and_type_of_reading_order( - x_min_hor_some, x_max_hor_some, cy_hor_some, peak_points, y_max_hor_some): + peak_points, x_min_hor_some, x_max_hor_some, cy_hor_some, y_max_hor_some): """ Analyse which separators overlap multiple column candidates, and how they overlap each other. @@ -54,10 +54,10 @@ def return_x_start_end_mothers_childs_and_type_of_reading_order( with no mother, specifically (and thus, no simple box separation is possible). Arguments: + * the x column coordinates * the x start column index of the raw separators * the x end column index of the raw separators * the y center coordinate of the raw separators - * the x column coordinates * the y end coordinate of the raw separators Returns: @@ -1736,7 +1736,7 @@ def return_boxes_of_images_by_order_of_reading_new( there_is_sep_with_child, \ y_mid_with_child_without_mother, x_start_with_child_without_mother, x_end_with_child_without_mother, \ new_main_sep_y = return_x_start_end_mothers_childs_and_type_of_reading_order( - x_min_hor_some, x_max_hor_some, cy_hor_some, peaks_neg_tot, y_max_hor_some) + peaks_neg_tot, x_min_hor_some, x_max_hor_some, cy_hor_some, y_max_hor_some) # show multi-column separators # dbg_plt([0, None, top, bot], "multi-column separators in current split", From 19b2c3fa424f8750e093a2fb88d7e6e381daeaab Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 24 Oct 2025 22:51:19 +0200 Subject: [PATCH 21/32] reading order: improve handling of headings and horizontal seps - drop connected components analysis to test overlaps between horizontal separators and (horizontal) neighbours (introduced in ab17a927) - instead of converting headings to topline and baseline during `find_number_of_columns_in_document` (introduced in 9f1595d7), add them to the matrix unchanged, but mark as extra type (besides horizontal and vertical separtors) - convert headings to toplines and baselines no earlier than in `return_boxes_of_images_by_order_of_reading_new` - for both headings and horizontal separators, if they already span multiple columns, check if they would overlap (horizontal) neighbours by looking at successively larger (left and right) intervals of columns (and pick the largest elongation which does not introduce any overlaps) --- src/eynollah/utils/__init__.py | 127 +++++++++++++++++++++------------ 1 file changed, 80 insertions(+), 47 deletions(-) diff --git a/src/eynollah/utils/__init__.py b/src/eynollah/utils/__init__.py index a163fad..f3dbae2 100644 --- a/src/eynollah/utils/__init__.py +++ b/src/eynollah/utils/__init__.py @@ -1387,8 +1387,6 @@ def return_points_with_boundies(peaks_neg_fin, first_point, last_point): return peaks_neg_tot def find_number_of_columns_in_document(region_pre_p, num_col_classifier, tables, label_seps, contours_h=None): - ncomps, ccomps = cv2.connectedComponents(region_pre_p.astype(np.uint8)) - separators_closeup = 1 * (region_pre_p == label_seps) separators_closeup[0:110] = 0 separators_closeup[-150:] = 0 @@ -1414,14 +1412,6 @@ def find_number_of_columns_in_document(region_pre_p, num_col_classifier, tables, dist_ye = max_ye - min_ye if dist_ye <= 50 and dist_xe >= 3 * dist_ye: cnts_hor_e.append(cnt) - labels = np.setdiff1d(np.unique(ccomps[med_ye]), [0]) - if len(labels) == 1: - # mid line does not intersect with any other region - # so add it as extra splitter line - cnts_hor_e.append(np.array([[[0, med_ye]], - [[ccomps.shape[1], med_ye]], - [[ccomps.shape[1], med_ye + 1]], - [[0, med_ye + 1]]])) # delete horizontal contours (leaving only the edges) separators_closeup_n_binary = cv2.fillPoly(separators_closeup_n_binary, pts=cnts_hor_e, color=0) @@ -1493,7 +1483,7 @@ def find_number_of_columns_in_document(region_pre_p, num_col_classifier, tables, slope_seps_org_hor=slope_seps_org_hor[dist_x_hor>=len_x/2.0] dist_x_hor=dist_x_hor[dist_x_hor>=len_x/2.0] - matrix_of_seps_ch=np.zeros((len(cy_seps_hor)+len(cx_seps_ver),10)) + matrix_of_seps_ch = np.zeros((len(cy_seps_hor)+len(cx_seps_ver), 10), dtype=int) matrix_of_seps_ch[:len(cy_seps_hor),0]=args_hor matrix_of_seps_ch[len(cy_seps_hor):,0]=args_ver matrix_of_seps_ch[len(cy_seps_hor):,1]=cx_seps_ver @@ -1515,34 +1505,17 @@ def find_number_of_columns_in_document(region_pre_p, num_col_classifier, tables, if contours_h is not None: _, dist_x_head, x_min_head, x_max_head, cy_head, _, y_min_head, y_max_head, _ = \ find_features_of_lines(contours_h) - # matrix_l_n = np.zeros((len(cy_head), matrix_of_seps_ch.shape[1])) - # args_head = np.arange(len(cy_head)) - # matrix_l_n[:, 0] = args_head - # matrix_l_n[:, 2] = x_min_head+30 - # matrix_l_n[:, 3] = x_max_head-30 - # matrix_l_n[:, 4] = dist_x_head - # matrix_l_n[:, 5] = y_min_head-3-8 - # matrix_l_n[:, 6] = y_min_head-5-8 - # matrix_l_n[:, 7] = y_max_head#y_min_head+1-8 - # matrix_l_n[:, 8] = 4 - # split at toplines (y_min_head) and baselines (y_max_head) instead of center (cy_head): - cy_head = np.stack((y_min_head, y_max_head)).T.flatten() - y_min_head, y_max_head = (np.stack((y_min_head - 2, y_max_head - 2)).T.flatten(), - np.stack((y_min_head + 2, y_max_head + 2)).T.flatten()) - x_min_head = np.repeat(x_min_head, 2) - x_max_head = np.repeat(x_max_head, 2) - dist_x_head = np.repeat(dist_x_head, 2) - matrix_l_n = np.zeros((len(cy_head), matrix_of_seps_ch.shape[1])) + matrix_l_n = np.zeros((len(cy_head), matrix_of_seps_ch.shape[1]), dtype=int) args_head = np.arange(len(cy_head)) matrix_l_n[:, 0] = args_head - # +/- 30px to avoid crossing col peaks by accident - matrix_l_n[:, 2] = x_min_head + 30 - matrix_l_n[:, 3] = x_max_head - 30 + matrix_l_n[:, 2] = x_min_head + matrix_l_n[:, 3] = x_max_head matrix_l_n[:, 4] = dist_x_head matrix_l_n[:, 5] = cy_head matrix_l_n[:, 6] = y_min_head matrix_l_n[:, 7] = y_max_head - matrix_l_n[:, 8] = 4 + matrix_l_n[:, 8] = y_max_head - y_min_head + matrix_l_n[:, 9] = 2 # mark as heading (so it can be split into 2 horizontal separators as needed) matrix_of_seps_ch = np.append( matrix_of_seps_ch, matrix_l_n, axis=0) @@ -1551,9 +1524,12 @@ def find_number_of_columns_in_document(region_pre_p, num_col_classifier, tables, cy_seps_splitters = np.append(cy_seps_splitters, special_separators) if contours_h is not None: - cy_seps_splitters_head=cy_head[(x_min_head<=.16*region_pre_p.shape[1]) & - (x_max_head>=.84*region_pre_p.shape[1])] - cy_seps_splitters = np.append(cy_seps_splitters, cy_seps_splitters_head) + y_min_splitters_head = y_min_head[(x_min_head<=.16*region_pre_p.shape[1]) & + (x_max_head>=.84*region_pre_p.shape[1])] + y_max_splitters_head = y_max_head[(x_min_head<=.16*region_pre_p.shape[1]) & + (x_max_head>=.84*region_pre_p.shape[1])] + cy_seps_splitters = np.append(cy_seps_splitters, y_min_splitters_head) + cy_seps_splitters = np.append(cy_seps_splitters, y_max_splitters_head) cy_seps_splitters = np.sort(cy_seps_splitters).astype(int) splitter_y_new = [0] + list(cy_seps_splitters) + [region_pre_p.shape[0]] @@ -1713,17 +1689,6 @@ def return_boxes_of_images_by_order_of_reading_new( #num_col, peaks_neg_fin = find_num_col( # regions_without_separators[top:bot,:], # multiplier=7.0) - x_min_hor_some=matrix_new[:,2][ (matrix_new[:,9]==0) ] - x_max_hor_some=matrix_new[:,3][ (matrix_new[:,9]==0) ] - cy_hor_some=matrix_new[:,5][ (matrix_new[:,9]==0) ] - y_max_hor_some=matrix_new[:,7][ (matrix_new[:,9]==0) ] - - if right2left_readingorder: - x_max_hor_some_new = width_tot - x_min_hor_some - x_min_hor_some_new = width_tot - x_max_hor_some - x_min_hor_some =list(np.copy(x_min_hor_some_new)) - x_max_hor_some =list(np.copy(x_max_hor_some_new)) - peaks_neg_tot = np.array([0] + peaks_neg_fin + [width_tot]) #print(peaks_neg_tot,'peaks_neg_tot') peaks_neg_tot_tables.append(peaks_neg_tot) @@ -1731,6 +1696,74 @@ def return_boxes_of_images_by_order_of_reading_new( all_columns = set(range(len(peaks_neg_tot) - 1)) #print("all_columns", all_columns) + # elongate horizontal separators+headings as much as possible without overlap + args_nonver = matrix_new[:, 9] != 1 + regions_with_separators = np.copy(regions_without_separators[top:bot]) + for xmin, xmax, ymin, ymax in matrix_new[:, [2, 3, 6, 7]]: + regions_with_separators[ymin - top: ymax - top, xmin: xmax] = 6 + # def dbg_imshow(box, title): + # xmin, xmax, ymin, ymax = box + # plt.imshow(regions_with_separators, extent=[0, width_tot, bot, top]) + # plt.gca().add_patch(patches.Rectangle((xmin, ymin), xmax - xmin, ymax - ymin, + # fill=False, linewidth=1, edgecolor='r')) + # plt.title(title + " at %d:%d, %d:%d" % (ymin, ymax, xmin, xmax)) + # plt.show() + for i in np.flatnonzero(args_nonver): + xmin, xmax, ymin, ymax, typ = matrix_new[i, [2, 3, 6, 7, 9]] + cut = regions_with_separators[ymin - top: ymax - top] + # dbg_imshow([xmin, xmax, ymin, ymax], "separator %d (%s)" % (i, "heading" if typ else "horizontal")) + starting = xmin - peaks_neg_tot + min_start = np.flatnonzero(starting >= 0)[-1] # last left-of + ending = xmax - peaks_neg_tot + max_end = np.flatnonzero(ending < 0)[0] # first right-of + # skip elongation unless this is already a multi-column separator/heading: + if not max_end - min_start > 1: + continue + # is there anything left of min_start? + for j in range(min_start): + # dbg_imshow([peaks_neg_tot[j], xmin, ymin, ymax], "start of %d candidate %d" % (i, j)) + if not np.any(cut[:, peaks_neg_tot[j]: xmin]): + # print("elongated sep", i, "typ", typ, "start", xmin, "to", j, peaks_neg_tot[j]) + matrix_new[i, 2] = peaks_neg_tot[j] + 1 # elongate to start of this column + break + # is there anything right of max_end? + for j in range(len(peaks_neg_tot) - 1, max_end, -1): + # dbg_imshow([xmax, peaks_neg_tot[j], ymin, ymax], "end of %d candidate %d" % (i, j)) + if not np.any(cut[:, xmax: peaks_neg_tot[j]]): + # print("elongated sep", i, "typ", typ, "end", xmax, "to", j, peaks_neg_tot[j]) + matrix_new[i, 3] = peaks_neg_tot[j] - 1 # elongate to end of this column + break + + args_hor = matrix_new[:, 9] == 0 + x_min_hor_some = matrix_new[:, 2][args_hor] + x_max_hor_some = matrix_new[:, 3][args_hor] + y_max_hor_some = matrix_new[:, 7][args_hor] + cy_hor_some = matrix_new[:, 5][args_hor] + + args_head = matrix_new[:, 9] == 2 + x_min_hor_head = matrix_new[:, 2][args_head] + x_max_hor_head = matrix_new[:, 3][args_head] + y_min_hor_head = matrix_new[:, 6][args_head] + y_max_hor_head = matrix_new[:, 7][args_head] + cy_hor_head = matrix_new[:, 5][args_head] + + # split headings at toplines (y_min_head) and baselines (y_max_head) + # instead of merely adding their center (cy_head) as horizontal separator + # (x +/- 30px to avoid crossing col peaks by accident) + x_min_hor_some = np.append(x_min_hor_some, np.tile(x_min_hor_head + 30, 2)) + x_max_hor_some = np.append(x_max_hor_some, np.tile(x_max_hor_head - 30, 2)) + y_max_hor_some = np.append(y_max_hor_some, # baselines + np.concatenate((y_min_hor_head + 2, + y_max_hor_head + 2))) + cy_hor_some = np.append(cy_hor_some, # toplines + np.concatenate((y_min_hor_head - 2, + y_max_hor_head - 2))) + + if right2left_readingorder: + x_max_hor_some = width_tot - x_min_hor_some + x_min_hor_some = width_tot - x_max_hor_some + + reading_order_type, x_starting, x_ending, y_mid, y_max, \ y_mid_without_mother, x_start_without_mother, x_end_without_mother, \ there_is_sep_with_child, \ From 1a76ce177dba69aa711b74e6c69022e4a5ebf27f Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 14 Nov 2025 02:07:20 +0100 Subject: [PATCH 22/32] do_order_of_regions: round contour centers (so we can be sure they do not fall through the "pixel cracks": bboxes are delimited by integers, and we do not want to assign contours between boxes) --- src/eynollah/eynollah.py | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index eee3777..35b0a37 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -2491,11 +2491,15 @@ class Eynollah: contours_only_text_parent) cx_head, cy_head, mx_head, Mx_head, my_head, My_head, mxy_head = find_new_features_of_contours( contours_only_text_parent_h) + cx_main = np.array(cx_main, dtype=int) + cy_main = np.array(cy_main, dtype=int) + cx_head = np.array(cx_head, dtype=int) + cy_head = np.array(cy_head, dtype=int) def match_boxes(only_centers: bool): arg_text_con_main = np.zeros(len(contours_only_text_parent), dtype=int) for ii in range(len(contours_only_text_parent)): - check_if_textregion_located_in_a_box = False + box_found = False for jj, box in enumerate(boxes): if ((cx_main[ii] >= box[0] and cx_main[ii] < box[1] and @@ -2506,22 +2510,23 @@ class Eynollah: my_main[ii] >= box[2] and My_main[ii] < box[3])): arg_text_con_main[ii] = jj - check_if_textregion_located_in_a_box = True - #print("main/matched", (mx_main[ii], Mx_main[ii], my_main[ii], My_main[ii]), "\tin", box, only_centers) + box_found = True + # print("main/matched ", ii, "\t", (mx_main[ii], Mx_main[ii], my_main[ii], My_main[ii]), "\tin", jj, box, only_centers) break - if not check_if_textregion_located_in_a_box: + if not box_found: dists_tr_from_box = np.linalg.norm(c_boxes - np.array([[cy_main[ii]], [cx_main[ii]]]), axis=0) pcontained_in_box = ((boxes[:, 2] <= cy_main[ii]) & (cy_main[ii] < boxes[:, 3]) & (boxes[:, 0] <= cx_main[ii]) & (cx_main[ii] < boxes[:, 1])) + assert pcontained_in_box.any(), (ii, cx_main[ii], cy_main[ii]) ind_min = np.argmin(np.ma.masked_array(dists_tr_from_box, ~pcontained_in_box)) arg_text_con_main[ii] = ind_min - #print("main/fallback", (mx_main[ii], Mx_main[ii], my_main[ii], My_main[ii]), "\tin", boxes[ind_min], only_centers) + # print("main/fallback ", ii, "\t", (mx_main[ii], Mx_main[ii], my_main[ii], My_main[ii]), "\tin", ind_min, boxes[ind_min], only_centers) args_contours_main = np.arange(len(contours_only_text_parent)) order_by_con_main = np.zeros_like(arg_text_con_main) arg_text_con_head = np.zeros(len(contours_only_text_parent_h), dtype=int) for ii in range(len(contours_only_text_parent_h)): - check_if_textregion_located_in_a_box = False + box_found = False for jj, box in enumerate(boxes): if ((cx_head[ii] >= box[0] and cx_head[ii] < box[1] and @@ -2532,16 +2537,17 @@ class Eynollah: my_head[ii] >= box[2] and My_head[ii] < box[3])): arg_text_con_head[ii] = jj - check_if_textregion_located_in_a_box = True - #print("head/matched", (mx_head[ii], Mx_head[ii], my_head[ii], My_head[ii]), "\tin", box, only_centers) + box_found = True + # print("head/matched ", ii, "\t", (mx_head[ii], Mx_head[ii], my_head[ii], My_head[ii]), "\tin", jj, box, only_centers) break - if not check_if_textregion_located_in_a_box: + if not box_found: dists_tr_from_box = np.linalg.norm(c_boxes - np.array([[cy_head[ii]], [cx_head[ii]]]), axis=0) pcontained_in_box = ((boxes[:, 2] <= cy_head[ii]) & (cy_head[ii] < boxes[:, 3]) & (boxes[:, 0] <= cx_head[ii]) & (cx_head[ii] < boxes[:, 1])) + assert pcontained_in_box.any(), (ii, cx_head[ii], cy_head[ii]) ind_min = np.argmin(np.ma.masked_array(dists_tr_from_box, ~pcontained_in_box)) arg_text_con_head[ii] = ind_min - #print("head/fallback", (mx_head[ii], Mx_head[ii], my_head[ii], My_head[ii]), "\tin", boxes[ind_min], only_centers) + # print("head/fallback ", ii, "\t", (mx_head[ii], Mx_head[ii], my_head[ii], My_head[ii]), "\tin", ind_min, boxes[ind_min], only_centers) args_contours_head = np.arange(len(contours_only_text_parent_h)) order_by_con_head = np.zeros_like(arg_text_con_head) From 95f76081d1de4611d3007ef14a342d7dbb530584 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 14 Nov 2025 02:22:39 +0100 Subject: [PATCH 23/32] rename some more identifiers: MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - `lines` → `seps` (to distinguish from textlines) - `text_regions_p_1_n` → `text_regions_p_d` (because all other deskewed variables are called like this) - `pixel` → `label` --- src/eynollah/eynollah.py | 178 +++++++++++++++++++-------------------- 1 file changed, 89 insertions(+), 89 deletions(-) diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index 35b0a37..2bdb2c7 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -2091,19 +2091,19 @@ class Eynollah: prediction_regions_org = prediction_regions_org[page_coord[0] : page_coord[1], page_coord[2] : page_coord[3]] prediction_regions_org=prediction_regions_org[:,:,0] - mask_lines_only = (prediction_regions_org[:,:] ==3)*1 + mask_seps_only = (prediction_regions_org[:,:] == 3)*1 mask_texts_only = (prediction_regions_org[:,:] ==1)*1 mask_images_only=(prediction_regions_org[:,:] ==2)*1 - polygons_seplines, hir_seplines = return_contours_of_image(mask_lines_only) + polygons_seplines, hir_seplines = return_contours_of_image(mask_seps_only) polygons_seplines = filter_contours_area_of_image( - mask_lines_only, polygons_seplines, hir_seplines, max_area=1, min_area=0.00001, dilate=1) + mask_seps_only, polygons_seplines, hir_seplines, max_area=1, min_area=0.00001, dilate=1) polygons_of_only_texts = return_contours_of_interested_region(mask_texts_only,1,0.00001) - polygons_of_only_lines = return_contours_of_interested_region(mask_lines_only,1,0.00001) + polygons_of_only_seps = return_contours_of_interested_region(mask_seps_only,1,0.00001) text_regions_p_true = np.zeros(prediction_regions_org.shape) - text_regions_p_true = cv2.fillPoly(text_regions_p_true, pts = polygons_of_only_lines, color=(3,3,3)) + text_regions_p_true = cv2.fillPoly(text_regions_p_true, pts = polygons_of_only_seps, color=(3,3,3)) text_regions_p_true[:,:][mask_images_only[:,:] == 1] = 2 text_regions_p_true = cv2.fillPoly(text_regions_p_true, pts=polygons_of_only_texts, color=(1,1,1)) @@ -2282,7 +2282,7 @@ class Eynollah: img_bin = resize_image(img_bin, img_height_h, img_width_h ) prediction_regions_org=prediction_regions_org[:,:,0] - mask_lines_only = (prediction_regions_org[:,:] ==3)*1 + mask_seps_only = (prediction_regions_org[:,:] == 3)*1 mask_texts_only = (prediction_regions_org[:,:] ==1)*1 mask_texts_only = mask_texts_only.astype('uint8') @@ -2293,7 +2293,7 @@ class Eynollah: mask_texts_only = cv2.dilate(mask_texts_only, kernel=np.ones((2,2), np.uint8), iterations=1) mask_images_only=(prediction_regions_org[:,:] ==2)*1 - polygons_seplines, hir_seplines = return_contours_of_image(mask_lines_only) + polygons_seplines, hir_seplines = return_contours_of_image(mask_seps_only) test_khat = np.zeros(prediction_regions_org.shape) test_khat = cv2.fillPoly(test_khat, pts=polygons_seplines, color=(1,1,1)) @@ -2307,7 +2307,7 @@ class Eynollah: #plt.show() polygons_seplines = filter_contours_area_of_image( - mask_lines_only, polygons_seplines, hir_seplines, max_area=1, min_area=0.00001, dilate=1) + mask_seps_only, polygons_seplines, hir_seplines, max_area=1, min_area=0.00001, dilate=1) test_khat = np.zeros(prediction_regions_org.shape) test_khat = cv2.fillPoly(test_khat, pts = polygons_seplines, color=(1,1,1)) @@ -2318,10 +2318,10 @@ class Eynollah: polygons_of_only_texts = return_contours_of_interested_region(mask_texts_only,1,0.00001) ##polygons_of_only_texts = dilate_textregion_contours(polygons_of_only_texts) - polygons_of_only_lines = return_contours_of_interested_region(mask_lines_only,1,0.00001) + polygons_of_only_seps = return_contours_of_interested_region(mask_seps_only,1,0.00001) text_regions_p_true = np.zeros(prediction_regions_org.shape) - text_regions_p_true = cv2.fillPoly(text_regions_p_true, pts=polygons_of_only_lines, color=(3,3,3)) + text_regions_p_true = cv2.fillPoly(text_regions_p_true, pts=polygons_of_only_seps, color=(3,3,3)) text_regions_p_true[:,:][mask_images_only[:,:] == 1] = 2 text_regions_p_true = cv2.fillPoly(text_regions_p_true, pts = polygons_of_only_texts, color=(1,1,1)) @@ -2377,7 +2377,7 @@ class Eynollah: prediction_regions_org2=resize_image(prediction_regions_org2, img_height_h, img_width_h ) mask_zeros2 = (prediction_regions_org2[:,:,0] == 0) - mask_lines2 = (prediction_regions_org2[:,:,0] == 3) + mask_seps2 = (prediction_regions_org2[:,:,0] == 3) text_sume_early = (prediction_regions_org[:,:] == 1).sum() prediction_regions_org_copy = np.copy(prediction_regions_org) prediction_regions_org_copy[(prediction_regions_org_copy[:,:]==1) & (mask_zeros2[:,:]==1)] = 0 @@ -2388,8 +2388,8 @@ class Eynollah: if not(is_image_enhanced and rate_two_models < RATIO_OF_TWO_MODEL_THRESHOLD): prediction_regions_org = np.copy(prediction_regions_org_copy) - prediction_regions_org[(mask_lines2[:,:]==1) & (prediction_regions_org[:,:]==0)]=3 - mask_lines_only=(prediction_regions_org[:,:]==3)*1 + prediction_regions_org[(mask_seps2[:,:]==1) & (prediction_regions_org[:,:]==0)]=3 + mask_seps_only=(prediction_regions_org[:,:]==3)*1 prediction_regions_org = cv2.erode(prediction_regions_org[:,:], KERNEL, iterations=2) prediction_regions_org = cv2.dilate(prediction_regions_org[:,:], KERNEL, iterations=2) @@ -2411,20 +2411,20 @@ class Eynollah: prediction_regions_org = resize_image(prediction_regions_org, img_height_h, img_width_h ) prediction_regions_org=prediction_regions_org[:,:,0] - mask_lines_only=(prediction_regions_org[:,:]==3)*1 + mask_seps_only=(prediction_regions_org[:,:]==3)*1 mask_texts_only=(prediction_regions_org[:,:]==1)*1 mask_images_only=(prediction_regions_org[:,:]==2)*1 - polygons_seplines, hir_seplines = return_contours_of_image(mask_lines_only) + polygons_seplines, hir_seplines = return_contours_of_image(mask_seps_only) polygons_seplines = filter_contours_area_of_image( - mask_lines_only, polygons_seplines, hir_seplines, max_area=1, min_area=0.00001, dilate=1) + mask_seps_only, polygons_seplines, hir_seplines, max_area=1, min_area=0.00001, dilate=1) polygons_of_only_texts = return_contours_of_interested_region(mask_texts_only, 1, 0.00001) - polygons_of_only_lines = return_contours_of_interested_region(mask_lines_only, 1, 0.00001) + polygons_of_only_seps = return_contours_of_interested_region(mask_seps_only, 1, 0.00001) text_regions_p_true = np.zeros(prediction_regions_org.shape) - text_regions_p_true = cv2.fillPoly(text_regions_p_true,pts = polygons_of_only_lines, color=(3, 3, 3)) + text_regions_p_true = cv2.fillPoly(text_regions_p_true,pts = polygons_of_only_seps, color=(3, 3, 3)) text_regions_p_true[:,:][mask_images_only[:,:] == 1] = 2 text_regions_p_true=cv2.fillPoly(text_regions_p_true,pts=polygons_of_only_texts, color=(1,1,1)) @@ -2449,7 +2449,7 @@ class Eynollah: prediction_regions_org = resize_image(prediction_regions_org, img_height_h, img_width_h ) prediction_regions_org=prediction_regions_org[:,:,0] - #mask_lines_only=(prediction_regions_org[:,:]==3)*1 + #mask_seps_only=(prediction_regions_org[:,:]==3)*1 #img = resize_image(img_org, int(img_org.shape[0]*1), int(img_org.shape[1]*1)) #prediction_regions_org = self.do_prediction(True, img, self.models["region"]) @@ -2457,19 +2457,19 @@ class Eynollah: #prediction_regions_org = prediction_regions_org[:,:,0] #prediction_regions_org[(prediction_regions_org[:,:] == 1) & (mask_zeros_y[:,:] == 1)]=0 - mask_lines_only = (prediction_regions_org == 3)*1 + mask_seps_only = (prediction_regions_org == 3)*1 mask_texts_only = (prediction_regions_org == 1)*1 mask_images_only= (prediction_regions_org == 2)*1 - polygons_seplines, hir_seplines = return_contours_of_image(mask_lines_only) + polygons_seplines, hir_seplines = return_contours_of_image(mask_seps_only) polygons_seplines = filter_contours_area_of_image( - mask_lines_only, polygons_seplines, hir_seplines, max_area=1, min_area=0.00001, dilate=1) + mask_seps_only, polygons_seplines, hir_seplines, max_area=1, min_area=0.00001, dilate=1) polygons_of_only_texts = return_contours_of_interested_region(mask_texts_only,1,0.00001) - polygons_of_only_lines = return_contours_of_interested_region(mask_lines_only,1,0.00001) + polygons_of_only_seps = return_contours_of_interested_region(mask_seps_only,1,0.00001) text_regions_p_true = np.zeros(prediction_regions_org.shape) - text_regions_p_true = cv2.fillPoly(text_regions_p_true, pts = polygons_of_only_lines, color=(3,3,3)) + text_regions_p_true = cv2.fillPoly(text_regions_p_true, pts = polygons_of_only_seps, color=(3,3,3)) text_regions_p_true[:,:][mask_images_only[:,:] == 1] = 2 text_regions_p_true = cv2.fillPoly(text_regions_p_true, pts = polygons_of_only_texts, color=(1,1,1)) @@ -2952,8 +2952,8 @@ class Eynollah: mask_images = (text_regions_p_1[:, :] == 2) * 1 mask_images = mask_images.astype(np.uint8) mask_images = cv2.erode(mask_images[:, :], KERNEL, iterations=10) - mask_lines = (text_regions_p_1[:, :] == 3) * 1 - mask_lines = mask_lines.astype(np.uint8) + mask_seps = (text_regions_p_1[:, :] == 3) * 1 + mask_seps = mask_seps.astype(np.uint8) img_only_regions_with_sep = ((text_regions_p_1[:, :] != 3) & (text_regions_p_1[:, :] != 0)) * 1 img_only_regions_with_sep = img_only_regions_with_sep.astype(np.uint8) @@ -2979,7 +2979,7 @@ class Eynollah: self.logger.exception(why) num_col = None #print("inside graphics 3 ", time.time() - t_in_gr) - return (num_col, num_col_classifier, img_only_regions, page_coord, image_page, mask_images, mask_lines, + return (num_col, num_col_classifier, img_only_regions, page_coord, image_page, mask_images, mask_seps, text_regions_p_1, cont_page, table_prediction, textline_mask_tot_ea, img_bin_light) def run_graphics_and_columns_without_layout(self, textline_mask_tot_ea, img_bin_light): @@ -3029,8 +3029,8 @@ class Eynollah: mask_images = (text_regions_p_1[:, :] == 2) * 1 mask_images = mask_images.astype(np.uint8) mask_images = cv2.erode(mask_images[:, :], KERNEL, iterations=10) - mask_lines = (text_regions_p_1[:, :] == 3) * 1 - mask_lines = mask_lines.astype(np.uint8) + mask_seps = (text_regions_p_1[:, :] == 3) * 1 + mask_seps = mask_seps.astype(np.uint8) img_only_regions_with_sep = ((text_regions_p_1[:, :] != 3) & (text_regions_p_1[:, :] != 0)) * 1 img_only_regions_with_sep = img_only_regions_with_sep.astype(np.uint8) @@ -3046,7 +3046,7 @@ class Eynollah: except Exception as why: self.logger.exception(why) num_col = None - return (num_col, num_col_classifier, img_only_regions, page_coord, image_page, mask_images, mask_lines, + return (num_col, num_col_classifier, img_only_regions, page_coord, image_page, mask_images, mask_seps, text_regions_p_1, cont_page, table_prediction) def run_enhancement(self, light_version): @@ -3101,13 +3101,13 @@ class Eynollah: return slope_deskew def run_marginals( - self, textline_mask_tot_ea, mask_images, mask_lines, + self, textline_mask_tot_ea, mask_images, mask_seps, num_col_classifier, slope_deskew, text_regions_p_1, table_prediction): textline_mask_tot = textline_mask_tot_ea[:, :] textline_mask_tot[mask_images[:, :] == 1] = 0 - text_regions_p_1[mask_lines[:, :] == 1] = 3 + text_regions_p_1[mask_seps[:, :] == 1] = 3 text_regions_p = text_regions_p_1[:, :] text_regions_p = np.array(text_regions_p) if num_col_classifier in (1, 2): @@ -3131,12 +3131,12 @@ class Eynollah: self.logger.debug('enter run_boxes_no_full_layout') t_0_box = time.time() if np.abs(slope_deskew) >= SLOPE_THRESHOLD: - _, textline_mask_tot_d, text_regions_p_1_n, table_prediction_n = rotation_not_90_func( + _, textline_mask_tot_d, text_regions_p_d, table_prediction_n = rotation_not_90_func( image_page, textline_mask_tot, text_regions_p, table_prediction, slope_deskew) - text_regions_p_1_n = resize_image(text_regions_p_1_n, text_regions_p.shape[0], text_regions_p.shape[1]) + text_regions_p_d = resize_image(text_regions_p_d, text_regions_p.shape[0], text_regions_p.shape[1]) textline_mask_tot_d = resize_image(textline_mask_tot_d, text_regions_p.shape[0], text_regions_p.shape[1]) table_prediction_n = resize_image(table_prediction_n, text_regions_p.shape[0], text_regions_p.shape[1]) - regions_without_separators_d = (text_regions_p_1_n[:, :] == 1) * 1 + regions_without_separators_d = (text_regions_p_d[:, :] == 1) * 1 if self.tables: regions_without_separators_d[table_prediction_n[:,:] == 1] = 1 regions_without_separators = (text_regions_p[:, :] == 1) * 1 @@ -3146,7 +3146,7 @@ class Eynollah: if self.tables: regions_without_separators[table_prediction ==1 ] = 1 if np.abs(slope_deskew) < SLOPE_THRESHOLD: - text_regions_p_1_n = None + text_regions_p_d = None textline_mask_tot_d = None regions_without_separators_d = None label_seps = 3 @@ -3156,7 +3156,7 @@ class Eynollah: if np.abs(slope_deskew) >= SLOPE_THRESHOLD: _, _, matrix_of_seps_ch_d, splitter_y_new_d, _ = find_number_of_columns_in_document( - text_regions_p_1_n, num_col_classifier, self.tables, label_seps) + text_regions_p_d, num_col_classifier, self.tables, label_seps) #print(time.time()-t_0_box,'time box in 2') self.logger.info("num_col_classifier: %s", num_col_classifier) @@ -3171,7 +3171,7 @@ class Eynollah: t1 = time.time() if np.abs(slope_deskew) < SLOPE_THRESHOLD: boxes, peaks_neg_tot_tables = return_boxes_of_images_by_order_of_reading_new( - splitter_y_new, regions_without_separators, matrix_of_seps_ch, + splitter_y_new, regions_without_separators, text_regions_p, matrix_of_seps_ch, num_col_classifier, erosion_hurts, self.tables, self.right2left) boxes_d = None self.logger.debug("len(boxes): %s", len(boxes)) @@ -3193,7 +3193,7 @@ class Eynollah: #print(time.time()-t_0_box,'time box in 3.3') else: boxes_d, peaks_neg_tot_tables_d = return_boxes_of_images_by_order_of_reading_new( - splitter_y_new_d, regions_without_separators_d, matrix_of_seps_ch_d, + splitter_y_new_d, regions_without_separators_d, text_regions_p_d, matrix_of_seps_ch_d, num_col_classifier, erosion_hurts, self.tables, self.right2left) boxes = None self.logger.debug("len(boxes): %s", len(boxes_d)) @@ -3202,7 +3202,7 @@ class Eynollah: if self.light_version: pass else: - text_regions_p_tables = np.copy(text_regions_p_1_n) + text_regions_p_tables = np.copy(text_regions_p_d) text_regions_p_tables = np.round(text_regions_p_tables) text_regions_p_tables[(text_regions_p_tables != 3) & (table_prediction_n == 1)] = 10 @@ -3245,22 +3245,22 @@ class Eynollah: else: polygons_of_images = return_contours_of_interested_region(img_revised_tab, 2) - pixel_img = 4 + label_marginalia = 4 min_area_mar = 0.00001 if self.light_version: - marginal_mask = (text_regions_p[:,:]==pixel_img)*1 + marginal_mask = (text_regions_p[:,:]==label_marginalia)*1 marginal_mask = marginal_mask.astype('uint8') marginal_mask = cv2.dilate(marginal_mask, KERNEL, iterations=2) polygons_of_marginals = return_contours_of_interested_region(marginal_mask, 1, min_area_mar) else: - polygons_of_marginals = return_contours_of_interested_region(text_regions_p, pixel_img, min_area_mar) + polygons_of_marginals = return_contours_of_interested_region(text_regions_p, label_marginalia, min_area_mar) - pixel_img = 10 - contours_tables = return_contours_of_interested_region(text_regions_p, pixel_img, min_area_mar) + label_tables = 10 + contours_tables = return_contours_of_interested_region(text_regions_p, label_tables, min_area_mar) #print(time.time()-t_0_box,'time box in 5') self.logger.debug('exit run_boxes_no_full_layout') - return (polygons_of_images, img_revised_tab, text_regions_p_1_n, textline_mask_tot_d, + return (polygons_of_images, img_revised_tab, text_regions_p_d, textline_mask_tot_d, regions_without_separators_d, boxes, boxes_d, polygons_of_marginals, contours_tables) @@ -3276,13 +3276,13 @@ class Eynollah: text_regions_p[:,:][table_prediction[:,:]==1] = 10 img_revised_tab = text_regions_p[:,:] if np.abs(slope_deskew) >= SLOPE_THRESHOLD: - _, textline_mask_tot_d, text_regions_p_1_n, table_prediction_n = \ + _, textline_mask_tot_d, text_regions_p_d, table_prediction_n = \ rotation_not_90_func(image_page, textline_mask_tot, text_regions_p, table_prediction, slope_deskew) - text_regions_p_1_n = resize_image(text_regions_p_1_n, - text_regions_p.shape[0], - text_regions_p.shape[1]) + text_regions_p_d = resize_image(text_regions_p_d, + text_regions_p.shape[0], + text_regions_p.shape[1]) textline_mask_tot_d = resize_image(textline_mask_tot_d, text_regions_p.shape[0], text_regions_p.shape[1]) @@ -3290,10 +3290,10 @@ class Eynollah: text_regions_p.shape[0], text_regions_p.shape[1]) - regions_without_separators_d = (text_regions_p_1_n[:,:] == 1)*1 + regions_without_separators_d = (text_regions_p_d[:,:] == 1)*1 regions_without_separators_d[table_prediction_n[:,:] == 1] = 1 else: - text_regions_p_1_n = None + text_regions_p_d = None textline_mask_tot_d = None regions_without_separators_d = None # regions_without_separators = ( text_regions_p[:,:]==1 | text_regions_p[:,:]==2 )*1 @@ -3303,13 +3303,13 @@ class Eynollah: else: if np.abs(slope_deskew) >= SLOPE_THRESHOLD: - _, textline_mask_tot_d, text_regions_p_1_n, table_prediction_n = \ + _, textline_mask_tot_d, text_regions_p_d, table_prediction_n = \ rotation_not_90_func(image_page, textline_mask_tot, text_regions_p, table_prediction, slope_deskew) - text_regions_p_1_n = resize_image(text_regions_p_1_n, - text_regions_p.shape[0], - text_regions_p.shape[1]) + text_regions_p_d = resize_image(text_regions_p_d, + text_regions_p.shape[0], + text_regions_p.shape[1]) textline_mask_tot_d = resize_image(textline_mask_tot_d, text_regions_p.shape[0], text_regions_p.shape[1]) @@ -3317,10 +3317,10 @@ class Eynollah: text_regions_p.shape[0], text_regions_p.shape[1]) - regions_without_separators_d = (text_regions_p_1_n[:,:] == 1)*1 + regions_without_separators_d = (text_regions_p_d[:,:] == 1)*1 regions_without_separators_d[table_prediction_n[:,:] == 1] = 1 else: - text_regions_p_1_n = None + text_regions_p_d = None textline_mask_tot_d = None regions_without_separators_d = None @@ -3331,12 +3331,12 @@ class Eynollah: label_seps=3 if np.abs(slope_deskew) < SLOPE_THRESHOLD: - num_col, _, matrix_of_lines_ch, splitter_y_new, _ = find_number_of_columns_in_document( + num_col, _, matrix_of_seps_ch, splitter_y_new, _ = find_number_of_columns_in_document( text_regions_p, num_col_classifier, self.tables, label_seps) if np.abs(slope_deskew) >= SLOPE_THRESHOLD: - num_col_d, _, matrix_of_lines_ch_d, splitter_y_new_d, _ = find_number_of_columns_in_document( - text_regions_p_1_n, num_col_classifier, self.tables, label_seps) + num_col_d, _, matrix_of_seps_ch_d, splitter_y_new_d, _ = find_number_of_columns_in_document( + text_regions_p_d, num_col_classifier, self.tables, label_seps) if num_col_classifier>=3: if np.abs(slope_deskew) < SLOPE_THRESHOLD: @@ -3351,7 +3351,7 @@ class Eynollah: if np.abs(slope_deskew) < SLOPE_THRESHOLD: boxes, peaks_neg_tot_tables = return_boxes_of_images_by_order_of_reading_new( - splitter_y_new, regions_without_separators, matrix_of_lines_ch, + splitter_y_new, regions_without_separators, text_regions_p, matrix_of_seps_ch, num_col_classifier, erosion_hurts, self.tables, self.right2left) text_regions_p_tables = np.copy(text_regions_p) text_regions_p_tables[:,:][(table_prediction[:,:]==1)] = 10 @@ -3364,9 +3364,9 @@ class Eynollah: img_revised_tab2, table_prediction, 10, num_col_classifier) else: boxes_d, peaks_neg_tot_tables_d = return_boxes_of_images_by_order_of_reading_new( - splitter_y_new_d, regions_without_separators_d, matrix_of_lines_ch_d, + splitter_y_new_d, regions_without_separators_d, text_regions_p_d, matrix_of_seps_ch_d, num_col_classifier, erosion_hurts, self.tables, self.right2left) - text_regions_p_tables = np.copy(text_regions_p_1_n) + text_regions_p_tables = np.copy(text_regions_p_d) text_regions_p_tables = np.round(text_regions_p_tables) text_regions_p_tables[(text_regions_p_tables != 3) & (table_prediction_n == 1)] = 10 @@ -3399,20 +3399,20 @@ class Eynollah: text_regions_p[img_revised_tab == 10] = 10 #img_revised_tab[img_revised_tab2 == 10] = 10 - pixel_img = 4 + label_marginalia = 4 min_area_mar = 0.00001 if self.light_version: - marginal_mask = (text_regions_p[:,:]==pixel_img)*1 + marginal_mask = (text_regions_p[:,:]==label_marginalia)*1 marginal_mask = marginal_mask.astype('uint8') marginal_mask = cv2.dilate(marginal_mask, KERNEL, iterations=2) polygons_of_marginals = return_contours_of_interested_region(marginal_mask, 1, min_area_mar) else: - polygons_of_marginals = return_contours_of_interested_region(text_regions_p, pixel_img, min_area_mar) + polygons_of_marginals = return_contours_of_interested_region(text_regions_p, label_marginalia, min_area_mar) - pixel_img = 10 - contours_tables = return_contours_of_interested_region(text_regions_p, pixel_img, min_area_mar) + label_tables = 10 + contours_tables = return_contours_of_interested_region(text_regions_p, label_tables, min_area_mar) # set first model with second model text_regions_p[:, :][text_regions_p[:, :] == 2] = 5 @@ -3465,16 +3465,16 @@ class Eynollah: #plt.show() ####if not self.tables: if np.abs(slope_deskew) >= SLOPE_THRESHOLD: - _, textline_mask_tot_d, text_regions_p_1_n, regions_fully_n = rotation_not_90_func_full_layout( + _, textline_mask_tot_d, text_regions_p_d, regions_fully_n = rotation_not_90_func_full_layout( image_page, textline_mask_tot, text_regions_p, regions_fully, slope_deskew) - text_regions_p_1_n = resize_image(text_regions_p_1_n, text_regions_p.shape[0], text_regions_p.shape[1]) + text_regions_p_d = resize_image(text_regions_p_d, text_regions_p.shape[0], text_regions_p.shape[1]) textline_mask_tot_d = resize_image(textline_mask_tot_d, text_regions_p.shape[0], text_regions_p.shape[1]) regions_fully_n = resize_image(regions_fully_n, text_regions_p.shape[0], text_regions_p.shape[1]) if not self.tables: - regions_without_separators_d = (text_regions_p_1_n[:, :] == 1) * 1 + regions_without_separators_d = (text_regions_p_d[:, :] == 1) * 1 else: - text_regions_p_1_n = None + text_regions_p_d = None textline_mask_tot_d = None regions_without_separators_d = None if not self.tables: @@ -3484,7 +3484,7 @@ class Eynollah: self.logger.debug('exit run_boxes_full_layout') #print("full inside 3", time.time()- t_full0) - return (polygons_of_images, img_revised_tab, text_regions_p_1_n, textline_mask_tot_d, + return (polygons_of_images, img_revised_tab, text_regions_p_d, textline_mask_tot_d, regions_without_separators_d, regions_fully, regions_without_separators, polygons_of_marginals, contours_tables) @@ -4301,7 +4301,7 @@ class Eynollah: slope_deskew = self.run_deskew(textline_mask_tot_ea) #print("text region early -2,5 in %.1fs", time.time() - t0) #self.logger.info("Textregion detection took %.1fs ", time.time() - t1t) - num_col, num_col_classifier, img_only_regions, page_coord, image_page, mask_images, mask_lines, \ + num_col, num_col_classifier, img_only_regions, page_coord, image_page, mask_images, mask_seps, \ text_regions_p_1, cont_page, table_prediction, textline_mask_tot_ea, img_bin_light = \ self.run_graphics_and_columns_light(text_regions_p_1, textline_mask_tot_ea, num_col_classifier, num_column_is_classified, @@ -4318,7 +4318,7 @@ class Eynollah: confidence_matrix = np.zeros((text_regions_p_1.shape[:2])) t1 = time.time() - num_col, num_col_classifier, img_only_regions, page_coord, image_page, mask_images, mask_lines, \ + num_col, num_col_classifier, img_only_regions, page_coord, image_page, mask_images, mask_seps, \ text_regions_p_1, cont_page, table_prediction = \ self.run_graphics_and_columns(text_regions_p_1, num_col_classifier, num_column_is_classified, erosion_hurts) @@ -4356,12 +4356,12 @@ class Eynollah: image_page = resize_image(image_page,img_h_new, img_w_new ) textline_mask_tot_ea = resize_image(textline_mask_tot_ea,img_h_new, img_w_new ) mask_images = resize_image(mask_images,img_h_new, img_w_new ) - mask_lines = resize_image(mask_lines,img_h_new, img_w_new ) + mask_seps = resize_image(mask_seps, img_h_new, img_w_new) text_regions_p_1 = resize_image(text_regions_p_1,img_h_new, img_w_new ) table_prediction = resize_image(table_prediction,img_h_new, img_w_new ) textline_mask_tot, text_regions_p = \ - self.run_marginals(textline_mask_tot_ea, mask_images, mask_lines, + self.run_marginals(textline_mask_tot_ea, mask_images, mask_seps, num_col_classifier, slope_deskew, text_regions_p_1, table_prediction) if self.plotter: self.plotter.save_plot_of_layout_main_all(text_regions_p, image_page) @@ -4398,14 +4398,14 @@ class Eynollah: ## birdan sora chock chakir t1 = time.time() if not self.full_layout: - polygons_of_images, img_revised_tab, text_regions_p_1_n, \ + polygons_of_images, img_revised_tab, text_regions_p_d, \ textline_mask_tot_d, regions_without_separators_d, \ boxes, boxes_d, polygons_of_marginals, contours_tables = \ self.run_boxes_no_full_layout(image_page, textline_mask_tot, text_regions_p, slope_deskew, num_col_classifier, table_prediction, erosion_hurts) ###polygons_of_marginals = dilate_textregion_contours(polygons_of_marginals) else: - polygons_of_images, img_revised_tab, text_regions_p_1_n, \ + polygons_of_images, img_revised_tab, text_regions_p_d, \ textline_mask_tot_d, regions_without_separators_d, \ regions_fully, regions_without_separators, polygons_of_marginals, contours_tables = \ self.run_boxes_full_layout(image_page, textline_mask_tot, text_regions_p, slope_deskew, @@ -4419,7 +4419,7 @@ class Eynollah: text_only = (img_revised_tab[:, :] == 1) * 1 if np.abs(slope_deskew) >= SLOPE_THRESHOLD: - text_only_d = (text_regions_p_1_n[:, :] == 1) * 1 + text_only_d = ((text_regions_p_d[:, :] == 1)) * 1 #print("text region early 2 in %.1fs", time.time() - t0) ###min_con_area = 0.000005 @@ -4695,18 +4695,18 @@ class Eynollah: label_seps = 6 if not self.headers_off: if np.abs(slope_deskew) < SLOPE_THRESHOLD: - num_col, _, matrix_of_lines_ch, splitter_y_new, _ = find_number_of_columns_in_document( + num_col, _, matrix_of_seps_ch, splitter_y_new, _ = find_number_of_columns_in_document( text_regions_p, num_col_classifier, self.tables, label_seps, contours_only_text_parent_h) else: - _, _, matrix_of_lines_ch_d, splitter_y_new_d, _ = find_number_of_columns_in_document( - text_regions_p_1_n, num_col_classifier, self.tables, label_seps, contours_only_text_parent_h_d_ordered) + _, _, matrix_of_seps_ch_d, splitter_y_new_d, _ = find_number_of_columns_in_document( + text_regions_p_d, num_col_classifier, self.tables, label_seps, contours_only_text_parent_h_d_ordered) elif self.headers_off: if np.abs(slope_deskew) < SLOPE_THRESHOLD: - num_col, _, matrix_of_lines_ch, splitter_y_new, _ = find_number_of_columns_in_document( + num_col, _, matrix_of_seps_ch, splitter_y_new, _ = find_number_of_columns_in_document( text_regions_p, num_col_classifier, self.tables, label_seps) else: - _, _, matrix_of_lines_ch_d, splitter_y_new_d, _ = find_number_of_columns_in_document( - text_regions_p_1_n, num_col_classifier, self.tables, label_seps) + _, _, matrix_of_seps_ch_d, splitter_y_new_d, _ = find_number_of_columns_in_document( + text_regions_p_d, num_col_classifier, self.tables, label_seps) if num_col_classifier >= 3: if np.abs(slope_deskew) < SLOPE_THRESHOLD: @@ -4718,12 +4718,12 @@ class Eynollah: if np.abs(slope_deskew) < SLOPE_THRESHOLD: boxes, _ = return_boxes_of_images_by_order_of_reading_new( - splitter_y_new, regions_without_separators, matrix_of_lines_ch, + splitter_y_new, regions_without_separators, text_regions_p, matrix_of_seps_ch, num_col_classifier, erosion_hurts, self.tables, self.right2left, logger=self.logger) else: boxes_d, _ = return_boxes_of_images_by_order_of_reading_new( - splitter_y_new_d, regions_without_separators_d, matrix_of_lines_ch_d, + splitter_y_new_d, regions_without_separators_d, text_regions_p_d, matrix_of_seps_ch_d, num_col_classifier, erosion_hurts, self.tables, self.right2left, logger=self.logger) else: From 4abc2ff57249e634c70cda665abc5d99429595d2 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 14 Nov 2025 03:05:02 +0100 Subject: [PATCH 24/32] rewrite/simplify manual reading order using recursive algorithm MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - rename `return_x_start_end_mothers_childs_and_type_of_reading_order` → `return_multicol_separators_x_start_end`, and drop all the analysis pertaining to mother/child relationships and full-span separators, also drop the separator unification rules; instead of the latter, try to combine neighbouring separators more generally: join column spans iff there is nothing in between (which also necessitates passing the region mask), and keep only one of every such redundant pair; add the top (of each page part) as full-span separator up front, and return separators already ordered by y - `return_boxes_of_images_by_order_of_reading_new`: - also pass regions with separators, so they do not have to be reconstructed from the separator coordinates, and also contain images and other non-text region types, when trying to elongate separators to maximize their span (without introducing overlaps) - determine connected components of the region mask, i.e. labels and their respective bboxes, in order to 1. gain additional multi-column separators, if possible 2. avoid cutting through regions which do cross column boundaries later on - whenever adding a new bbox, first look up the label map to see if there are any multi-column regions extending to the right of the current column; if there are, then advance not just one column to the right, but as many as necessary to avoid cutting through these regions - new core algorithm: iterate separators sorted by y and then column by column, but whenever the next separator ends in the same column as the current one or even further left, recurse (i.e. finish that span first before continuing with the top iteration) --- src/eynollah/utils/__init__.py | 935 ++++++++++----------------------- 1 file changed, 277 insertions(+), 658 deletions(-) diff --git a/src/eynollah/utils/__init__.py b/src/eynollah/utils/__init__.py index f3dbae2..e00004f 100644 --- a/src/eynollah/utils/__init__.py +++ b/src/eynollah/utils/__init__.py @@ -32,289 +32,132 @@ def pairwise(iterable): yield a, b a = b -def return_x_start_end_mothers_childs_and_type_of_reading_order( - peak_points, x_min_hor_some, x_max_hor_some, cy_hor_some, y_max_hor_some): +def return_multicol_separators_x_start_end( + regions_without_separators, peak_points, top, bot, + x_min_hor_some, x_max_hor_some, cy_hor_some, y_min_hor_some, y_max_hor_some): """ Analyse which separators overlap multiple column candidates, and how they overlap each other. Ignore separators not spanning multiple columns. - For the separators to be returned, try to join them when they are directly - adjacent horizontally but nearby vertically (and thus mutually compatible). - Also, mark any separators that already span the full width. - - Furthermore, identify which pairs of (unjoined) separators span subsets of columns - of each other (disregarding vertical positions). Referring, respectively, to the - superset separators as "mothers" and to the subset separators as "children", - retrieve information on which columns are spanned by separators with no mother, - and which columns are spanned by their children (if any). - - Moreover, determine if there is any (column) overlap among the multi-span separators - with no mother, specifically (and thus, no simple box separation is possible). + For the separators to be returned, try to remove or unify them when there + is no region between them (vertically) and their neighbours. Arguments: + * the text mask (with all separators suppressed) * the x column coordinates - * the x start column index of the raw separators - * the x end column index of the raw separators - * the y center coordinate of the raw separators - * the y end coordinate of the raw separators + * the y start coordinate to consider in total + * the y end coordinate to consider in total + * the x start coordinate of the horizontal separators + * the x end coordinate of the horizontal separators + * the y start coordinate of the horizontal separators + * the y center coordinate of the horizontal separators + * the y end coordinate of the horizontal separators Returns: a tuple of: - * whether any top-level (no-mother) multi-span separators overlap each other * the x start column index of the resulting multi-span separators * the x end column index of the resulting multi-span separators + * the y start coordinate of the resulting multi-span separators * the y center coordinate of the resulting multi-span separators * the y end coordinate of the resulting multi-span separators - * the y center (for 1 representative) of the top-level (no-mother) multi-span separators - * the x start column index of the top-level (no-mother) multi-span separators - * the x end column index of the top-level (no-mother) multi-span separators - * whether any multi-span separators have super-spans of other (child) multi-span separators - * the y center (for 1 representative) of the top-level (no-mother) multi-span separators - which have super-spans of other (child) multi-span separators - * the x start column index of the top-level multi-span separators - which have super-spans of other (child) multi-span separators - * the x end column index of the top-level multi-span separators - which have super-spans of other (child) multi-span separators - * indexes of multi-span separators with full-width span """ - x_start=[] - x_end=[] - len_sep=[] - y_mid=[] - y_max=[] - new_main_sep_y=[] - indexer=0 + x_start = [0] + x_end = [len(peak_points) - 1] + y_min = [top] + y_mid = [top] + y_max = [top + 2] + indexer = 1 for i in range(len(x_min_hor_some)): #print(indexer, "%d:%d" % (x_min_hor_some[i], x_max_hor_some[i]), cy_hor_some[i]) starting = x_min_hor_some[i] - peak_points min_start = np.flatnonzero(starting >= 0)[-1] # last left-of ending = x_max_hor_some[i] - peak_points - max_end = np.flatnonzero(ending < 0)[0] # first right-of + max_end = np.flatnonzero(ending <= 0)[0] # first right-of #print(indexer, "%d:%d" % (min_start, max_end)) if (max_end-min_start)>=2: # column range of separator spans more than one column candidate - if (max_end-min_start)==(len(peak_points)-1): - # all columns (i.e. could be true new y splitter) - new_main_sep_y.append(indexer) - #print((max_end-min_start),len(peak_points),'(max_end-min_start)') + y_min.append(y_min_hor_some[i]) y_mid.append(cy_hor_some[i]) y_max.append(y_max_hor_some[i]) x_end.append(max_end) x_start.append(min_start) - len_sep.append(max_end-min_start) indexer+=1 #print(x_start,'x_start') #print(x_end,'x_end') - x_start_returned = np.array(x_start, dtype=int) - x_end_returned = np.array(x_end, dtype=int) - y_mid_returned = np.array(y_mid, dtype=int) - y_max_returned = np.array(y_max, dtype=int) - #print(y_mid_returned,'y_mid_returned') - #print(x_start_returned,'x_start_returned') - #print(x_end_returned,'x_end_returned') - - # join/elongate separators if follow-up x and similar y - sep_pairs = contours_in_same_horizon(y_mid_returned) - if len(sep_pairs): - #print('burda') - args_to_be_unified = set() - y_mid_unified = [] - y_max_unified = [] - x_start_unified = [] - x_end_unified = [] - for pair in sep_pairs: - if (not np.array_equal(*x_start_returned[pair]) and - not np.array_equal(*x_end_returned[pair]) and - # immediately adjacent columns? - np.diff(x_end_returned[pair] - - x_start_returned[pair])[0] in [1, -1]): - - args_to_be_unified.union(set(pair)) - y_mid_unified.append(np.min(y_mid_returned[pair])) - y_max_unified.append(np.max(y_max_returned[pair])) - x_start_unified.append(np.min(x_start_returned[pair])) - x_end_unified.append(np.max(x_end_returned[pair])) - #print(pair,'pair') - #print(x_start_returned[pair],'x_s_same_hor') - #print(x_end_returned[pair],'x_e_same_hor') - #print(y_mid_unified,'y_mid_unified') - #print(y_max_unified,'y_max_unified') - #print(x_start_unified,'x_s_unified') - #print(x_end_unified,'x_e_selected') - #print('#############################') - - if len(y_mid_unified): - args_lines_not_unified = np.setdiff1d(np.arange(len(y_mid_returned)), - list(args_to_be_unified), assume_unique=True) - #print(args_lines_not_unified,'args_lines_not_unified') - x_start_returned = np.append(x_start_returned[args_lines_not_unified], - x_start_unified, axis=0) - x_end_returned = np.append(x_end_returned[args_lines_not_unified], - x_end_unified, axis=0) - y_mid_returned = np.append(y_mid_returned[args_lines_not_unified], - y_mid_unified, axis=0) - y_max_returned = np.append(y_max_returned[args_lines_not_unified], - y_max_unified, axis=0) - #print(y_mid_returned,'y_mid_returned2') - #print(x_start_returned,'x_start_returned2') - #print(x_end_returned,'x_end_returned2') - - #print(new_main_sep_y,'new_main_sep_y') - #print(x_start,'x_start') - #print(x_end,'x_end') - x_start = np.array(x_start) - x_end = np.array(x_end) - y_mid = np.array(y_mid) - if len(new_main_sep_y): - # some full-width multi-span separators exist, so - # restrict the y range of separators to search for - # mutual overlaps to only those within the largest - # y strip between adjacent multi-span separators - # that involve at least one such full-width seps. - # (does not affect the separators to be returned) - min_ys=np.min(y_mid) - max_ys=np.max(y_mid) - #print(min_ys,'min_ys') - #print(max_ys,'max_ys') - - y_mains0 = list(y_mid[new_main_sep_y]) - y_mains = [min_ys] + y_mains0 + [max_ys] - - y_mains = np.sort(y_mains) - argm = np.argmax(np.diff(y_mains)) - y_mid_new = y_mains[argm] - y_mid_next_new = y_mains[argm + 1] - - #print(y_mid_new,argm,'y_mid_new') - #print(y_mid_next_new,argm+1,'y_mid_next_new') - #print(y_mid[new_main_sep_y],new_main_sep_y,'yseps') - x_start=np.array(x_start) - x_end=np.array(x_end) - y_mid=np.array(y_mid) - # iff either boundary is itself not a full-width separator, - # then include it in the range of separators to be kept - if y_mid_new in y_mains0: - where = y_mid > y_mid_new - else: - where = y_mid >= y_mid_new - if y_mid_next_new in y_mains0: - where &= y_mid < y_mid_next_new - else: - where &= y_mid <= y_mid_next_new - x_start = x_start[where] - x_end = x_end[where] - y_mid = y_mid[where] + x_start = np.array(x_start, dtype=int) + x_end = np.array(x_end, dtype=int) + y_min = np.array(y_min, dtype=int) + y_mid = np.array(y_mid, dtype=int) + y_max = np.array(y_max, dtype=int) + #print(y_mid,'y_mid') #print(x_start,'x_start') #print(x_end,'x_end') - # remove redundant separators that span the same columns - # (keeping only 1 representative each) - deleted = set() - for index_i in range(len(x_start) - 1): - nodes_i = set(range(x_start[index_i], x_end[index_i] + 1)) - #print(nodes_i, "nodes_i") - for index_j in range(index_i + 1, len(x_start)): - nodes_j = set(range(x_start[index_j], x_end[index_j] + 1)) - #print(nodes_j, "nodes_j") - if nodes_i == nodes_j: - deleted.add(index_j) - #print(deleted,"deleted") - remained_sep_indexes = set(range(len(x_start))) - deleted - #print(remained_sep_indexes,'remained_sep_indexes') + # remove redundant separators (with nothing in between) + args_emptysep = set() + args_ysorted = np.argsort(y_mid) + for i in range(len(y_mid)): + # find nearest neighbours above with nothing in between + prev = (~np.eye(len(y_mid), dtype=bool)[i] & + (y_mid[i] >= y_mid) & + # complete subsumption: + # (x_start[i] >= x_start) & + # (x_end[i] <= x_end) + # partial overlap + (x_start[i] < x_end) & + (x_end[i] > x_start) + ) + prev[list(args_emptysep)] = False # but no pair we already saw + if not prev.any(): + continue + prev = np.flatnonzero(prev[args_ysorted]) + j = args_ysorted[prev[-1]] + if not np.any(regions_without_separators[y_max[j]: y_min[i], + peak_points[min(x_start[i], x_start[j])]: + peak_points[max(x_end[i], x_end[j])]]): + args_emptysep.add(i) + if x_start[j] > x_start[i]: + # print(j, "now starts at", x_start[i]) + x_start[j] = x_start[i] + if x_end[j] < x_end[i]: + x_end[j] = x_end[i] + # print(j, "now ends at", x_end[i]) + # print(j, i, "%d:%d" % (y_mid[j], y_mid[i]), "%d:%d" % (x_start[i], x_end[i]), "empty prev sep") + continue + # find nearest neighbours below with nothing in between + nExt = (~np.eye(len(y_mid), dtype=bool)[i] & + (y_mid[i] <= y_mid) & + (x_start[i] >= x_start) & + (x_end[i] <= x_end)) + nExt[list(args_emptysep)] = False # but no pair we already saw + if not nExt.any(): + continue + nExt = np.flatnonzero(nExt[args_ysorted]) + j = args_ysorted[nExt[0]] + if not np.any(regions_without_separators[y_max[i]: y_min[j], + peak_points[x_start[i]]: + peak_points[x_end[i]]]): + args_emptysep.add(i) + # print(j, i, "%d:%d" % (y_mid[j], y_mid[i]), "%d:%d" % (x_start[i], x_end[i]), "empty next sep") + args_to_be_kept = [arg for arg in args_ysorted + if not arg in args_emptysep] + x_start = x_start[args_to_be_kept] + x_end = x_end[args_to_be_kept] + y_min = y_min[args_to_be_kept] + y_mid = y_mid[args_to_be_kept] + y_max = y_max[args_to_be_kept] - # determine which separators span which columns - mother = [] # whether the respective separator has a mother separator - child = [] # whether the respective separator has a child separator - for index_i in remained_sep_indexes: - have_mother=0 - have_child=0 - nodes_i = set(range(x_start[index_i], x_end[index_i] + 1)) - for index_j in remained_sep_indexes: - nodes_j = set(range(x_start[index_j], x_end[index_j] + 1)) - if nodes_i < nodes_j: - have_mother=1 - if nodes_i > nodes_j: - have_child=1 - mother.append(have_mother) - child.append(have_child) - #print(mother, "mother") - #print(child, "child") - - mother = np.array(mother) - child = np.array(child) - #print(mother,'mother') - #print(child,'child') - remained_sep_indexes = np.array(list(remained_sep_indexes)) - #print(len(remained_sep_indexes)) - #print(len(remained_sep_indexes),len(x_start),len(x_end),len(y_mid),'lens') - - reading_order_type = 0 - if len(remained_sep_indexes): - #print(np.array(remained_sep_indexes),'np.array(remained_sep_indexes)') - #print(np.array(mother),'mother') - remained_sep_indexes_without_mother = remained_sep_indexes[mother==0] - remained_sep_indexes_with_child_without_mother = remained_sep_indexes[(mother==0) & (child==1)] - #print(remained_sep_indexes_without_mother,'remained_sep_indexes_without_mother') - #print(remained_sep_indexes_without_mother,'remained_sep_indexes_without_mother') - - x_end_with_child_without_mother = x_end[remained_sep_indexes_with_child_without_mother] - x_start_with_child_without_mother = x_start[remained_sep_indexes_with_child_without_mother] - y_mid_with_child_without_mother = y_mid[remained_sep_indexes_with_child_without_mother] - - x_end_without_mother = x_end[remained_sep_indexes_without_mother] - x_start_without_mother = x_start[remained_sep_indexes_without_mother] - y_mid_without_mother = y_mid[remained_sep_indexes_without_mother] - - if len(remained_sep_indexes_without_mother)>=2: - for i in range(len(remained_sep_indexes_without_mother)-1): - index_i = remained_sep_indexes_without_mother[i] - nodes_i = set(range(x_start[index_i], x_end[index_i] + 1)) - #print(index_i, nodes_i, "nodes_i without mother") - for j in range(i + 1, len(remained_sep_indexes_without_mother)): - index_j = remained_sep_indexes_without_mother[j] - nodes_j = set(range(x_start[index_j], x_end[index_j] + 1)) - #print(index_j, nodes_j, "nodes_j without mother") - if nodes_i - nodes_j != nodes_i: - #print("type=1") - reading_order_type = 1 - else: - y_mid_without_mother = np.zeros(0, int) - x_start_without_mother = np.zeros(0, int) - x_end_without_mother = np.zeros(0, int) - y_mid_with_child_without_mother = np.zeros(0, int) - x_start_with_child_without_mother = np.zeros(0, int) - x_end_with_child_without_mother = np.zeros(0, int) - - #print(reading_order_type,'reading_order_type') - #print(y_mid_with_child_without_mother,'y_mid_with_child_without_mother') - #print(x_start_with_child_without_mother,'x_start_with_child_without_mother') - #print(x_end_with_child_without_mother,'x_end_with_hild_without_mother') - - len_sep_with_child = len(child[child==1]) - #print(len_sep_with_child,'len_sep_with_child') - there_is_sep_with_child = 0 - if len_sep_with_child >= 1: - there_is_sep_with_child = 1 - - return (reading_order_type, - x_start_returned, - x_end_returned, - y_mid_returned, - y_max_returned, - y_mid_without_mother, - x_start_without_mother, - x_end_without_mother, - there_is_sep_with_child, - y_mid_with_child_without_mother, - x_start_with_child_without_mother, - x_end_with_child_without_mother, - new_main_sep_y) + return (x_start, + x_end, + y_min, + y_mid, + y_max) def box2rect(box: Tuple[int, int, int, int]) -> Tuple[int, int, int, int]: return (box[1], box[1] + box[3], @@ -1212,6 +1055,25 @@ def small_textlines_to_parent_adherence2(textlines_con, textline_iamge, num_col) return textlines_con_changed def order_of_regions(textline_mask, contours_main, contours_head, y_ref, x_ref): + """ + Order text region contours within a single column bbox in a top-down-left-right way. + + First, determine the vertical gaps. Then iterate over each vertical segment, + identifying the contours centered in that segment. Order them by their + horizontal center, and add them to the overall order. + + Arguments: + * textline_mask: the mask of the textline segmentation, cropped for that box + * contours_main: the paragraph text region contours expected to be here + * contours_head: the heading text region contours expected to be here + * y_ref: the vertical offset of that box within the page + * x_ref: the horizontal offset of that box within the page + + Returns: a tuple of + * the array of contour indexes overall within this box (i.e. into main+head) + * the array of types (1 for paragraph, 2 for heading) + * the array of contour indexes for the respective type (i.e. into contours_main or contours_head) + """ ##plt.imshow(textline_mask) ##plt.show() y = textline_mask.sum(axis=1) # horizontal projection profile @@ -1547,7 +1409,7 @@ def find_number_of_columns_in_document(region_pre_p, num_col_classifier, tables, try: num_col, peaks_neg_fin = find_num_col(regions_without_separators[top: bot], num_col_classifier, tables, multiplier=7.0) - #print("big part %d:%d has %d columns" % (top, bot, num_col), peaks_neg_fin) + # print("big part %d:%d has %d columns" % (top, bot, num_col + 1), peaks_neg_fin) except: num_col = 0 peaks_neg_fin = [] @@ -1564,11 +1426,36 @@ def find_number_of_columns_in_document(region_pre_p, num_col_classifier, tables, return num_col_fin, peaks_neg_fin_fin, matrix_of_seps_ch, splitter_y_new, separators_closeup_n def return_boxes_of_images_by_order_of_reading_new( - splitter_y_new, regions_without_separators, - matrix_of_lines_ch, + splitter_y_new, + regions_without_separators, + regions_with_separators, + matrix_of_seps_ch, num_col_classifier, erosion_hurts, tables, right2left_readingorder, logger=None): + """ + Iterate through the vertical parts of a page, each with its own set of columns, + and from the matrix of horizontal separators for that part, find an ordered + list of bounding boxes through all columns and regions. + + Arguments: + * splitter_y_new: the y coordinates separating the parts + * regions_without_separators: (text) region mask with separators suppressed; + (needed to find per-part columns and to combine separators if possible) + * regions_with_separators: (full) region map with separators suppressed; + (needed to elongate separators if possible) + * matrix_of_seps: type and coordinates of horizontal and vertical separators, + as well as headings + * num_col_classifier: predicted number of columns for the entire page + * erosion_hurts: bool + * tables: bool + * right2left_readingorder: whether to invert the default left-to-right order + + Returns: a tuple of + * the ordered list of bounding boxes + * a list of arrays: the x coordinates delimiting the columns for every page part + (according to splitter) + """ if right2left_readingorder: regions_without_separators = cv2.flip(regions_without_separators,1) @@ -1576,12 +1463,20 @@ def return_boxes_of_images_by_order_of_reading_new( logger = getLogger(__package__) logger.debug('enter return_boxes_of_images_by_order_of_reading_new') + # def dbg_imshow(box, title): + # xmin, xmax, ymin, ymax = box + # plt.imshow(regions_with_separators) #, extent=[0, width_tot, bot, top]) + # plt.gca().add_patch(patches.Rectangle((xmin, ymin), xmax - xmin, ymax - ymin, + # fill=False, linewidth=1, edgecolor='r')) + # plt.title(title + " at %d:%d, %d:%d" % (ymin, ymax, xmin, xmax)) + # plt.show() # def dbg_plt(box=None, title=None, rectangles=None, rectangles_showidx=False): # minx, maxx, miny, maxy = box or (0, None, 0, None) # img = regions_without_separators[miny:maxy, minx:maxx] # plt.imshow(img) - # xrange = np.arange(0, img.shape[1], 100) - # yrange = np.arange(0, img.shape[0], 100) + # step = max(img.shape) // 10 + # xrange = np.arange(0, img.shape[1], step) + # yrange = np.arange(0, img.shape[0], step) # ax = plt.gca() # ax.set_xticks(xrange) # ax.set_yticks(yrange) @@ -1597,7 +1492,7 @@ def return_boxes_of_images_by_order_of_reading_new( # ax.add_patch(patches.Rectangle((xmin, ymin), xmax - xmin, ymax - ymin, # fill=False, linewidth=1, edgecolor='r')) # if rectangles_showidx: - # ax.text((xmin+xmax)/2, (ymin+ymax)/2, str(i + 1), c='r') + # ax.text((xmin+xmax)/2, (ymin+ymax)/2, str(i), c='r') # plt.show() # dbg_plt(title="return_boxes_of_images_by_order_of_reading_new") @@ -1606,11 +1501,12 @@ def return_boxes_of_images_by_order_of_reading_new( splitter_y_new = np.array(splitter_y_new, dtype=int) height_tot, width_tot = regions_without_separators.shape big_part = 22 * height_tot // 100 # percent height + _, ccomps, cstats, _ = cv2.connectedComponentsWithStats(regions_without_separators.astype(np.uint8)) for top, bot in pairwise(splitter_y_new): # print("%d:%d" % (top, bot), 'i') # dbg_plt([0, None, top, bot], "image cut for y split %d:%d" % (top, bot)) - matrix_new = matrix_of_lines_ch[(matrix_of_lines_ch[:,6] > top) & - (matrix_of_lines_ch[:,7] < bot)] + matrix_new = matrix_of_seps_ch[(matrix_of_seps_ch[:,6] >= top) & + (matrix_of_seps_ch[:,7] < bot)] #print(len( matrix_new[:,9][matrix_new[:,9]==1] )) #print(matrix_new[:,8][matrix_new[:,9]==1],'gaddaaa') # check to see is there any vertical separator to find holes. @@ -1698,19 +1594,9 @@ def return_boxes_of_images_by_order_of_reading_new( # elongate horizontal separators+headings as much as possible without overlap args_nonver = matrix_new[:, 9] != 1 - regions_with_separators = np.copy(regions_without_separators[top:bot]) - for xmin, xmax, ymin, ymax in matrix_new[:, [2, 3, 6, 7]]: - regions_with_separators[ymin - top: ymax - top, xmin: xmax] = 6 - # def dbg_imshow(box, title): - # xmin, xmax, ymin, ymax = box - # plt.imshow(regions_with_separators, extent=[0, width_tot, bot, top]) - # plt.gca().add_patch(patches.Rectangle((xmin, ymin), xmax - xmin, ymax - ymin, - # fill=False, linewidth=1, edgecolor='r')) - # plt.title(title + " at %d:%d, %d:%d" % (ymin, ymax, xmin, xmax)) - # plt.show() for i in np.flatnonzero(args_nonver): xmin, xmax, ymin, ymax, typ = matrix_new[i, [2, 3, 6, 7, 9]] - cut = regions_with_separators[ymin - top: ymax - top] + cut = regions_with_separators[ymin: ymax] # dbg_imshow([xmin, xmax, ymin, ymax], "separator %d (%s)" % (i, "heading" if typ else "horizontal")) starting = xmin - peaks_neg_tot min_start = np.flatnonzero(starting >= 0)[-1] # last left-of @@ -1737,6 +1623,7 @@ def return_boxes_of_images_by_order_of_reading_new( args_hor = matrix_new[:, 9] == 0 x_min_hor_some = matrix_new[:, 2][args_hor] x_max_hor_some = matrix_new[:, 3][args_hor] + y_min_hor_some = matrix_new[:, 6][args_hor] y_max_hor_some = matrix_new[:, 7][args_hor] cy_hor_some = matrix_new[:, 5][args_hor] @@ -1752,412 +1639,144 @@ def return_boxes_of_images_by_order_of_reading_new( # (x +/- 30px to avoid crossing col peaks by accident) x_min_hor_some = np.append(x_min_hor_some, np.tile(x_min_hor_head + 30, 2)) x_max_hor_some = np.append(x_max_hor_some, np.tile(x_max_hor_head - 30, 2)) + y_min_hor_some = np.append(y_min_hor_some, # toplines + np.concatenate((y_min_hor_head - 2, + y_max_hor_head - 0))) y_max_hor_some = np.append(y_max_hor_some, # baselines - np.concatenate((y_min_hor_head + 2, + np.concatenate((y_min_hor_head + 0, y_max_hor_head + 2))) - cy_hor_some = np.append(cy_hor_some, # toplines - np.concatenate((y_min_hor_head - 2, - y_max_hor_head - 2))) + cy_hor_some = np.append(cy_hor_some, # centerlines + np.concatenate((y_min_hor_head - 1, + y_max_hor_head + 1))) + + # analyse connected components of regions to gain additional separators + # and prepare a map for cross-column boxes + ccounts = np.bincount(ccomps[top: bot].flatten()) + col_ccounts = np.stack([np.bincount(ccomps[top: bot, left: right].flatten(), + minlength=ccounts.size) + for left, right in pairwise(peaks_neg_tot)]) + labelcolmap = dict() + for label, label_count in enumerate(ccounts): + if not label: + continue + label_left, label_top, label_width, label_height, label_area = cstats[label] + # if label_count < 0.9 * label_area: + # # mostly not in this part of the page + # continue + if label_count < 0.01 * (top - bot) * width_tot: + continue + #assert np.sum(col_ccounts[:, label]) == label_count + label_right = label_left + label_width + label_bot = label_top + label_height + label_start = np.flatnonzero(peaks_neg_tot > label_left)[0] - 1 + label_end = np.flatnonzero(peaks_neg_tot >= label_right)[0] + # store as dict for multi-column boxes: + for start in range(label_start, label_end): + labelcolmap.setdefault(start, list()).append( + (label_end, label_top, label_bot, sum(col_ccounts[start: label_end, label]))) + # make additional separators: + if label_end - label_start < 2: + continue + if np.count_nonzero(col_ccounts[:, label] > 0.1 * label_count) < 2: + continue + x_min_hor_some = np.append(x_min_hor_some, [label_left] * 2) + x_max_hor_some = np.append(x_max_hor_some, [label_right] * 2) + y_min_hor_some = np.append(y_min_hor_some, [label_top - 2, label_bot]) + y_max_hor_some = np.append(y_max_hor_some, [label_top, label_bot + 2]) + cy_hor_some = np.append(cy_hor_some, [label_top - 1, label_bot + 1]) if right2left_readingorder: x_max_hor_some = width_tot - x_min_hor_some x_min_hor_some = width_tot - x_max_hor_some - - reading_order_type, x_starting, x_ending, y_mid, y_max, \ - y_mid_without_mother, x_start_without_mother, x_end_without_mother, \ - there_is_sep_with_child, \ - y_mid_with_child_without_mother, x_start_with_child_without_mother, x_end_with_child_without_mother, \ - new_main_sep_y = return_x_start_end_mothers_childs_and_type_of_reading_order( - peaks_neg_tot, x_min_hor_some, x_max_hor_some, cy_hor_some, y_max_hor_some) - - # show multi-column separators - # dbg_plt([0, None, top, bot], "multi-column separators in current split", + x_starting, x_ending, y_min, y_mid, y_max = return_multicol_separators_x_start_end( + regions_without_separators, peaks_neg_tot, top, bot, + x_min_hor_some, x_max_hor_some, cy_hor_some, y_min_hor_some, y_max_hor_some) + # dbg_plt([0, None, top, bot], "non-empty multi-column separators in current split", # list(zip(peaks_neg_tot[x_starting], peaks_neg_tot[x_ending], - # y_mid - top, y_max - top)), True) + # y_min - top, y_max - top)), True) - if (reading_order_type == 1 or - len(y_mid_without_mother) >= 2 or - there_is_sep_with_child == 1): - # there are top-level multi-colspan horizontal separators which overlap each other - # or multiple top-level multi-colspan horizontal separators - # or multi-colspan horizontal separators shorter than their respective top-level: - # todo: explain how this is dealt with - try: - y_grenze = top + 300 - up = (y_mid > top) & (y_mid <= y_grenze) - - args_early_ys=np.arange(len(y_mid)) - #print(args_early_ys,'args_early_ys') - #print(y_mid,'y_mid') - - x_starting_up = x_starting[up] - x_ending_up = x_ending[up] - y_mid_up = y_mid[up] - y_max_up = y_max[up] - args_up = args_early_ys[up] - #print(args_up,'args_up') - #print(y_mid_up,'y_mid_up') - #check if there is a big separator in this y_mains0 - if len(y_mid_up) > 0: - # is there a separator with full-width span? - main_separator = (x_starting_up == 0) & (x_ending_up == len(peaks_neg_tot) - 1) - y_mid_main_separator_up = y_mid_up[main_separator] - y_max_main_separator_up = y_max_up[main_separator] - args_main_to_deleted = args_up[main_separator] - #print(y_mid_main_separator_up,y_max_main_separator_up,args_main_to_deleted,'fffffjammmm') - if len(y_max_main_separator_up): - args_to_be_kept = np.array(list( set(args_early_ys) - set(args_main_to_deleted) )) - #print(args_to_be_kept,'args_to_be_kept') - boxes.append([0, peaks_neg_tot[-1], - top, y_max_main_separator_up.max()]) - # dbg_plt(boxes[-1], "near top main separator box") - top = y_max_main_separator_up.max() - - #print(top,'top') - y_mid = y_mid[args_to_be_kept] - x_starting = x_starting[args_to_be_kept] - x_ending = x_ending[args_to_be_kept] - y_max = y_max[args_to_be_kept] - - #print('galdiha') - y_grenze = top + 200 - up = (y_mid > top) & (y_mid <= y_grenze) - args_early_ys2 = np.arange(len(y_mid)) - x_starting_up = x_starting[up] - x_ending_up = x_ending[up] - y_mid_up = y_mid[up] - y_max_up = y_max[up] - args_up2 = args_early_ys2[up] - #print(y_mid_up,x_starting_up,x_ending_up,'didid') - else: - args_early_ys2 = args_early_ys - args_up2 = args_up - - nodes_in = set() - for ij in range(len(x_starting_up)): - nodes_in.update(range(x_starting_up[ij], - x_ending_up[ij])) - #print(nodes_in,'nodes_in') - #print(np.array(range(len(peaks_neg_tot)-1)),'np.array(range(len(peaks_neg_tot)-1))') - - if nodes_in == set(range(len(peaks_neg_tot)-1)): - pass - elif nodes_in == set(range(1, len(peaks_neg_tot)-1)): - pass - else: - #print('burdaydikh') - args_to_be_kept2 = np.array(list( set(args_early_ys2) - set(args_up2) )) - - if len(args_to_be_kept2): - #print(args_to_be_kept2, "args_to_be_kept2") - y_mid = y_mid[args_to_be_kept2] - x_starting = x_starting[args_to_be_kept2] - x_ending = x_ending[args_to_be_kept2] - y_max = y_max[args_to_be_kept2] - - #int(top) - # order multi-column separators - y_mid_by_order=[] - x_start_by_order=[] - x_end_by_order=[] - if (reading_order_type == 1 or - len(x_end_with_child_without_mother) == 0): - if reading_order_type == 1: - # there are top-level multi-colspan horizontal separators which overlap each other - #print("adding all columns at top because of multiple overlapping mothers") - y_mid_by_order.append(top) - x_start_by_order.append(0) - x_end_by_order.append(len(peaks_neg_tot)-2) - else: - # there are no top-level multi-colspan horizontal separators which themselves - # contain shorter multi-colspan separators - #print(x_start_without_mother,x_end_without_mother,peaks_neg_tot,'dodo') - columns_covered_by_mothers = set() - for dj in range(len(x_start_without_mother)): - columns_covered_by_mothers.update( - range(x_start_without_mother[dj], - x_end_without_mother[dj])) - columns_not_covered = list(all_columns - columns_covered_by_mothers) - #print(columns_covered_by_mothers, "columns_covered_by_mothers") - #print(columns_not_covered, "columns_not_covered") - y_mid = np.append(y_mid, np.ones(len(columns_not_covered) + - len(x_start_without_mother), - dtype=int) * top) - ##y_mid_by_order = np.append(y_mid_by_order, [top] * len(columns_not_covered)) - ##x_start_by_order = np.append(x_start_by_order, [0] * len(columns_not_covered)) - x_starting = np.append(x_starting, np.array(columns_not_covered, int)) - x_starting = np.append(x_starting, x_start_without_mother) - x_ending = np.append(x_ending, np.array(columns_not_covered, int) + 1) - x_ending = np.append(x_ending, x_end_without_mother) - - ind_args=np.arange(len(y_mid)) - #print(ind_args,'ind_args') - for column in range(len(peaks_neg_tot)-1): - #print(column,'column') - ind_args_in_col=ind_args[x_starting==column] - #print('babali2') - #print(ind_args_in_col,'ind_args_in_col') - #print(len(y_mid)) - y_mid_column=y_mid[ind_args_in_col] - x_start_column=x_starting[ind_args_in_col] - x_end_column=x_ending[ind_args_in_col] - #print('babali3') - ind_args_col_sorted=np.argsort(y_mid_column) - y_mid_by_order.extend(y_mid_column[ind_args_col_sorted]) - x_start_by_order.extend(x_start_column[ind_args_col_sorted]) - x_end_by_order.extend(x_end_column[ind_args_col_sorted] - 1) - else: - #print(x_start_without_mother,x_end_without_mother,peaks_neg_tot,'dodo') - columns_covered_by_mothers = set() - for dj in range(len(x_start_without_mother)): - columns_covered_by_mothers.update( - range(x_start_without_mother[dj], - x_end_without_mother[dj])) - columns_not_covered = list(all_columns - columns_covered_by_mothers) - #print(columns_covered_by_mothers, "columns_covered_by_mothers") - #print(columns_not_covered, "columns_not_covered") - y_mid = np.append(y_mid, np.ones(len(columns_not_covered) + - len(x_start_without_mother), - dtype=int) * top) - ##y_mid_by_order = np.append(y_mid_by_order, [top] * len(columns_not_covered)) - ##x_start_by_order = np.append(x_start_by_order, [0] * len(columns_not_covered)) - x_starting = np.append(x_starting, np.array(columns_not_covered, int)) - x_starting = np.append(x_starting, x_start_without_mother) - x_ending = np.append(x_ending, np.array(columns_not_covered, int) + 1) - x_ending = np.append(x_ending, x_end_without_mother) - - columns_covered_by_mothers_with_child = set() - for dj in range(len(x_end_with_child_without_mother)): - columns_covered_by_mothers_with_child.update( - range(x_start_with_child_without_mother[dj], - x_end_with_child_without_mother[dj])) - #print(columns_covered_by_mothers_with_child, "columns_covered_by_mothers_with_child") - columns_not_covered_by_mothers_with_child = list( - all_columns - columns_covered_by_mothers_with_child) - #indexes_to_be_spanned=[] - for i_s in range(len(x_end_with_child_without_mother)): - columns_not_covered_by_mothers_with_child.append(x_start_with_child_without_mother[i_s]) - columns_not_covered_by_mothers_with_child = np.sort(columns_not_covered_by_mothers_with_child) - #print(columns_not_covered_by_mothers_with_child, "columns_not_covered_by_mothers_with_child") - ind_args = np.arange(len(y_mid)) - for i_s_nc in columns_not_covered_by_mothers_with_child: - if i_s_nc in x_start_with_child_without_mother: - # use only seps with mother's span ("biggest") - #print("i_s_nc", i_s_nc) - x_end_biggest_column = \ - x_end_with_child_without_mother[ - x_start_with_child_without_mother == i_s_nc][0] - args_all_biggest_seps = \ - ind_args[(x_starting == i_s_nc) & - (x_ending == x_end_biggest_column)] - y_mid_column_nc = y_mid[args_all_biggest_seps] - #print("%d:%d" % (i_s_nc, x_end_biggest_column), "columns covered by mother with child") - #x_start_column_nc = x_starting[args_all_biggest_seps] - #x_end_column_nc = x_ending[args_all_biggest_seps] - y_mid_column_nc = np.sort(y_mid_column_nc) - #print(y_mid_column_nc, "y_mid_column_nc (sorted)") - for nc_top, nc_bot in pairwise(np.append(y_mid_column_nc, bot)): - #print("i_c", i_c) - #print("%d:%d" % (nc_top, nc_bot), "y_mid_column_nc") - ind_all_seps_between_nm_wc = \ - ind_args[(y_mid > nc_top) & - (y_mid < nc_bot) & - (x_starting >= i_s_nc) & - (x_ending <= x_end_biggest_column)] - y_mid_all_between_nm_wc = y_mid[ind_all_seps_between_nm_wc] - x_starting_all_between_nm_wc = x_starting[ind_all_seps_between_nm_wc] - x_ending_all_between_nm_wc = x_ending[ind_all_seps_between_nm_wc] - - columns_covered_by_mothers = set() - for dj in range(len(ind_all_seps_between_nm_wc)): - columns_covered_by_mothers.update( - range(x_starting_all_between_nm_wc[dj], - x_ending_all_between_nm_wc[dj])) - #print(columns_covered_by_mothers, "columns_covered_by_mothers") - child_columns = set(range(i_s_nc, x_end_biggest_column)) - columns_not_covered = list(child_columns - columns_covered_by_mothers) - #print(child_columns, "child_columns") - #print(columns_not_covered, "columns_not_covered") - - if len(ind_all_seps_between_nm_wc): - biggest = np.argmax(x_ending_all_between_nm_wc - - x_starting_all_between_nm_wc) - #print(ind_all_seps_between_nm_wc, "ind_all_seps_between_nm_wc") - #print(biggest, "%d:%d" % (x_starting_all_between_nm_wc[biggest], - x_ending_all_between_nm_wc[biggest]), "biggest") - if columns_covered_by_mothers == set( - range(x_starting_all_between_nm_wc[biggest], - x_ending_all_between_nm_wc[biggest])): - # single biggest accounts for all covered columns alone, - # this separator should be extended to cover all - seps_too_close_to_top_separator = \ - ((y_mid_all_between_nm_wc > nc_top) & - (y_mid_all_between_nm_wc <= nc_top + 500)) - if (np.count_nonzero(seps_too_close_to_top_separator) and - np.count_nonzero(seps_too_close_to_top_separator) < - len(ind_all_seps_between_nm_wc)): - #print(seps_too_close_to_top_separator, "seps_too_close_to_top_separator") - y_mid_all_between_nm_wc = \ - y_mid_all_between_nm_wc[~seps_too_close_to_top_separator] - x_starting_all_between_nm_wc = \ - x_starting_all_between_nm_wc[~seps_too_close_to_top_separator] - x_ending_all_between_nm_wc = \ - x_ending_all_between_nm_wc[~seps_too_close_to_top_separator] - - y_mid_all_between_nm_wc = np.append( - y_mid_all_between_nm_wc, nc_top) - x_starting_all_between_nm_wc = np.append( - x_starting_all_between_nm_wc, i_s_nc) - x_ending_all_between_nm_wc = np.append( - x_ending_all_between_nm_wc, x_end_biggest_column) - else: - y_mid_all_between_nm_wc = np.append( - y_mid_all_between_nm_wc, nc_top) - x_starting_all_between_nm_wc = np.append( - x_starting_all_between_nm_wc, x_starting_all_between_nm_wc[biggest]) - x_ending_all_between_nm_wc = np.append( - x_ending_all_between_nm_wc, x_ending_all_between_nm_wc[biggest]) - - if len(columns_not_covered): - y_mid_all_between_nm_wc = np.append( - y_mid_all_between_nm_wc, [nc_top] * len(columns_not_covered)) - x_starting_all_between_nm_wc = np.append( - x_starting_all_between_nm_wc, np.array(columns_not_covered, int)) - x_ending_all_between_nm_wc = np.append( - x_ending_all_between_nm_wc, np.array(columns_not_covered, int) + 1) - - ind_args_between=np.arange(len(x_ending_all_between_nm_wc)) - for column in range(int(i_s_nc), int(x_end_biggest_column)): - ind_args_in_col=ind_args_between[x_starting_all_between_nm_wc==column] - #print('babali2') - #print(ind_args_in_col,'ind_args_in_col') - #print(len(y_mid)) - y_mid_column=y_mid_all_between_nm_wc[ind_args_in_col] - x_start_column=x_starting_all_between_nm_wc[ind_args_in_col] - x_end_column=x_ending_all_between_nm_wc[ind_args_in_col] - #print('babali3') - ind_args_col_sorted=np.argsort(y_mid_column) - y_mid_by_order.extend(y_mid_column[ind_args_col_sorted]) - x_start_by_order.extend(x_start_column[ind_args_col_sorted]) - x_end_by_order.extend(x_end_column[ind_args_col_sorted] - 1) - else: - #print(i_s_nc,'column not covered by mothers with child') - ind_args_in_col=ind_args[x_starting==i_s_nc] - #print('babali2') - #print(ind_args_in_col,'ind_args_in_col') - #print(len(y_mid)) - y_mid_column=y_mid[ind_args_in_col] - x_start_column=x_starting[ind_args_in_col] - x_end_column=x_ending[ind_args_in_col] - #print('babali3') - ind_args_col_sorted = np.argsort(y_mid_column) - y_mid_by_order.extend(y_mid_column[ind_args_col_sorted]) - x_start_by_order.extend(x_start_column[ind_args_col_sorted]) - x_end_by_order.extend(x_end_column[ind_args_col_sorted] - 1) - - # create single-column boxes from multi-column separators - y_mid_by_order = np.array(y_mid_by_order) - x_start_by_order = np.array(x_start_by_order) - x_end_by_order = np.array(x_end_by_order) - for il in range(len(y_mid_by_order)): - #print(il, "il") - y_mid_itself = y_mid_by_order[il] - x_start_itself = x_start_by_order[il] - x_end_itself = x_end_by_order[il] - for column in range(int(x_start_itself), int(x_end_itself)+1): - #print(column,'cols') - #print('burda') - #print('burda2') - y_mid_next = y_mid_by_order[(y_mid_itself < y_mid_by_order) & - (column >= x_start_by_order) & - (column <= x_end_by_order)] - y_mid_next = y_mid_next.min(initial=bot) - #print(y_mid_next,'y_mid_next') - #print(y_mid_itself,'y_mid_itself') + # core algorithm: + # 1. iterate through multi-column separators, pre-ordered by their y coord + # 2. for each separator, iterate from its starting to its ending column + # 3. in each starting column, determine the next downwards separator, + # 4. if there is none, then fill up the column to the bottom; + # otherwise, fill up to that next separator + # 5. moreover, determine the next rightward column that would not cut through + # any regions, advancing to that column, and storing a new in-order bbox + # for that down/right span + # 6. if there was a next separator, and it ends no further than the current one, + # then recurse on that separator from step 1, then continue (with the next + # column for the current separator) at step 2, or (with the next separator + # in order) at step 1 + args = list(range(len(y_mid))) + while len(args): + cur = args[0] + args = args[1:] + # print("iter", cur, y_mid[cur], "%d:%d" % (x_starting[cur], x_ending[cur])) + def get_span(start, y_top, y_bot): + # for last, l_top, l_bot, l_count in labelcolmap.get(start, []): + # if y_top < l_bot and y_bot > l_top and last > start + 1: + # width = (peaks_neg_tot[last] - peaks_neg_tot[start]) + # print("span", start, last, l_top, l_bot, l_count, + # "box area", (y_bot - y_top) * width, + # "label area", (min(y_bot, l_bot) - max(y_top, l_top)) * width, + # "box height", (y_bot - y_top), + # "label height", sum(regions_without_separators[ + # y_top: y_bot, peaks_neg_tot[start + 1]])) + return min((last for last, l_top, l_bot, l_count in labelcolmap.get(start, []) + # yield the right-most column that does not cut through + # any regions in this horizontal span + if y_top < l_bot and y_bot > l_top + # Ignore if it ends here, anyway + and last > start + 1 + # Ensure this is not just a tiny region near larger regions + and l_count > 0.1 * max(l_count2 for _, l_top2, l_bot2, l_count2 in labelcolmap[start] + if y_top < l_bot2 and y_bot > l_top2) + # or just a small cut of the respective region + # (i.e. box should cover at least 10% of the label). + and ((min(y_bot, l_bot) - max(y_top, l_top)) * + (peaks_neg_tot[last] - peaks_neg_tot[start])) > 0.1 * l_count + # But do allow cutting tiny passages with less 10% of height + # (i.e. label is already almost separated by columns) + and sum(regions_without_separators[ + y_top: y_bot, peaks_neg_tot[start + 1]]) > 0.1 * (y_bot - y_top)), + # Otherwise advance only 1 column. + default=start + 1) + def add_sep(cur): + column = x_starting[cur] + while column < x_ending[cur]: + nxt = np.flatnonzero((y_mid[cur] < y_mid) & + (column >= x_starting) & + (column < x_ending)) + if len(nxt): + nxt = nxt[0] + # print("column", column) + last = get_span(column, y_max[cur], y_min[nxt]) + last = min(last, x_ending[nxt], x_ending[cur]) + # print("nxt", nxt, y_mid[nxt], "%d:%d" % (column, last)) boxes.append([peaks_neg_tot[column], - peaks_neg_tot[column+1], - y_mid_itself, - y_mid_next]) - # dbg_plt(boxes[-1], "A column %d box" % (column + 1)) - except: - logger.exception("cannot assign boxes") - boxes.append([0, peaks_neg_tot[len(peaks_neg_tot)-1], - top, bot]) - # dbg_plt(boxes[-1], "fallback box") - else: - # order multi-column separators - y_mid_by_order=[] - x_start_by_order=[] - x_end_by_order=[] - if len(x_starting)>0: - columns_covered_by_seps_covered_more_than_2col = set() - for dj in range(len(x_starting)): - if set(range(x_starting[dj], x_ending[dj])) != all_columns: - columns_covered_by_seps_covered_more_than_2col.update( - range(x_starting[dj], x_ending[dj])) - columns_not_covered = list(all_columns - columns_covered_by_seps_covered_more_than_2col) - - y_mid = np.append(y_mid, np.ones(len(columns_not_covered) + 1, - dtype=int) * top) - ##y_mid_by_order = np.append(y_mid_by_order, [top] * len(columns_not_covered)) - ##x_start_by_order = np.append(x_start_by_order, [0] * len(columns_not_covered)) - x_starting = np.append(x_starting, np.array(columns_not_covered, x_starting.dtype)) - x_ending = np.append(x_ending, np.array(columns_not_covered, x_ending.dtype) + 1) - if len(new_main_sep_y) > 0: - x_starting = np.append(x_starting, 0) - x_ending = np.append(x_ending, len(peaks_neg_tot) - 1) - else: - x_starting = np.append(x_starting, x_starting[0]) - x_ending = np.append(x_ending, x_ending[0]) - else: - columns_not_covered = list(all_columns) - y_mid = np.append(y_mid, np.ones(len(columns_not_covered), - dtype=int) * top) - ##y_mid_by_order = np.append(y_mid_by_order, [top] * len(columns_not_covered)) - ##x_start_by_order = np.append(x_start_by_order, [0] * len(columns_not_covered)) - x_starting = np.append(x_starting, np.array(columns_not_covered, x_starting.dtype)) - x_ending = np.append(x_ending, np.array(columns_not_covered, x_ending.dtype) + 1) - - ind_args = np.arange(len(y_mid)) - - for column in range(len(peaks_neg_tot)-1): - #print(column,'column') - ind_args_in_col=ind_args[x_starting==column] - #print(len(y_mid)) - y_mid_column=y_mid[ind_args_in_col] - x_start_column=x_starting[ind_args_in_col] - x_end_column=x_ending[ind_args_in_col] - - ind_args_col_sorted = np.argsort(y_mid_column) - y_mid_by_order.extend(y_mid_column[ind_args_col_sorted]) - x_start_by_order.extend(x_start_column[ind_args_col_sorted]) - x_end_by_order.extend(x_end_column[ind_args_col_sorted] - 1) - - # create single-column boxes from multi-column separators - y_mid_by_order = np.array(y_mid_by_order) - x_start_by_order = np.array(x_start_by_order) - x_end_by_order = np.array(x_end_by_order) - for il in range(len(y_mid_by_order)): - #print(il, "il") - y_mid_itself = y_mid_by_order[il] - #print(y_mid_itself,'y_mid_itself') - x_start_itself = x_start_by_order[il] - x_end_itself = x_end_by_order[il] - for column in range(x_start_itself, x_end_itself+1): - #print(column,'cols') - #print('burda2') - y_mid_next = y_mid_by_order[(y_mid_itself < y_mid_by_order) & - (column >= x_start_by_order) & - (column <= x_end_by_order)] - #print(y_mid_next,'y_mid_next') - y_mid_next = y_mid_next.min(initial=bot) - #print(y_mid_next,'y_mid_next') - boxes.append([peaks_neg_tot[column], - peaks_neg_tot[column+1], - y_mid_itself, - y_mid_next]) - # dbg_plt(boxes[-1], "B column %d box" % (column + 1)) + peaks_neg_tot[last], + y_mid[cur], + y_mid[nxt]]) + # dbg_plt(boxes[-1], "recursive column %d:%d box [%d]" % (column, last, len(boxes))) + column = last + if last == x_ending[nxt] and x_ending[nxt] <= x_ending[cur] and nxt in args: + # child – recur + # print("recur", nxt, y_mid[nxt], "%d:%d" % (x_starting[nxt], x_ending[nxt])) + args.remove(nxt) + add_sep(nxt) + else: + # print("column", column) + last = get_span(column, y_max[cur], bot) + # print("bot", bot, "%d:%d" % (column, last)) + boxes.append([peaks_neg_tot[column], + peaks_neg_tot[last], + y_mid[cur], + bot]) + # dbg_plt(boxes[-1], "non-recursive column %d box [%d]" % (column, len(boxes))) + column = last + add_sep(cur) if right2left_readingorder: peaks_neg_tot_tables_new = [] From 4475183f08d2c25eb90deb04bda552930abd4ba0 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 14 Nov 2025 03:39:36 +0100 Subject: [PATCH 25/32] improve rules governing column split - reduce `sigma` for smoothing of input to `find_peaks` (so we get deeper gaps between columns) - allow column boundaries closer to the margins (50 instead of 100 or 200 px, 170 instead of 370 px) - allow column boundaries closer to each other (300 instead of 400 px) - add a secondary `grenze` criterion for depth of gap (relative to lowest minimum, if that is smaller than the old criterion relative to lowest maximum) - for calls to `find_num_col` within parts of a page, do allow unbalanced column boundaries --- src/eynollah/utils/__init__.py | 113 +++++++++++++++++++-------------- 1 file changed, 66 insertions(+), 47 deletions(-) diff --git a/src/eynollah/utils/__init__.py b/src/eynollah/utils/__init__.py index e00004f..570eefe 100644 --- a/src/eynollah/utils/__init__.py +++ b/src/eynollah/utils/__init__.py @@ -241,7 +241,7 @@ def find_num_col_deskew(regions_without_separators, sigma_, multiplier=3.8): z = gaussian_filter1d(regions_without_separators_0, sigma_) return np.std(z) -def find_num_col(regions_without_separators, num_col_classifier, tables, multiplier=3.8): +def find_num_col(regions_without_separators, num_col_classifier, tables, multiplier=3.8, unbalanced=False): if not regions_without_separators.any(): return 0, [] regions_without_separators_0 = regions_without_separators.sum(axis=0) @@ -249,13 +249,15 @@ def find_num_col(regions_without_separators, num_col_classifier, tables, multipl # ax1.imshow(regions_without_separators, aspect="auto") # ax2.plot(regions_without_separators_0) # plt.show() - sigma_ = 35 # 70#35 + sigma_ = 25 # 70#35 meda_n_updown = regions_without_separators_0[::-1] first_nonzero = next((i for i, x in enumerate(regions_without_separators_0) if x), 0) last_nonzero = next((i for i, x in enumerate(meda_n_updown) if x), 0) last_nonzero = len(regions_without_separators_0) - last_nonzero - last_nonzero = last_nonzero - 100 - first_nonzero = first_nonzero + 200 + last_nonzero = last_nonzero - 50 #- 100 + first_nonzero = first_nonzero + 50 #+ 200 + last_offmargin = len(regions_without_separators_0) - 170 #370 + first_offmargin = 170 #370 y = regions_without_separators_0 # [first_nonzero:last_nonzero] y_help = np.zeros(len(y) + 20) y_help[10 : len(y) + 10] = y @@ -285,26 +287,34 @@ def find_num_col(regions_without_separators, num_col_classifier, tables, multipl # ax2.axvline(last_nonzero, label="last nonzero") # ax2.text(first_nonzero, 0, "first nonzero", rotation=90) # ax2.text(last_nonzero, 0, "last nonzero", rotation=90) - # ax2.axvline(370, label="first") - # ax2.axvline(len(y) - 370, label="last") - # ax2.text(370, 0, "first", rotation=90) - # ax2.text(len(y) - 370, 0, "last", rotation=90) + # ax2.axvline(first_offmargin, label="first offmargin") + # ax2.axvline(last_offmargin, label="last offmargin") + # ax2.text(first_offmargin, 0, "first offmargin", rotation=90) + # ax2.text(last_offmargin, 0, "last offmargin", rotation=90) # plt.show() peaks_neg = peaks_neg - 10 - 10 + # print("raw peaks", peaks) peaks = peaks[(peaks > 0.06 * len(y)) & (peaks < 0.94 * len(y))] + # print("non-marginal peaks", peaks) interest_pos = z[peaks] + # print("interest_pos", interest_pos) interest_pos = interest_pos[interest_pos > 10] if not interest_pos.any(): return 0, [] + # plt.plot(z) # plt.show() + #print("raw peaks_neg", peaks_neg) peaks_neg = peaks_neg[(peaks_neg > first_nonzero) & (peaks_neg < last_nonzero)] - peaks_neg = peaks_neg[(peaks_neg > 370) & - (peaks_neg < len(y) - 370)] + #print("non-zero peaks_neg", peaks_neg) + peaks_neg = peaks_neg[(peaks_neg > first_offmargin) & + (peaks_neg < last_offmargin)] + #print("non-marginal peaks_neg", peaks_neg) interest_neg = z[peaks_neg] + #print("interest_neg", interest_neg) if not interest_neg.any(): return 0, [] @@ -317,10 +327,14 @@ def find_num_col(regions_without_separators, num_col_classifier, tables, multipl min_peaks_neg = 0 # np.min(interest_neg) + # cutoff criterion: fixed fraction of lowest column height dis_talaei = (min_peaks_pos - min_peaks_neg) / multiplier grenze = min_peaks_pos - dis_talaei #np.mean(y[peaks_neg[0]:peaks_neg[-1]])-np.std(y[peaks_neg[0]:peaks_neg[-1]])/2.0 + # extra criterion: fixed multiple of lowest gap height + grenze = min(grenze, multiplier * (5 + np.min(interest_neg))) + # print(interest_neg,'interest_neg') # print(grenze,'grenze') # print(min_peaks_pos,'min_peaks_pos') @@ -356,18 +370,20 @@ def find_num_col(regions_without_separators, num_col_classifier, tables, multipl # print(peaks_neg_fin,'peaks_neg_fin') # print(num_col,'diz') # cancel if resulting split is highly unbalanced across available width - if ((num_col == 3 and - ((peaks_neg_fin[0] > 0.75 * len(y) and - peaks_neg_fin[1] > 0.75 * len(y)) or - (peaks_neg_fin[0] < 0.25 * len(y) and - peaks_neg_fin[1] < 0.25 * len(y)) or - (peaks_neg_fin[0] < 0.5 * len(y) - 200 and - peaks_neg_fin[1] < 0.5 * len(y)) or - (peaks_neg_fin[0] > 0.5 * len(y) + 200 and - peaks_neg_fin[1] > 0.5 * len(y)))) or - (num_col == 2 and - (peaks_neg_fin[0] > 0.75 * len(y) or - peaks_neg_fin[0] < 0.25 * len(y)))): + if unbalanced: + pass + elif ((num_col == 3 and + ((peaks_neg_fin[0] > 0.75 * len(y) and + peaks_neg_fin[1] > 0.75 * len(y)) or + (peaks_neg_fin[0] < 0.25 * len(y) and + peaks_neg_fin[1] < 0.25 * len(y)) or + (peaks_neg_fin[0] < 0.5 * len(y) - 200 and + peaks_neg_fin[1] < 0.5 * len(y)) or + (peaks_neg_fin[0] > 0.5 * len(y) + 200 and + peaks_neg_fin[1] > 0.5 * len(y)))) or + (num_col == 2 and + (peaks_neg_fin[0] > 0.75 * len(y) or + peaks_neg_fin[0] < 0.25 * len(y)))): num_col = 1 peaks_neg_fin = [] @@ -376,7 +392,7 @@ def find_num_col(regions_without_separators, num_col_classifier, tables, multipl # filter out peaks that are too close (<400px) to each other: # among each group, pick the position with smallest amount of text diff_peaks = np.abs(np.diff(peaks_neg_fin)) - cut_off = 400 + cut_off = 300 #400 peaks_neg_true = [] forest = [] # print(len(peaks_neg_fin),'len_') @@ -401,30 +417,32 @@ def find_num_col(regions_without_separators, num_col_classifier, tables, multipl #print(peaks_neg_true, "peaks_neg_true") ##print(num_col,'early') # cancel if resulting split is highly unbalanced across available width - if ((num_col == 3 and - ((peaks_neg_true[0] > 0.75 * len(y) and - peaks_neg_true[1] > 0.75 * len(y)) or - (peaks_neg_true[0] < 0.25 * len(y) and - peaks_neg_true[1] < 0.25 * len(y)) or - (peaks_neg_true[0] < 0.5 * len(y) - 200 and - peaks_neg_true[1] < 0.5 * len(y)) or - (peaks_neg_true[0] > 0.5 * len(y) + 200 and - peaks_neg_true[1] > 0.5 * len(y)))) or - (num_col == 2 and - (peaks_neg_true[0] > 0.75 * len(y) or - peaks_neg_true[0] < 0.25 * len(y)))): + if unbalanced: + pass + elif ((num_col == 3 and + ((peaks_neg_true[0] > 0.75 * len(y) and + peaks_neg_true[1] > 0.75 * len(y)) or + (peaks_neg_true[0] < 0.25 * len(y) and + peaks_neg_true[1] < 0.25 * len(y)) or + (peaks_neg_true[0] < 0.5 * len(y) - 200 and + peaks_neg_true[1] < 0.5 * len(y)) or + (peaks_neg_true[0] > 0.5 * len(y) + 200 and + peaks_neg_true[1] > 0.5 * len(y)))) or + (num_col == 2 and + (peaks_neg_true[0] > 0.75 * len(y) or + peaks_neg_true[0] < 0.25 * len(y)))): num_col = 1 peaks_neg_true = [] - if (num_col == 3 and - (peaks_neg_true[0] < 0.75 * len(y) and - peaks_neg_true[0] > 0.25 * len(y) and - peaks_neg_true[1] > 0.80 * len(y))): + elif (num_col == 3 and + (peaks_neg_true[0] < 0.75 * len(y) and + peaks_neg_true[0] > 0.25 * len(y) and + peaks_neg_true[1] > 0.80 * len(y))): num_col = 2 peaks_neg_true = [peaks_neg_true[0]] - if (num_col == 3 and - (peaks_neg_true[1] < 0.75 * len(y) and - peaks_neg_true[1] > 0.25 * len(y) and - peaks_neg_true[0] < 0.20 * len(y))): + elif (num_col == 3 and + (peaks_neg_true[1] < 0.75 * len(y) and + peaks_neg_true[1] > 0.25 * len(y) and + peaks_neg_true[0] < 0.20 * len(y))): num_col = 2 peaks_neg_true = [peaks_neg_true[1]] @@ -1151,8 +1169,8 @@ def order_of_regions(textline_mask, contours_main, contours_head, y_ref, x_ref): ##matrix_of_orders[:len_main,4]=final_indexers_sorted[:] - # assert len(final_indexers_sorted) == len(contours_main) + len(contours_head) - # assert not len(final_indexers_sorted) or max(final_index_type) == max(len(contours_main) + assert len(set(final_indexers_sorted)) == len(contours_main) + len(contours_head) + assert set(final_index_type) == set(range(len(contours_main))).union(range(len(contours_head))) return np.array(final_indexers_sorted), np.array(final_types), np.array(final_index_type) @@ -1518,7 +1536,8 @@ def return_boxes_of_images_by_order_of_reading_new( regions_without_separators[top:bot], # we do not expect to get all columns in small parts (headings etc.): num_col_classifier if bot - top >= big_part else 1, - tables, multiplier=6. if erosion_hurts else 7.) + tables, multiplier=6. if erosion_hurts else 7., + unbalanced=True) except: peaks_neg_fin=[] num_col = 0 @@ -1534,7 +1553,7 @@ def return_boxes_of_images_by_order_of_reading_new( if len(peaks_neg_fin)==0: num_col, peaks_neg_fin = find_num_col( regions_without_separators[top:bot], - num_col_classifier, tables, multiplier=3.) + num_col_classifier, tables, multiplier=3., unbalanced=True) #print(peaks_neg_fin,'peaks_neg_fin') peaks_neg_fin_early = [0] + peaks_neg_fin + [width_tot-1] From 3c15c4f7d4bf03fee11c54da82ba7d29f09ada5a Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sat, 15 Nov 2025 14:29:41 +0100 Subject: [PATCH 26/32] back to `rotate_image` instead of `rotation_image_new` for deskewing (because the latter does not preserve coordinates; it scales, even when resizing the image; this caused coordinate problems when matching deskewed contours) --- src/eynollah/eynollah.py | 58 +++++++++------------------------------- 1 file changed, 13 insertions(+), 45 deletions(-) diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index 2bdb2c7..efd67d5 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -88,12 +88,7 @@ from .utils.contour import ( join_polygons, make_intersection, ) -from .utils.rotate import ( - rotate_image, - rotation_not_90_func, - rotation_not_90_func_full_layout, - rotation_image_new -) +from .utils.rotate import rotate_image from .utils.utils_ocr import ( return_start_and_end_of_common_text_of_textline_ocr_without_common_section, return_textline_contour_with_added_box_coordinate, @@ -3131,11 +3126,9 @@ class Eynollah: self.logger.debug('enter run_boxes_no_full_layout') t_0_box = time.time() if np.abs(slope_deskew) >= SLOPE_THRESHOLD: - _, textline_mask_tot_d, text_regions_p_d, table_prediction_n = rotation_not_90_func( - image_page, textline_mask_tot, text_regions_p, table_prediction, slope_deskew) - text_regions_p_d = resize_image(text_regions_p_d, text_regions_p.shape[0], text_regions_p.shape[1]) - textline_mask_tot_d = resize_image(textline_mask_tot_d, text_regions_p.shape[0], text_regions_p.shape[1]) - table_prediction_n = resize_image(table_prediction_n, text_regions_p.shape[0], text_regions_p.shape[1]) + textline_mask_tot_d = rotate_image(textline_mask_tot, slope_deskew) + text_regions_p_d = rotate_image(text_regions_p, slope_deskew) + table_prediction_n = rotate_image(table_prediction, slope_deskew) regions_without_separators_d = (text_regions_p_d[:, :] == 1) * 1 if self.tables: regions_without_separators_d[table_prediction_n[:,:] == 1] = 1 @@ -3276,20 +3269,9 @@ class Eynollah: text_regions_p[:,:][table_prediction[:,:]==1] = 10 img_revised_tab = text_regions_p[:,:] if np.abs(slope_deskew) >= SLOPE_THRESHOLD: - _, textline_mask_tot_d, text_regions_p_d, table_prediction_n = \ - rotation_not_90_func(image_page, textline_mask_tot, text_regions_p, - table_prediction, slope_deskew) - - text_regions_p_d = resize_image(text_regions_p_d, - text_regions_p.shape[0], - text_regions_p.shape[1]) - textline_mask_tot_d = resize_image(textline_mask_tot_d, - text_regions_p.shape[0], - text_regions_p.shape[1]) - table_prediction_n = resize_image(table_prediction_n, - text_regions_p.shape[0], - text_regions_p.shape[1]) - + textline_mask_tot_d = rotate_image(textline_mask_tot, slope_deskew) + text_regions_p_d = rotate_image(text_regions_p, slope_deskew) + table_prediction_n = rotate_image(table_prediction, slope_deskew) regions_without_separators_d = (text_regions_p_d[:,:] == 1)*1 regions_without_separators_d[table_prediction_n[:,:] == 1] = 1 else: @@ -3303,20 +3285,9 @@ class Eynollah: else: if np.abs(slope_deskew) >= SLOPE_THRESHOLD: - _, textline_mask_tot_d, text_regions_p_d, table_prediction_n = \ - rotation_not_90_func(image_page, textline_mask_tot, text_regions_p, - table_prediction, slope_deskew) - - text_regions_p_d = resize_image(text_regions_p_d, - text_regions_p.shape[0], - text_regions_p.shape[1]) - textline_mask_tot_d = resize_image(textline_mask_tot_d, - text_regions_p.shape[0], - text_regions_p.shape[1]) - table_prediction_n = resize_image(table_prediction_n, - text_regions_p.shape[0], - text_regions_p.shape[1]) - + textline_mask_tot_d = rotate_image(textline_mask_tot, slope_deskew) + text_regions_p_d = rotate_image(text_regions_p, slope_deskew) + table_prediction_n = rotate_image(table_prediction, slope_deskew) regions_without_separators_d = (text_regions_p_d[:,:] == 1)*1 regions_without_separators_d[table_prediction_n[:,:] == 1] = 1 else: @@ -3465,12 +3436,9 @@ class Eynollah: #plt.show() ####if not self.tables: if np.abs(slope_deskew) >= SLOPE_THRESHOLD: - _, textline_mask_tot_d, text_regions_p_d, regions_fully_n = rotation_not_90_func_full_layout( - image_page, textline_mask_tot, text_regions_p, regions_fully, slope_deskew) - - text_regions_p_d = resize_image(text_regions_p_d, text_regions_p.shape[0], text_regions_p.shape[1]) - textline_mask_tot_d = resize_image(textline_mask_tot_d, text_regions_p.shape[0], text_regions_p.shape[1]) - regions_fully_n = resize_image(regions_fully_n, text_regions_p.shape[0], text_regions_p.shape[1]) + textline_mask_tot_d = rotate_image(textline_mask_tot, slope_deskew) + text_regions_p_d = rotate_image(text_regions_p, slope_deskew) + regions_fully_n = rotate_image(regions_fully, slope_deskew) if not self.tables: regions_without_separators_d = (text_regions_p_d[:, :] == 1) * 1 else: From 5a778003fde3cc540f3b8b1c00bc6eebee1f9295 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sat, 15 Nov 2025 14:32:22 +0100 Subject: [PATCH 27/32] contour matching for deskewed image: ensure matches for both sides --- src/eynollah/eynollah.py | 42 +++++++++++++++++++++++++++++++--------- 1 file changed, 33 insertions(+), 9 deletions(-) diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index efd67d5..b7c6ddf 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -4461,42 +4461,42 @@ class Eynollah: dists[i] = np.linalg.norm(centers[:, i:i + 1] - centers_d, axis=0) corresp = np.zeros(dists.shape, dtype=bool) # keep searching next-closest until at least one correspondence on each side - while not np.all(corresp.sum(axis=1)) and not np.all(corresp.sum(axis=0)): + while not np.all(corresp.sum(axis=1)) or not np.all(corresp.sum(axis=0)): idx = np.nanargmin(dists) i, j = np.unravel_index(idx, dists.shape) dists[i, j] = np.nan corresp[i, j] = True - #print("original/deskewed adjacency", corresp.nonzero()) + # print("original/deskewed adjacency", corresp.nonzero()) contours_only_text_parent_d_ordered = np.zeros_like(contours_only_text_parent) contours_only_text_parent_d_ordered = contours_only_text_parent_d[np.argmax(corresp, axis=1)] # img1 = np.zeros(text_only_d.shape[:2], dtype=np.uint8) # for i in range(len(contours_only_text_parent)): # cv2.fillPoly(img1, pts=[contours_only_text_parent_d_ordered[i]], color=i + 1) - # plt.subplot(2, 2, 1, title="direct corresp contours") + # plt.subplot(1, 4, 1, title="direct corresp contours") # plt.imshow(img1) # img2 = np.zeros(text_only_d.shape[:2], dtype=np.uint8) # join deskewed regions mapping to single original ones for i in range(len(contours_only_text_parent)): if np.count_nonzero(corresp[i]) > 1: indices = np.flatnonzero(corresp[i]) - #print("joining", indices) + # print("joining", indices) polygons_d = [contour2polygon(contour) for contour in contours_only_text_parent_d[indices]] contour_d = polygon2contour(join_polygons(polygons_d)) contours_only_text_parent_d_ordered[i] = contour_d # cv2.fillPoly(img2, pts=[contour_d], color=i + 1) - # plt.subplot(2, 2, 3, title="joined contours") + # plt.subplot(1, 4, 2, title="joined contours") # plt.imshow(img2) # img3 = np.zeros(text_only_d.shape[:2], dtype=np.uint8) # split deskewed regions mapping to multiple original ones def deskew(polygon): polygon = shapely.affinity.rotate(polygon, -slope_deskew, origin=center) - polygon = shapely.affinity.translate(polygon, *offset.squeeze()) + #polygon = shapely.affinity.translate(polygon, *offset.squeeze()) return polygon for j in range(len(contours_only_text_parent_d)): if np.count_nonzero(corresp[:, j]) > 1: indices = np.flatnonzero(corresp[:, j]) - #print("splitting along", indices) + # print("splitting along", indices) polygons = [deskew(contour2polygon(contour)) for contour in contours_only_text_parent[indices]] polygon_d = contour2polygon(contours_only_text_parent_d[j]) @@ -4509,14 +4509,38 @@ class Eynollah: if polygon_d] contours_only_text_parent_d_ordered[indices] = contours_d # cv2.fillPoly(img3, pts=contours_d, color=j + 1) - # plt.subplot(2, 2, 4, title="split contours") + # plt.subplot(1, 4, 3, title="split contours") # plt.imshow(img3) # img4 = np.zeros(text_only_d.shape[:2], dtype=np.uint8) # for i in range(len(contours_only_text_parent)): # cv2.fillPoly(img4, pts=[contours_only_text_parent_d_ordered[i]], color=i + 1) - # plt.subplot(2, 2, 2, title="result contours") + # plt.subplot(1, 4, 4, title="result contours") # plt.imshow(img4) # plt.show() + # from matplotlib import patches as ptchs + # plt.subplot(1, 2, 1, title="undeskewed") + # plt.imshow(text_only) + # centers = np.stack(find_center_of_contours(contours_only_text_parent)) # [2, N] + # for i in range(len(contours_only_text_parent)): + # cnt = contours_only_text_parent[i] + # ctr = centers[:, i] + # plt.gca().add_patch(ptchs.Polygon(cnt[:, 0], closed=False, fill=False, color='blue')) + # plt.gca().scatter(ctr[0], ctr[1], 20, c='blue', marker='x') + # plt.gca().text(ctr[0], ctr[1], str(i), c='blue') + # plt.subplot(1, 2, 2, title="deskewed") + # plt.imshow(text_only_d) + # centers_d = np.stack(find_center_of_contours(contours_only_text_parent_d_ordered)) # [2, N] + # for i in range(len(contours_only_text_parent)): + # cnt = contours_only_text_parent[i] + # cnt = polygon2contour(deskew(contour2polygon(cnt))) + # plt.gca().add_patch(ptchs.Polygon(cnt[:, 0], closed=False, fill=False, color='blue')) + # for i in range(len(contours_only_text_parent_d_ordered)): + # cnt = contours_only_text_parent_d_ordered[i] + # ctr = centers_d[:, i] + # plt.gca().add_patch(ptchs.Polygon(cnt[:, 0], closed=False, fill=False, color='red')) + # plt.gca().scatter(ctr[0], ctr[1], 20, c='red', marker='x') + # plt.gca().text(ctr[0], ctr[1], str(i), c='red') + # plt.show() if not len(contours_only_text_parent): # stop early From 72d059f3c973b942945b62d4463a6ea031043efc Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sat, 15 Nov 2025 14:34:12 +0100 Subject: [PATCH 28/32] reading order: simplify assignment / counting - `do_order_of_regions`: simplify aggregating per-box orders for paragraphs and headings to overall order passed to `xml_reading_order`; no need for `order_and_id_of_texts`, no need to return `id_of_texts_tot` - `do_order_of_regions_with_model`: no need to return `region_ids` - writer: no need to pass `id_of_texts_tot` in `build_pagexml` --- src/eynollah/eynollah.py | 70 +++++++++++++--------------------- src/eynollah/utils/__init__.py | 1 + src/eynollah/writer.py | 6 +-- 3 files changed, 30 insertions(+), 47 deletions(-) diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index b7c6ddf..6024646 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -134,7 +134,6 @@ from .utils import ( return_boxes_of_images_by_order_of_reading_new ) from .utils.pil_cv2 import check_dpi, pil2cv -from .utils.xml import order_and_id_of_texts from .plot import EynollahPlotter from .writer import EynollahXmlWriter @@ -2546,9 +2545,7 @@ class Eynollah: args_contours_head = np.arange(len(contours_only_text_parent_h)) order_by_con_head = np.zeros_like(arg_text_con_head) - ref_point = 0 - order_of_texts_tot = [] - id_of_texts_tot = [] + idx = 0 for iij, box in enumerate(boxes): ys = slice(*box[2:4]) xs = slice(*box[0:2]) @@ -2557,37 +2554,25 @@ class Eynollah: con_inter_box = contours_only_text_parent[args_contours_box_main] con_inter_box_h = contours_only_text_parent_h[args_contours_box_head] - indexes_sorted, kind_of_texts_sorted, index_by_kind_sorted = order_of_regions( + _, kind_of_texts_sorted, index_by_kind_sorted = order_of_regions( textline_mask_tot[ys, xs], con_inter_box, con_inter_box_h, box[2], box[0]) - order_of_texts, id_of_texts = order_and_id_of_texts( - con_inter_box, con_inter_box_h, - indexes_sorted, index_by_kind_sorted, kind_of_texts_sorted, ref_point) + for tidx, kind in zip(index_by_kind_sorted, kind_of_texts_sorted): + if kind == 1: + # print(iij, "main", args_contours_box_main[tidx], "becomes", idx) + order_by_con_main[args_contours_box_main[tidx]] = idx + else: + # print(iij, "head", args_contours_box_head[tidx], "becomes", idx) + order_by_con_head[args_contours_box_head[tidx]] = idx + idx += 1 - indexes_sorted_main = indexes_sorted[kind_of_texts_sorted == 1] - indexes_by_type_main = index_by_kind_sorted[kind_of_texts_sorted == 1] - indexes_sorted_head = indexes_sorted[kind_of_texts_sorted == 2] - indexes_by_type_head = index_by_kind_sorted[kind_of_texts_sorted == 2] - - for zahler, _ in enumerate(args_contours_box_main): - arg_order_v = indexes_sorted_main[zahler] - order_by_con_main[args_contours_box_main[indexes_by_type_main[zahler]]] = \ - np.flatnonzero(indexes_sorted == arg_order_v) + ref_point - - for zahler, _ in enumerate(args_contours_box_head): - arg_order_v = indexes_sorted_head[zahler] - order_by_con_head[args_contours_box_head[indexes_by_type_head[zahler]]] = \ - np.flatnonzero(indexes_sorted == arg_order_v) + ref_point - - for jji in range(len(id_of_texts)): - order_of_texts_tot.append(order_of_texts[jji] + ref_point) - id_of_texts_tot.append(id_of_texts[jji]) - ref_point += len(id_of_texts) - - order_of_texts_tot = np.concatenate((order_by_con_main, - order_by_con_head)) - order_text_new = np.argsort(order_of_texts_tot) - return order_text_new, id_of_texts_tot + # xml writer will create region ids in order of + # - contours_only_text_parent (main text), followed by + # - contours_only_text_parent (headings), + # and then create regionrefs into these ordered by order_text_new + order_text_new = np.argsort(np.concatenate((order_by_con_main, + order_by_con_head))) + return order_text_new try: results = match_boxes(False) @@ -3600,7 +3585,7 @@ class Eynollah: co_text_all = contours_only_text_parent if not len(co_text_all): - return [], [] + return [] labels_con = np.zeros((int(y_len /6.), int(x_len/6.), len(co_text_all)), dtype=bool) co_text_all = [(i/6).astype(int) for i in co_text_all] @@ -3683,11 +3668,9 @@ class Eynollah: else: org_contours_indexes.extend([indexes_of_located_cont[region_with_curr_order]]) - region_ids = ['region_%04d' % i for i in range(len(co_text_all_org))] - return org_contours_indexes, region_ids + return org_contours_indexes else: - region_ids = ['region_%04d' % i for i in range(len(co_text_all_org))] - return ordered, region_ids + return ordered def return_start_and_end_of_common_text_of_textline_ocr(self,textline_image, ind_tot): width = np.shape(textline_image)[1] @@ -4222,7 +4205,6 @@ class Eynollah: order_text_new = [0] slopes =[0] - id_of_texts_tot =['region_0001'] conf_contours_textregions =[0] if self.ocr and not self.tr: @@ -4234,7 +4216,7 @@ class Eynollah: ocr_all_textlines = None pcgts = self.writer.build_pagexml_no_full_layout( - cont_page, page_coord, order_text_new, id_of_texts_tot, + cont_page, page_coord, order_text_new, all_found_textline_polygons, page_coord, [], [], [], [], [], [], [], slopes, [], [], @@ -4736,14 +4718,14 @@ class Eynollah: self.logger.info("Headers ignored in reading order") if self.reading_order_machine_based: - order_text_new, id_of_texts_tot = self.do_order_of_regions_with_model( + order_text_new = self.do_order_of_regions_with_model( contours_only_text_parent, contours_only_text_parent_h, text_regions_p) else: if np.abs(slope_deskew) < SLOPE_THRESHOLD: - order_text_new, id_of_texts_tot = self.do_order_of_regions( + order_text_new = self.do_order_of_regions( contours_only_text_parent, contours_only_text_parent_h, boxes, textline_mask_tot) else: - order_text_new, id_of_texts_tot = self.do_order_of_regions( + order_text_new = self.do_order_of_regions( contours_only_text_parent_d_ordered, contours_only_text_parent_h_d_ordered, boxes_d, textline_mask_tot_d) self.logger.info(f"Detection of reading order took {time.time() - t_order:.1f}s") @@ -4840,7 +4822,7 @@ class Eynollah: if self.full_layout: pcgts = self.writer.build_pagexml_full_layout( - contours_only_text_parent, contours_only_text_parent_h, page_coord, order_text_new, id_of_texts_tot, + contours_only_text_parent, contours_only_text_parent_h, page_coord, order_text_new, all_found_textline_polygons, all_found_textline_polygons_h, all_box_coord, all_box_coord_h, polygons_of_images, contours_tables, polygons_of_drop_capitals, polygons_of_marginals_left, polygons_of_marginals_right, @@ -4853,7 +4835,7 @@ class Eynollah: conf_contours_textregions, conf_contours_textregions_h) else: pcgts = self.writer.build_pagexml_no_full_layout( - contours_only_text_parent, page_coord, order_text_new, id_of_texts_tot, + contours_only_text_parent, page_coord, order_text_new, all_found_textline_polygons, all_box_coord, polygons_of_images, polygons_of_marginals_left, polygons_of_marginals_right, all_found_textline_polygons_marginals_left, all_found_textline_polygons_marginals_right, diff --git a/src/eynollah/utils/__init__.py b/src/eynollah/utils/__init__.py index 570eefe..20766a8 100644 --- a/src/eynollah/utils/__init__.py +++ b/src/eynollah/utils/__init__.py @@ -1158,6 +1158,7 @@ def order_of_regions(textline_mask, contours_main, contours_head, y_ref, x_ref): # cnt = (contours_main if type_ == 1 else contours_head)[idx] # col = 'red' if type_ == 1 else 'blue' # plt.scatter(cx - x_ref, cy - y_ref, 20, c=col, marker='o') + # plt.text(cx - x_ref, cy - y_ref, str(idx), c=col) # plt.gca().add_patch(patches.Polygon(cnt[:, 0] - [[x_ref, y_ref]], closed=False, fill=False, color=col)) # plt.title("box contours centered in %d:%d (red=main / blue=heading)" % (top, bot)) # plt.show() diff --git a/src/eynollah/writer.py b/src/eynollah/writer.py index 9c3456a..f8aff62 100644 --- a/src/eynollah/writer.py +++ b/src/eynollah/writer.py @@ -89,7 +89,7 @@ class EynollahXmlWriter: def build_pagexml_no_full_layout( self, found_polygons_text_region, - page_coord, order_of_texts, id_of_texts, + page_coord, order_of_texts, all_found_textline_polygons, all_box_coord, found_polygons_text_region_img, @@ -102,7 +102,7 @@ class EynollahXmlWriter: **kwargs): return self.build_pagexml_full_layout( found_polygons_text_region, [], - page_coord, order_of_texts, id_of_texts, + page_coord, order_of_texts, all_found_textline_polygons, [], all_box_coord, [], found_polygons_text_region_img, found_polygons_tables, [], @@ -116,7 +116,7 @@ class EynollahXmlWriter: def build_pagexml_full_layout( self, found_polygons_text_region, found_polygons_text_region_h, - page_coord, order_of_texts, id_of_texts, + page_coord, order_of_texts, all_found_textline_polygons, all_found_textline_polygons_h, all_box_coord, all_box_coord_h, found_polygons_text_region_img, found_polygons_tables, found_polygons_drop_capitals, From 49ab269e085505940a17c355905795d91777a451 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sat, 15 Nov 2025 15:46:08 +0100 Subject: [PATCH 29/32] fix typos found by ruff --- src/eynollah/sbb_binarize.py | 2 +- src/eynollah/utils/__init__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/eynollah/sbb_binarize.py b/src/eynollah/sbb_binarize.py index 0eab2ae..b81f45e 100644 --- a/src/eynollah/sbb_binarize.py +++ b/src/eynollah/sbb_binarize.py @@ -328,7 +328,7 @@ class SbbBinarizer: print(input_path, 'image_name') if os.path.exists(output_path): if overwrite: - self.logger.warning("will overwrite existing output file '%s'", output_ptah) + self.logger.warning("will overwrite existing output file '%s'", output_path) else: self.logger.warning("will skip input for existing output file '%s'", output_path) image = cv2.imread(input_path) diff --git a/src/eynollah/utils/__init__.py b/src/eynollah/utils/__init__.py index 20766a8..7be1fd0 100644 --- a/src/eynollah/utils/__init__.py +++ b/src/eynollah/utils/__init__.py @@ -146,7 +146,7 @@ def return_multicol_separators_x_start_end( args_emptysep.add(i) # print(j, i, "%d:%d" % (y_mid[j], y_mid[i]), "%d:%d" % (x_start[i], x_end[i]), "empty next sep") args_to_be_kept = [arg for arg in args_ysorted - if not arg in args_emptysep] + if arg not in args_emptysep] x_start = x_start[args_to_be_kept] x_end = x_end[args_to_be_kept] y_min = y_min[args_to_be_kept] From 028ed169212df4a1048b26d691e1edc53592f230 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sat, 15 Nov 2025 17:17:37 +0100 Subject: [PATCH 30/32] adapt ocrd-sbb-binarize --- src/eynollah/ocrd_cli_binarization.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/eynollah/ocrd_cli_binarization.py b/src/eynollah/ocrd_cli_binarization.py index 848bbac..6289517 100644 --- a/src/eynollah/ocrd_cli_binarization.py +++ b/src/eynollah/ocrd_cli_binarization.py @@ -70,7 +70,7 @@ class SbbBinarizeProcessor(Processor): if oplevel == 'page': self.logger.info("Binarizing on 'page' level in page '%s'", page_id) - page_image_bin = cv2pil(self.binarizer.run(image=pil2cv(page_image), use_patches=True)) + page_image_bin = cv2pil(self.binarizer.run_single(image=pil2cv(page_image), use_patches=True)) # update PAGE (reference the image file): page_image_ref = AlternativeImageType(comments=page_xywh['features'] + ',binarized,clipped') page.add_AlternativeImage(page_image_ref) @@ -83,7 +83,7 @@ class SbbBinarizeProcessor(Processor): for region in regions: region_image, region_xywh = self.workspace.image_from_segment( region, page_image, page_xywh, feature_filter='binarized') - region_image_bin = cv2pil(self.binarizer.run(image=pil2cv(region_image), use_patches=True)) + region_image_bin = cv2pil(self.binarizer.run_single(image=pil2cv(region_image), use_patches=True)) # update PAGE (reference the image file): region_image_ref = AlternativeImageType(comments=region_xywh['features'] + ',binarized') region.add_AlternativeImage(region_image_ref) @@ -95,7 +95,7 @@ class SbbBinarizeProcessor(Processor): self.logger.warning("Page '%s' contains no text lines", page_id) for line in lines: line_image, line_xywh = self.workspace.image_from_segment(line, page_image, page_xywh, feature_filter='binarized') - line_image_bin = cv2pil(self.binarizer.run(image=pil2cv(line_image), use_patches=True)) + line_image_bin = cv2pil(self.binarizer.run_single(image=pil2cv(line_image), use_patches=True)) # update PAGE (reference the image file): line_image_ref = AlternativeImageType(comments=line_xywh['features'] + ',binarized') line.add_AlternativeImage(region_image_ref) From 406288b1fed020c2a68e20114ec51fe4d7f580f8 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sat, 15 Nov 2025 20:13:58 +0100 Subject: [PATCH 31/32] fixup 72d059f3: forgot to update other writer calls --- src/eynollah/eynollah.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index 6024646..46a1704 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -4164,7 +4164,7 @@ class Eynollah: image_page, page_coord, cont_page = \ self.get_regions_light_v_extract_only_images(img_res, is_image_enhanced, num_col_classifier) pcgts = self.writer.build_pagexml_no_full_layout( - [], page_coord, [], [], [], [], + [], page_coord, [], [], [], polygons_of_images, [], [], [], [], [], [], [], [], [], cont_page, [], []) if self.plotter: @@ -4282,7 +4282,7 @@ class Eynollah: self.logger.info("No columns detected - generating empty PAGE-XML") pcgts = self.writer.build_pagexml_no_full_layout( - [], page_coord, [], [], [], [], [], [], [], [], [], [], [], [], [], [], + [], page_coord, [], [], [], [], [], [], [], [], [], [], [], [], [], cont_page, [], []) return pcgts @@ -4529,7 +4529,7 @@ class Eynollah: empty_marginals = [[]] * len(polygons_of_marginals) if self.full_layout: pcgts = self.writer.build_pagexml_full_layout( - [], [], page_coord, [], [], [], [], [], [], + [], [], page_coord, [], [], [], [], [], polygons_of_images, contours_tables, [], polygons_of_marginals, polygons_of_marginals, empty_marginals, empty_marginals, @@ -4538,7 +4538,7 @@ class Eynollah: cont_page, polygons_seplines) else: pcgts = self.writer.build_pagexml_no_full_layout( - [], page_coord, [], [], [], [], + [], page_coord, [], [], [], polygons_of_images, polygons_of_marginals, polygons_of_marginals, empty_marginals, empty_marginals, From e428e7ad78629d9d4a39fa9c49f88aa4c6244139 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sun, 16 Nov 2025 12:17:29 +0100 Subject: [PATCH 32/32] ensure separators stay within image bounds --- src/eynollah/utils/__init__.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/eynollah/utils/__init__.py b/src/eynollah/utils/__init__.py index 7be1fd0..307d8f3 100644 --- a/src/eynollah/utils/__init__.py +++ b/src/eynollah/utils/__init__.py @@ -1400,6 +1400,14 @@ def find_number_of_columns_in_document(region_pre_p, num_col_classifier, tables, matrix_of_seps_ch = np.append( matrix_of_seps_ch, matrix_l_n, axis=0) + # ensure no seps are out of bounds + matrix_of_seps_ch[:, 1] = np.maximum(np.minimum(matrix_of_seps_ch[:, 1], region_pre_p.shape[1]), 0) + matrix_of_seps_ch[:, 2] = np.maximum(matrix_of_seps_ch[:, 2], 0) + matrix_of_seps_ch[:, 3] = np.minimum(matrix_of_seps_ch[:, 3], region_pre_p.shape[1]) + matrix_of_seps_ch[:, 5] = np.maximum(np.minimum(matrix_of_seps_ch[:, 5], region_pre_p.shape[0]), 0) + matrix_of_seps_ch[:, 6] = np.maximum(matrix_of_seps_ch[:, 6], 0) + matrix_of_seps_ch[:, 7] = np.minimum(matrix_of_seps_ch[:, 7], region_pre_p.shape[0]) + cy_seps_splitters=cy_seps_hor[(x_min_seps_hor<=.16*region_pre_p.shape[1]) & (x_max_seps_hor>=.84*region_pre_p.shape[1])] cy_seps_splitters = np.append(cy_seps_splitters, special_separators) @@ -1621,7 +1629,7 @@ def return_boxes_of_images_by_order_of_reading_new( starting = xmin - peaks_neg_tot min_start = np.flatnonzero(starting >= 0)[-1] # last left-of ending = xmax - peaks_neg_tot - max_end = np.flatnonzero(ending < 0)[0] # first right-of + max_end = np.flatnonzero(ending <= 0)[0] # first right-of # skip elongation unless this is already a multi-column separator/heading: if not max_end - min_start > 1: continue